From 9a07b1f37005bfe57188762152d7a746f5b438e1 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Wed, 15 Oct 2025 15:28:03 -0500
Subject: [PATCH 01/24] Local deployment minor fixes

---
 Dockerfile.local      |  4 ++--
 browser-operator-core |  1 -
 build-local.sh        | 18 ++++++++++++------
 run-local.sh          |  8 ++++++++
 4 files changed, 22 insertions(+), 9 deletions(-)
 delete mode 160000 browser-operator-core

diff --git a/Dockerfile.local b/Dockerfile.local
index 35756a4..cba98f2 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -70,8 +70,8 @@ FROM --platform=linux/arm64 node:18-alpine AS eval-server-builder
 
 WORKDIR /workspace
 
-# Copy local browser-operator-core eval server with our modifications
-COPY browser-operator-core/eval-server/nodejs /workspace/eval-server
+# Copy local eval server
+COPY eval-server/nodejs /workspace/eval-server
 
 WORKDIR /workspace/eval-server
 
diff --git a/browser-operator-core b/browser-operator-core
deleted file mode 160000
index 3aaef1e..0000000
--- a/browser-operator-core
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3aaef1ef13cede9dd2b443ee5eddf7102be8cc24
diff --git a/build-local.sh b/build-local.sh
index b388617..9916ad2 100755
--- a/build-local.sh
+++ b/build-local.sh
@@ -9,16 +9,22 @@ echo "🔨 Building extended kernel-browser with DevTools frontend..."
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 cd "$SCRIPT_DIR"
 
+# Fix orphaned browser-operator-core submodule if it exists
+if [ -d "browser-operator-core" ] && ! grep -q "browser-operator-core" .gitmodules 2>/dev/null; then
+    echo "🔧 Fixing orphaned browser-operator-core submodule..."
+    git rm -f browser-operator-core 2>/dev/null || true
+    rm -rf .git/modules/browser-operator-core 2>/dev/null || true
+    echo "✅ Removed orphaned submodule"
+fi
+
 # Check if kernel-images submodule exists and is initialized
-if [ ! -d "kernel-images" ]; then
-    echo "❌ Error: kernel-images submodule not found"
-    echo "   Run: git submodule update --init --recursive"
-    exit 1
+if [ ! -d "kernel-images" ] || [ ! -f "kernel-images/images/chromium-headful/build-docker.sh" ]; then
+    echo "📦 Initializing kernel-images submodule..."
+    git submodule update --init --recursive
 fi
 
 if [ ! -f "kernel-images/images/chromium-headful/build-docker.sh" ]; then
-    echo "❌ Error: kernel-images submodule appears empty"
-    echo "   Run: git submodule update --init --recursive"
+    echo "❌ Error: kernel-images submodule appears empty after initialization"
     exit 1
 fi
 
diff --git a/run-local.sh b/run-local.sh
index cb17e47..1452635 100755
--- a/run-local.sh
+++ b/run-local.sh
@@ -80,6 +80,14 @@ else
       exit 1
     fi
   fi
+
+  # Clean up Chromium lock files from previous runs to prevent profile lock errors
+  # These files prevent concurrent access but remain after container crashes
+  echo "🧹 Cleaning Chromium lock files from previous runs..."
+  rm -f "$CHROMIUM_DATA_REAL/user-data/SingletonLock" \
+        "$CHROMIUM_DATA_REAL/user-data/SingletonSocket" \
+        "$CHROMIUM_DATA_REAL/user-data/SingletonCookie" 2>/dev/null || true
+
   CHROMIUM_DATA_VOLUME="${CHROMIUM_DATA_REAL}:/data"
 fi
 

From edbf47027fb04a3de8c1e5eb2074db1966ed0775 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Wed, 15 Oct 2025 18:31:17 -0500
Subject: [PATCH 02/24] Make dev build

---
 .gitignore                    |   8 +-
 .gitmodules                   |   5 +
 Dockerfile.devtools           |  74 ++++--
 Dockerfile.local              |  76 +-----
 Makefile                      |  38 ++-
 browser-operator-core         |   1 +
 build-local.sh                |  27 +-
 docs/DEVTOOLS-DEVELOPMENT.md  | 167 ++++++++++++
 docs/devtools-build-system.md | 485 ++++++++++++++++++++++++++++++++++
 9 files changed, 783 insertions(+), 98 deletions(-)
 create mode 160000 browser-operator-core
 create mode 100644 docs/DEVTOOLS-DEVELOPMENT.md
 create mode 100644 docs/devtools-build-system.md

diff --git a/.gitignore b/.gitignore
index e92ab63..8371b9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,4 +54,10 @@ service-account-key.json
 *.backup
 
 # Chromium persistent data
-chromium-data/
\ No newline at end of file
+chromium-data/
+
+# Browser Operator DevTools build artifacts
+browser-operator-core/devtools-frontend/
+browser-operator-core/depot_tools/
+browser-operator-core/.devtools-built
+browser-operator-core/.devtools-base-built
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 2d8f624..193a5c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,8 @@
 [submodule "kernel-images"]
 	path = kernel-images
 	url = https://github.com/onkernel/kernel-images.git
+
+[submodule "browser-operator-core"]
+	path = browser-operator-core
+	url = git@github.com:BrowserOperator/browser-operator-core.git
+	shallow = true
diff --git a/Dockerfile.devtools b/Dockerfile.devtools
index edefa5e..8df2c11 100644
--- a/Dockerfile.devtools
+++ b/Dockerfile.devtools
@@ -1,7 +1,13 @@
-# DevTools Frontend build stage using browser-operator-core
-FROM --platform=linux/amd64 ubuntu:22.04 AS devtools-builder
+# Development-optimized Dockerfile for Browser Operator DevTools
+# This Dockerfile is designed for fast iterative builds during local development
+# It caches expensive operations (fetch, sync) and allows quick rebuilds when code changes
 
-# Install required packages for DevTools frontend build
+# ==============================================================================
+# Stage 1: DevTools Base (cached, rarely rebuilt)
+# ==============================================================================
+FROM --platform=linux/amd64 ubuntu:22.04 AS devtools-base
+
+# Install required packages
 RUN apt-get update && apt-get install -y \
     curl \
     git \
@@ -22,12 +28,12 @@ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
 
 WORKDIR /workspace
 
-# Clone depot_tools
+# Clone depot_tools (cached)
 RUN git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
 ENV PATH="/workspace/depot_tools:${PATH}"
 ENV DEPOT_TOOLS_UPDATE=0
 
-# Follow README instructions exactly - fetching code
+# Fetch devtools-frontend (expensive operation, cached)
 RUN mkdir devtools
 WORKDIR /workspace/devtools
 RUN fetch devtools-frontend
@@ -35,29 +41,63 @@ RUN fetch devtools-frontend
 # Build steps
 WORKDIR /workspace/devtools/devtools-frontend
 
+# Sync dependencies (cached)
 RUN gclient sync
 RUN /workspace/depot_tools/ensure_bootstrap
 
-# Build standard DevTools first
+# Build standard DevTools first (cached)
 RUN npm run build
 
-# Add Browser Operator fork and switch to it
+# Create marker file to indicate base is built
+RUN touch /workspace/.devtools-base-built
+
+# ==============================================================================
+# Stage 2: Apply Browser Operator Changes (fast, iterative)
+# ==============================================================================
+FROM devtools-base AS devtools-local
+
+WORKDIR /workspace/devtools/devtools-frontend
+
+# Add Browser Operator fork
 RUN git remote add upstream https://github.com/BrowserOperator/browser-operator-core.git
 RUN git fetch upstream
 RUN git checkout upstream/main
 
-# Build Browser Operator version
+# This is where local changes would be copied in development mode
+# When building from submodule, copy local changes here:
+# COPY will be added by build script if browser-operator-core/ exists locally
+
+# Force automated mode
+RUN sed -i 's/AUTOMATED_MODE: false/AUTOMATED_MODE: true/' front_end/panels/ai_chat/core/BuildConfig.ts || true
+
+# Build Browser Operator version with current changes
 RUN npm run build
 
-# Production stage for DevTools frontend
-FROM nginx:alpine AS devtools-frontend
-WORKDIR /usr/share/nginx/html
+# Create marker file
+RUN touch /workspace/.devtools-built
+
+# ==============================================================================
+# Stage 3: Nginx Server (optional, for standalone testing)
+# ==============================================================================
+FROM nginx:alpine AS devtools-server
+
+# Copy the built DevTools frontend
+COPY --from=devtools-local /workspace/devtools/devtools-frontend/out/Default/gen/front_end /usr/share/nginx/html
 
-# Copy the built DevTools frontend from builder
-COPY --from=devtools-builder /workspace/devtools/devtools-frontend/out/Default/gen/front_end .
+# Create simple nginx config
+RUN echo 'server { \
+    listen 8001; \
+    root /usr/share/nginx/html; \
+    location / { \
+        try_files $uri $uri/ /index.html; \
+    } \
+    location /health { \
+        return 200 "{\"status\":\"healthy\"}"; \
+        add_header Content-Type application/json; \
+    } \
+}' > /etc/nginx/conf.d/default.conf
 
-# Copy nginx config from browser-operator-core
-COPY browser-operator-core/docker/nginx.conf /etc/nginx/conf.d/default.conf
+EXPOSE 8001
 
-# Create health check endpoint
-RUN echo '{"status": "healthy", "service": "browser-operator-devtools"}' > health.json
\ No newline at end of file
+# Create health check file
+RUN echo '{"status": "healthy", "service": "browser-operator-devtools"}' > /usr/share/nginx/html/health.json
diff --git a/Dockerfile.local b/Dockerfile.local
index cba98f2..ad90db1 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -1,67 +1,13 @@
 # Extended Dockerfile combining kernel-images with DevTools frontend
 # This extends the kernel-images base with Browser Operator DevTools static files
+#
+# NOTE: DevTools are built separately using Dockerfile.devtools
+# Run 'make build-devtools' first to build the DevTools image
 
-# DevTools Frontend build stage using browser-operator-core
-FROM --platform=linux/amd64 ubuntu:22.04 AS devtools-builder
-
-# Install required packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    git \
-    python3 \
-    python3-pip \
-    python-is-python3 \
-    wget \
-    unzip \
-    sudo \
-    ca-certificates \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Node.js 18.x
-RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
-    apt-get install -y nodejs && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /workspace
-
-# Clone depot_tools
-RUN git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
-ENV PATH="/workspace/depot_tools:${PATH}"
-ENV DEPOT_TOOLS_UPDATE=0
-
-# Follow README instructions exactly:
-# fetching code
-RUN mkdir devtools
-WORKDIR /workspace/devtools
-RUN fetch devtools-frontend
-
-# Build steps
-WORKDIR /workspace/devtools/devtools-frontend
-
-RUN gclient sync
-RUN /workspace/depot_tools/ensure_bootstrap
-
-# Build standard DevTools first
-RUN npm run build
-
-# Add Browser Operator fork and switch to it
-RUN git remote add upstream https://github.com/BrowserOperator/browser-operator-core.git
-RUN git fetch upstream
-RUN git checkout upstream/main
-
-# Copy local changes from the repository (preserve build config)
-# Uncomment the following lines if you want to copy local changes
-# COPY browser-operator-core/front_end/core /workspace/devtools/devtools-frontend/front_end/core/
-# COPY browser-operator-core/front_end/panels/ai_chat /workspace/devtools/devtools-frontend/front_end/panels/ai_chat/
-# COPY browser-operator-core/front_end/entrypoints /workspace/devtools/devtools-frontend/front_end/entrypoints/
-# COPY browser-operator-core/scripts /workspace/devtools/devtools-frontend/scripts/
-
-# Force automated mode
-RUN sed -i 's/AUTOMATED_MODE: false/AUTOMATED_MODE: true/' front_end/panels/ai_chat/core/BuildConfig.ts;
-
-# Build Browser Operator version with current changes
-RUN npm run build
+# ============================================================================
+# DevTools stage - Copy from pre-built devtools image
+# ============================================================================
+FROM browser-operator-devtools:latest AS devtools-source
 
 # ============================================================================
 # Eval Server build stage
@@ -70,8 +16,8 @@ FROM --platform=linux/arm64 node:18-alpine AS eval-server-builder
 
 WORKDIR /workspace
 
-# Copy local eval server
-COPY eval-server/nodejs /workspace/eval-server
+# Copy eval server from browser-operator-core submodule
+COPY browser-operator-core/eval-server/nodejs /workspace/eval-server
 
 WORKDIR /workspace/eval-server
 
@@ -271,8 +217,8 @@ COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-a
 # DevTools Integration
 # ============================================================================
 
-# Copy DevTools static files from builder
-COPY --from=devtools-builder /workspace/devtools/devtools-frontend/out/Default/gen/front_end /usr/share/nginx/devtools
+# Copy DevTools static files from pre-built devtools image
+COPY --from=devtools-source /usr/share/nginx/html /usr/share/nginx/devtools
 
 # Create DevTools nginx configuration
 COPY nginx-devtools.conf /etc/nginx/sites-available/devtools
diff --git a/Makefile b/Makefile
index 477e46e..7eb88ae 100644
--- a/Makefile
+++ b/Makefile
@@ -17,10 +17,38 @@ help: ## Show this help message
 	@echo "  - Disable persistence: CHROMIUM_DATA_HOST=\"\" make run"
 
 init: ## Initialize submodules (run this first)
-	git submodule update --init --recursive
+	@echo "📦 Initializing submodules..."
+	git submodule update --init --depth 1 kernel-images
+	git submodule update --init --depth 1 browser-operator-core
 	@echo "✅ Submodules initialized"
 
-build: init ## Build extended image with DevTools frontend
+init-devtools: ## Initialize browser-operator-core submodule only
+	@echo "📦 Initializing browser-operator-core submodule..."
+	git submodule update --init --depth 1 browser-operator-core
+	@echo "✅ browser-operator-core submodule initialized"
+
+build-devtools-base: init-devtools ## Build DevTools base image (slow, rarely needed)
+	@echo "🔨 Building DevTools base layer (this takes ~30 minutes)..."
+	docker build -f Dockerfile.devtools --target devtools-base -t browser-operator-devtools:base .
+	@echo "✅ DevTools base built and cached"
+
+build-devtools: init-devtools ## Build DevTools image (smart: uses cache)
+	@if docker images | grep -q "browser-operator-devtools.*base"; then \
+		echo "✅ Using cached DevTools base"; \
+	else \
+		echo "📦 DevTools base not found, building from scratch..."; \
+		$(MAKE) --no-print-directory build-devtools-base; \
+	fi
+	@echo "🔨 Building Browser Operator DevTools..."
+	docker build -f Dockerfile.devtools --target devtools-server -t browser-operator-devtools:latest .
+	@echo "✅ DevTools built: browser-operator-devtools:latest"
+
+rebuild-devtools: ## Force rebuild DevTools (use after code changes)
+	@echo "🔄 Force rebuilding DevTools..."
+	docker build -f Dockerfile.devtools --no-cache --target devtools-server -t browser-operator-devtools:latest .
+	@echo "✅ DevTools rebuilt"
+
+build: init build-devtools ## Build extended image with DevTools frontend
 	@echo "🔨 Building extended kernel-browser with DevTools frontend..."
 	docker build -f Dockerfile.local -t kernel-browser:extended .
 	@echo "✅ Extended build complete"
@@ -112,6 +140,12 @@ clean: stop ## Clean up everything
 	rm -rf kernel-images/images/chromium-headful/.tmp 2>/dev/null || true
 	@echo "✅ Cleanup complete"
 
+clean-devtools: ## Clean DevTools images and cache
+	@echo "🧹 Cleaning DevTools images..."
+	docker rmi browser-operator-devtools:latest 2>/dev/null || true
+	docker rmi browser-operator-devtools:base 2>/dev/null || true
+	@echo "✅ DevTools images removed"
+
 # Alternative commands for different approaches
 native-build: init ## Build using kernel-images native script directly
 	cd kernel-images/images/chromium-headful && \
diff --git a/browser-operator-core b/browser-operator-core
new file mode 160000
index 0000000..cfd482d
--- /dev/null
+++ b/browser-operator-core
@@ -0,0 +1 @@
+Subproject commit cfd482d61c4f032cf1d1602f655e7e02d78f07e9
diff --git a/build-local.sh b/build-local.sh
index 9916ad2..1c48324 100755
--- a/build-local.sh
+++ b/build-local.sh
@@ -9,23 +9,24 @@ echo "🔨 Building extended kernel-browser with DevTools frontend..."
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 cd "$SCRIPT_DIR"
 
-# Fix orphaned browser-operator-core submodule if it exists
-if [ -d "browser-operator-core" ] && ! grep -q "browser-operator-core" .gitmodules 2>/dev/null; then
-    echo "🔧 Fixing orphaned browser-operator-core submodule..."
-    git rm -f browser-operator-core 2>/dev/null || true
-    rm -rf .git/modules/browser-operator-core 2>/dev/null || true
-    echo "✅ Removed orphaned submodule"
+# Initialize submodules if needed
+if [ ! -d "kernel-images/.git" ]; then
+    echo "📦 Initializing kernel-images submodule..."
+    git submodule update --init --depth 1 kernel-images
 fi
 
-# Check if kernel-images submodule exists and is initialized
-if [ ! -d "kernel-images" ] || [ ! -f "kernel-images/images/chromium-headful/build-docker.sh" ]; then
-    echo "📦 Initializing kernel-images submodule..."
-    git submodule update --init --recursive
+if [ ! -d "browser-operator-core/.git" ]; then
+    echo "📦 Initializing browser-operator-core submodule..."
+    git submodule update --init --depth 1 browser-operator-core
 fi
 
-if [ ! -f "kernel-images/images/chromium-headful/build-docker.sh" ]; then
-    echo "❌ Error: kernel-images submodule appears empty after initialization"
-    exit 1
+# Check if DevTools image exists
+if ! docker images | grep -q "browser-operator-devtools.*latest"; then
+    echo "📦 DevTools image not found, building it first..."
+    echo "   This is a one-time operation and will take ~30 minutes..."
+    make build-devtools
+else
+    echo "✅ Using existing DevTools image"
 fi
 
 echo "🚀 Starting extended build with Docker..."
diff --git a/docs/DEVTOOLS-DEVELOPMENT.md b/docs/DEVTOOLS-DEVELOPMENT.md
new file mode 100644
index 0000000..4560e99
--- /dev/null
+++ b/docs/DEVTOOLS-DEVELOPMENT.md
@@ -0,0 +1,167 @@
+# DevTools Development Workflow
+
+This document explains how to develop and iterate on Browser Operator DevTools locally.
+
+## Architecture
+
+The build system uses a 2-stage approach for fast iteration:
+
+1. **Stage 1: DevTools Build** (`Dockerfile.devtools`) - Builds Browser Operator DevTools
+   - Expensive operations (fetch, sync) are cached in `devtools-base` layer
+   - Fast rebuilds when you modify BrowserOperator code
+
+2. **Stage 2: Browser Image** (`Dockerfile.local`) - Combines DevTools with kernel-browser
+   - Copies pre-built DevTools from Stage 1
+   - Quick rebuilds (~2-5 min)
+
+## Quick Start
+
+### First Time Setup
+
+```bash
+# Initialize submodules and build everything
+make init
+make build-devtools  # ~30 minutes (one-time)
+make build           # ~5 minutes
+make run
+```
+
+### Daily Development Workflow
+
+#### Editing Browser Operator Code
+
+```bash
+# 1. Make changes to browser-operator-core/front_end/
+vim browser-operator-core/front_end/panels/ai_chat/...
+
+# 2. Rebuild DevTools (fast, ~5-10 min)
+make rebuild-devtools
+
+# 3. Rebuild final image (fast, ~2-5 min)
+make build
+
+# 4. Run
+make run
+```
+
+#### Quick Iteration (no DevTools changes)
+
+```bash
+# If you're only changing kernel-browser config
+make build  # Smart: skips DevTools if already built
+make run
+```
+
+## Makefile Commands
+
+### DevTools Management
+- `make init-devtools` - Initialize browser-operator-core submodule
+- `make build-devtools-base` - Build base layer (rare, ~30min)
+- `make build-devtools` - Build DevTools (smart, uses cache)
+- `make rebuild-devtools` - Force rebuild (after code changes)
+- `make clean-devtools` - Remove DevTools images
+
+### Main Workflow
+- `make init` - Initialize all submodules
+- `make build` - Build everything (calls build-devtools automatically)
+- `make run` - Run the container
+- `make stop` - Stop containers
+- `make clean` - Clean up everything
+
+## Understanding the Build Stages
+
+### Dockerfile.devtools
+
+```
+devtools-base (cached)
+  ├─ Install system deps
+  ├─ Clone depot_tools
+  ├─ Fetch devtools-frontend (~2GB, cached!)
+  ├─ gclient sync (cached!)
+  └─ npm run build (cached!)
+
+devtools-local (fast rebuild)
+  ├─ Add BrowserOperator remote
+  ├─ Checkout upstream/main
+  ├─ (Optional: COPY local changes)
+  └─ npm run build (~5-10 min)
+
+devtools-server (nginx)
+  └─ Serve built DevTools on port 8001
+```
+
+### Smart Caching
+
+- **First build**: ~30 minutes (builds everything)
+- **After code changes**: ~5-10 minutes (only rebuilds DevTools)
+- **No changes**: <1 minute (uses cached layers)
+
+## Troubleshooting
+
+### "DevTools image not found"
+
+```bash
+make build-devtools
+```
+
+### Force complete rebuild
+
+```bash
+make clean-devtools
+make build-devtools-base
+make build-devtools
+make build
+```
+
+### Submodule issues
+
+```bash
+git submodule deinit -f browser-operator-core
+git submodule update --init --depth 1 browser-operator-core
+```
+
+### Profile lock errors
+
+The `run-local.sh` script automatically cleans lock files. If you still see issues:
+
+```bash
+rm -f chromium-data/user-data/Singleton*
+make run
+```
+
+## Development Tips
+
+1. **Modify BrowserOperator locally**: Edit files in `browser-operator-core/`, then `make rebuild-devtools`
+
+2. **Switch BrowserOperator branches**:
+   ```bash
+   cd browser-operator-core
+   git fetch origin
+   git checkout feature-branch
+   cd ..
+   make rebuild-devtools
+   ```
+
+3. **Test DevTools standalone**:
+   ```bash
+   docker run -p 8001:8001 browser-operator-devtools:latest
+   # Access at http://localhost:8001
+   ```
+
+4. **Skip DevTools rebuild**: If you only change kernel-browser config, just run `make build`
+
+## File Reference
+
+- `.gitmodules` - Submodule configuration
+- `Dockerfile.devtools` - DevTools build (2-stage)
+- `Dockerfile.local` - Final browser image
+- `Makefile` - Build orchestration
+- `build-local.sh` - Build script with smart checks
+- `run-local.sh` - Run script with lock file cleanup
+
+## Contributing Back
+
+If you make improvements to the DevTools build process, consider contributing them upstream to:
+https://github.com/BrowserOperator/browser-operator-core
+
+The `Dockerfile.devtools` in this repo can serve as the basis for a `docker/Dockerfile.dev` in the upstream repo.
diff --git a/docs/devtools-build-system.md b/docs/devtools-build-system.md
new file mode 100644
index 0000000..680bba7
--- /dev/null
+++ b/docs/devtools-build-system.md
@@ -0,0 +1,485 @@
+# DevTools Build System - Implementation Plan & Usage
+
+## Overview
+
+This document describes the 2-stage build system for Browser Operator DevTools that enables fast local development and iteration.
+
+## Problem Statement
+
+The original build process had several issues:
+1. **Slow builds**: Every build fetched ~2GB of DevTools source from scratch (~30 minutes)
+2. **No caching**: Expensive operations (depot_tools, fetch, gclient sync) weren't cached
+3. **Submodule conflicts**: `browser-operator-core` was orphaned, causing git errors
+4. **Poor iteration**: Small code changes required full 30-minute rebuilds
+
+## Solution Architecture
+
+### Two-Stage Build System
+
+**Stage 1: DevTools Build** (Dockerfile.devtools)
+- Caches expensive operations in `devtools-base` layer
+- Enables fast rebuilds when Browser Operator code changes
+- Produces `browser-operator-devtools:latest` image
+
+**Stage 2: Browser Image** (Dockerfile.local)
+- Copies pre-built DevTools from Stage 1
+- Combines with kernel-browser runtime
+- Fast final image build (~2-5 minutes)
+
+### Directory Structure
+
+```
+web-agent/
+├── .gitmodules                      # Submodule config (restored browser-operator-core)
+├── .gitignore                       # Ignores DevTools build artifacts
+├── Dockerfile.devtools              # NEW: DevTools builder (3 stages)
+├── Dockerfile.local                 # UPDATED: Uses pre-built DevTools
+├── Makefile                         # UPDATED: DevTools management targets
+├── build-local.sh                   # UPDATED: Smart DevTools checks
+├── run-local.sh                     # Lock file cleanup
+├── docs/
+│   └── devtools-build-system.md     # This file
+├── DEVTOOLS-DEVELOPMENT.md          # Developer workflow guide
+└── browser-operator-core/           # Submodule (shallow clone)
+    ├── front_end/                   # DevTools source (modify here)
+    ├── eval-server/                 # Eval server source
+    └── docker/                      # Upstream Docker files
+```
+
+## Implementation Details
+
+### 1. Submodule Configuration (.gitmodules)
+
+```ini
+[submodule "browser-operator-core"]
+    path = browser-operator-core
+    url = git@github.com:BrowserOperator/browser-operator-core.git
+    shallow = true
+```
+
+**Key features:**
+- Shallow clone (no deep recursion into Chromium submodules)
+- SSH URL for authenticated access
+- Separate from kernel-images submodule
+
+### 2. DevTools Builder (Dockerfile.devtools)
+
+**Stage 1: devtools-base (cached, ~30 min)**
+```dockerfile
+FROM ubuntu:22.04 AS devtools-base
+# Install deps, depot_tools
+# fetch devtools-frontend (~2GB)
+# gclient sync
+# npm run build
+# Creates marker: /workspace/.devtools-base-built
+```
+
+**Stage 2: devtools-local (fast, ~5-10 min)**
+```dockerfile
+FROM devtools-base AS devtools-local
+# Add BrowserOperator remote
+# Checkout upstream/main
+# (Optional: COPY local changes)
+# npm run build
+# Creates marker: /workspace/.devtools-built
+```
+
+**Stage 3: devtools-server (nginx)**
+```dockerfile
+FROM nginx:alpine AS devtools-server
+# Copy built DevTools
+# Configure nginx on port 8001
+# Health check endpoint
+```
+
+**Build strategy:**
+- Expensive operations in Stage 1 (rarely changes)
+- Code changes trigger only Stage 2+ rebuild
+- Docker layer caching dramatically speeds up rebuilds
+
+### 3. Final Browser Image (Dockerfile.local)
+
+**Before:**
+```dockerfile
+# DevTools builder stage (lines 4-64)
+FROM ubuntu:22.04 AS devtools-builder
+RUN fetch devtools-frontend  # ~2GB download every build!
+...
+```
+
+**After:**
+```dockerfile
+# Copy from pre-built image
+FROM browser-operator-devtools:latest AS devtools-source
+
+# Final stage
+COPY --from=devtools-source /usr/share/nginx/html /usr/share/nginx/devtools
+```
+
+**Benefits:**
+- No expensive fetch operations
+- Reuses cached DevTools build
+- Fast final image assembly
+
+### 4. Makefile Targets
+
+**New targets:**
+```makefile
+init-devtools          # Initialize browser-operator-core submodule
+build-devtools-base    # Build base layer (rare)
+build-devtools         # Build DevTools (smart, checks cache)
+rebuild-devtools       # Force rebuild after code changes
+clean-devtools         # Remove DevTools images
+```
+
+**Updated targets:**
+```makefile
+init    # Now initializes both submodules with --depth 1
+build   # Auto-calls build-devtools if needed
+clean   # Preserved, separate clean-devtools for DevTools
+```
+
+### 5. Smart Build Script (build-local.sh)
+
+**Before:**
+- Removed orphaned submodules
+- Full recursive submodule init
+
+**After:**
+```bash
+# Shallow clone submodules
+git submodule update --init --depth 1 kernel-images
+git submodule update --init --depth 1 browser-operator-core
+
+# Smart DevTools check
+if ! docker images | grep -q "browser-operator-devtools.*latest"; then
+    make build-devtools  # Only if missing
+fi
+```
+
+**Benefits:**
+- Shallow clones prevent deep recursion
+- Automatic DevTools build if missing
+- No orphaned submodule issues
+
+### 6. Profile Persistence with Lock Cleanup (run-local.sh)
+
+**Issue:** Chromium lock files persist after container crashes
+**Solution:** Clean locks before each run
+
+```bash
+rm -f "$CHROMIUM_DATA_REAL/user-data/SingletonLock" \
+      "$CHROMIUM_DATA_REAL/user-data/SingletonSocket" \
+      "$CHROMIUM_DATA_REAL/user-data/SingletonCookie"
+```
+
+**Result:** Profile data persists, but locks don't block startup
+
+---
+
+## Quick Start Guide
+
+### First Time Setup
+
+```bash
+# 1. Clone repository
+git clone <your-repo>
+cd web-agent
+
+# 2. Initialize submodules
+make init
+
+# 3. Build DevTools (one-time, ~30 minutes)
+make build-devtools
+
+# 4. Build browser image (~5 minutes)
+make build
+
+# 5. Run
+make run
+```
+
+**Access points:**
+- WebRTC Client: http://localhost:8000
+- Enhanced DevTools UI: http://localhost:8001
+- Chrome DevTools: http://localhost:9222
+- Eval Server: http://localhost:8080
+
+### Daily Development Workflow
+
+#### Editing Browser Operator Code
+
+```bash
+# 1. Edit code in browser-operator-core/
+vim browser-operator-core/front_end/panels/ai_chat/AIChatPanel.ts
+
+# 2. Rebuild DevTools (~5-10 minutes)
+make rebuild-devtools
+
+# 3. Rebuild browser image (~2-5 minutes)
+make build
+
+# 4. Run
+make run
+```
+
+#### Quick Iteration (no DevTools changes)
+
+```bash
+# Just rebuild and run
+make build  # Smart: skips DevTools if unchanged
+make run
+```
+
+#### Switching Browser Operator Branches
+
+```bash
+cd browser-operator-core
+git fetch origin
+git checkout feature-branch
+cd ..
+make rebuild-devtools
+make build
+make run
+```
+
+---
+
+## Performance Comparison
+
+### Build Times
+
+| Operation | Before | After | Improvement |
+|-----------|--------|-------|-------------|
+| First build | ~30 min | ~30 min | Same (necessary) |
+| Code change rebuild | ~30 min | ~5-10 min | **6-10x faster** |
+| No DevTools change | ~30 min | ~2-5 min | **10-15x faster** |
+| Submodule init | Variable | ~1 min | Faster (shallow) |
+
+### Docker Layer Caching
+
+**Before:**
+```
+RUN fetch devtools-frontend    # ❌ Always runs
+RUN gclient sync               # ❌ Always runs
+RUN npm run build              # ❌ Always runs
+```
+
+**After:**
+```
+devtools-base                  # ✅ Cached
+devtools-local                 # ✅ Only rebuilds if code changed
+devtools-server                # ✅ Quick nginx copy
+```
+
+---
+
+## Troubleshooting
+
+### Issue: "DevTools image not found"
+
+**Solution:**
+```bash
+make build-devtools
+```
+
+### Issue: Submodule errors
+
+**Solution:**
+```bash
+# Reset submodules
+git submodule deinit -f browser-operator-core
+git submodule update --init --depth 1 browser-operator-core
+```
+
+### Issue: Profile lock errors
+
+**Solution:**
+```bash
+# Manual cleanup (run-local.sh does this automatically)
+rm -f chromium-data/user-data/Singleton*
+make run
+```
+
+### Issue: Force complete rebuild
+
+**Solution:**
+```bash
+make clean-devtools
+make build-devtools-base
+make build-devtools
+make build
+```
+
+### Issue: Out of disk space
+
+**Solution:**
+```bash
+# Clean Docker cache
+docker system prune -a
+make clean-devtools
+```
+
+---
+
+## Advanced Usage
+
+### Test DevTools Standalone
+
+```bash
+docker run -p 8001:8001 browser-operator-devtools:latest
+# Access at http://localhost:8001
+```
+
+### Build Only DevTools Base Layer
+
+```bash
+make build-devtools-base
+# Creates browser-operator-devtools:base
+# Takes ~30 minutes
+```
+
+### Use Custom Browser Operator Branch
+
+```bash
+cd browser-operator-core
+git remote add myfork git@github.com:myuser/browser-operator-core.git
+git fetch myfork
+git checkout myfork/my-feature
+cd ..
+make rebuild-devtools
+```
+
+### Skip DevTools Build (Use Existing)
+
+```bash
+# build-local.sh automatically checks if image exists
+./build-local.sh
+```
+
+---
+
+## Contributing Upstream
+
+The `Dockerfile.devtools` can be contributed to BrowserOperator as `docker/Dockerfile.dev`:
+
+**Benefits for upstream:**
+- Faster local development for all contributors
+- Better Docker layer caching
+- Clear separation of base dependencies vs. code changes
+- Enables CI/CD optimization
+
+**Suggested upstream PR:**
+1. Add `docker/Dockerfile.dev` based on our `Dockerfile.devtools`
+2. Update `docker/README.md` with development workflow
+3. Add Makefile targets for dev builds
+
+---
+
+## Technical Decisions
+
+### Why Shallow Submodules?
+
+**Problem:** Chromium DevTools has massive submodule tree
+**Solution:** `shallow = true` and `--depth 1` prevents deep recursion
+**Tradeoff:** Can't access full git history in submodule
+**Verdict:** Acceptable - we rarely need full history for local dev
+
+### Why Separate Dockerfile.devtools?
+
+**Alternative:** Keep everything in Dockerfile.local
+**Rationale:**
+- Clear separation of concerns
+- Easier to contribute upstream
+- Can build DevTools independently
+- Better mental model for developers
+
+### Why Not Use Browser Operator's Existing Dockerfile?
+
+**Existing:** `browser-operator-core/docker/Dockerfile`
+**Issue:** Not optimized for fast iteration (always fetches from scratch)
+**Solution:** Created dev-optimized version
+**Future:** Contribute improvements back upstream
+
+### Why Makefile Instead of Shell Scripts?
+
+**Rationale:**
+- Standardized build interface
+- Parallel execution support
+- Better dependency management
+- Familiar to developers
+- Self-documenting (make help)
+
+---
+
+## File Changes Summary
+
+| File | Status | Description |
+|------|--------|-------------|
+| `.gitmodules` | UPDATED | Restored browser-operator-core submodule |
+| `.gitignore` | UPDATED | Added DevTools build artifacts |
+| `Dockerfile.devtools` | NEW | 3-stage DevTools builder |
+| `Dockerfile.local` | UPDATED | Uses pre-built DevTools |
+| `Makefile` | UPDATED | Added DevTools targets |
+| `build-local.sh` | UPDATED | Smart DevTools checks |
+| `run-local.sh` | UNCHANGED | Already has lock cleanup |
+| `DEVTOOLS-DEVELOPMENT.md` | NEW | Developer guide |
+| `docs/devtools-build-system.md` | NEW | This file |
+
+---
+
+## Future Improvements
+
+### Potential Optimizations
+
+1. **Multi-architecture support**
+   - Currently: amd64 for DevTools, arm64 for final image
+   - Future: Native builds for both architectures
+
+2. **CI/CD integration**
+   - Cache devtools-base layer in registry
+   - Parallel builds for different stages
+   - Automated DevTools updates
+
+3. **Development hot-reload**
+   - Mount browser-operator-core/front_end/ as volume
+   - Watch mode for automatic rebuilds
+   - Live reload in browser
+
+4. **Build context optimization**
+   - .dockerignore improvements
+   - Selective file copying
+   - BuildKit cache mounts
+
+### Monitoring Build Performance
+
+Track build times with:
+```bash
+time make build-devtools
+time make build
+```
+
+Expected results:
+- First devtools build: ~30 minutes
+- Incremental rebuild: ~5-10 minutes
+- Final image: ~2-5 minutes
+
+---
+
+## References
+
+- Browser Operator Core: https://github.com/BrowserOperator/browser-operator-core
+- DevTools Build Guide: https://github.com/BrowserOperator/browser-operator-core/blob/main/front_end/panels/ai_chat/Readme.md
+- Docker Multi-stage Builds: https://docs.docker.com/build/building/multi-stage/
+- Git Submodules: https://git-scm.com/book/en/v2/Git-Tools-Submodules
+
+---
+
+## Support
+
+For issues or questions:
+1. Check `DEVTOOLS-DEVELOPMENT.md` for workflow guide
+2. Review troubleshooting section above
+3. Check Docker logs: `docker logs kernel-browser-extended`
+4. Verify submodules: `git submodule status`
+5. Open issue in repository with build logs

From 8afb683e67c4a43d539a33a5a55777b2f5b366a8 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Thu, 16 Oct 2025 11:56:17 -0500
Subject: [PATCH 03/24] Local deployment setup: should support fast building

---
 Dockerfile.devtools           |  27 ++--
 Makefile                      |  15 +-
 docs/devtools-build-system.md | 264 +++++++++++++++++++++-------------
 3 files changed, 193 insertions(+), 113 deletions(-)

diff --git a/Dockerfile.devtools b/Dockerfile.devtools
index 8df2c11..9545ab0 100644
--- a/Dockerfile.devtools
+++ b/Dockerfile.devtools
@@ -48,6 +48,16 @@ RUN /workspace/depot_tools/ensure_bootstrap
 # Build standard DevTools first (cached)
 RUN npm run build
 
+# Add Browser Operator fork and checkout main branch
+# This ensures the base has all Browser Operator customizations (ai_chat panel, etc.)
+RUN git remote add upstream https://github.com/BrowserOperator/browser-operator-core.git
+RUN git fetch upstream
+RUN git checkout upstream/main
+
+# Build Browser Operator version to ensure everything works
+# This catches any build issues in the base layer
+RUN npm run build
+
 # Create marker file to indicate base is built
 RUN touch /workspace/.devtools-base-built
 
@@ -56,21 +66,18 @@ RUN touch /workspace/.devtools-base-built
 # ==============================================================================
 FROM devtools-base AS devtools-local
 
-WORKDIR /workspace/devtools/devtools-frontend
-
-# Add Browser Operator fork
-RUN git remote add upstream https://github.com/BrowserOperator/browser-operator-core.git
-RUN git fetch upstream
-RUN git checkout upstream/main
+# Copy local changes from browser-operator-core submodule FIRST
+# This happens before checking out upstream, so we copy over the upstream code
+COPY browser-operator-core/front_end /workspace/devtools/devtools-frontend/front_end
+COPY browser-operator-core/eval-server /workspace/devtools/devtools-frontend/eval-server
 
-# This is where local changes would be copied in development mode
-# When building from submodule, copy local changes here:
-# COPY will be added by build script if browser-operator-core/ exists locally
+WORKDIR /workspace/devtools/devtools-frontend
 
 # Force automated mode
 RUN sed -i 's/AUTOMATED_MODE: false/AUTOMATED_MODE: true/' front_end/panels/ai_chat/core/BuildConfig.ts || true
 
-# Build Browser Operator version with current changes
+# Build Browser Operator version with local changes
+# This build is much faster since we're only building the changed files
 RUN npm run build
 
 # Create marker file
diff --git a/Makefile b/Makefile
index 7eb88ae..7508284 100644
--- a/Makefile
+++ b/Makefile
@@ -43,10 +43,19 @@ build-devtools: init-devtools ## Build DevTools image (smart: uses cache)
 	docker build -f Dockerfile.devtools --target devtools-server -t browser-operator-devtools:latest .
 	@echo "✅ DevTools built: browser-operator-devtools:latest"
 
-rebuild-devtools: ## Force rebuild DevTools (use after code changes)
-	@echo "🔄 Force rebuilding DevTools..."
+rebuild-devtools: ## Fast rebuild DevTools with local changes (recommended)
+	@echo "🔄 Rebuilding DevTools with local changes (using cached base)..."
+	@if ! docker images | grep -q "browser-operator-devtools.*base"; then \
+		echo "❌ DevTools base not found. Building base first..."; \
+		$(MAKE) --no-print-directory build-devtools-base; \
+	fi
+	docker build -f Dockerfile.devtools --target devtools-server -t browser-operator-devtools:latest .
+	@echo "✅ DevTools rebuilt with your local changes"
+
+rebuild-devtools-full: ## Force complete rebuild from scratch (slow, rarely needed)
+	@echo "🔄 Force rebuilding DevTools from scratch (this will take ~30 minutes)..."
 	docker build -f Dockerfile.devtools --no-cache --target devtools-server -t browser-operator-devtools:latest .
-	@echo "✅ DevTools rebuilt"
+	@echo "✅ DevTools completely rebuilt"
 
 build: init build-devtools ## Build extended image with DevTools frontend
 	@echo "🔨 Building extended kernel-browser with DevTools frontend..."
diff --git a/docs/devtools-build-system.md b/docs/devtools-build-system.md
index 680bba7..68b475e 100644
--- a/docs/devtools-build-system.md
+++ b/docs/devtools-build-system.md
@@ -1,9 +1,14 @@
-# DevTools Build System - Implementation Plan & Usage
+# Browser Operator DevTools - Build System & Development Guide
 
 ## Overview
 
 This document describes the 2-stage build system for Browser Operator DevTools that enables fast local development and iteration.
 
+**Key Benefits:**
+- **225x faster rebuilds**: ~8 seconds instead of ~30 minutes for code changes
+- **Smart caching**: 14GB base layer built once, reused forever
+- **Simple workflow**: Edit code → rebuild → run in ~1 minute
+
 ## Problem Statement
 
 The original build process had several issues:
@@ -39,13 +44,122 @@ web-agent/
 ├── run-local.sh                     # Lock file cleanup
 ├── docs/
 │   └── devtools-build-system.md     # This file
-├── DEVTOOLS-DEVELOPMENT.md          # Developer workflow guide
 └── browser-operator-core/           # Submodule (shallow clone)
-    ├── front_end/                   # DevTools source (modify here)
+    ├── front_end/                   # DevTools UI source code (edit here!)
+    │   └── panels/
+    │       └── ai_chat/             # AI Chat panel
+    │           ├── ui/              # UI components
+    │           └── core/            # Core logic
     ├── eval-server/                 # Eval server source
     └── docker/                      # Upstream Docker files
 ```
 
+---
+
+## Quick Start Guide
+
+### First Time Setup
+
+```bash
+# 1. Initialize submodules
+make init
+
+# 2. Build DevTools base (one-time, ~22 minutes)
+make build-devtools-base
+
+# 3. Build DevTools and browser image
+make build
+
+# 4. Run
+make run
+```
+
+**Access points:**
+- **WebRTC Client:** http://localhost:8000
+- **Enhanced DevTools UI:** http://localhost:8001
+- **Chrome DevTools:** http://localhost:9222
+- **Eval Server:** http://localhost:8080
+
+---
+
+## Daily Development Workflow
+
+### Making Changes to DevTools UI
+
+**Example:** Changing the Settings dialog title
+
+```bash
+# 1. Edit the file
+vim browser-operator-core/front_end/panels/ai_chat/ui/SettingsDialog.ts
+
+# 2. Fast rebuild (~8 seconds!)
+make rebuild-devtools
+
+# 3. Rebuild browser image (~35 seconds with cache)
+make build
+
+# 4. Restart container
+make stop && make run
+```
+
+**Total time:** ~1 minute from code change to running!
+
+### What Gets Cached
+
+- ✅ Base layer (14GB): depot_tools, devtools-frontend fetch, gclient sync, base npm build
+- ✅ Your local changes are copied in and only changed files rebuild
+- ✅ Incremental npm build is lightning fast (~8 seconds)
+
+### Quick Iteration (no DevTools changes)
+
+```bash
+# Just rebuild and run
+make build  # Smart: skips DevTools if unchanged
+make run
+```
+
+### Switching Browser Operator Branches
+
+```bash
+cd browser-operator-core
+git fetch origin
+git checkout feature-branch
+cd ..
+make rebuild-devtools
+make build
+make run
+```
+
+---
+
+## Common Commands
+
+```bash
+# Quick rebuild after editing code (full cycle)
+make rebuild-devtools && make build && make stop && make run
+
+# Force complete rebuild from scratch (rarely needed)
+make rebuild-devtools-full
+
+# Clean everything and start fresh
+make clean-devtools && make clean
+make init
+make build-devtools
+make build
+make run
+
+# Test endpoints
+make test
+
+# View logs
+make logs
+
+# Shell access
+make shell
+```
+
+---
+
 ## Implementation Details
 
 ### 1. Submodule Configuration (.gitmodules)
@@ -177,84 +291,23 @@ rm -f "$CHROMIUM_DATA_REAL/user-data/SingletonLock" \
 
 ---
 
-## Quick Start Guide
-
-### First Time Setup
-
-```bash
-# 1. Clone repository
-git clone <your-repo>
-cd web-agent
-
-# 2. Initialize submodules
-make init
-
-# 3. Build DevTools (one-time, ~30 minutes)
-make build-devtools
-
-# 4. Build browser image (~5 minutes)
-make build
-
-# 5. Run
-make run
-```
-
-**Access points:**
-- WebRTC Client: http://localhost:8000
-- Enhanced DevTools UI: http://localhost:8001
-- Chrome DevTools: http://localhost:9222
-- Eval Server: http://localhost:8080
-
-### Daily Development Workflow
-
-#### Editing Browser Operator Code
-
-```bash
-# 1. Edit code in browser-operator-core/
-vim browser-operator-core/front_end/panels/ai_chat/AIChatPanel.ts
-
-# 2. Rebuild DevTools (~5-10 minutes)
-make rebuild-devtools
-
-# 3. Rebuild browser image (~2-5 minutes)
-make build
-
-# 4. Run
-make run
-```
-
-#### Quick Iteration (no DevTools changes)
-
-```bash
-# Just rebuild and run
-make build  # Smart: skips DevTools if unchanged
-make run
-```
-
-#### Switching Browser Operator Branches
-
-```bash
-cd browser-operator-core
-git fetch origin
-git checkout feature-branch
-cd ..
-make rebuild-devtools
-make build
-make run
-```
-
----
-
 ## Performance Comparison
 
 ### Build Times
 
 | Operation | Before | After | Improvement |
 |-----------|--------|-------|-------------|
-| First build | ~30 min | ~30 min | Same (necessary) |
-| Code change rebuild | ~30 min | ~5-10 min | **6-10x faster** |
-| No DevTools change | ~30 min | ~2-5 min | **10-15x faster** |
-| Submodule init | Variable | ~1 min | Faster (shallow) |
+| First build | ~30 min | ~22 min (base) | Slightly faster |
+| Code change rebuild | ~30 min | **~8 seconds** | **225x faster!** |
+| Final image build | ~30 min | ~1-2 min | **15-30x faster** |
+| No DevTools change | ~30 min | ~30 sec | **60x faster** |
+| Full cycle (edit → run) | ~35 min | **~1 minute** | **35x faster** |
+
+**Real-world performance (tested):**
+- Base layer build (one-time): 22 minutes (14GB cached)
+- Local changes rebuild: **8.4 seconds** (incremental npm build)
+- DevTools server image: 13 seconds (nginx + copy)
+- Final browser image: 35 seconds (all cached layers)
 
 ### Docker Layer Caching
 
@@ -274,36 +327,50 @@ devtools-server                # ✅ Quick nginx copy
 
 ---
 
-## Troubleshooting
+## Tips & Best Practices
 
-### Issue: "DevTools image not found"
+1. **Only rebuild what changed:** Use `make rebuild-devtools` (fast) instead of `make rebuild-devtools-full` (slow)
 
-**Solution:**
-```bash
-make build-devtools
-```
+2. **Check if base exists:** The build will fail if you don't have the base layer. Run `make build-devtools-base` once.
+
+3. **Profile persistence:** Your Chromium profile persists in `./chromium-data` - no need to log in every time
+
+4. **Lock file cleanup:** Lock files are automatically cleaned before each run
 
-### Issue: Submodule errors
+5. **Parallel work:** You can edit code while containers are running, then rebuild and restart
 
-**Solution:**
+---
+
+## Troubleshooting
+
+### "DevTools base not found"
 ```bash
-# Reset submodules
-git submodule deinit -f browser-operator-core
-git submodule update --init --depth 1 browser-operator-core
+make build-devtools-base
 ```
 
-### Issue: Profile lock errors
+### Changes not appearing
+```bash
+# Make sure you rebuilt DevTools after editing
+make rebuild-devtools
+make build
+make stop && make run
+```
 
-**Solution:**
+### Container won't start (lock errors)
 ```bash
-# Manual cleanup (run-local.sh does this automatically)
+# Lock files are auto-cleaned, but you can manually clean
 rm -f chromium-data/user-data/Singleton*
 make run
 ```
 
-### Issue: Force complete rebuild
+### Submodule errors
+```bash
+# Reset submodules
+git submodule deinit -f browser-operator-core
+git submodule update --init --depth 1 browser-operator-core
+```
 
-**Solution:**
+### Force complete rebuild
 ```bash
 make clean-devtools
 make build-devtools-base
@@ -311,9 +378,7 @@ make build-devtools
 make build
 ```
 
-### Issue: Out of disk space
-
-**Solution:**
+### Out of disk space
 ```bash
 # Clean Docker cache
 docker system prune -a
@@ -423,8 +488,7 @@ The `Dockerfile.devtools` can be contributed to BrowserOperator as `docker/Docke
 | `Makefile` | UPDATED | Added DevTools targets |
 | `build-local.sh` | UPDATED | Smart DevTools checks |
 | `run-local.sh` | UNCHANGED | Already has lock cleanup |
-| `DEVTOOLS-DEVELOPMENT.md` | NEW | Developer guide |
-| `docs/devtools-build-system.md` | NEW | This file |
+| `docs/devtools-build-system.md` | UPDATED | Combined documentation |
 
 ---
 
@@ -478,8 +542,8 @@ Expected results:
 ## Support
 
 For issues or questions:
-1. Check `DEVTOOLS-DEVELOPMENT.md` for workflow guide
-2. Review troubleshooting section above
-3. Check Docker logs: `docker logs kernel-browser-extended`
-4. Verify submodules: `git submodule status`
+1. Review troubleshooting section above
+2. Check Docker logs: `docker logs kernel-browser-extended`
+3. Verify submodules: `git submodule status`
+4. Check make targets: `make help`
 5. Open issue in repository with build logs

From 81afe0e1e1099270d32b05e93fd9ff8d1db050a8 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Thu, 16 Oct 2025 13:27:49 -0500
Subject: [PATCH 04/24] Added proper /tabs API

---
 eval-server/nodejs/src/api-server.js | 52 ++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 2713da4..fbef719 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -96,6 +96,10 @@ class APIServer {
       if (pathname.startsWith('/clients/') && pathname.endsWith('/evaluations')) {
         const clientId = pathname.split('/')[2];
         result = this.getClientEvaluations(clientId);
+      } else if (pathname.startsWith('/clients/') && pathname.endsWith('/tabs')) {
+        // Handle dynamic client tabs route
+        const clientId = pathname.split('/')[2];
+        result = this.getClientTabsById(clientId);
       } else {
         switch (pathname) {
           case '/status':
@@ -153,23 +157,23 @@ class APIServer {
 
   getClients() {
     const clients = this.evaluationServer.getClientManager().getAllClients();
+    const connectedClients = this.evaluationServer.connectedClients;
 
     return clients.map(client => {
-      const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(client.id);
-      const connection = this.evaluationServer.connectedClients.get(client.id);
+      const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
 
       return {
         id: client.id,
         name: client.name,
         description: client.description,
-        connected: !!connection,
-        ready: connection?.ready || false,
-        evaluations: evaluations.map(evaluation => ({
-          id: evaluation.id,
-          name: evaluation.name,
-          tool: evaluation.tool,
-          status: evaluation.status || 'pending',
-          enabled: evaluation.enabled !== false
+        tabCount: tabs.length,
+        tabs: tabs.map(tab => ({
+          tabId: tab.tabId,
+          compositeClientId: tab.compositeClientId,
+          connected: connectedClients.has(tab.compositeClientId),
+          ready: connectedClients.get(tab.compositeClientId)?.ready || false,
+          connectedAt: tab.connectedAt,
+          remoteAddress: tab.connection?.remoteAddress || 'unknown'
         }))
       };
     });
@@ -196,6 +200,34 @@ class APIServer {
     };
   }
 
+  getClientTabsById(clientId) {
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    const tabs = this.evaluationServer.getClientManager().getClientTabs(clientId);
+    const connectedClients = this.evaluationServer.connectedClients;
+    const client = this.evaluationServer.getClientManager().getClient(clientId);
+
+    if (!client) {
+      throw new Error(`Client '${clientId}' not found`);
+    }
+
+    return {
+      baseClientId: clientId,
+      clientName: client.name,
+      tabCount: tabs.length,
+      tabs: tabs.map(tab => ({
+        tabId: tab.tabId,
+        compositeClientId: tab.compositeClientId,
+        connected: connectedClients.has(tab.compositeClientId),
+        ready: connectedClients.get(tab.compositeClientId)?.ready || false,
+        connectedAt: tab.connectedAt,
+        remoteAddress: tab.connection?.remoteAddress || 'unknown'
+      }))
+    };
+  }
+
   async triggerEvaluation(payload) {
     const { clientId, evaluationId, runAll = false } = payload;
 

From 7f5e4cdf41ef9e536e67bb4f38b6813a43738cb6 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Thu, 16 Oct 2025 17:44:39 -0500
Subject: [PATCH 05/24] Added API for managing tabs

---
 Dockerfile.local                              |   2 +-
 eval-server/nodejs/CLAUDE.md                  | 164 +++++++++++++++-
 .../nodejs/examples/with-http-wrapper.js      |   2 +-
 eval-server/nodejs/src/api-server.js          |  63 ++++++
 eval-server/nodejs/src/lib/EvalServer.js      | 182 ++++++++++++++++++
 5 files changed, 410 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.local b/Dockerfile.local
index ad90db1..4a833cb 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -17,7 +17,7 @@ FROM --platform=linux/arm64 node:18-alpine AS eval-server-builder
 WORKDIR /workspace
 
 # Copy eval server from browser-operator-core submodule
-COPY browser-operator-core/eval-server/nodejs /workspace/eval-server
+COPY eval-server/nodejs /workspace/eval-server
 
 WORKDIR /workspace/eval-server
 
diff --git a/eval-server/nodejs/CLAUDE.md b/eval-server/nodejs/CLAUDE.md
index ba84f31..7403353 100644
--- a/eval-server/nodejs/CLAUDE.md
+++ b/eval-server/nodejs/CLAUDE.md
@@ -180,6 +180,167 @@ The server supports runtime LLM configuration via the `configure_llm` JSON-RPC m
 }
 ```
 
+### Tab Management
+
+The evaluation server supports managing browser tabs via REST API endpoints and Chrome DevTools Protocol (CDP).
+
+#### Tab Identification
+
+Each browser tab is identified by a **composite client ID** in the format: `baseClientId:tabId`
+
+- `baseClientId`: The persistent identifier for the DevTools client (e.g., `9907fd8d-92a8-4a6a-bce9-458ec8c57306`)
+- `tabId`: The Chrome target ID for the specific tab (e.g., `482D56EE57B1931A3B9D1BFDAF935429`)
+
+#### API Endpoints
+
+**List All Clients and Tabs**
+```bash
+GET /clients
+```
+
+Returns all registered clients with their active tabs, connection status, and readiness state.
+
+Response format:
+```json
+[
+  {
+    "id": "baseClientId",
+    "name": "Client Name",
+    "description": "Client Description",
+    "tabCount": 3,
+    "tabs": [
+      {
+        "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+        "compositeClientId": "baseClientId:tabId",
+        "connected": true,
+        "ready": true,
+        "connectedAt": "2025-01-15T10:30:00.000Z",
+        "remoteAddress": "::ffff:172.18.0.1"
+      }
+    ]
+  }
+]
+```
+
+**List Tabs for Specific Client**
+```bash
+GET /clients/{clientId}/tabs
+```
+
+Returns all tabs for a specific client identified by `baseClientId`.
+
+**Open New Tab**
+```bash
+POST /tabs/open
+Content-Type: application/json
+
+{
+  "clientId": "baseClientId:tabId",
+  "url": "https://example.com",
+  "background": false
+}
+```
+
+Opens a new tab in the browser associated with the specified client.
+
+Response format:
+```json
+{
+  "clientId": "baseClientId:tabId",
+  "tabId": "newTabId",
+  "compositeClientId": "baseClientId:newTabId",
+  "url": "https://example.com",
+  "status": "opened"
+}
+```
+
+**Close Tab**
+```bash
+POST /tabs/close
+Content-Type: application/json
+
+{
+  "clientId": "baseClientId:tabId",
+  "tabId": "targetTabId"
+}
+```
+
+Closes the specified tab.
+
+Response format:
+```json
+{
+  "clientId": "baseClientId:tabId",
+  "tabId": "targetTabId",
+  "status": "closed",
+  "success": true
+}
+```
+
+#### Implementation Architecture
+
+**Direct CDP Approach (Current)**
+
+Tab management is implemented using direct Chrome DevTools Protocol (CDP) communication:
+
+1. Server discovers the CDP WebSocket endpoint via `http://localhost:9223/json/version`
+2. For each command (open/close), a new WebSocket connection is established to the CDP endpoint
+3. Commands are sent using JSON-RPC 2.0 format:
+   - `Target.createTarget` - Opens new tab
+   - `Target.closeTarget` - Closes existing tab
+4. WebSocket connection is closed after receiving the response
+
+Key implementation files:
+- `src/lib/EvalServer.js` - Contains `sendCDPCommand()`, `openTab()`, and `closeTab()` methods
+- `src/api-server.js` - REST API endpoints that delegate to EvalServer methods
+
+**Alternative Approach Considered**
+
+An RPC-based approach was initially considered where:
+- API server sends JSON-RPC request to DevTools client via WebSocket
+- DevTools client executes CDP commands locally
+- Response is sent back via JSON-RPC
+
+This was rejected in favor of direct CDP communication for simplicity and reduced latency.
+
+#### Chrome Setup
+
+The browser must be started with remote debugging enabled:
+```bash
+chromium --remote-debugging-port=9223
+```
+
+The CDP endpoint is accessible at:
+- HTTP: `http://localhost:9223/json/version`
+- WebSocket: `ws://localhost:9223/devtools/browser/{browserId}`
+
+#### Current Limitations
+
+**⚠️ Known Issue: WebSocket Timeout**
+
+Tab opening and closing functionality is currently experiencing a WebSocket timeout issue:
+
+- Symptom: `sendCDPCommand()` times out after 10 seconds with no response
+- Error: `CDP command timeout: Target.createTarget`
+- Status: Under investigation
+- Debugging approach: Added extensive logging to track WebSocket lifecycle events
+
+The CDP endpoint is correctly discovered and accessible, but WebSocket messages are not being received. This may be related to:
+- WebSocket handshake issues
+- CDP protocol version mismatch
+- Network/proxy configuration
+- Chrome process state
+
+**Workaround**: Until this issue is resolved, tab management via the API is not functional. Manual CDP testing is required to diagnose the root cause.
+
+#### Future Enhancements
+
+- Automatic tab registration in ClientManager when DevTools connects
+- Tab lifecycle events (opened, closed, navigated)
+- Bulk tab operations
+- Tab metadata (title, URL, favicon)
+- Tab grouping and organization
+
 ### Configuration
 
 All configuration is managed through environment variables and `src/config.js`. Key settings:
@@ -187,4 +348,5 @@ All configuration is managed through environment variables and `src/config.js`.
 - OpenAI API configuration
 - RPC timeouts
 - Logging levels and directories
-- Maximum concurrent evaluations
\ No newline at end of file
+- Maximum concurrent evaluations
+- CDP endpoint (default: localhost:9223)
\ No newline at end of file
diff --git a/eval-server/nodejs/examples/with-http-wrapper.js b/eval-server/nodejs/examples/with-http-wrapper.js
index 2ec9d0f..78b09f4 100644
--- a/eval-server/nodejs/examples/with-http-wrapper.js
+++ b/eval-server/nodejs/examples/with-http-wrapper.js
@@ -19,7 +19,7 @@ const evalServer = new EvalServer({
 console.log('🔧 Creating HTTP wrapper...');
 const httpWrapper = new HTTPWrapper(evalServer, {
   port: 8080,
-  host: '127.0.0.1'
+  host: '0.0.0.0'
 });
 
 
diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index fbef719..db62055 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -118,6 +118,22 @@ class APIServer {
             result = await this.triggerEvaluation(JSON.parse(body));
             break;
 
+          case '/tabs/open':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.openTab(JSON.parse(body));
+            break;
+
+          case '/tabs/close':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.closeTab(JSON.parse(body));
+            break;
+
           case '/v1/responses':
             if (method !== 'POST') {
               this.sendError(res, 405, 'Method not allowed');
@@ -286,6 +302,53 @@ class APIServer {
 
   }
 
+  async openTab(payload) {
+    const { clientId, url = 'about:blank', background = false } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    // Since we use direct CDP, we don't need the client to be connected
+    // Just extract the baseClientId (first part before colon if composite, or the whole ID)
+    const baseClientId = clientId.split(':')[0];
+
+    const result = await this.evaluationServer.openTab(baseClientId, { url, background });
+
+    return {
+      clientId: baseClientId,
+      tabId: result.tabId,
+      compositeClientId: result.compositeClientId,
+      url: result.url || url,
+      status: 'opened'
+    };
+  }
+
+  async closeTab(payload) {
+    const { clientId, tabId } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    if (!tabId) {
+      throw new Error('Tab ID is required');
+    }
+
+    // Since we use direct CDP, we don't need the client to be connected
+    // Just extract the baseClientId
+    const baseClientId = clientId.split(':')[0];
+
+    const result = await this.evaluationServer.closeTab(baseClientId, { tabId });
+
+    return {
+      clientId: baseClientId,
+      tabId,
+      status: 'closed',
+      success: result.success !== false
+    };
+  }
+
   /**
    * Handle OpenAI Responses API compatible requests with nested model format
    */
diff --git a/eval-server/nodejs/src/lib/EvalServer.js b/eval-server/nodejs/src/lib/EvalServer.js
index d174c7a..208ec1f 100644
--- a/eval-server/nodejs/src/lib/EvalServer.js
+++ b/eval-server/nodejs/src/lib/EvalServer.js
@@ -826,6 +826,188 @@ export class EvalServer extends EventEmitter {
     }
   }
 
+  /**
+   * Get the browser-level CDP WebSocket endpoint
+   * @returns {Promise<string>} WebSocket URL
+   */
+  async getCDPBrowserEndpoint() {
+    try {
+      const response = await fetch('http://localhost:9223/json/version');
+      const data = await response.json();
+      return data.webSocketDebuggerUrl;
+    } catch (error) {
+      logger.error('Failed to get CDP browser endpoint', { error: error.message });
+      throw new Error('Failed to connect to Chrome DevTools Protocol');
+    }
+  }
+
+  /**
+   * Send a CDP command via WebSocket
+   * @param {string} method - CDP method name
+   * @param {Object} params - CDP method parameters
+   * @returns {Promise<Object>} CDP response
+   */
+  async sendCDPCommand(method, params = {}) {
+    return new Promise(async (resolve, reject) => {
+      try {
+        const { default: WebSocket } = await import('ws');
+        const cdpEndpoint = await this.getCDPBrowserEndpoint();
+        const ws = new WebSocket(cdpEndpoint);
+        // Use a simple counter for CDP message IDs (must be a reasonable integer)
+        const id = Math.floor(Math.random() * 1000000);
+
+      const timeout = setTimeout(() => {
+        ws.close();
+        reject(new Error(`CDP command timeout: ${method}`));
+      }, 10000);
+
+      ws.on('open', () => {
+        const message = JSON.stringify({
+          id,
+          method,
+          params
+        });
+        logger.info('CDP WebSocket opened, sending command', { method, params, cdpEndpoint });
+        ws.send(message);
+      });
+
+      ws.on('message', (data) => {
+        try {
+          const response = JSON.parse(data.toString());
+          logger.info('CDP WebSocket message received', {
+            method,
+            responseId: response.id,
+            expectedId: id,
+            hasResult: !!response.result,
+            hasError: !!response.error,
+            fullResponse: JSON.stringify(response)
+          });
+          if (response.id === id) {
+            clearTimeout(timeout);
+            ws.close();
+
+            if (response.error) {
+              logger.error('CDP command error', { method, error: response.error });
+              reject(new Error(`CDP error: ${response.error.message}`));
+            } else {
+              logger.info('CDP command success', { method, result: response.result });
+              resolve(response.result);
+            }
+          } else {
+            logger.warn('CDP message ID mismatch', {
+              method,
+              receivedId: response.id,
+              expectedId: id,
+              responseType: response.method ? 'event' : 'response'
+            });
+          }
+        } catch (error) {
+          clearTimeout(timeout);
+          ws.close();
+          logger.error('CDP message parse error', { error: error.message });
+          reject(error);
+        }
+      });
+
+      ws.on('error', (error) => {
+        clearTimeout(timeout);
+        logger.error('CDP WebSocket error', { error: error.message });
+        reject(error);
+      });
+      } catch (error) {
+        reject(error);
+      }
+    });
+  }
+
+  /**
+   * Open a new tab using CDP directly
+   * @param {string} baseClientId - Base client ID (or will be extracted from composite ID)
+   * @param {Object} options - Tab options
+   * @param {string} options.url - URL to open in the new tab (default: 'about:blank')
+   * @param {boolean} options.background - Whether to open in background (default: false)
+   * @returns {Promise<Object>} Result with tabId
+   */
+  async openTab(baseClientId, options = {}) {
+    const { url = 'about:blank', background = false } = options;
+    // Extract base client ID if composite ID was passed
+    const cleanBaseClientId = baseClientId.split(':')[0];
+
+    try {
+      logger.info('Opening new tab via CDP', { url, background, baseClientId: cleanBaseClientId });
+
+      // Use CDP Target.createTarget
+      const result = await this.sendCDPCommand('Target.createTarget', {
+        url,
+        newWindow: false,
+        background
+      });
+
+      const tabId = result.targetId;
+      const compositeClientId = `${cleanBaseClientId}:${tabId}`;
+
+      logger.info('Tab opened successfully via CDP', {
+        tabId,
+        compositeClientId,
+        url
+      });
+
+      return {
+        tabId,
+        compositeClientId,
+        url
+      };
+    } catch (error) {
+      logger.error('Failed to open tab via CDP', {
+        baseClientId,
+        url,
+        error: error.message
+      });
+      throw error;
+    }
+  }
+
+  /**
+   * Close a tab using CDP directly
+   * @param {string} baseClientId - Base client ID (currently not used, kept for API compatibility)
+   * @param {Object} options - Close options
+   * @param {string} options.tabId - Tab ID to close
+   * @returns {Promise<Object>} Result with success status
+   */
+  async closeTab(baseClientId, options = {}) {
+    const { tabId } = options;
+
+    if (!tabId) {
+      throw new Error('tabId is required to close a tab');
+    }
+
+    try {
+      logger.info('Closing tab via CDP', { tabId, baseClientId });
+
+      // Use CDP Target.closeTarget
+      const result = await this.sendCDPCommand('Target.closeTarget', {
+        targetId: tabId
+      });
+
+      logger.info('Tab closed successfully via CDP', {
+        tabId,
+        success: result.success
+      });
+
+      return {
+        success: result.success !== false,
+        tabId
+      };
+    } catch (error) {
+      logger.error('Failed to close tab via CDP', {
+        tabId,
+        baseClientId,
+        error: error.message
+      });
+      throw error;
+    }
+  }
+
   /**
    * Validate response using configured judge
    */

From 469cc40dce1e92e09419ee62a4fe5a2654e74d4c Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Thu, 16 Oct 2025 18:20:00 -0500
Subject: [PATCH 06/24] New tab opened for each /responses API call

---
 eval-server/nodejs/src/api-server.js | 78 ++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 9 deletions(-)

diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index db62055..1d83522 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -375,22 +375,34 @@ class APIServer {
         }
       });
 
-      // Find a connected and ready client
-      const readyClient = this.findReadyClient();
-      if (!readyClient) {
-        throw new Error('No DevTools client is connected and ready. Please ensure a DevTools client is connected to the evaluation server.');
-      }
+      // Find a client with existing tabs (not the dummy client)
+      const baseClientId = this.findClientWithTabs();
+
+      // Open a new tab for this request
+      logger.info('Opening new tab for responses request', { baseClientId });
+      const tabResult = await this.evaluationServer.openTab(baseClientId, {
+        url: 'about:blank',
+        background: false
+      });
+
+      logger.info('Tab opened successfully', {
+        tabId: tabResult.tabId,
+        compositeClientId: tabResult.compositeClientId
+      });
+
+      // Wait for the new tab's DevTools to connect
+      const tabClient = await this.waitForClientConnection(tabResult.compositeClientId);
 
       // Create a dynamic evaluation for this request
       const evaluation = this.createDynamicEvaluationNested(requestBody.input, nestedModelConfig);
 
-      // Execute the evaluation on the DevTools client
-      logger.info('Executing evaluation on DevTools client', {
-        clientId: readyClient.clientId,
+      // Execute the evaluation on the new tab's DevTools client
+      logger.info('Executing evaluation on new tab', {
+        compositeClientId: tabResult.compositeClientId,
         evaluationId: evaluation.id
       });
 
-      const result = await this.evaluationServer.executeEvaluation(readyClient, evaluation);
+      const result = await this.evaluationServer.executeEvaluation(tabClient, evaluation);
 
       // Debug: log the result structure
       logger.debug('executeEvaluation result:', result);
@@ -465,6 +477,54 @@ class APIServer {
     return null;
   }
 
+  /**
+   * Find a client that has existing tabs (not the dummy client)
+   * @returns {string} Base client ID
+   */
+  findClientWithTabs() {
+    const clients = this.evaluationServer.getClientManager().getAllClients();
+
+    for (const client of clients) {
+      const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
+      if (tabs.length > 0) {
+        logger.info('Found client with tabs', { clientId: client.id, tabCount: tabs.length });
+        return client.id;
+      }
+    }
+
+    throw new Error('No client with existing tabs found. Please ensure at least one DevTools client with a tab is connected.');
+  }
+
+  /**
+   * Wait for a client connection to be established and ready
+   * @param {string} compositeClientId - Composite client ID (baseClientId:tabId)
+   * @param {number} maxWaitMs - Maximum time to wait in milliseconds
+   * @returns {Promise<Object>} Connection object
+   */
+  async waitForClientConnection(compositeClientId, maxWaitMs = 10000) {
+    const startTime = Date.now();
+    const pollInterval = 500; // Check every 500ms
+
+    logger.info('Waiting for client connection', { compositeClientId, maxWaitMs });
+
+    while (Date.now() - startTime < maxWaitMs) {
+      const connection = this.evaluationServer.connectedClients.get(compositeClientId);
+
+      if (connection && connection.ready) {
+        logger.info('Client connection established and ready', {
+          compositeClientId,
+          waitedMs: Date.now() - startTime
+        });
+        return connection;
+      }
+
+      // Wait before next check
+      await new Promise(resolve => setTimeout(resolve, pollInterval));
+    }
+
+    throw new Error(`Timeout waiting for client connection: ${compositeClientId}. Tab may not have connected to eval-server.`);
+  }
+
   /**
    * Create a dynamic evaluation object with nested model configuration
    * @param {string} input - Input message for the evaluation

From ccc914d4249045320cfa339b52cdbebfdf0cd390 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Thu, 16 Oct 2025 18:40:50 -0500
Subject: [PATCH 07/24] Updated documentation

---
 eval-server/nodejs/README.md | 53 ++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/eval-server/nodejs/README.md b/eval-server/nodejs/README.md
index d29f9bc..d9da46f 100644
--- a/eval-server/nodejs/README.md
+++ b/eval-server/nodejs/README.md
@@ -350,6 +350,59 @@ curl -X POST http://localhost:8081/v1/responses \
   -d '{"input": "What is 2+2?"}'
 ```
 
+#### `/v1/responses` Endpoint Details
+
+The `/v1/responses` endpoint provides an OpenAI-compatible interface for chat requests. **Each request automatically creates a new browser tab** to isolate the chat session.
+
+**Behavior:**
+1. Finds a client with existing tabs (skips dummy clients with 0 tabs)
+2. Opens a new browser tab via Chrome DevTools Protocol (CDP)
+3. Waits for the tab's DevTools to connect (up to 10 seconds)
+4. Executes the chat request in the isolated new tab
+5. Returns the response in OpenAI Responses API format
+
+**Request Format:**
+```json
+{
+  "input": "Your question or prompt here",
+  "model": {
+    "main_model": {
+      "provider": "openai",
+      "model": "gpt-4",
+      "api_key": "sk-..."
+    }
+  }
+}
+```
+
+**Response Format:**
+```json
+[
+  {
+    "id": "msg_...",
+    "type": "message",
+    "role": "assistant",
+    "content": [
+      {
+        "type": "output_text",
+        "text": "Response text here",
+        "annotations": []
+      }
+    ]
+  }
+]
+```
+
+**Requirements:**
+- At least one DevTools client must have a connected tab
+- Chrome must be running with remote debugging enabled (port 9223)
+
+**Benefits:**
+- Each chat request runs in isolation
+- Supports parallel requests in different tabs
+- Better tracking and debugging per request
+- Automatic cleanup via browser tab management
+
 ## CLI Usage
 
 Interactive command-line interface for server management:

From 6f33f6368c67940710d399cf6ce3a39e2898dadc Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Sat, 18 Oct 2025 10:14:48 -0500
Subject: [PATCH 08/24] Initial eval setup

---
 Makefile                                      |  17 +-
 build-local.sh                                |  43 ---
 eval-server/nodejs/src/api-server.js          |  29 +-
 eval-server/nodejs/src/config.js              |   2 +-
 evals/.env.example                            |  15 +
 evals/.gitignore                              |  38 ++
 evals/.python-version                         |   1 +
 evals/README.md                               | 328 +++++++++++++++++
 evals/config.yml                              |  75 ++++
 evals/data/action-agent/a11y-001.yaml         |  46 +++
 evals/data/action-agent/accordion-001.yaml    |  46 +++
 .../action-agent/action-agent-a11y-001.yaml   |  46 +++
 .../action-agent-accordion-001.yaml           |  46 +++
 .../action-agent-autocomplete-001.yaml        |  46 +++
 .../action-agent-checkbox-001.yaml            |  46 +++
 .../action-agent-checkbox-002.yaml            |  47 +++
 .../action-agent/action-agent-click-001.yaml  |  47 +++
 .../action-agent-context-001.yaml             |  46 +++
 .../action-agent-datepicker-001.yaml          |  46 +++
 .../action-agent-daterange-001.yaml           |  46 +++
 .../action-agent-dropdown-001.yaml            |  46 +++
 .../action-agent-dynamic-001.yaml             |  46 +++
 .../action-agent-ecommerce-001.yaml           |  46 +++
 .../action-agent/action-agent-error-001.yaml  |  47 +++
 .../action-agent/action-agent-filter-001.yaml |  46 +++
 .../action-agent/action-agent-form-001.yaml   |  46 +++
 .../action-agent/action-agent-hover-001.yaml  |  46 +++
 .../action-agent-keyboard-001.yaml            |  46 +++
 .../action-agent/action-agent-login-001.yaml  |  47 +++
 .../action-agent/action-agent-modal-001.yaml  |  46 +++
 .../action-agent-multiselect-001.yaml         |  46 +++
 .../action-agent-multistep-001.yaml           |  47 +++
 .../action-agent/action-agent-nav-001.yaml    |  46 +++
 .../action-agent/action-agent-radio-001.yaml  |  47 +++
 .../action-agent/action-agent-slider-001.yaml |  46 +++
 .../action-agent-tableselect-001.yaml         |  46 +++
 .../action-agent-tablesort-001.yaml           |  46 +++
 .../action-agent/action-agent-tabs-001.yaml   |  46 +++
 .../action-agent-timepicker-001.yaml          |  46 +++
 .../action-agent/action-agent-upload-001.yaml |  46 +++
 .../action-agent/action-agent-video-001.yaml  |  47 +++
 .../action-agent/action-agent-video-002.yaml  |  47 +++
 evals/data/action-agent/autocomplete-001.yaml |  46 +++
 evals/data/action-agent/checkbox-001.yaml     |  46 +++
 evals/data/action-agent/checkbox-002.yaml     |  47 +++
 evals/data/action-agent/click-001.yaml        |  47 +++
 evals/data/action-agent/context-001.yaml      |  46 +++
 evals/data/action-agent/datepicker-001.yaml   |  46 +++
 evals/data/action-agent/daterange-001.yaml    |  46 +++
 evals/data/action-agent/dropdown-001.yaml     |  46 +++
 evals/data/action-agent/dynamic-001.yaml      |  46 +++
 evals/data/action-agent/ecommerce-001.yaml    |  46 +++
 evals/data/action-agent/error-001.yaml        |  47 +++
 evals/data/action-agent/filter-001.yaml       |  46 +++
 evals/data/action-agent/form-001.yaml         |  46 +++
 evals/data/action-agent/hover-001.yaml        |  46 +++
 evals/data/action-agent/keyboard-001.yaml     |  46 +++
 evals/data/action-agent/login-001.yaml        |  47 +++
 evals/data/action-agent/modal-001.yaml        |  46 +++
 evals/data/action-agent/multiselect-001.yaml  |  46 +++
 evals/data/action-agent/multistep-001.yaml    |  47 +++
 evals/data/action-agent/nav-001.yaml          |  46 +++
 evals/data/action-agent/radio-001.yaml        |  47 +++
 evals/data/action-agent/slider-001.yaml       |  46 +++
 evals/data/action-agent/tableselect-001.yaml  |  46 +++
 evals/data/action-agent/tablesort-001.yaml    |  46 +++
 evals/data/action-agent/tabs-001.yaml         |  46 +++
 evals/data/action-agent/timepicker-001.yaml   |  46 +++
 evals/data/action-agent/upload-001.yaml       |  46 +++
 evals/data/action-agent/video-001.yaml        |  47 +++
 evals/data/action-agent/video-002.yaml        |  47 +++
 evals/data/config.yaml                        |  11 +
 .../end-to-end/b-vitamins-research-001.yaml   |  35 ++
 .../end-to-end/investment-research-001.yaml   |  35 ++
 .../end-to-end/product-comparison-001.yaml    |  40 +++
 .../data/end-to-end/recipe-nutrition-001.yaml |  40 +++
 .../data/end-to-end/travel-planning-001.yaml  |  40 +++
 evals/data/research-agent/basic-001.yaml      |  39 ++
 evals/data/research-agent/business-001.yaml   |  39 ++
 evals/data/research-agent/comparison-001.yaml |  39 ++
 evals/data/research-agent/current-001.yaml    |  40 +++
 evals/data/research-agent/edge-001.yaml       |  39 ++
 .../research-agent-basic-001.yaml             |  39 ++
 .../research-agent-business-001.yaml          |  39 ++
 .../research-agent-comparison-001.yaml        |  39 ++
 .../research-agent-current-001.yaml           |  40 +++
 .../research-agent-edge-001.yaml              |  39 ++
 .../research-agent-technical-001.yaml         |  39 ++
 .../research-agent-tools-001.yaml             |  40 +++
 evals/data/research-agent/technical-001.yaml  |  39 ++
 evals/data/research-agent/tools-001.yaml      |  40 +++
 .../schema-extractor/amazon-product-001.yaml  |  78 ++++
 evals/data/schema-extractor/bbc-news-001.yaml |  69 ++++
 .../schema-extractor/bing-search-001.yaml     |  70 ++++
 .../github-repo-001-streamlined.yaml          |  66 ++++
 .../schema-extractor/github-repo-001.yaml     |  66 ++++
 .../schema-extractor/google-flights-001.yaml  | 106 ++++++
 .../schema-extractor/google-search-001.yaml   |  76 ++++
 .../data/schema-extractor/homedepot-001.yaml  |  92 +++++
 evals/data/schema-extractor/macys-001.yaml    | 106 ++++++
 .../wikipedia-search-001.yaml                 |  77 ++++
 .../dynamic-content-verification-001.yaml     |  45 +++
 .../screenshot-error-handling-001.yaml        |  42 +++
 .../screenshot-fullpage-001.yaml              |  43 +++
 .../screenshot-viewport-001.yaml              |  42 +++
 .../visual-comparison-001.yaml                |  45 +++
 .../amazon-product-001.yaml                   |  78 ++++
 .../bbc-news-001.yaml                         |  69 ++++
 .../bing-search-001.yaml                      |  70 ++++
 .../github-repo-001.yaml                      |  66 ++++
 .../google-flights-001.yaml                   | 106 ++++++
 .../google-search-001.yaml                    |  76 ++++
 .../homedepot-001.yaml                        |  92 +++++
 .../macys-001.yaml                            | 106 ++++++
 .../wikipedia-001.yaml                        |  76 ++++
 .../wikipedia-search-001.yaml                 |  77 ++++
 evals/data/web-task-agent/booking-001.yaml    |  45 +++
 evals/data/web-task-agent/ecommerce-001.yaml  |  53 +++
 evals/data/web-task-agent/error-001.yaml      |  45 +++
 evals/data/web-task-agent/extract-001.yaml    |  60 ++++
 evals/data/web-task-agent/finance-001.yaml    |  68 ++++
 evals/data/web-task-agent/flight-001.yaml     |  45 +++
 evals/data/web-task-agent/food-001.yaml       |  68 ++++
 evals/data/web-task-agent/iframe-001.yaml     |  83 +++++
 evals/data/web-task-agent/jobs-001.yaml       |  68 ++++
 evals/data/web-task-agent/learning-001.yaml   |  69 ++++
 evals/data/web-task-agent/nav-001.yaml        |  46 +++
 evals/data/web-task-agent/news-001.yaml       |  64 ++++
 evals/data/web-task-agent/realestate-001.yaml |  70 ++++
 evals/data/web-task-agent/scroll-001.yaml     |  61 ++++
 evals/data/web-task-agent/scroll-002.yaml     |  65 ++++
 evals/data/web-task-agent/scroll-003.yaml     |  61 ++++
 evals/data/web-task-agent/scroll-004.yaml     |  61 ++++
 evals/data/web-task-agent/scroll-005.yaml     |  73 ++++
 evals/data/web-task-agent/search-001.yaml     |  41 +++
 evals/data/web-task-agent/social-001.yaml     |  60 ++++
 .../web-task-agent-booking-001.yaml           |  45 +++
 .../web-task-agent-ecommerce-001.yaml         |  53 +++
 .../web-task-agent-error-001.yaml             |  45 +++
 .../web-task-agent-extract-001.yaml           |  60 ++++
 .../web-task-agent-finance-001.yaml           |  68 ++++
 .../web-task-agent-flight-001.yaml            |  45 +++
 .../web-task-agent-food-001.yaml              |  68 ++++
 .../web-task-agent-iframe-001.yaml            |  83 +++++
 .../web-task-agent-jobs-001.yaml              |  68 ++++
 .../web-task-agent-learning-001.yaml          |  69 ++++
 .../web-task-agent-nav-001.yaml               |  46 +++
 .../web-task-agent-news-001.yaml              |  64 ++++
 .../web-task-agent-realestate-001.yaml        |  70 ++++
 .../web-task-agent-scroll-001.yaml            |  61 ++++
 .../web-task-agent-scroll-002.yaml            |  65 ++++
 .../web-task-agent-scroll-003.yaml            |  61 ++++
 .../web-task-agent-scroll-004.yaml            |  61 ++++
 .../web-task-agent-scroll-005.yaml            |  73 ++++
 .../web-task-agent-search-001.yaml            |  41 +++
 .../web-task-agent-social-001.yaml            |  60 ++++
 evals/lib/__init__.py                         |  19 +
 evals/lib/api_client.py                       | 207 +++++++++++
 evals/lib/config_loader.py                    | 198 +++++++++++
 evals/lib/eval_loader.py                      | 274 ++++++++++++++
 evals/lib/judge.py                            | 244 +++++++++++++
 evals/pyproject.toml                          |  38 ++
 evals/requirements.txt                        |  13 +
 evals/run_action_agent.py                     | 333 ++++++++++++++++++
 164 files changed, 9693 insertions(+), 51 deletions(-)
 delete mode 100755 build-local.sh
 create mode 100644 evals/.env.example
 create mode 100644 evals/.gitignore
 create mode 100644 evals/.python-version
 create mode 100644 evals/README.md
 create mode 100644 evals/config.yml
 create mode 100644 evals/data/action-agent/a11y-001.yaml
 create mode 100644 evals/data/action-agent/accordion-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-a11y-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-accordion-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-autocomplete-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-checkbox-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-checkbox-002.yaml
 create mode 100644 evals/data/action-agent/action-agent-click-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-context-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-datepicker-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-daterange-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-dropdown-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-dynamic-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-ecommerce-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-error-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-filter-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-form-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-hover-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-keyboard-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-login-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-modal-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-multiselect-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-multistep-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-nav-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-radio-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-slider-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-tableselect-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-tablesort-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-tabs-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-timepicker-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-upload-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-video-001.yaml
 create mode 100644 evals/data/action-agent/action-agent-video-002.yaml
 create mode 100644 evals/data/action-agent/autocomplete-001.yaml
 create mode 100644 evals/data/action-agent/checkbox-001.yaml
 create mode 100644 evals/data/action-agent/checkbox-002.yaml
 create mode 100644 evals/data/action-agent/click-001.yaml
 create mode 100644 evals/data/action-agent/context-001.yaml
 create mode 100644 evals/data/action-agent/datepicker-001.yaml
 create mode 100644 evals/data/action-agent/daterange-001.yaml
 create mode 100644 evals/data/action-agent/dropdown-001.yaml
 create mode 100644 evals/data/action-agent/dynamic-001.yaml
 create mode 100644 evals/data/action-agent/ecommerce-001.yaml
 create mode 100644 evals/data/action-agent/error-001.yaml
 create mode 100644 evals/data/action-agent/filter-001.yaml
 create mode 100644 evals/data/action-agent/form-001.yaml
 create mode 100644 evals/data/action-agent/hover-001.yaml
 create mode 100644 evals/data/action-agent/keyboard-001.yaml
 create mode 100644 evals/data/action-agent/login-001.yaml
 create mode 100644 evals/data/action-agent/modal-001.yaml
 create mode 100644 evals/data/action-agent/multiselect-001.yaml
 create mode 100644 evals/data/action-agent/multistep-001.yaml
 create mode 100644 evals/data/action-agent/nav-001.yaml
 create mode 100644 evals/data/action-agent/radio-001.yaml
 create mode 100644 evals/data/action-agent/slider-001.yaml
 create mode 100644 evals/data/action-agent/tableselect-001.yaml
 create mode 100644 evals/data/action-agent/tablesort-001.yaml
 create mode 100644 evals/data/action-agent/tabs-001.yaml
 create mode 100644 evals/data/action-agent/timepicker-001.yaml
 create mode 100644 evals/data/action-agent/upload-001.yaml
 create mode 100644 evals/data/action-agent/video-001.yaml
 create mode 100644 evals/data/action-agent/video-002.yaml
 create mode 100644 evals/data/config.yaml
 create mode 100644 evals/data/end-to-end/b-vitamins-research-001.yaml
 create mode 100644 evals/data/end-to-end/investment-research-001.yaml
 create mode 100644 evals/data/end-to-end/product-comparison-001.yaml
 create mode 100644 evals/data/end-to-end/recipe-nutrition-001.yaml
 create mode 100644 evals/data/end-to-end/travel-planning-001.yaml
 create mode 100644 evals/data/research-agent/basic-001.yaml
 create mode 100644 evals/data/research-agent/business-001.yaml
 create mode 100644 evals/data/research-agent/comparison-001.yaml
 create mode 100644 evals/data/research-agent/current-001.yaml
 create mode 100644 evals/data/research-agent/edge-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-basic-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-business-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-comparison-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-current-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-edge-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-technical-001.yaml
 create mode 100644 evals/data/research-agent/research-agent-tools-001.yaml
 create mode 100644 evals/data/research-agent/technical-001.yaml
 create mode 100644 evals/data/research-agent/tools-001.yaml
 create mode 100644 evals/data/schema-extractor/amazon-product-001.yaml
 create mode 100644 evals/data/schema-extractor/bbc-news-001.yaml
 create mode 100644 evals/data/schema-extractor/bing-search-001.yaml
 create mode 100644 evals/data/schema-extractor/github-repo-001-streamlined.yaml
 create mode 100644 evals/data/schema-extractor/github-repo-001.yaml
 create mode 100644 evals/data/schema-extractor/google-flights-001.yaml
 create mode 100644 evals/data/schema-extractor/google-search-001.yaml
 create mode 100644 evals/data/schema-extractor/homedepot-001.yaml
 create mode 100644 evals/data/schema-extractor/macys-001.yaml
 create mode 100644 evals/data/schema-extractor/wikipedia-search-001.yaml
 create mode 100644 evals/data/screenshot-verification/dynamic-content-verification-001.yaml
 create mode 100644 evals/data/screenshot-verification/screenshot-error-handling-001.yaml
 create mode 100644 evals/data/screenshot-verification/screenshot-fullpage-001.yaml
 create mode 100644 evals/data/screenshot-verification/screenshot-viewport-001.yaml
 create mode 100644 evals/data/screenshot-verification/visual-comparison-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/amazon-product-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/bbc-news-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/bing-search-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/github-repo-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/google-flights-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/google-search-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/homedepot-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/macys-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/wikipedia-001.yaml
 create mode 100644 evals/data/streamlined-schema-extractor/wikipedia-search-001.yaml
 create mode 100644 evals/data/web-task-agent/booking-001.yaml
 create mode 100644 evals/data/web-task-agent/ecommerce-001.yaml
 create mode 100644 evals/data/web-task-agent/error-001.yaml
 create mode 100644 evals/data/web-task-agent/extract-001.yaml
 create mode 100644 evals/data/web-task-agent/finance-001.yaml
 create mode 100644 evals/data/web-task-agent/flight-001.yaml
 create mode 100644 evals/data/web-task-agent/food-001.yaml
 create mode 100644 evals/data/web-task-agent/iframe-001.yaml
 create mode 100644 evals/data/web-task-agent/jobs-001.yaml
 create mode 100644 evals/data/web-task-agent/learning-001.yaml
 create mode 100644 evals/data/web-task-agent/nav-001.yaml
 create mode 100644 evals/data/web-task-agent/news-001.yaml
 create mode 100644 evals/data/web-task-agent/realestate-001.yaml
 create mode 100644 evals/data/web-task-agent/scroll-001.yaml
 create mode 100644 evals/data/web-task-agent/scroll-002.yaml
 create mode 100644 evals/data/web-task-agent/scroll-003.yaml
 create mode 100644 evals/data/web-task-agent/scroll-004.yaml
 create mode 100644 evals/data/web-task-agent/scroll-005.yaml
 create mode 100644 evals/data/web-task-agent/search-001.yaml
 create mode 100644 evals/data/web-task-agent/social-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-booking-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-ecommerce-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-error-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-extract-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-finance-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-flight-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-food-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-iframe-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-jobs-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-learning-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-nav-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-news-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-realestate-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-scroll-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-scroll-002.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-scroll-003.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-scroll-004.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-scroll-005.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-search-001.yaml
 create mode 100644 evals/data/web-task-agent/web-task-agent-social-001.yaml
 create mode 100644 evals/lib/__init__.py
 create mode 100644 evals/lib/api_client.py
 create mode 100644 evals/lib/config_loader.py
 create mode 100644 evals/lib/eval_loader.py
 create mode 100644 evals/lib/judge.py
 create mode 100644 evals/pyproject.toml
 create mode 100644 evals/requirements.txt
 create mode 100755 evals/run_action_agent.py

diff --git a/Makefile b/Makefile
index 7508284..df5e2ef 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile for kernel-browser local development
 # Using kernel-images native build system
 
-.PHONY: help build run stop logs clean dev status shell test
+.PHONY: help build rebuild run stop logs clean dev status shell test
 
 # Default target
 help: ## Show this help message
@@ -57,11 +57,24 @@ rebuild-devtools-full: ## Force complete rebuild from scratch (slow, rarely need
 	docker build -f Dockerfile.devtools --no-cache --target devtools-server -t browser-operator-devtools:latest .
 	@echo "✅ DevTools completely rebuilt"
 
-build: init build-devtools ## Build extended image with DevTools frontend
+build: init ## Build extended image with DevTools frontend (smart: only builds DevTools if needed)
 	@echo "🔨 Building extended kernel-browser with DevTools frontend..."
+	@if ! docker images | grep -q "browser-operator-devtools.*latest"; then \
+		echo "📦 DevTools image not found, building it first..."; \
+		echo "   This is a one-time operation and will take ~30 minutes..."; \
+		$(MAKE) --no-print-directory build-devtools; \
+	else \
+		echo "✅ Using existing DevTools image"; \
+	fi
 	docker build -f Dockerfile.local -t kernel-browser:extended .
 	@echo "✅ Extended build complete"
 
+rebuild: init ## Force complete rebuild (including DevTools)
+	@echo "🔄 Force rebuilding everything from scratch..."
+	$(MAKE) --no-print-directory build-devtools
+	docker build -f Dockerfile.local -t kernel-browser:extended .
+	@echo "✅ Complete rebuild finished"
+
 run: ## Run extended container with DevTools (interactive)
 	@echo "🚀 Starting extended kernel-browser with DevTools..."
 	@if [ -n "$(URLS)" ]; then echo "📄 Opening URLs: $(URLS)"; fi
diff --git a/build-local.sh b/build-local.sh
deleted file mode 100755
index 1c48324..0000000
--- a/build-local.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-# Extended local build wrapper for kernel-browser with DevTools
-set -e -o pipefail
-
-echo "🔨 Building extended kernel-browser with DevTools frontend..."
-
-# Ensure we're in the right directory
-SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
-cd "$SCRIPT_DIR"
-
-# Initialize submodules if needed
-if [ ! -d "kernel-images/.git" ]; then
-    echo "📦 Initializing kernel-images submodule..."
-    git submodule update --init --depth 1 kernel-images
-fi
-
-if [ ! -d "browser-operator-core/.git" ]; then
-    echo "📦 Initializing browser-operator-core submodule..."
-    git submodule update --init --depth 1 browser-operator-core
-fi
-
-# Check if DevTools image exists
-if ! docker images | grep -q "browser-operator-devtools.*latest"; then
-    echo "📦 DevTools image not found, building it first..."
-    echo "   This is a one-time operation and will take ~30 minutes..."
-    make build-devtools
-else
-    echo "✅ Using existing DevTools image"
-fi
-
-echo "🚀 Starting extended build with Docker..."
-echo "   Using: Dockerfile.local"
-echo "   Target image: kernel-browser:extended"
-
-# Build using Docker with extended Dockerfile
-docker build -f Dockerfile.local -t kernel-browser:extended .
-
-echo "✅ Extended build completed successfully!"
-echo "   Image built: kernel-browser:extended"
-echo "   Includes: Chromium + DevTools frontend + WebRTC"
-echo ""
-echo "🏃 To run locally, use: ./run-local.sh"
\ No newline at end of file
diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 1d83522..08be5a4 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -362,12 +362,18 @@ class APIServer {
       // Handle nested model configuration directly
       const nestedModelConfig = this.processNestedModelConfig(requestBody);
 
+      // Extract optional URL and wait timeout
+      const targetUrl = requestBody.url || 'about:blank';
+      const waitTimeout = requestBody.wait_timeout || 5000;
+
       const redact = (mk) => ({
         ...mk,
         api_key: mk?.api_key ? `${String(mk.api_key).slice(0, 4)}...` : undefined
       });
       logger.info('Processing responses request:', {
         input: requestBody.input,
+        url: targetUrl,
+        wait_timeout: targetUrl !== 'about:blank' ? waitTimeout : 0,
         modelConfig: {
           main_model: redact(nestedModelConfig.main_model),
           mini_model: redact(nestedModelConfig.mini_model),
@@ -378,10 +384,10 @@ class APIServer {
       // Find a client with existing tabs (not the dummy client)
       const baseClientId = this.findClientWithTabs();
 
-      // Open a new tab for this request
-      logger.info('Opening new tab for responses request', { baseClientId });
+      // Open a new tab for this request at the specified URL
+      logger.info('Opening new tab for responses request', { baseClientId, url: targetUrl });
       const tabResult = await this.evaluationServer.openTab(baseClientId, {
-        url: 'about:blank',
+        url: targetUrl,
         background: false
       });
 
@@ -393,6 +399,12 @@ class APIServer {
       // Wait for the new tab's DevTools to connect
       const tabClient = await this.waitForClientConnection(tabResult.compositeClientId);
 
+      // Wait for page to load if a custom URL was provided
+      if (targetUrl !== 'about:blank') {
+        logger.info('Waiting for page to load', { waitTimeout });
+        await new Promise(resolve => setTimeout(resolve, waitTimeout));
+      }
+
       // Create a dynamic evaluation for this request
       const evaluation = this.createDynamicEvaluationNested(requestBody.input, nestedModelConfig);
 
@@ -484,6 +496,7 @@ class APIServer {
   findClientWithTabs() {
     const clients = this.evaluationServer.getClientManager().getAllClients();
 
+    // First, try to find a client with existing tabs
     for (const client of clients) {
       const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
       if (tabs.length > 0) {
@@ -492,7 +505,13 @@ class APIServer {
       }
     }
 
-    throw new Error('No client with existing tabs found. Please ensure at least one DevTools client with a tab is connected.');
+    // If no client with tabs, use the first available client (even with 0 tabs)
+    if (clients.length > 0) {
+      logger.info('No clients with tabs found, using first available client', { clientId: clients[0].id });
+      return clients[0].id;
+    }
+
+    throw new Error('No clients found. Please ensure at least one DevTools client is registered.');
   }
 
   /**
@@ -540,7 +559,7 @@ class APIServer {
       description: 'Dynamic evaluation created from API request',
       enabled: true,
       tool: 'chat',
-      timeout: 1500000, // 25 minutes
+      timeout: 7200000, // 2 hours (increased for slow custom API)
       input: {
         message: input
       },
diff --git a/eval-server/nodejs/src/config.js b/eval-server/nodejs/src/config.js
index 4bde4e5..3992715 100644
--- a/eval-server/nodejs/src/config.js
+++ b/eval-server/nodejs/src/config.js
@@ -45,7 +45,7 @@ export const CONFIG = {
   },
 
   rpc: {
-    timeout: parseInt(process.env.RPC_TIMEOUT) || 1500000, // 25 minutes default
+    timeout: parseInt(process.env.RPC_TIMEOUT) || 7200000, // 2 hours default (increased for slow custom API)
     maxConcurrentEvaluations: parseInt(process.env.MAX_CONCURRENT_EVALUATIONS) || 10
   },
 
diff --git a/evals/.env.example b/evals/.env.example
new file mode 100644
index 0000000..65e41e8
--- /dev/null
+++ b/evals/.env.example
@@ -0,0 +1,15 @@
+# Evaluation Framework Environment Variables
+# Copy this file to .env and fill in your actual API keys
+
+# Required: OpenAI API key for LLM judge and main model
+OPENAI_API_KEY=sk-your-openai-api-key-here
+
+# Optional: Groq API key (if using Groq models)
+GROQ_API_KEY=gsk-your-groq-api-key-here
+
+# Optional: OpenRouter API key (if using OpenRouter)
+OPENROUTER_API_KEY=your-openrouter-api-key-here
+
+# Optional: LiteLLM configuration (if using LiteLLM)
+LITELLM_API_KEY=your-litellm-api-key-here
+LITELLM_ENDPOINT=http://localhost:8000
diff --git a/evals/.gitignore b/evals/.gitignore
new file mode 100644
index 0000000..1837a46
--- /dev/null
+++ b/evals/.gitignore
@@ -0,0 +1,38 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+
+# uv
+.uv/
+uv.lock
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Reports
+reports/*.csv
+
+# Environment variables
+.env
+.env.local
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/evals/.python-version b/evals/.python-version
new file mode 100644
index 0000000..bd28b9c
--- /dev/null
+++ b/evals/.python-version
@@ -0,0 +1 @@
+3.9
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000..8f86cd0
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,328 @@
+# Evaluation Framework
+
+Modular evaluation framework for testing browser automation agents using LLM-as-a-judge.
+
+## Overview
+
+This framework provides:
+- **Shared Configuration**: Single `config.yml` for all evaluation runners
+- **Modular Runner Scripts**: Separate scripts for different evaluation categories
+- **LLM Judge**: Uses GPT-4 to assess response quality against criteria
+- **Automatic Reporting**: Timestamped CSV reports with detailed results
+
+## Directory Structure
+
+```
+evals/
+├── config.yml              # Shared configuration
+├── data/                   # Evaluation definitions (YAML)
+│   ├── action-agent/
+│   ├── research-agent/
+│   ├── schema-extractor/
+│   └── ...
+├── lib/                    # Shared library code
+│   ├── config_loader.py
+│   ├── eval_loader.py
+│   ├── api_client.py
+│   └── judge.py
+├── reports/                # Generated CSV reports
+├── run_action_agent.py     # Action agent runner
+├── pyproject.toml          # Project configuration and dependencies
+└── requirements.txt        # Legacy pip requirements (optional)
+```
+
+## Setup
+
+### 1. Install uv (if not already installed)
+
+```bash
+# macOS/Linux
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Or using pip
+pip install uv
+```
+
+### 2. Install Dependencies
+
+```bash
+cd evals
+
+# Install dependencies using uv
+uv pip install -e .
+
+# Or use uv sync for development
+uv sync
+```
+
+**Alternative (using pip):**
+```bash
+pip install -r requirements.txt
+```
+
+### 3. Configure Environment
+
+You have two options for setting API keys:
+
+#### Option A: Using .env file (Recommended)
+
+```bash
+# Copy the example file
+cp .env.example .env
+
+# Edit .env and add your API keys
+# The file will be automatically loaded when running evaluations
+```
+
+Example `.env` file:
+```bash
+OPENAI_API_KEY=sk-your-actual-key-here
+GROQ_API_KEY=gsk-your-actual-key-here  # Optional
+```
+
+#### Option B: Using shell environment variables
+
+```bash
+export OPENAI_API_KEY="sk-..."      # Required for LLM judge
+export GROQ_API_KEY="gsk-..."       # Optional, if using Groq models
+```
+
+### 4. Configure Models
+
+Edit `config.yml` to set your model preferences:
+
+```yaml
+main_model:
+  provider: "openai"
+  model_name: "gpt-4"
+  api_key: "${OPENAI_API_KEY}"
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-4"
+  api_key: "${OPENAI_API_KEY}"
+```
+
+The config supports environment variable substitution using `${VAR_NAME}` syntax.
+
+### 5. Start Evaluation Server
+
+Ensure the evaluation server is running:
+
+```bash
+# From the project root
+make compose-dev
+# OR
+docker run -d --name kernel-browser-extended ... kernel-browser:extended
+```
+
+The server should be accessible at `http://localhost:8080` (or the URL specified in `config.yml`).
+
+## Usage
+
+### Running Action Agent Evaluations
+
+```bash
+# Run all enabled action-agent evaluations (up to default limit)
+./run_action_agent.py
+
+# Run first 10 evaluations
+./run_action_agent.py --limit 10
+
+# Run specific evaluations by ID
+./run_action_agent.py --eval-ids action-agent-click-001 action-agent-form-001
+
+# Use custom config file
+./run_action_agent.py --config /path/to/config.yml
+```
+
+### Command-Line Options
+
+```
+--limit N           Maximum number of evaluations to run
+--eval-ids ID...    Specific evaluation IDs to run
+--config PATH       Path to config file (default: evals/config.yml)
+```
+
+## How It Works
+
+### 1. Load Configuration
+
+The runner automatically:
+- Loads environment variables from `.env` file (if present)
+- Loads model configurations from `config.yml`, including:
+  - API endpoint for the evaluation server
+  - Model tiers (main, mini, nano) for agent requests
+  - Judge model for evaluation assessment
+  - Execution settings (timeouts, delays, etc.)
+- Substitutes environment variables using `${VAR_NAME}` syntax
+
+### 2. Load Evaluations
+
+Evaluation definitions are loaded from YAML files in `data/`:
+
+```yaml
+id: "action-agent-click-001"
+name: "Search with Text Entry and Click"
+tool: "action_agent"
+input:
+  objective: "Type 'DevTools automation' in search box and click search button"
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    criteria:
+      - "Successfully located the search input field"
+      - "Entered text correctly"
+      - "Search was executed and results loaded"
+```
+
+### 3. Execute Evaluations
+
+For each evaluation:
+
+1. **Extract input** from the YAML definition
+2. **Send API request** to `/v1/responses` with model config
+3. **Receive response** from the agent
+4. **Judge response** using LLM against validation criteria
+5. **Record result** (pass/fail, score, reasoning)
+
+### 4. Generate Reports
+
+Results are saved to `reports/` as timestamped CSV files:
+
+```
+reports/action-agent_2025-01-17_14-30-45.csv
+```
+
+CSV columns:
+- `timestamp`: When the evaluation was run
+- `eval_id`: Evaluation identifier
+- `eval_name`: Human-readable name
+- `category`: Evaluation category
+- `status`: PASS or FAIL
+- `score`: Numerical score (0-1)
+- `judge_reasoning`: LLM judge's explanation
+- `execution_time_ms`: API request duration
+- `error`: Error message (if any)
+
+## Creating New Runners
+
+To create a runner for a different category (e.g., `research-agent`):
+
+1. Copy `run_action_agent.py` to `run_research_agent.py`
+2. Update the category parameter in `run_evaluations()`:
+   ```python
+   runner.run_evaluations(
+       category='research-agent',  # Change this
+       limit=limit,
+       eval_ids=args.eval_ids
+   )
+   ```
+3. Update the script description and help text
+4. Make it executable: `chmod +x run_research_agent.py`
+
+All runners share the same configuration and library code.
+
+## Adding New Evaluations
+
+To add new evaluation definitions:
+
+1. Create a YAML file in the appropriate `data/` subdirectory
+2. Follow the existing evaluation format:
+   ```yaml
+   id: "unique-eval-id"
+   name: "Human-readable name"
+   enabled: true
+   tool: "action_agent"  # or chat, research_agent, etc.
+   input:
+     objective: "Task description"
+   validation:
+     type: "llm-judge"
+     llm_judge:
+       criteria:
+         - "Criterion 1"
+         - "Criterion 2"
+   ```
+3. The evaluation will be automatically discovered and loaded
+
+## Configuration Reference
+
+### Model Configuration
+
+```yaml
+main_model:
+  provider: "openai"         # Provider: openai, groq, etc.
+  model_name: "gpt-4"       # Model identifier
+  api_key: "${ENV_VAR}"     # API key (supports env vars)
+```
+
+### Execution Settings
+
+```yaml
+execution:
+  default_limit: 20           # Default number of evals to run
+  timeout: 300               # API request timeout (seconds)
+  concurrent_requests: 1     # Concurrent execution (future)
+  request_delay: 1           # Delay between requests (seconds)
+```
+
+### Reporting Settings
+
+```yaml
+reporting:
+  reports_dir: "reports"          # Where to save CSV reports
+  format: "csv"                   # Report format
+  include_reasoning: true         # Include judge reasoning
+```
+
+## Troubleshooting
+
+### API Server Connection Failed
+
+```
+ERROR: Cannot connect to API server at http://localhost:8080
+```
+
+**Solution**: Ensure the evaluation server is running and accessible:
+```bash
+curl http://localhost:8080/status
+```
+
+### Environment Variable Not Found
+
+```
+ValueError: Environment variable ${OPENAI_API_KEY} not found
+```
+
+**Solution**: Set the required environment variable using one of these methods:
+
+1. **Using .env file (recommended)**:
+   ```bash
+   cp .env.example .env
+   # Edit .env and add: OPENAI_API_KEY=sk-your-actual-key
+   ```
+
+2. **Using shell export**:
+   ```bash
+   export OPENAI_API_KEY="sk-..."
+   ```
+
+### No Evaluations Found
+
+```
+No evaluations found in category: action-agent
+```
+
+**Solution**: Verify that:
+1. The `data/action-agent/` directory exists
+2. It contains `.yaml` files
+3. Evaluations have `enabled: true`
+
+## Future Enhancements
+
+- Additional runner scripts for other categories
+- Parallel evaluation execution
+- Web UI for viewing reports
+- Integration with CI/CD pipelines
+- Support for additional judge providers (Claude, local models)
diff --git a/evals/config.yml b/evals/config.yml
new file mode 100644
index 0000000..5de80b5
--- /dev/null
+++ b/evals/config.yml
@@ -0,0 +1,75 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+
+mini_model:
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
+
+nano_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
+
+# main_model:
+#   provider: "openrouter"
+#   model_name: "openai/gpt-oss-20b:free"
+#   # model_name: "tngtech/deepseek-r1t2-chimera:free"
+#   api_key: "${OPENROUTER_API_KEY}"
+
+# mini_model:
+#   provider: "openrouter"
+#   model_name: "x-ai/grok-4-fast:free"
+#   api_key: "${OPENROUTER_API_KEY}"
+
+# nano_model:
+#   provider: "openrouter"
+#   model_name: "x-ai/grok-4-fast:free"
+#   api_key: "${OPENROUTER_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/data/action-agent/a11y-001.yaml b/evals/data/action-agent/a11y-001.yaml
new file mode 100644
index 0000000..7c7947a
--- /dev/null
+++ b/evals/data/action-agent/a11y-001.yaml
@@ -0,0 +1,46 @@
+# Accessibility action test
+id: "a11y-001"
+name: "Click Using ARIA Label"
+description: "Test clicking an element identified primarily by ARIA attributes"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the button with aria-label \"Print Page\""
+  reasoning: "Testing action selection using accessibility attributes"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Used accessibility tree to find elements"
+      - "Correctly identified element by ARIA label"
+      - "Successfully clicked the target button"
+      - "Demonstrated understanding of accessibility attributes"
+      - "No reliance on visual appearance alone"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Print Page button was successfully clicked"
+        - "Check if any print dialog or print preview appeared"
+        - "Confirm the button showed visual feedback (pressed state)"
+        - "Ensure the action was performed on the correct accessibility-labeled element"
+
+metadata:
+  tags: ["action", "accessibility", "aria", "click", "a11y"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/accordion-001.yaml b/evals/data/action-agent/accordion-001.yaml
new file mode 100644
index 0000000..dae142d
--- /dev/null
+++ b/evals/data/action-agent/accordion-001.yaml
@@ -0,0 +1,46 @@
+# Accordion expansion test
+id: "accordion-001"
+name: "Expand Accordion Section"
+description: "Test clicking to expand an accordion panel"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+  reasoning: "Testing accordion expand/collapse interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+      - "Accordion animation completed smoothly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+        - "Confirm the expansion animation completed"
+        - "Ensure Section 2 header shows expanded state"
+
+metadata:
+  tags: ["action", "accordion", "expand", "collapse", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-a11y-001.yaml b/evals/data/action-agent/action-agent-a11y-001.yaml
new file mode 100644
index 0000000..9526551
--- /dev/null
+++ b/evals/data/action-agent/action-agent-a11y-001.yaml
@@ -0,0 +1,46 @@
+# Accessibility action test
+id: "action-agent-a11y-001"
+name: "Click Using ARIA Label"
+description: "Test clicking an element identified primarily by ARIA attributes"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the button with aria-label \"Print Page\""
+  reasoning: "Testing action selection using accessibility attributes"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Used accessibility tree to find elements"
+      - "Correctly identified element by ARIA label"
+      - "Successfully clicked the target button"
+      - "Demonstrated understanding of accessibility attributes"
+      - "No reliance on visual appearance alone"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Print Page button was successfully clicked"
+        - "Check if any print dialog or print preview appeared"
+        - "Confirm the button showed visual feedback (pressed state)"
+        - "Ensure the action was performed on the correct accessibility-labeled element"
+
+metadata:
+  tags: ["action", "accessibility", "aria", "click", "a11y"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-accordion-001.yaml b/evals/data/action-agent/action-agent-accordion-001.yaml
new file mode 100644
index 0000000..f2df343
--- /dev/null
+++ b/evals/data/action-agent/action-agent-accordion-001.yaml
@@ -0,0 +1,46 @@
+# Accordion expansion test
+id: "action-agent-accordion-001"
+name: "Expand Accordion Section"
+description: "Test clicking to expand an accordion panel"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+  reasoning: "Testing accordion expand/collapse interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+      - "Accordion animation completed smoothly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+        - "Confirm the expansion animation completed"
+        - "Ensure Section 2 header shows expanded state"
+
+metadata:
+  tags: ["action", "accordion", "expand", "collapse", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-autocomplete-001.yaml b/evals/data/action-agent/action-agent-autocomplete-001.yaml
new file mode 100644
index 0000000..c22bfc7
--- /dev/null
+++ b/evals/data/action-agent/action-agent-autocomplete-001.yaml
@@ -0,0 +1,46 @@
+# Autocomplete search test
+id: "action-agent-autocomplete-001"
+name: "Use Autocomplete Search"
+description: "Test typing in autocomplete field and selecting from suggestions"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/autocomplete/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
+  reasoning: "Testing autocomplete/typeahead interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the autocomplete input field"
+      - "Typed \"Java\" to trigger suggestions"
+      - "Autocomplete dropdown appeared with suggestions"
+      - "Selected \"JavaScript\" from the suggestion list"
+      - "Input field shows the selected value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify \"JavaScript\" appears in the input field"
+        - "Check if autocomplete suggestions appeared"
+        - "Confirm the correct suggestion was selected"
+        - "Ensure dropdown closed after selection"
+
+metadata:
+  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-checkbox-001.yaml b/evals/data/action-agent/action-agent-checkbox-001.yaml
new file mode 100644
index 0000000..b76f307
--- /dev/null
+++ b/evals/data/action-agent/action-agent-checkbox-001.yaml
@@ -0,0 +1,46 @@
+# Checkbox/radio button test
+id: "action-agent-checkbox-001"
+name: "Toggle Newsletter Checkbox"
+description: "Test clicking checkbox elements for form options"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Click the checkbox labeled \"I have a bike\" to check it"
+  reasoning: "Testing interaction with checkbox form elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct checkbox among multiple options"
+      - "Used click action on the checkbox element"
+      - "Checkbox state changed from unchecked to checked"
+      - "Handled the iframe structure if present"
+      - "No errors with form element interaction"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
+        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
+        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
+        - "Ensure no other checkboxes were accidentally modified"
+
+metadata:
+  tags: ["action", "checkbox", "form", "w3schools", "input"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-checkbox-002.yaml b/evals/data/action-agent/action-agent-checkbox-002.yaml
new file mode 100644
index 0000000..0b25fa8
--- /dev/null
+++ b/evals/data/action-agent/action-agent-checkbox-002.yaml
@@ -0,0 +1,47 @@
+# Toggle checkbox test - using HTML form test site
+id: "action-agent-checkbox-002"
+name: "Check Extra Cheese Checkbox"
+description: "Test checking a specific checkbox using the check method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
+  reasoning: "Testing checkbox interaction functionality using check method"
+  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
+      - "Used the check method instead of click for better reliability"
+      - "Checkbox became checked (if it wasn't already)"
+      - "No errors occurred during checkbox interaction"
+      - "Form maintained its structure after checkbox selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
+        - "Check that the checkbox shows proper visual feedback for checked state"
+        - "Confirm the form structure remained intact"
+        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
+
+metadata:
+  tags: ["action", "checkbox", "check", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-click-001.yaml b/evals/data/action-agent/action-agent-click-001.yaml
new file mode 100644
index 0000000..e9af6cf
--- /dev/null
+++ b/evals/data/action-agent/action-agent-click-001.yaml
@@ -0,0 +1,47 @@
+# Basic search interaction test
+id: "action-agent-click-001"
+name: "Search with Text Entry and Click"
+description: "Test entering text in search field and clicking search button"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
+  reasoning: "Testing multi-step interaction: text input followed by button click"
+  hint: "First fill the search input field, then find and click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully located the search input field"
+      - "Entered \"DevTools automation\" text in the search box"
+      - "Located the Google Search button after entering text"
+      - "Successfully clicked the search button"
+      - "Search was executed and results page loaded"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify text \"DevTools automation\" was entered in the search field"
+        - "Check if search results page loaded with relevant results"
+        - "Confirm the search was executed (URL changed to results page)"
+        - "Ensure search results are related to \"DevTools automation\""
+
+metadata:
+  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-context-001.yaml b/evals/data/action-agent/action-agent-context-001.yaml
new file mode 100644
index 0000000..6162697
--- /dev/null
+++ b/evals/data/action-agent/action-agent-context-001.yaml
@@ -0,0 +1,46 @@
+# Right click context menu test
+id: "action-agent-context-001"
+name: "Right Click Context Menu"
+description: "Test right-clicking to open context menu"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/context_menu"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Right-click on the context menu area to open the context menu"
+  reasoning: "Testing right-click context menu interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the designated context menu area"
+      - "Performed right-click action correctly"
+      - "Context menu appeared with options"
+      - "Successfully triggered the right-click event"
+      - "Alert or confirmation appeared as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify right-click was performed on correct area"
+        - "Check if context menu or alert appeared"
+        - "Confirm right-click event was properly triggered"
+        - "Ensure the expected response occurred"
+
+metadata:
+  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-datepicker-001.yaml b/evals/data/action-agent/action-agent-datepicker-001.yaml
new file mode 100644
index 0000000..f4abbf7
--- /dev/null
+++ b/evals/data/action-agent/action-agent-datepicker-001.yaml
@@ -0,0 +1,46 @@
+# Date picker test
+id: "action-agent-datepicker-001"
+name: "Select Date from Calendar"
+description: "Test clicking date input and selecting a specific date from calendar popup"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/datepicker/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
+  reasoning: "Testing interaction with calendar popup widgets"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the date input field"
+      - "Calendar popup opened successfully"
+      - "Navigated to correct month/year if needed"
+      - "Selected the specific date (March 15, 2024)"
+      - "Date input field shows the selected date"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the date input field contains the selected date"
+        - "Check if the calendar widget opened and closed properly"
+        - "Confirm the correct date was highlighted and selected"
+        - "Ensure the date format matches expected output"
+
+metadata:
+  tags: ["action", "datepicker", "calendar", "form", "popup"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-daterange-001.yaml b/evals/data/action-agent/action-agent-daterange-001.yaml
new file mode 100644
index 0000000..4581a47
--- /dev/null
+++ b/evals/data/action-agent/action-agent-daterange-001.yaml
@@ -0,0 +1,46 @@
+# Date range picker test
+id: "action-agent-daterange-001"
+name: "Select Date Range"
+description: "Test selecting a date range with start and end dates"
+enabled: true
+
+target:
+  url: "https://www.daterangepicker.com/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select a date range from February 1, 2024 to February 28, 2024"
+  reasoning: "Testing complex date range selection with start and end dates"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Opened the date range picker interface"
+      - "Selected the start date (February 1, 2024)"
+      - "Selected the end date (February 28, 2024)"
+      - "Date range was properly applied"
+      - "Input field shows the complete date range"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both start and end dates are displayed in the input"
+        - "Check if the date range picker shows the selected range"
+        - "Confirm the format matches expected date range display"
+        - "Ensure both dates were selected in sequence"
+
+metadata:
+  tags: ["action", "daterange", "date-picker", "form", "complex"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-dropdown-001.yaml b/evals/data/action-agent/action-agent-dropdown-001.yaml
new file mode 100644
index 0000000..b37b91c
--- /dev/null
+++ b/evals/data/action-agent/action-agent-dropdown-001.yaml
@@ -0,0 +1,46 @@
+# Dropdown selection test
+id: "action-agent-dropdown-001"
+name: "Select Dropdown Option"
+description: "Test selecting an option from a dropdown menu"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select \"Audi\" from the car brands dropdown menu"
+  reasoning: "Testing dropdown selection interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the dropdown/select element"
+      - "Identified the correct option to select"
+      - "Successfully selected the Audi option"
+      - "Dropdown value changed to the selected option"
+      - "Handled select element interaction properly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the dropdown selection changed"
+        - "Confirm \"Audi\" is now displayed as the selected option"
+        - "Check if the dropdown is closed after selection"
+        - "Verify no other form elements were affected by the selection"
+
+metadata:
+  tags: ["action", "dropdown", "select", "form", "w3schools"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-dynamic-001.yaml b/evals/data/action-agent/action-agent-dynamic-001.yaml
new file mode 100644
index 0000000..a4380f3
--- /dev/null
+++ b/evals/data/action-agent/action-agent-dynamic-001.yaml
@@ -0,0 +1,46 @@
+# Dynamic content interaction test
+id: "action-agent-dynamic-001"
+name: "Click Dynamic Load Button"
+description: "Test clicking a button that loads dynamic content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the \"Start\" button to trigger dynamic content loading"
+  reasoning: "Testing interaction with dynamically loaded content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found and clicked the Start button"
+      - "Handled the dynamic loading process"
+      - "Recognized that content changes after clicking"
+      - "No timing issues with the dynamic content"
+      - "Successfully triggered the loading animation"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify dynamic content loaded after clicking Start"
+        - "Check if loading animation or spinner was displayed"
+        - "Confirm new content appeared that was previously hidden"
+        - "Verify the Start button state changed or was replaced after clicking"
+
+metadata:
+  tags: ["action", "dynamic", "click", "ajax", "loading"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-ecommerce-001.yaml b/evals/data/action-agent/action-agent-ecommerce-001.yaml
new file mode 100644
index 0000000..503c157
--- /dev/null
+++ b/evals/data/action-agent/action-agent-ecommerce-001.yaml
@@ -0,0 +1,46 @@
+# E-commerce action test
+id: "action-agent-ecommerce-001"
+name: "Add Product to Cart"
+description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 180000
+
+input:
+  objective: "Click the \"Add to Cart\" button for this storage container"
+  reasoning: "Testing e-commerce interaction with product cart functionality"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Add to Cart button on the product page"
+      - "Successfully clicked the button"
+      - "Handled any popups or confirmations that appeared"
+      - "Verified the item was added (cart count changed or confirmation shown)"
+      - "Dealt with page dynamics after clicking"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the Add to Cart button was clicked"
+        - "Check if cart count indicator increased or shows the item was added"
+        - "Look for any confirmation popup or notification about the item being added"
+        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
+
+metadata:
+  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
+  priority: "high"
+  timeout: 180000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-error-001.yaml b/evals/data/action-agent/action-agent-error-001.yaml
new file mode 100644
index 0000000..43c95e6
--- /dev/null
+++ b/evals/data/action-agent/action-agent-error-001.yaml
@@ -0,0 +1,47 @@
+# Error recovery test
+id: "action-agent-error-001"
+name: "Handle Missing Element"
+description: "Test agent behavior when target element is not found"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the \"Sign Up\" button"
+  reasoning: "Testing error handling when element does not exist"
+  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to find the requested element"
+      - "Recognized that the element does not exist"
+      - "Provided clear error message or explanation"
+      - "Did not crash or produce confusing output"
+      - "Suggested alternatives or explained the issue"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the page remains in a stable state despite the missing element"
+        - "Confirm no error dialogs or broken UI elements appeared"
+        - "Check that the agent handled the missing element gracefully"
+        - "Ensure the page was properly analyzed even though the target was not found"
+
+metadata:
+  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
+  priority: "high"
+  timeout: 60000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-filter-001.yaml b/evals/data/action-agent/action-agent-filter-001.yaml
new file mode 100644
index 0000000..7782999
--- /dev/null
+++ b/evals/data/action-agent/action-agent-filter-001.yaml
@@ -0,0 +1,46 @@
+# Search filter application test
+id: "action-agent-filter-001"
+name: "Apply Search Filters"
+description: "Test applying search filters to modify results"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Anna\" in the search filter to filter the list"
+  reasoning: "Testing search filter application"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the search filter input"
+      - "Typed \"Anna\" in the filter field"
+      - "List items filtered to show only matching results"
+      - "Non-matching items were hidden or removed from view"
+      - "Filter functionality worked as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search input contains \"Anna\""
+        - "Check if list shows only items containing \"Anna\""
+        - "Confirm non-matching items are not visible"
+        - "Ensure filter functionality reduced the visible list items"
+
+metadata:
+  tags: ["action", "filter", "search", "list", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-form-001.yaml b/evals/data/action-agent/action-agent-form-001.yaml
new file mode 100644
index 0000000..61d036f
--- /dev/null
+++ b/evals/data/action-agent/action-agent-form-001.yaml
@@ -0,0 +1,46 @@
+# Form fill action test
+id: "action-agent-form-001"
+name: "Fill Search Query"
+description: "Test filling a search input field with specific text"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Fill the search box with \"Chrome DevTools automation testing\""
+  reasoning: "Testing form input capability with a specific search query"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully identified the search input field"
+      - "Used perform_action with fill method"
+      - "Correctly filled the field with the specified text"
+      - "Verified the field accepted the input"
+      - "No formatting or encoding issues with the text"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to confirm text was entered in the search field"
+        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
+        - "Check if search suggestions or autocomplete dropdown appeared"
+        - "Ensure no input validation errors are shown"
+
+metadata:
+  tags: ["action", "form-fill", "input", "google", "basic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-hover-001.yaml b/evals/data/action-agent/action-agent-hover-001.yaml
new file mode 100644
index 0000000..ed98fbf
--- /dev/null
+++ b/evals/data/action-agent/action-agent-hover-001.yaml
@@ -0,0 +1,46 @@
+# Hover action test
+id: "action-agent-hover-001"
+name: "Hover to Reveal Menu"
+description: "Test hovering over an element to reveal hidden content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/hovers"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Hover over the first user avatar image to reveal the hidden caption"
+  reasoning: "Testing hover interaction to reveal dynamic content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first user avatar image"
+      - "Used appropriate hover action method"
+      - "Successfully triggered the hover state"
+      - "Hidden caption became visible after hover"
+      - "Handled mouse interaction correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify hover revealed hidden content"
+        - "Check that caption or overlay appeared over the first avatar"
+        - "Confirm the hover state is visually active on the image"
+        - "Verify user information or caption text is now visible"
+
+metadata:
+  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-keyboard-001.yaml b/evals/data/action-agent/action-agent-keyboard-001.yaml
new file mode 100644
index 0000000..6bfceac
--- /dev/null
+++ b/evals/data/action-agent/action-agent-keyboard-001.yaml
@@ -0,0 +1,46 @@
+# Keyboard tab navigation test
+id: "action-agent-keyboard-001"
+name: "Keyboard Tab Navigation"
+description: "Test using keyboard navigation to move between elements"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Use Tab key to navigate between menu items and Enter to activate"
+  reasoning: "Testing keyboard-only navigation patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used keyboard navigation"
+      - "Tab key moved focus between menu items"
+      - "Focus indicators were visible during navigation"
+      - "Enter key activated the focused menu item"
+      - "Keyboard navigation followed accessibility standards"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify focus indicators are visible on menu items"
+        - "Check if keyboard navigation moved focus correctly"
+        - "Confirm Enter key activated the focused item"
+        - "Ensure accessibility navigation patterns worked"
+
+metadata:
+  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-login-001.yaml b/evals/data/action-agent/action-agent-login-001.yaml
new file mode 100644
index 0000000..1b705ce
--- /dev/null
+++ b/evals/data/action-agent/action-agent-login-001.yaml
@@ -0,0 +1,47 @@
+# Login form test
+id: "action-agent-login-001"
+name: "Fill Login Credentials"
+description: "Test filling username and password fields in a login form"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/login"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
+  reasoning: "Testing form fill with multiple fields including password type"
+  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified both username and password fields"
+      - "Filled username field with correct value"
+      - "Filled password field with correct value"
+      - "Handled password field type appropriately"
+      - "Used the provided input_data XML format correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the username field shows \"tomsmith\" entered"
+        - "Confirm the password field has dots/asterisks indicating password entry"
+        - "Check that both fields are properly filled before submission"
+        - "Ensure no validation errors are shown for the filled fields"
+
+metadata:
+  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-modal-001.yaml b/evals/data/action-agent/action-agent-modal-001.yaml
new file mode 100644
index 0000000..1324fee
--- /dev/null
+++ b/evals/data/action-agent/action-agent-modal-001.yaml
@@ -0,0 +1,46 @@
+# Modal dialog test
+id: "action-agent-modal-001"
+name: "Open and Close Modal"
+description: "Test opening modal dialog and closing it with X button"
+enabled: true
+
+target:
+  url: "https://getbootstrap.com/docs/5.0/components/modal/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to open the modal dialog, then close it using the X button"
+  reasoning: "Testing modal dialog interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the modal trigger button"
+      - "Modal dialog opened successfully"
+      - "Modal content was visible and accessible"
+      - "Found and clicked the close (X) button"
+      - "Modal closed and page returned to normal state"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify modal opened with visible content"
+        - "Check if modal overlay appeared correctly"
+        - "Confirm modal was closed after clicking X"
+        - "Ensure page background is accessible again"
+
+metadata:
+  tags: ["action", "modal", "dialog", "popup", "overlay"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-multiselect-001.yaml b/evals/data/action-agent/action-agent-multiselect-001.yaml
new file mode 100644
index 0000000..fed3f78
--- /dev/null
+++ b/evals/data/action-agent/action-agent-multiselect-001.yaml
@@ -0,0 +1,46 @@
+# Multi-select dropdown test
+id: "action-agent-multiselect-001"
+name: "Select Multiple Options"
+description: "Test selecting multiple options from a multi-select dropdown"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
+  reasoning: "Testing multiple selection in select elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the multi-select dropdown element"
+      - "Successfully selected Volvo option"
+      - "Successfully selected Audi option"
+      - "Both options remain selected simultaneously"
+      - "Used appropriate multi-select interaction method"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both Volvo and Audi appear selected"
+        - "Check if both options are highlighted/marked"
+        - "Confirm multi-select functionality worked correctly"
+        - "Ensure no other options were accidentally selected"
+
+metadata:
+  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-multistep-001.yaml b/evals/data/action-agent/action-agent-multistep-001.yaml
new file mode 100644
index 0000000..31514dd
--- /dev/null
+++ b/evals/data/action-agent/action-agent-multistep-001.yaml
@@ -0,0 +1,47 @@
+# Multi-step form test
+id: "action-agent-multistep-001"
+name: "Complete Search and Submit"
+description: "Test filling a search form and then clicking the submit button"
+enabled: true
+
+target:
+  url: "https://www.bing.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
+  reasoning: "Testing multi-step form interaction combining fill and click actions"
+  hint: "This requires two actions: first fill the search field, then click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Recognized this requires multiple actions"
+      - "First filled the search input correctly"
+      - "Then located and clicked the search button"
+      - "Both actions completed successfully in sequence"
+      - "Search was initiated with the correct query"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the search input contains \"automated testing tools\" text"
+        - "Confirm the search was submitted and results page loaded"
+        - "Check that search results are related to the query"
+        - "Ensure the multi-step action completed fully with both fill and click"
+
+metadata:
+  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-nav-001.yaml b/evals/data/action-agent/action-agent-nav-001.yaml
new file mode 100644
index 0000000..f49a0cf
--- /dev/null
+++ b/evals/data/action-agent/action-agent-nav-001.yaml
@@ -0,0 +1,46 @@
+# Complex navigation test
+id: "action-agent-nav-001"
+name: "Navigate via Menu Click"
+description: "Test clicking navigation menu items to navigate between pages"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
+  reasoning: "Testing navigation through link clicks on a multilingual site"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct language link among many options"
+      - "Successfully clicked the English link"
+      - "Navigation occurred to the English Wikipedia"
+      - "Used appropriate tools to verify navigation success"
+      - "Handled the multilingual page structure correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
+        - "Check if the page language and content changed to English"
+        - "Verify the URL changed to en.wikipedia.org"
+        - "Confirm the English Wikipedia main page is displayed"
+
+metadata:
+  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-radio-001.yaml b/evals/data/action-agent/action-agent-radio-001.yaml
new file mode 100644
index 0000000..07d6ef8
--- /dev/null
+++ b/evals/data/action-agent/action-agent-radio-001.yaml
@@ -0,0 +1,47 @@
+# Radio button selection test
+id: "action-agent-radio-001"
+name: "Select Radio Button Option"
+description: "Test selecting a specific radio button option using click method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
+  reasoning: "Testing radio button selection functionality"
+  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Medium radio button in the Pizza Size section"
+      - "Successfully clicked the Medium radio button"
+      - "Radio button became selected (checked state)"
+      - "Other radio buttons in the same group became unselected"
+      - "Form maintained its structure after radio button selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Medium radio button is now selected (shows filled circle)"
+        - "Check that other pizza size options (Small, Large) are no longer selected"
+        - "Confirm the form structure remained intact"
+        - "Ensure the Medium pizza size radio button was specifically targeted"
+
+metadata:
+  tags: ["action", "radio", "click", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-slider-001.yaml b/evals/data/action-agent/action-agent-slider-001.yaml
new file mode 100644
index 0000000..c370658
--- /dev/null
+++ b/evals/data/action-agent/action-agent-slider-001.yaml
@@ -0,0 +1,46 @@
+# Range slider test
+id: "action-agent-slider-001"
+name: "Adjust Range Slider"
+description: "Test moving slider to set a specific value"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/slider/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Move the slider to set the value to 75"
+  reasoning: "Testing slider/range input manipulation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the slider control element"
+      - "Successfully moved the slider handle"
+      - "Set the slider value to approximately 75"
+      - "Slider position reflects the target value"
+      - "Any associated display shows the correct value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify slider handle moved to represent value 75"
+        - "Check if value display shows 75 or close to it"
+        - "Confirm slider position visually matches target"
+        - "Ensure slider interaction was smooth and successful"
+
+metadata:
+  tags: ["action", "slider", "range", "form", "drag"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-tableselect-001.yaml b/evals/data/action-agent/action-agent-tableselect-001.yaml
new file mode 100644
index 0000000..d78e66c
--- /dev/null
+++ b/evals/data/action-agent/action-agent-tableselect-001.yaml
@@ -0,0 +1,46 @@
+# Table row selection test
+id: "action-agent-tableselect-001"
+name: "Select Table Row"
+description: "Test clicking to select a table row"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/api/select_single_row.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the first row to select it"
+  reasoning: "Testing table row selection patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first table row"
+      - "Successfully clicked the row"
+      - "Row became highlighted/selected"
+      - "Selection state is visually apparent"
+      - "Only one row is selected at a time"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the first row is now highlighted/selected"
+        - "Check if row selection visual feedback is clear"
+        - "Confirm only the clicked row is selected"
+        - "Ensure row selection styling is properly applied"
+
+metadata:
+  tags: ["action", "table", "select", "row", "highlight"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-tablesort-001.yaml b/evals/data/action-agent/action-agent-tablesort-001.yaml
new file mode 100644
index 0000000..e3e3176
--- /dev/null
+++ b/evals/data/action-agent/action-agent-tablesort-001.yaml
@@ -0,0 +1,46 @@
+# Table column sorting test
+id: "action-agent-tablesort-001"
+name: "Sort Table Column"
+description: "Test clicking table column header to sort data"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Name\" column header to sort the table by name"
+  reasoning: "Testing table column sorting interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Name column header"
+      - "Successfully clicked the column header"
+      - "Table data reordered by name alphabetically"
+      - "Sort indicator appeared on the Name column"
+      - "Table sorting completed without errors"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify table rows are now sorted alphabetically by name"
+        - "Check if sort arrow/indicator appears on Name column"
+        - "Confirm the data order changed from before to after"
+        - "Ensure table structure remained intact after sorting"
+
+metadata:
+  tags: ["action", "table", "sort", "column", "data"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-tabs-001.yaml b/evals/data/action-agent/action-agent-tabs-001.yaml
new file mode 100644
index 0000000..22db60c
--- /dev/null
+++ b/evals/data/action-agent/action-agent-tabs-001.yaml
@@ -0,0 +1,46 @@
+# Tab panel navigation test
+id: "action-agent-tabs-001"
+name: "Navigate Tab Panels"
+description: "Test clicking tab to switch between tab panels"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/tabs/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
+  reasoning: "Testing tab panel navigation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the \"Nunc tincidunt\" tab button"
+      - "Successfully clicked the tab"
+      - "Tab panel content switched to the selected tab"
+      - "Active tab visual state changed appropriately"
+      - "Content area updated to show the new panel"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
+        - "Check if the content panel changed to show new content"
+        - "Confirm the tab switching animation completed"
+        - "Ensure the correct tab content is visible"
+
+metadata:
+  tags: ["action", "tabs", "navigation", "panels", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-timepicker-001.yaml b/evals/data/action-agent/action-agent-timepicker-001.yaml
new file mode 100644
index 0000000..056fbe9
--- /dev/null
+++ b/evals/data/action-agent/action-agent-timepicker-001.yaml
@@ -0,0 +1,46 @@
+# Time picker test
+id: "action-agent-timepicker-001"
+name: "Select Time from Picker"
+description: "Test setting time using time picker controls"
+enabled: true
+
+target:
+  url: "https://timepicker.co/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Set the time to 2:30 PM using the time picker controls"
+  reasoning: "Testing time selection with hour/minute controls"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the time picker interface"
+      - "Set the hour to 2 (14 for 24-hour format)"
+      - "Set the minutes to 30"
+      - "Selected PM or appropriate time format"
+      - "Time input shows 2:30 PM or equivalent"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the time input displays 2:30 PM or 14:30"
+        - "Check if hour and minute were set correctly"
+        - "Confirm AM/PM selection if applicable"
+        - "Ensure the time picker interface was properly used"
+
+metadata:
+  tags: ["action", "timepicker", "time", "form", "clock"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-upload-001.yaml b/evals/data/action-agent/action-agent-upload-001.yaml
new file mode 100644
index 0000000..518515d
--- /dev/null
+++ b/evals/data/action-agent/action-agent-upload-001.yaml
@@ -0,0 +1,46 @@
+# File upload test
+id: "action-agent-upload-001"
+name: "Upload File via Input"
+description: "Test clicking file input and uploading a test file"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/upload"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the file input and upload a test file"
+  reasoning: "Testing file upload interaction through input elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the file input element"
+      - "Triggered file selection dialog"
+      - "Selected a file for upload"
+      - "File name appears in the input field"
+      - "Upload process initiated successfully"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify file name appears in the upload input field"
+        - "Check if file selection was successful"
+        - "Confirm upload button is available or file is ready"
+        - "Ensure no upload errors are displayed"
+
+metadata:
+  tags: ["action", "upload", "file", "input", "form"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-video-001.yaml b/evals/data/action-agent/action-agent-video-001.yaml
new file mode 100644
index 0000000..ba21b28
--- /dev/null
+++ b/evals/data/action-agent/action-agent-video-001.yaml
@@ -0,0 +1,47 @@
+# Video playback controls test
+id: "action-agent-video-001"
+name: "Control Video Playback"
+description: "Test starting video playback using click + spacebar"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the video element to focus it, then press spacebar to start playback"
+  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
+  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Video element in the accessibility tree"
+      - "Successfully clicked the Video element to focus it"
+      - "Used keyboard input to press spacebar"
+      - "Video playback started after spacebar press"
+      - "No errors occurred during the interaction sequence"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify video player is visible on the page"
+        - "Check if the play button was clicked (may show pause button after)"
+        - "Look for visual indicators that video started playing"
+        - "Ensure no error messages appeared during video interaction"
+
+metadata:
+  tags: ["action", "video", "media", "controls", "playback"]
+  priority: "high"
+  timeout: 90000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-video-002.yaml b/evals/data/action-agent/action-agent-video-002.yaml
new file mode 100644
index 0000000..d7188ec
--- /dev/null
+++ b/evals/data/action-agent/action-agent-video-002.yaml
@@ -0,0 +1,47 @@
+# Video play button specific targeting test
+id: "action-agent-video-002"
+name: "Click Video Play Button Specifically"
+description: "Test clicking the specific play button (not the video element)"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
+  reasoning: "Testing specific targeting of the play button element"
+  hint: "Target the button element with text or label \"play\", do not click the Video element"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found a button element (not Video element) with \"play\" in the name"
+      - "Successfully clicked the play button specifically"
+      - "Did not click on the Video element itself"
+      - "Play button click was executed correctly"
+      - "Video responded to the button click"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the play button (not video element) was clicked"
+        - "Check if video started playing after button click"
+        - "Confirm the target was the button, not the video container"
+        - "Look for changes in video player state"
+
+metadata:
+  tags: ["action", "video", "button", "specific-targeting"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/autocomplete-001.yaml b/evals/data/action-agent/autocomplete-001.yaml
new file mode 100644
index 0000000..4bd4aa8
--- /dev/null
+++ b/evals/data/action-agent/autocomplete-001.yaml
@@ -0,0 +1,46 @@
+# Autocomplete search test
+id: "autocomplete-001"
+name: "Use Autocomplete Search"
+description: "Test typing in autocomplete field and selecting from suggestions"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/autocomplete/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
+  reasoning: "Testing autocomplete/typeahead interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the autocomplete input field"
+      - "Typed \"Java\" to trigger suggestions"
+      - "Autocomplete dropdown appeared with suggestions"
+      - "Selected \"JavaScript\" from the suggestion list"
+      - "Input field shows the selected value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify \"JavaScript\" appears in the input field"
+        - "Check if autocomplete suggestions appeared"
+        - "Confirm the correct suggestion was selected"
+        - "Ensure dropdown closed after selection"
+
+metadata:
+  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/checkbox-001.yaml b/evals/data/action-agent/checkbox-001.yaml
new file mode 100644
index 0000000..041f2f6
--- /dev/null
+++ b/evals/data/action-agent/checkbox-001.yaml
@@ -0,0 +1,46 @@
+# Checkbox/radio button test
+id: "checkbox-001"
+name: "Toggle Newsletter Checkbox"
+description: "Test clicking checkbox elements for form options"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Click the checkbox labeled \"I have a bike\" to check it"
+  reasoning: "Testing interaction with checkbox form elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct checkbox among multiple options"
+      - "Used click action on the checkbox element"
+      - "Checkbox state changed from unchecked to checked"
+      - "Handled the iframe structure if present"
+      - "No errors with form element interaction"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
+        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
+        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
+        - "Ensure no other checkboxes were accidentally modified"
+
+metadata:
+  tags: ["action", "checkbox", "form", "w3schools", "input"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/checkbox-002.yaml b/evals/data/action-agent/checkbox-002.yaml
new file mode 100644
index 0000000..036f388
--- /dev/null
+++ b/evals/data/action-agent/checkbox-002.yaml
@@ -0,0 +1,47 @@
+# Toggle checkbox test - using HTML form test site
+id: "checkbox-002"
+name: "Check Extra Cheese Checkbox"
+description: "Test checking a specific checkbox using the check method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
+  reasoning: "Testing checkbox interaction functionality using check method"
+  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
+      - "Used the check method instead of click for better reliability"
+      - "Checkbox became checked (if it wasn't already)"
+      - "No errors occurred during checkbox interaction"
+      - "Form maintained its structure after checkbox selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
+        - "Check that the checkbox shows proper visual feedback for checked state"
+        - "Confirm the form structure remained intact"
+        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
+
+metadata:
+  tags: ["action", "checkbox", "check", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/click-001.yaml b/evals/data/action-agent/click-001.yaml
new file mode 100644
index 0000000..e86c8fd
--- /dev/null
+++ b/evals/data/action-agent/click-001.yaml
@@ -0,0 +1,47 @@
+# Basic search interaction test
+id: "click-001"
+name: "Search with Text Entry and Click"
+description: "Test entering text in search field and clicking search button"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
+  reasoning: "Testing multi-step interaction: text input followed by button click"
+  hint: "First fill the search input field, then find and click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully located the search input field"
+      - "Entered \"DevTools automation\" text in the search box"
+      - "Located the Google Search button after entering text"
+      - "Successfully clicked the search button"
+      - "Search was executed and results page loaded"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify text \"DevTools automation\" was entered in the search field"
+        - "Check if search results page loaded with relevant results"
+        - "Confirm the search was executed (URL changed to results page)"
+        - "Ensure search results are related to \"DevTools automation\""
+
+metadata:
+  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/context-001.yaml b/evals/data/action-agent/context-001.yaml
new file mode 100644
index 0000000..0ca7c58
--- /dev/null
+++ b/evals/data/action-agent/context-001.yaml
@@ -0,0 +1,46 @@
+# Right click context menu test
+id: "context-001"
+name: "Right Click Context Menu"
+description: "Test right-clicking to open context menu"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/context_menu"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Right-click on the context menu area to open the context menu"
+  reasoning: "Testing right-click context menu interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the designated context menu area"
+      - "Performed right-click action correctly"
+      - "Context menu appeared with options"
+      - "Successfully triggered the right-click event"
+      - "Alert or confirmation appeared as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify right-click was performed on correct area"
+        - "Check if context menu or alert appeared"
+        - "Confirm right-click event was properly triggered"
+        - "Ensure the expected response occurred"
+
+metadata:
+  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/datepicker-001.yaml b/evals/data/action-agent/datepicker-001.yaml
new file mode 100644
index 0000000..9b6a9df
--- /dev/null
+++ b/evals/data/action-agent/datepicker-001.yaml
@@ -0,0 +1,46 @@
+# Date picker test
+id: "datepicker-001"
+name: "Select Date from Calendar"
+description: "Test clicking date input and selecting a specific date from calendar popup"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/datepicker/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
+  reasoning: "Testing interaction with calendar popup widgets"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the date input field"
+      - "Calendar popup opened successfully"
+      - "Navigated to correct month/year if needed"
+      - "Selected the specific date (March 15, 2024)"
+      - "Date input field shows the selected date"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the date input field contains the selected date"
+        - "Check if the calendar widget opened and closed properly"
+        - "Confirm the correct date was highlighted and selected"
+        - "Ensure the date format matches expected output"
+
+metadata:
+  tags: ["action", "datepicker", "calendar", "form", "popup"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/daterange-001.yaml b/evals/data/action-agent/daterange-001.yaml
new file mode 100644
index 0000000..a9b202b
--- /dev/null
+++ b/evals/data/action-agent/daterange-001.yaml
@@ -0,0 +1,46 @@
+# Date range picker test
+id: "daterange-001"
+name: "Select Date Range"
+description: "Test selecting a date range with start and end dates"
+enabled: true
+
+target:
+  url: "https://www.daterangepicker.com/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select a date range from February 1, 2024 to February 28, 2024"
+  reasoning: "Testing complex date range selection with start and end dates"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Opened the date range picker interface"
+      - "Selected the start date (February 1, 2024)"
+      - "Selected the end date (February 28, 2024)"
+      - "Date range was properly applied"
+      - "Input field shows the complete date range"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both start and end dates are displayed in the input"
+        - "Check if the date range picker shows the selected range"
+        - "Confirm the format matches expected date range display"
+        - "Ensure both dates were selected in sequence"
+
+metadata:
+  tags: ["action", "daterange", "date-picker", "form", "complex"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/dropdown-001.yaml b/evals/data/action-agent/dropdown-001.yaml
new file mode 100644
index 0000000..a64edb0
--- /dev/null
+++ b/evals/data/action-agent/dropdown-001.yaml
@@ -0,0 +1,46 @@
+# Dropdown selection test
+id: "dropdown-001"
+name: "Select Dropdown Option"
+description: "Test selecting an option from a dropdown menu"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select \"Audi\" from the car brands dropdown menu"
+  reasoning: "Testing dropdown selection interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the dropdown/select element"
+      - "Identified the correct option to select"
+      - "Successfully selected the Audi option"
+      - "Dropdown value changed to the selected option"
+      - "Handled select element interaction properly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the dropdown selection changed"
+        - "Confirm \"Audi\" is now displayed as the selected option"
+        - "Check if the dropdown is closed after selection"
+        - "Verify no other form elements were affected by the selection"
+
+metadata:
+  tags: ["action", "dropdown", "select", "form", "w3schools"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/dynamic-001.yaml b/evals/data/action-agent/dynamic-001.yaml
new file mode 100644
index 0000000..fba60bd
--- /dev/null
+++ b/evals/data/action-agent/dynamic-001.yaml
@@ -0,0 +1,46 @@
+# Dynamic content interaction test
+id: "dynamic-001"
+name: "Click Dynamic Load Button"
+description: "Test clicking a button that loads dynamic content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the \"Start\" button to trigger dynamic content loading"
+  reasoning: "Testing interaction with dynamically loaded content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found and clicked the Start button"
+      - "Handled the dynamic loading process"
+      - "Recognized that content changes after clicking"
+      - "No timing issues with the dynamic content"
+      - "Successfully triggered the loading animation"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify dynamic content loaded after clicking Start"
+        - "Check if loading animation or spinner was displayed"
+        - "Confirm new content appeared that was previously hidden"
+        - "Verify the Start button state changed or was replaced after clicking"
+
+metadata:
+  tags: ["action", "dynamic", "click", "ajax", "loading"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/ecommerce-001.yaml b/evals/data/action-agent/ecommerce-001.yaml
new file mode 100644
index 0000000..ae573de
--- /dev/null
+++ b/evals/data/action-agent/ecommerce-001.yaml
@@ -0,0 +1,46 @@
+# E-commerce action test
+id: "ecommerce-001"
+name: "Add Product to Cart"
+description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 180000
+
+input:
+  objective: "Click the \"Add to Cart\" button for this storage container"
+  reasoning: "Testing e-commerce interaction with product cart functionality"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Add to Cart button on the product page"
+      - "Successfully clicked the button"
+      - "Handled any popups or confirmations that appeared"
+      - "Verified the item was added (cart count changed or confirmation shown)"
+      - "Dealt with page dynamics after clicking"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the Add to Cart button was clicked"
+        - "Check if cart count indicator increased or shows the item was added"
+        - "Look for any confirmation popup or notification about the item being added"
+        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
+
+metadata:
+  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
+  priority: "high"
+  timeout: 180000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/error-001.yaml b/evals/data/action-agent/error-001.yaml
new file mode 100644
index 0000000..a2b5646
--- /dev/null
+++ b/evals/data/action-agent/error-001.yaml
@@ -0,0 +1,47 @@
+# Error recovery test
+id: "error-001"
+name: "Handle Missing Element"
+description: "Test agent behavior when target element is not found"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the \"Sign Up\" button"
+  reasoning: "Testing error handling when element does not exist"
+  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to find the requested element"
+      - "Recognized that the element does not exist"
+      - "Provided clear error message or explanation"
+      - "Did not crash or produce confusing output"
+      - "Suggested alternatives or explained the issue"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the page remains in a stable state despite the missing element"
+        - "Confirm no error dialogs or broken UI elements appeared"
+        - "Check that the agent handled the missing element gracefully"
+        - "Ensure the page was properly analyzed even though the target was not found"
+
+metadata:
+  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
+  priority: "high"
+  timeout: 60000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/filter-001.yaml b/evals/data/action-agent/filter-001.yaml
new file mode 100644
index 0000000..7efa8f1
--- /dev/null
+++ b/evals/data/action-agent/filter-001.yaml
@@ -0,0 +1,46 @@
+# Search filter application test
+id: "filter-001"
+name: "Apply Search Filters"
+description: "Test applying search filters to modify results"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Anna\" in the search filter to filter the list"
+  reasoning: "Testing search filter application"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the search filter input"
+      - "Typed \"Anna\" in the filter field"
+      - "List items filtered to show only matching results"
+      - "Non-matching items were hidden or removed from view"
+      - "Filter functionality worked as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search input contains \"Anna\""
+        - "Check if list shows only items containing \"Anna\""
+        - "Confirm non-matching items are not visible"
+        - "Ensure filter functionality reduced the visible list items"
+
+metadata:
+  tags: ["action", "filter", "search", "list", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/form-001.yaml b/evals/data/action-agent/form-001.yaml
new file mode 100644
index 0000000..c4f06da
--- /dev/null
+++ b/evals/data/action-agent/form-001.yaml
@@ -0,0 +1,46 @@
+# Form fill action test
+id: "form-001"
+name: "Fill Search Query"
+description: "Test filling a search input field with specific text"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Fill the search box with \"Chrome DevTools automation testing\""
+  reasoning: "Testing form input capability with a specific search query"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully identified the search input field"
+      - "Used perform_action with fill method"
+      - "Correctly filled the field with the specified text"
+      - "Verified the field accepted the input"
+      - "No formatting or encoding issues with the text"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to confirm text was entered in the search field"
+        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
+        - "Check if search suggestions or autocomplete dropdown appeared"
+        - "Ensure no input validation errors are shown"
+
+metadata:
+  tags: ["action", "form-fill", "input", "google", "basic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/hover-001.yaml b/evals/data/action-agent/hover-001.yaml
new file mode 100644
index 0000000..a58b225
--- /dev/null
+++ b/evals/data/action-agent/hover-001.yaml
@@ -0,0 +1,46 @@
+# Hover action test
+id: "hover-001"
+name: "Hover to Reveal Menu"
+description: "Test hovering over an element to reveal hidden content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/hovers"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Hover over the first user avatar image to reveal the hidden caption"
+  reasoning: "Testing hover interaction to reveal dynamic content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first user avatar image"
+      - "Used appropriate hover action method"
+      - "Successfully triggered the hover state"
+      - "Hidden caption became visible after hover"
+      - "Handled mouse interaction correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify hover revealed hidden content"
+        - "Check that caption or overlay appeared over the first avatar"
+        - "Confirm the hover state is visually active on the image"
+        - "Verify user information or caption text is now visible"
+
+metadata:
+  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/keyboard-001.yaml b/evals/data/action-agent/keyboard-001.yaml
new file mode 100644
index 0000000..6a1ffd1
--- /dev/null
+++ b/evals/data/action-agent/keyboard-001.yaml
@@ -0,0 +1,46 @@
+# Keyboard tab navigation test
+id: "keyboard-001"
+name: "Keyboard Tab Navigation"
+description: "Test using keyboard navigation to move between elements"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Use Tab key to navigate between menu items and Enter to activate"
+  reasoning: "Testing keyboard-only navigation patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used keyboard navigation"
+      - "Tab key moved focus between menu items"
+      - "Focus indicators were visible during navigation"
+      - "Enter key activated the focused menu item"
+      - "Keyboard navigation followed accessibility standards"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify focus indicators are visible on menu items"
+        - "Check if keyboard navigation moved focus correctly"
+        - "Confirm Enter key activated the focused item"
+        - "Ensure accessibility navigation patterns worked"
+
+metadata:
+  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/login-001.yaml b/evals/data/action-agent/login-001.yaml
new file mode 100644
index 0000000..b56fbca
--- /dev/null
+++ b/evals/data/action-agent/login-001.yaml
@@ -0,0 +1,47 @@
+# Login form test
+id: "login-001"
+name: "Fill Login Credentials"
+description: "Test filling username and password fields in a login form"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/login"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
+  reasoning: "Testing form fill with multiple fields including password type"
+  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified both username and password fields"
+      - "Filled username field with correct value"
+      - "Filled password field with correct value"
+      - "Handled password field type appropriately"
+      - "Used the provided input_data XML format correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the username field shows \"tomsmith\" entered"
+        - "Confirm the password field has dots/asterisks indicating password entry"
+        - "Check that both fields are properly filled before submission"
+        - "Ensure no validation errors are shown for the filled fields"
+
+metadata:
+  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/modal-001.yaml b/evals/data/action-agent/modal-001.yaml
new file mode 100644
index 0000000..ef05d16
--- /dev/null
+++ b/evals/data/action-agent/modal-001.yaml
@@ -0,0 +1,46 @@
+# Modal dialog test
+id: "modal-001"
+name: "Open and Close Modal"
+description: "Test opening modal dialog and closing it with X button"
+enabled: true
+
+target:
+  url: "https://getbootstrap.com/docs/5.0/components/modal/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to open the modal dialog, then close it using the X button"
+  reasoning: "Testing modal dialog interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the modal trigger button"
+      - "Modal dialog opened successfully"
+      - "Modal content was visible and accessible"
+      - "Found and clicked the close (X) button"
+      - "Modal closed and page returned to normal state"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify modal opened with visible content"
+        - "Check if modal overlay appeared correctly"
+        - "Confirm modal was closed after clicking X"
+        - "Ensure page background is accessible again"
+
+metadata:
+  tags: ["action", "modal", "dialog", "popup", "overlay"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/multiselect-001.yaml b/evals/data/action-agent/multiselect-001.yaml
new file mode 100644
index 0000000..a456c9b
--- /dev/null
+++ b/evals/data/action-agent/multiselect-001.yaml
@@ -0,0 +1,46 @@
+# Multi-select dropdown test
+id: "multiselect-001"
+name: "Select Multiple Options"
+description: "Test selecting multiple options from a multi-select dropdown"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
+  reasoning: "Testing multiple selection in select elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the multi-select dropdown element"
+      - "Successfully selected Volvo option"
+      - "Successfully selected Audi option"
+      - "Both options remain selected simultaneously"
+      - "Used appropriate multi-select interaction method"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both Volvo and Audi appear selected"
+        - "Check if both options are highlighted/marked"
+        - "Confirm multi-select functionality worked correctly"
+        - "Ensure no other options were accidentally selected"
+
+metadata:
+  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/multistep-001.yaml b/evals/data/action-agent/multistep-001.yaml
new file mode 100644
index 0000000..14923a2
--- /dev/null
+++ b/evals/data/action-agent/multistep-001.yaml
@@ -0,0 +1,47 @@
+# Multi-step form test
+id: "multistep-001"
+name: "Complete Search and Submit"
+description: "Test filling a search form and then clicking the submit button"
+enabled: true
+
+target:
+  url: "https://www.bing.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
+  reasoning: "Testing multi-step form interaction combining fill and click actions"
+  hint: "This requires two actions: first fill the search field, then click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Recognized this requires multiple actions"
+      - "First filled the search input correctly"
+      - "Then located and clicked the search button"
+      - "Both actions completed successfully in sequence"
+      - "Search was initiated with the correct query"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the search input contains \"automated testing tools\" text"
+        - "Confirm the search was submitted and results page loaded"
+        - "Check that search results are related to the query"
+        - "Ensure the multi-step action completed fully with both fill and click"
+
+metadata:
+  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/nav-001.yaml b/evals/data/action-agent/nav-001.yaml
new file mode 100644
index 0000000..e1ef610
--- /dev/null
+++ b/evals/data/action-agent/nav-001.yaml
@@ -0,0 +1,46 @@
+# Complex navigation test
+id: "nav-001"
+name: "Navigate via Menu Click"
+description: "Test clicking navigation menu items to navigate between pages"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
+  reasoning: "Testing navigation through link clicks on a multilingual site"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct language link among many options"
+      - "Successfully clicked the English link"
+      - "Navigation occurred to the English Wikipedia"
+      - "Used appropriate tools to verify navigation success"
+      - "Handled the multilingual page structure correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
+        - "Check if the page language and content changed to English"
+        - "Verify the URL changed to en.wikipedia.org"
+        - "Confirm the English Wikipedia main page is displayed"
+
+metadata:
+  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/radio-001.yaml b/evals/data/action-agent/radio-001.yaml
new file mode 100644
index 0000000..a136e1e
--- /dev/null
+++ b/evals/data/action-agent/radio-001.yaml
@@ -0,0 +1,47 @@
+# Radio button selection test
+id: "radio-001"
+name: "Select Radio Button Option"
+description: "Test selecting a specific radio button option using click method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
+  reasoning: "Testing radio button selection functionality"
+  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Medium radio button in the Pizza Size section"
+      - "Successfully clicked the Medium radio button"
+      - "Radio button became selected (checked state)"
+      - "Other radio buttons in the same group became unselected"
+      - "Form maintained its structure after radio button selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Medium radio button is now selected (shows filled circle)"
+        - "Check that other pizza size options (Small, Large) are no longer selected"
+        - "Confirm the form structure remained intact"
+        - "Ensure the Medium pizza size radio button was specifically targeted"
+
+metadata:
+  tags: ["action", "radio", "click", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/slider-001.yaml b/evals/data/action-agent/slider-001.yaml
new file mode 100644
index 0000000..9369671
--- /dev/null
+++ b/evals/data/action-agent/slider-001.yaml
@@ -0,0 +1,46 @@
+# Range slider test
+id: "slider-001"
+name: "Adjust Range Slider"
+description: "Test moving slider to set a specific value"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/slider/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Move the slider to set the value to 75"
+  reasoning: "Testing slider/range input manipulation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the slider control element"
+      - "Successfully moved the slider handle"
+      - "Set the slider value to approximately 75"
+      - "Slider position reflects the target value"
+      - "Any associated display shows the correct value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify slider handle moved to represent value 75"
+        - "Check if value display shows 75 or close to it"
+        - "Confirm slider position visually matches target"
+        - "Ensure slider interaction was smooth and successful"
+
+metadata:
+  tags: ["action", "slider", "range", "form", "drag"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/tableselect-001.yaml b/evals/data/action-agent/tableselect-001.yaml
new file mode 100644
index 0000000..b38341e
--- /dev/null
+++ b/evals/data/action-agent/tableselect-001.yaml
@@ -0,0 +1,46 @@
+# Table row selection test
+id: "tableselect-001"
+name: "Select Table Row"
+description: "Test clicking to select a table row"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/api/select_single_row.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the first row to select it"
+  reasoning: "Testing table row selection patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first table row"
+      - "Successfully clicked the row"
+      - "Row became highlighted/selected"
+      - "Selection state is visually apparent"
+      - "Only one row is selected at a time"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the first row is now highlighted/selected"
+        - "Check if row selection visual feedback is clear"
+        - "Confirm only the clicked row is selected"
+        - "Ensure row selection styling is properly applied"
+
+metadata:
+  tags: ["action", "table", "select", "row", "highlight"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/tablesort-001.yaml b/evals/data/action-agent/tablesort-001.yaml
new file mode 100644
index 0000000..32695c7
--- /dev/null
+++ b/evals/data/action-agent/tablesort-001.yaml
@@ -0,0 +1,46 @@
+# Table column sorting test
+id: "tablesort-001"
+name: "Sort Table Column"
+description: "Test clicking table column header to sort data"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Name\" column header to sort the table by name"
+  reasoning: "Testing table column sorting interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Name column header"
+      - "Successfully clicked the column header"
+      - "Table data reordered by name alphabetically"
+      - "Sort indicator appeared on the Name column"
+      - "Table sorting completed without errors"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify table rows are now sorted alphabetically by name"
+        - "Check if sort arrow/indicator appears on Name column"
+        - "Confirm the data order changed from before to after"
+        - "Ensure table structure remained intact after sorting"
+
+metadata:
+  tags: ["action", "table", "sort", "column", "data"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/tabs-001.yaml b/evals/data/action-agent/tabs-001.yaml
new file mode 100644
index 0000000..1079266
--- /dev/null
+++ b/evals/data/action-agent/tabs-001.yaml
@@ -0,0 +1,46 @@
+# Tab panel navigation test
+id: "tabs-001"
+name: "Navigate Tab Panels"
+description: "Test clicking tab to switch between tab panels"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/tabs/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
+  reasoning: "Testing tab panel navigation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the \"Nunc tincidunt\" tab button"
+      - "Successfully clicked the tab"
+      - "Tab panel content switched to the selected tab"
+      - "Active tab visual state changed appropriately"
+      - "Content area updated to show the new panel"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
+        - "Check if the content panel changed to show new content"
+        - "Confirm the tab switching animation completed"
+        - "Ensure the correct tab content is visible"
+
+metadata:
+  tags: ["action", "tabs", "navigation", "panels", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/timepicker-001.yaml b/evals/data/action-agent/timepicker-001.yaml
new file mode 100644
index 0000000..cbc5742
--- /dev/null
+++ b/evals/data/action-agent/timepicker-001.yaml
@@ -0,0 +1,46 @@
+# Time picker test
+id: "timepicker-001"
+name: "Select Time from Picker"
+description: "Test setting time using time picker controls"
+enabled: true
+
+target:
+  url: "https://timepicker.co/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Set the time to 2:30 PM using the time picker controls"
+  reasoning: "Testing time selection with hour/minute controls"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the time picker interface"
+      - "Set the hour to 2 (14 for 24-hour format)"
+      - "Set the minutes to 30"
+      - "Selected PM or appropriate time format"
+      - "Time input shows 2:30 PM or equivalent"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the time input displays 2:30 PM or 14:30"
+        - "Check if hour and minute were set correctly"
+        - "Confirm AM/PM selection if applicable"
+        - "Ensure the time picker interface was properly used"
+
+metadata:
+  tags: ["action", "timepicker", "time", "form", "clock"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/upload-001.yaml b/evals/data/action-agent/upload-001.yaml
new file mode 100644
index 0000000..d5c276c
--- /dev/null
+++ b/evals/data/action-agent/upload-001.yaml
@@ -0,0 +1,46 @@
+# File upload test
+id: "upload-001"
+name: "Upload File via Input"
+description: "Test clicking file input and uploading a test file"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/upload"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the file input and upload a test file"
+  reasoning: "Testing file upload interaction through input elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the file input element"
+      - "Triggered file selection dialog"
+      - "Selected a file for upload"
+      - "File name appears in the input field"
+      - "Upload process initiated successfully"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify file name appears in the upload input field"
+        - "Check if file selection was successful"
+        - "Confirm upload button is available or file is ready"
+        - "Ensure no upload errors are displayed"
+
+metadata:
+  tags: ["action", "upload", "file", "input", "form"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/video-001.yaml b/evals/data/action-agent/video-001.yaml
new file mode 100644
index 0000000..17c76be
--- /dev/null
+++ b/evals/data/action-agent/video-001.yaml
@@ -0,0 +1,47 @@
+# Video playback controls test
+id: "video-001"
+name: "Control Video Playback"
+description: "Test starting video playback using click + spacebar"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the video element to focus it, then press spacebar to start playback"
+  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
+  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Video element in the accessibility tree"
+      - "Successfully clicked the Video element to focus it"
+      - "Used keyboard input to press spacebar"
+      - "Video playback started after spacebar press"
+      - "No errors occurred during the interaction sequence"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify video player is visible on the page"
+        - "Check if the play button was clicked (may show pause button after)"
+        - "Look for visual indicators that video started playing"
+        - "Ensure no error messages appeared during video interaction"
+
+metadata:
+  tags: ["action", "video", "media", "controls", "playback"]
+  priority: "high"
+  timeout: 90000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/action-agent/video-002.yaml b/evals/data/action-agent/video-002.yaml
new file mode 100644
index 0000000..b20014c
--- /dev/null
+++ b/evals/data/action-agent/video-002.yaml
@@ -0,0 +1,47 @@
+# Video play button specific targeting test
+id: "video-002"
+name: "Click Video Play Button Specifically"
+description: "Test clicking the specific play button (not the video element)"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
+  reasoning: "Testing specific targeting of the play button element"
+  hint: "Target the button element with text or label \"play\", do not click the Video element"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found a button element (not Video element) with \"play\" in the name"
+      - "Successfully clicked the play button specifically"
+      - "Did not click on the Video element itself"
+      - "Play button click was executed correctly"
+      - "Video responded to the button click"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the play button (not video element) was clicked"
+        - "Check if video started playing after button click"
+        - "Confirm the target was the button, not the video container"
+        - "Look for changes in video player state"
+
+metadata:
+  tags: ["action", "video", "button", "specific-targeting"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/config.yaml b/evals/data/config.yaml
new file mode 100644
index 0000000..3968421
--- /dev/null
+++ b/evals/data/config.yaml
@@ -0,0 +1,11 @@
+# model:
+#   main_model: "deepseek-r1:14b"
+#   mini_model: "deepseek-r1:14b"
+#   nano_model: "deepseek-r1:14b"
+#   provider: "litellm"
+
+model:
+  main_model: "gpt-4.1"
+  mini_model: "gpt-4.1-mini"
+  nano_model: "gpt-4.1-nano"
+  provider: "openai"
\ No newline at end of file
diff --git a/evals/data/end-to-end/b-vitamins-research-001.yaml b/evals/data/end-to-end/b-vitamins-research-001.yaml
new file mode 100644
index 0000000..746ead6
--- /dev/null
+++ b/evals/data/end-to-end/b-vitamins-research-001.yaml
@@ -0,0 +1,35 @@
+# B-Vitamins Research - End-to-End Test
+id: "vitamins-research-001"
+name: "B-Vitamins Supplementation Research"
+description: "End-to-end test for comprehensive B-vitamins research using chat interface"
+enabled: true
+
+tool: "chat"
+timeout: 600000
+
+input:
+  message: "Research everything on the supplementation of B-vitamins for adults. I need: types of vitamins, available forms and their advantages, dosage and safety"
+  reasoning: "End-to-end test validating complete user workflow with dynamic tool usage for health research"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Covers all B-vitamin types (B1, B2, B3, B5, B6, B7, B9, B12) comprehensively"
+      - "Explains different forms of each vitamin and their advantages"
+      - "Provides appropriate dosage recommendations for adults"
+      - "Discusses safety considerations and potential side effects"
+      - "Information is accurate and from reliable health sources"
+      - "Response is well-organized and easy to understand"
+      - "Demonstrates intelligent tool selection for health research"
+      - "Shows complete workflow from request to comprehensive result"
+
+metadata:
+  tags: ["end-to-end", "chat", "health", "vitamins", "research", "user-workflow"]
+  priority: "medium"
+  timeout: 300000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/end-to-end/investment-research-001.yaml b/evals/data/end-to-end/investment-research-001.yaml
new file mode 100644
index 0000000..72014df
--- /dev/null
+++ b/evals/data/end-to-end/investment-research-001.yaml
@@ -0,0 +1,35 @@
+# Renewable Energy Stocks Research - End-to-End Test
+id: "investment-research-001"
+name: "Renewable Energy Stocks Research"
+description: "End-to-end test for investment research using chat interface"
+enabled: true
+
+tool: "chat"
+timeout: 600000
+
+input:
+  message: "Research renewable energy stocks for potential investment. Focus on solar and wind companies with market cap over $1B."
+  reasoning: "End-to-end test validating financial research workflow with dynamic tool usage"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identifies specific solar and wind energy companies"
+      - "Confirms companies have market cap over $1 billion"
+      - "Provides relevant financial metrics and data"
+      - "Includes business descriptions and growth prospects"
+      - "Discusses investment considerations and risks"
+      - "Information appears current and from reliable sources"
+      - "Demonstrates intelligent financial research tool usage"
+      - "Shows complete workflow from request to investment analysis"
+
+metadata:
+  tags: ["end-to-end", "chat", "investment", "stocks", "renewable-energy", "financial", "user-workflow"]
+  priority: "medium"
+  timeout: 300000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/end-to-end/product-comparison-001.yaml b/evals/data/end-to-end/product-comparison-001.yaml
new file mode 100644
index 0000000..1363a09
--- /dev/null
+++ b/evals/data/end-to-end/product-comparison-001.yaml
@@ -0,0 +1,40 @@
+# Headphones Comparison - End-to-End Test
+id: "product-comparison-001"
+name: "Noise-Canceling Headphones Comparison"
+description: "End-to-end test for product research and comparison using chat interface"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "chat"
+timeout: 300000
+
+input:
+  message: "Compare the top 3 noise-canceling headphones under $300. Include features, pros/cons, and where to buy them."
+  reasoning: "End-to-end test validating product comparison workflow with dynamic tool usage"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identifies 3 specific noise-canceling headphones under $300"
+      - "Provides detailed feature comparison for each model"
+      - "Lists pros and cons for each headphone clearly"
+      - "Includes pricing information and purchase locations"
+      - "Comparison is fair and based on objective criteria"
+      - "Information appears current and accurate"
+      - "Demonstrates intelligent research and extraction tool usage"
+      - "Shows complete workflow from request to actionable buying guide"
+
+metadata:
+  tags: ["end-to-end", "chat", "product", "comparison", "headphones", "shopping", "user-workflow"]
+  priority: "medium"
+  timeout: 300000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/end-to-end/recipe-nutrition-001.yaml b/evals/data/end-to-end/recipe-nutrition-001.yaml
new file mode 100644
index 0000000..ef8b0f0
--- /dev/null
+++ b/evals/data/end-to-end/recipe-nutrition-001.yaml
@@ -0,0 +1,40 @@
+# Healthy Recipe Search - End-to-End Test
+id: "recipe-nutrition-001"
+name: "Healthy Family Dinner Recipes"
+description: "End-to-end test for recipe search with nutrition criteria using chat interface"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "chat"
+timeout: 300000
+
+input:
+  message: "Find me 3 healthy dinner recipes for a family of 4 that are under 500 calories per serving and take less than 30 minutes to prepare."
+  reasoning: "End-to-end test validating recipe search workflow with specific nutritional and time criteria"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Provides exactly 3 dinner recipes suitable for family of 4"
+      - "Each recipe is under 500 calories per serving"
+      - "All recipes can be prepared in under 30 minutes"
+      - "Includes ingredient lists and cooking instructions"
+      - "Nutritional information is provided or estimated"
+      - "Recipes are practical and family-friendly"
+      - "Demonstrates intelligent recipe search and analysis"
+      - "Shows complete workflow from request to actionable meal plan"
+
+metadata:
+  tags: ["end-to-end", "chat", "recipes", "nutrition", "healthy", "family", "user-workflow"]
+  priority: "medium"
+  timeout: 300000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/end-to-end/travel-planning-001.yaml b/evals/data/end-to-end/travel-planning-001.yaml
new file mode 100644
index 0000000..401f8b1
--- /dev/null
+++ b/evals/data/end-to-end/travel-planning-001.yaml
@@ -0,0 +1,40 @@
+# Barcelona Travel Planning - End-to-End Test
+id: "travel-planning-001"
+name: "Barcelona Trip Planning"
+description: "End-to-end test for comprehensive travel planning using chat interface"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "chat"
+timeout: 300000
+
+input:
+  message: "Help me plan a 3-day trip to Barcelona. I need flight options from New York, hotel recommendations in the city center, and top 5 attractions to visit."
+  reasoning: "End-to-end test validating complete travel planning workflow with dynamic tool usage"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Provides realistic flight options from New York to Barcelona"
+      - "Recommends hotels in Barcelona city center with details"
+      - "Lists top 5 attractions in Barcelona with descriptions"
+      - "Information is current and practically useful for trip planning"
+      - "Includes relevant details like prices, locations, or booking info"
+      - "Response is well-organized into clear sections"
+      - "Demonstrates multi-tool usage for comprehensive planning"
+      - "Shows complete workflow from request to actionable itinerary"
+
+metadata:
+  tags: ["end-to-end", "chat", "travel", "planning", "barcelona", "user-workflow"]
+  priority: "medium"
+  timeout: 300000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/basic-001.yaml b/evals/data/research-agent/basic-001.yaml
new file mode 100644
index 0000000..fcd0086
--- /dev/null
+++ b/evals/data/research-agent/basic-001.yaml
@@ -0,0 +1,39 @@
+# Basic research test - stable topic with clear sources
+id: "basic-001"
+name: "Research Chrome DevTools History"
+description: "Research the history and development of Chrome DevTools"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "History and development of Chrome DevTools browser developer tools"
+  reasoning: "Testing basic research capabilities on a well-documented technical topic"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0
+    criteria:
+      - "Research covers the origins and early development of Chrome DevTools"
+      - "Information includes key milestones and major feature additions"
+      - "Sources include official documentation or reliable technical sources"
+      - "At least 3-5 different sources were consulted"
+      - "Information is factually accurate and up-to-date"
+      - "Research demonstrates understanding of the topic evolution"
+      - "Handoff to content_writer_agent occurred with comprehensive data"
+
+metadata:
+  tags: ["basic", "technical", "stable", "documentation"]
+  priority: "high"
+  timeout: 180000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/business-001.yaml b/evals/data/research-agent/business-001.yaml
new file mode 100644
index 0000000..7558120
--- /dev/null
+++ b/evals/data/research-agent/business-001.yaml
@@ -0,0 +1,39 @@
+# Business research test
+id: "business-001"
+name: "Research Remote Work Productivity"
+description: "Research remote work impact on productivity and business outcomes"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Remote work productivity statistics impact business outcomes 2024 studies"
+  reasoning: "Testing business research requiring statistical data and multiple perspectives"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research includes statistical data and survey results"
+      - "Covers multiple perspectives (employee, employer, industry)"
+      - "Sources include business publications, research studies, and reports"
+      - "Information addresses both positive and negative impacts"
+      - "Data is recent and relevant to current work trends"
+      - "Research demonstrates understanding of business implications"
+      - "Statistics and claims are properly sourced"
+
+metadata:
+  tags: ["business", "statistics", "workplace", "comprehensive"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/comparison-001.yaml b/evals/data/research-agent/comparison-001.yaml
new file mode 100644
index 0000000..a9aa22b
--- /dev/null
+++ b/evals/data/research-agent/comparison-001.yaml
@@ -0,0 +1,39 @@
+# Comparative research test
+id: "comparison-001"
+name: "Compare JavaScript vs TypeScript"
+description: "Research and compare JavaScript and TypeScript for web development"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 200000
+
+input:
+  query: "JavaScript vs TypeScript comparison web development pros cons differences"
+  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers both JavaScript and TypeScript comprehensively"
+      - "Includes clear comparison points (syntax, features, ecosystem)"
+      - "Presents advantages and disadvantages of each language"
+      - "Sources include technical documentation and developer resources"
+      - "Information is balanced and objective, not biased toward one option"
+      - "Demonstrates understanding of use cases for each language"
+      - "Research data is well-organized for comparative analysis"
+
+metadata:
+  tags: ["comparison", "technical", "programming", "balanced"]
+  priority: "high"
+  timeout: 200000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/current-001.yaml b/evals/data/research-agent/current-001.yaml
new file mode 100644
index 0000000..6878868
--- /dev/null
+++ b/evals/data/research-agent/current-001.yaml
@@ -0,0 +1,40 @@
+# Current events research test
+id: "current-001"
+name: "Research Latest AI Development Trends"
+description: "Research recent developments in AI and machine learning (last 6 months)"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
+  reasoning: "Testing research on current events and rapidly evolving topics"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Research focuses on recent developments (within last 6 months)"
+      - "Covers multiple aspects of AI development (models, applications, research)"
+      - "Sources are current and from reputable news or research outlets"
+      - "Information includes specific examples or case studies"
+      - "Demonstrates ability to identify current trends vs older information"
+      - "Successfully gathered information from diverse source types"
+      - "Data is properly organized for content writer handoff"
+
+metadata:
+  tags: ["current-events", "ai", "dynamic", "trends"]
+  priority: "high"
+  timeout: 240000
+  retries: 1
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/edge-001.yaml b/evals/data/research-agent/edge-001.yaml
new file mode 100644
index 0000000..d75c2bf
--- /dev/null
+++ b/evals/data/research-agent/edge-001.yaml
@@ -0,0 +1,39 @@
+# No-results edge case test
+id: "edge-001"
+name: "Research Obscure Fictional Topic"
+description: "Test handling of queries with very limited or no reliable sources"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "quantum bluetooth watermelon encryption algorithm 2024"
+  reasoning: "Testing edge case handling when query yields no meaningful results"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent recognizes when query yields limited or unreliable results"
+      - "Demonstrates appropriate search strategy modification"
+      - "Does not fabricate information when sources are unavailable"
+      - "Gracefully handles lack of substantive results"
+      - "Still attempts handoff to content writer with available information"
+      - "Maintains professional approach despite limited data"
+      - "Shows appropriate uncertainty when information is sparse"
+
+metadata:
+  tags: ["edge-case", "no-results", "error-handling", "fictional"]
+  priority: "high"
+  timeout: 180000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-basic-001.yaml b/evals/data/research-agent/research-agent-basic-001.yaml
new file mode 100644
index 0000000..85743d5
--- /dev/null
+++ b/evals/data/research-agent/research-agent-basic-001.yaml
@@ -0,0 +1,39 @@
+# Basic research test - stable topic with clear sources
+id: "research-agent-basic-001"
+name: "Research Chrome DevTools History"
+description: "Research the history and development of Chrome DevTools"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "History and development of Chrome DevTools browser developer tools"
+  reasoning: "Testing basic research capabilities on a well-documented technical topic"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0
+    criteria:
+      - "Research covers the origins and early development of Chrome DevTools"
+      - "Information includes key milestones and major feature additions"
+      - "Sources include official documentation or reliable technical sources"
+      - "At least 3-5 different sources were consulted"
+      - "Information is factually accurate and up-to-date"
+      - "Research demonstrates understanding of the topic evolution"
+      - "Handoff to content_writer_agent occurred with comprehensive data"
+
+metadata:
+  tags: ["basic", "technical", "stable", "documentation"]
+  priority: "high"
+  timeout: 180000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-business-001.yaml b/evals/data/research-agent/research-agent-business-001.yaml
new file mode 100644
index 0000000..defeed1
--- /dev/null
+++ b/evals/data/research-agent/research-agent-business-001.yaml
@@ -0,0 +1,39 @@
+# Business research test
+id: "research-agent-business-001"
+name: "Research Remote Work Productivity"
+description: "Research remote work impact on productivity and business outcomes"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Remote work productivity statistics impact business outcomes 2024 studies"
+  reasoning: "Testing business research requiring statistical data and multiple perspectives"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research includes statistical data and survey results"
+      - "Covers multiple perspectives (employee, employer, industry)"
+      - "Sources include business publications, research studies, and reports"
+      - "Information addresses both positive and negative impacts"
+      - "Data is recent and relevant to current work trends"
+      - "Research demonstrates understanding of business implications"
+      - "Statistics and claims are properly sourced"
+
+metadata:
+  tags: ["business", "statistics", "workplace", "comprehensive"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-comparison-001.yaml b/evals/data/research-agent/research-agent-comparison-001.yaml
new file mode 100644
index 0000000..a433a58
--- /dev/null
+++ b/evals/data/research-agent/research-agent-comparison-001.yaml
@@ -0,0 +1,39 @@
+# Comparative research test
+id: "research-agent-comparison-001"
+name: "Compare JavaScript vs TypeScript"
+description: "Research and compare JavaScript and TypeScript for web development"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 200000
+
+input:
+  query: "JavaScript vs TypeScript comparison web development pros cons differences"
+  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers both JavaScript and TypeScript comprehensively"
+      - "Includes clear comparison points (syntax, features, ecosystem)"
+      - "Presents advantages and disadvantages of each language"
+      - "Sources include technical documentation and developer resources"
+      - "Information is balanced and objective, not biased toward one option"
+      - "Demonstrates understanding of use cases for each language"
+      - "Research data is well-organized for comparative analysis"
+
+metadata:
+  tags: ["comparison", "technical", "programming", "balanced"]
+  priority: "high"
+  timeout: 200000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-current-001.yaml b/evals/data/research-agent/research-agent-current-001.yaml
new file mode 100644
index 0000000..198c981
--- /dev/null
+++ b/evals/data/research-agent/research-agent-current-001.yaml
@@ -0,0 +1,40 @@
+# Current events research test
+id: "research-agent-current-001"
+name: "Research Latest AI Development Trends"
+description: "Research recent developments in AI and machine learning (last 6 months)"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
+  reasoning: "Testing research on current events and rapidly evolving topics"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Research focuses on recent developments (within last 6 months)"
+      - "Covers multiple aspects of AI development (models, applications, research)"
+      - "Sources are current and from reputable news or research outlets"
+      - "Information includes specific examples or case studies"
+      - "Demonstrates ability to identify current trends vs older information"
+      - "Successfully gathered information from diverse source types"
+      - "Data is properly organized for content writer handoff"
+
+metadata:
+  tags: ["current-events", "ai", "dynamic", "trends"]
+  priority: "high"
+  timeout: 240000
+  retries: 1
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-edge-001.yaml b/evals/data/research-agent/research-agent-edge-001.yaml
new file mode 100644
index 0000000..234c832
--- /dev/null
+++ b/evals/data/research-agent/research-agent-edge-001.yaml
@@ -0,0 +1,39 @@
+# No-results edge case test
+id: "research-agent-edge-001"
+name: "Research Obscure Fictional Topic"
+description: "Test handling of queries with very limited or no reliable sources"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "quantum bluetooth watermelon encryption algorithm 2024"
+  reasoning: "Testing edge case handling when query yields no meaningful results"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent recognizes when query yields limited or unreliable results"
+      - "Demonstrates appropriate search strategy modification"
+      - "Does not fabricate information when sources are unavailable"
+      - "Gracefully handles lack of substantive results"
+      - "Still attempts handoff to content writer with available information"
+      - "Maintains professional approach despite limited data"
+      - "Shows appropriate uncertainty when information is sparse"
+
+metadata:
+  tags: ["edge-case", "no-results", "error-handling", "fictional"]
+  priority: "high"
+  timeout: 180000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-technical-001.yaml b/evals/data/research-agent/research-agent-technical-001.yaml
new file mode 100644
index 0000000..c5e2540
--- /dev/null
+++ b/evals/data/research-agent/research-agent-technical-001.yaml
@@ -0,0 +1,39 @@
+# Deep technical research test
+id: "research-agent-technical-001"
+name: "Research WebAssembly Performance"
+description: "Deep dive research into WebAssembly performance characteristics and use cases"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 900000
+
+input:
+  query: "WebAssembly WASM performance benchmarks use cases implementation details"
+  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers technical details of WebAssembly architecture"
+      - "Includes performance benchmarks and comparison data"
+      - "Discusses practical use cases and implementation scenarios"
+      - "Sources include technical specifications, benchmarks, and expert analysis"
+      - "Information demonstrates deep understanding of the technology"
+      - "Research addresses both benefits and limitations"
+      - "Technical accuracy is maintained throughout"
+
+metadata:
+  tags: ["technical", "deep-dive", "performance", "webassembly"]
+  priority: "high"
+  timeout: 900000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/research-agent-tools-001.yaml b/evals/data/research-agent/research-agent-tools-001.yaml
new file mode 100644
index 0000000..44da108
--- /dev/null
+++ b/evals/data/research-agent/research-agent-tools-001.yaml
@@ -0,0 +1,40 @@
+# Tool orchestration test - focuses on how well the agent uses available tools
+id: "research-agent-tools-001"
+name: "Research Python Framework Comparison"
+description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Django vs Flask Python web framework comparison features performance"
+  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent effectively used navigate_url to access search engines"
+      - "Schema-based extraction was used to gather structured search results"
+      - "Fetcher tool was used to collect content from multiple URLs"
+      - "Navigation strategy was logical and systematic"
+      - "Tool usage demonstrated purposeful research progression"
+      - "Information from different tools was effectively synthesized"
+      - "At least 3-5 different sources were accessed and processed"
+      - "Final handoff included comprehensive data from all tools"
+
+metadata:
+  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/technical-001.yaml b/evals/data/research-agent/technical-001.yaml
new file mode 100644
index 0000000..f434081
--- /dev/null
+++ b/evals/data/research-agent/technical-001.yaml
@@ -0,0 +1,39 @@
+# Deep technical research test
+id: "technical-001"
+name: "Research WebAssembly Performance"
+description: "Deep dive research into WebAssembly performance characteristics and use cases"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 900000
+
+input:
+  query: "WebAssembly WASM performance benchmarks use cases implementation details"
+  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers technical details of WebAssembly architecture"
+      - "Includes performance benchmarks and comparison data"
+      - "Discusses practical use cases and implementation scenarios"
+      - "Sources include technical specifications, benchmarks, and expert analysis"
+      - "Information demonstrates deep understanding of the technology"
+      - "Research addresses both benefits and limitations"
+      - "Technical accuracy is maintained throughout"
+
+metadata:
+  tags: ["technical", "deep-dive", "performance", "webassembly"]
+  priority: "high"
+  timeout: 900000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/research-agent/tools-001.yaml b/evals/data/research-agent/tools-001.yaml
new file mode 100644
index 0000000..ae97430
--- /dev/null
+++ b/evals/data/research-agent/tools-001.yaml
@@ -0,0 +1,40 @@
+# Tool orchestration test - focuses on how well the agent uses available tools
+id: "tools-001"
+name: "Research Python Framework Comparison"
+description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Django vs Flask Python web framework comparison features performance"
+  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent effectively used navigate_url to access search engines"
+      - "Schema-based extraction was used to gather structured search results"
+      - "Fetcher tool was used to collect content from multiple URLs"
+      - "Navigation strategy was logical and systematic"
+      - "Tool usage demonstrated purposeful research progression"
+      - "Information from different tools was effectively synthesized"
+      - "At least 3-5 different sources were accessed and processed"
+      - "Final handoff included comprehensive data from all tools"
+
+metadata:
+  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/amazon-product-001.yaml b/evals/data/schema-extractor/amazon-product-001.yaml
new file mode 100644
index 0000000..42e4738
--- /dev/null
+++ b/evals/data/schema-extractor/amazon-product-001.yaml
@@ -0,0 +1,78 @@
+# E-commerce product extraction test
+id: "amazon-product-001"
+name: "Extract Amazon Product Details"
+description: "Extract product information from an Amazon product page"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      product:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          brand:
+            type: "string"
+          price:
+            type: "object"
+            properties:
+              current:
+                type: "number"
+              currency:
+                type: "string"
+          rating:
+            type: "object"
+            properties:
+              average:
+                type: "number"
+              count:
+                type: "number"
+          images:
+            type: "array"
+            items:
+              type: "string"
+              format: "url"
+          features:
+            type: "array"
+            items:
+              type: "string"
+        required:
+          - "title"
+          - "price"
+      availability:
+        type: "string"
+    required:
+      - "product"
+  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
+  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Product title is accurate and complete"
+      - "Price information is current and properly formatted"
+      - "Rating data includes both average and review count"
+      - "Image URLs are valid and accessible"
+      - "Key product features are captured"
+      - "All URLs are properly resolved (not node IDs)"
+
+metadata:
+  tags: ["ecommerce", "amazon", "product", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/bbc-news-001.yaml b/evals/data/schema-extractor/bbc-news-001.yaml
new file mode 100644
index 0000000..6843147
--- /dev/null
+++ b/evals/data/schema-extractor/bbc-news-001.yaml
@@ -0,0 +1,69 @@
+# News article extraction test
+id: "bbc-news-001"
+name: "Extract BBC News Article"
+description: "Extract article content and metadata from a BBC News page"
+enabled: true
+
+target:
+  url: "https://www.bbc.com/news/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      headlines:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            summary:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            category:
+              type: "string"
+          required:
+            - "title"
+      mainStory:
+        type: "object"
+        properties:
+          headline:
+            type: "string"
+          summary:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+    required:
+      - "headlines"
+  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
+  reasoning: "Testing extraction from a news aggregation page with multiple articles"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Headlines are current and relevant to technology news"
+      - "Article summaries provide meaningful context"
+      - "URLs link to valid BBC news articles"
+      - "Main story is properly identified"
+      - "All extracted content is in English"
+
+metadata:
+  tags: ["news", "bbc", "aggregation", "dynamic"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/bing-search-001.yaml b/evals/data/schema-extractor/bing-search-001.yaml
new file mode 100644
index 0000000..7e7d674
--- /dev/null
+++ b/evals/data/schema-extractor/bing-search-001.yaml
@@ -0,0 +1,70 @@
+# Bing Search results extraction test
+id: "bing-search-001"
+name: "Extract Bing Search Results"
+description: "Extract search results from Bing search page"
+enabled: true
+
+target:
+  url: "https://www.bing.com/search?q=web+scraping+best+practices"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            datePublished:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      sidebarInfo:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          description:
+            type: "string"
+          source:
+            type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
+  reasoning: "Testing extraction from Bing search results with different layout than Google"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results match the query intent"
+      - "Results include valid URLs and meaningful snippets"
+      - "Sidebar information is extracted when present"
+      - "No duplicate results in the list"
+
+metadata:
+  tags: ["search", "bing", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/github-repo-001-streamlined.yaml b/evals/data/schema-extractor/github-repo-001-streamlined.yaml
new file mode 100644
index 0000000..07532e7
--- /dev/null
+++ b/evals/data/schema-extractor/github-repo-001-streamlined.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test (Streamlined version)
+id: "github-repo-001-streamlined"
+name: "Extract GitHub Repository Info (Streamlined)"
+description: "Extract basic repository information from a GitHub page using streamlined extractor"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured", "streamlined"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/github-repo-001.yaml b/evals/data/schema-extractor/github-repo-001.yaml
new file mode 100644
index 0000000..6693577
--- /dev/null
+++ b/evals/data/schema-extractor/github-repo-001.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test
+id: "github-repo-001"
+name: "Extract GitHub Repository Info"
+description: "Extract basic repository information from a GitHub page"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/google-flights-001.yaml b/evals/data/schema-extractor/google-flights-001.yaml
new file mode 100644
index 0000000..ab2e53c
--- /dev/null
+++ b/evals/data/schema-extractor/google-flights-001.yaml
@@ -0,0 +1,106 @@
+# Google Flights search extraction test
+id: "google-flights-001"
+name: "Extract Google Flights Search Results"
+description: "Extract flight options from Google Flights search"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchCriteria:
+        type: "object"
+        properties:
+          origin:
+            type: "string"
+          destination:
+            type: "string"
+          departureDate:
+            type: "string"
+          returnDate:
+            type: "string"
+          tripType:
+            type: "string"
+          passengers:
+            type: "number"
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            flightNumber:
+              type: "string"
+            departureTime:
+              type: "string"
+            arrivalTime:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "number"
+            price:
+              type: "object"
+              properties:
+                amount:
+                  type: "number"
+                currency:
+                  type: "string"
+            cabin:
+              type: "string"
+            bookingUrl:
+              type: "string"
+              format: "url"
+            legroom:
+              type: "string"
+            amenities:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "airline"
+            - "departureTime"
+            - "arrivalTime"
+            - "price"
+      priceInsights:
+        type: "object"
+        properties:
+          trend:
+            type: "string"
+          recommendation:
+            type: "string"
+          averagePrice:
+            type: "number"
+    required:
+      - "flights"
+  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
+  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Flight times are in proper format"
+      - "Prices are numeric values with currency"
+      - "Airlines and flight numbers are accurate"
+      - "Stop information is correctly identified"
+      - "Duration is in readable format"
+
+metadata:
+  tags: ["travel", "flights", "google", "booking"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/google-search-001.yaml b/evals/data/schema-extractor/google-search-001.yaml
new file mode 100644
index 0000000..5763ba8
--- /dev/null
+++ b/evals/data/schema-extractor/google-search-001.yaml
@@ -0,0 +1,76 @@
+# Google Search results extraction test
+id: "google-search-001"
+name: "Extract Google Search Results"
+description: "Extract search results from Google search page"
+enabled: true
+
+target:
+  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            domain:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      featuredSnippet:
+        type: "object"
+        properties:
+          content:
+            type: "string"
+          source:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+      relatedSearches:
+        type: "array"
+        items:
+          type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
+  reasoning: "Testing extraction from Google search results page with various result types"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are relevant to the query"
+      - "Each result has a valid title, URL, and snippet"
+      - "URLs are properly resolved and not node IDs"
+      - "Related searches are extracted if present"
+      - "Featured snippet is captured when available"
+
+metadata:
+  tags: ["search", "google", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/homedepot-001.yaml b/evals/data/schema-extractor/homedepot-001.yaml
new file mode 100644
index 0000000..2eb4883
--- /dev/null
+++ b/evals/data/schema-extractor/homedepot-001.yaml
@@ -0,0 +1,92 @@
+# Home Depot product search extraction test
+id: "homedepot-001"
+name: "Extract Home Depot Product Search"
+description: "Extract product listings from Home Depot search results"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/s/power%2520drill"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchQuery:
+        type: "string"
+      totalResults:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            price:
+              type: "number"
+            originalPrice:
+              type: "number"
+            savings:
+              type: "number"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            availability:
+              type: "string"
+            features:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "price"
+            - "productUrl"
+      filters:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
+  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are relevant to the search query"
+      - "Prices are numeric values in USD"
+      - "Product URLs link to Home Depot product pages"
+      - "Ratings are on a 5-star scale"
+      - "Key product features are captured"
+
+metadata:
+  tags: ["ecommerce", "homedepot", "products", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/macys-001.yaml b/evals/data/schema-extractor/macys-001.yaml
new file mode 100644
index 0000000..81e05f9
--- /dev/null
+++ b/evals/data/schema-extractor/macys-001.yaml
@@ -0,0 +1,106 @@
+# Macy's product listing extraction test
+id: "macys-001"
+name: "Extract Macy's Product Listings"
+description: "Extract fashion products from Macy's category page"
+enabled: true
+
+target:
+  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      category:
+        type: "string"
+      totalProducts:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            currentPrice:
+              type: "number"
+            originalPrice:
+              type: "number"
+            discount:
+              type: "string"
+            colors:
+              type: "array"
+              items:
+                type: "string"
+            sizes:
+              type: "array"
+              items:
+                type: "string"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            promotions:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "brand"
+            - "currentPrice"
+      refinements:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          sizes:
+            type: "array"
+            items:
+              type: "string"
+          colors:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
+  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are from the correct category"
+      - "Prices reflect current and sale prices"
+      - "Color and size options are captured"
+      - "Brand names are accurately extracted"
+      - "Promotional text is included when present"
+
+metadata:
+  tags: ["ecommerce", "macys", "fashion", "products"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/schema-extractor/wikipedia-search-001.yaml b/evals/data/schema-extractor/wikipedia-search-001.yaml
new file mode 100644
index 0000000..616f0d6
--- /dev/null
+++ b/evals/data/schema-extractor/wikipedia-search-001.yaml
@@ -0,0 +1,77 @@
+# Wikipedia search results extraction test
+id: "wikipedia-search-001"
+name: "Extract Wikipedia Search Results"
+description: "Extract search results from Wikipedia search"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchTerm:
+        type: "string"
+      resultCount:
+        type: "number"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            category:
+              type: "string"
+            wordCount:
+              type: "number"
+            lastEdited:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      suggestedArticles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "searchResults"
+  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
+  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are Wikipedia articles"
+      - "Each result has a valid Wikipedia URL"
+      - "Snippets contain relevant content highlights"
+      - "Metadata like word count is extracted when available"
+
+metadata:
+  tags: ["search", "wikipedia", "encyclopedia"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/screenshot-verification/dynamic-content-verification-001.yaml b/evals/data/screenshot-verification/dynamic-content-verification-001.yaml
new file mode 100644
index 0000000..6ec53c4
--- /dev/null
+++ b/evals/data/screenshot-verification/dynamic-content-verification-001.yaml
@@ -0,0 +1,45 @@
+# Dynamic content visual verification test
+id: "dynamic-content-verification-001"
+name: "Dynamic Content Visual Verification"
+description: "Test visual verification of dynamic content loading using screenshots"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Take a screenshot, click the Start button, wait for content to load, then take another screenshot to verify the dynamic content appeared"
+  reasoning: "Testing visual verification of dynamic content changes using screenshot comparison"
+  hint: "Use take_screenshot before clicking Start, then again after the dynamic content loads"
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Initial screenshot captured the page before dynamic loading"
+      - "Start button was successfully clicked"
+      - "Agent waited for dynamic content to fully load"
+      - "Final screenshot shows the revealed dynamic content"
+      - "Visual comparison demonstrates successful content loading verification"
+      - "Screenshots show clear before/after difference in content visibility"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify dynamic content loading"
+        - "Confirm the first screenshot shows hidden content area"
+        - "Verify the second screenshot shows the revealed 'Hello World!' text"
+        - "Check that the loading animation or process is properly captured"
+
+metadata:
+  tags: ["screenshot", "dynamic-content", "visual-verification", "loading"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: true
\ No newline at end of file
diff --git a/evals/data/screenshot-verification/screenshot-error-handling-001.yaml b/evals/data/screenshot-verification/screenshot-error-handling-001.yaml
new file mode 100644
index 0000000..6d31c50
--- /dev/null
+++ b/evals/data/screenshot-verification/screenshot-error-handling-001.yaml
@@ -0,0 +1,42 @@
+# Screenshot error handling test
+id: "screenshot-error-handling-001"
+name: "Screenshot Error Handling"
+description: "Test screenshot tool error handling and recovery"
+enabled: true
+
+target:
+  url: "https://httpstat.us/500"
+
+tool: "take_screenshot"
+timeout: 30000
+
+input:
+  fullPage: false
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Screenshot tool handled the error page gracefully"
+      - "Either successfully captured the error page or reported appropriate error"
+      - "No crashes or undefined behavior occurred"
+      - "Tool response is meaningful regardless of page loading issues"
+      - "Error handling demonstrates robustness of screenshot functionality"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "If screenshot was taken, verify it shows the error page content"
+        - "Check that the tool handled the HTTP 500 error appropriately"
+        - "Confirm no blank or corrupted screenshots were produced"
+        - "Ensure error scenarios are handled professionally"
+
+metadata:
+  tags: ["screenshot", "error-handling", "robustness", "edge-case"]
+  priority: "normal"
+  timeout: 30000
+  retries: 1
+  flaky: true
\ No newline at end of file
diff --git a/evals/data/screenshot-verification/screenshot-fullpage-001.yaml b/evals/data/screenshot-verification/screenshot-fullpage-001.yaml
new file mode 100644
index 0000000..a1c71f9
--- /dev/null
+++ b/evals/data/screenshot-verification/screenshot-fullpage-001.yaml
@@ -0,0 +1,43 @@
+# Full page screenshot verification test
+id: "screenshot-fullpage-001"
+name: "Take Full Page Screenshot"
+description: "Test taking full page screenshot and verify functionality"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
+
+tool: "take_screenshot"
+timeout: 45000
+
+input:
+  fullPage: true
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Full page screenshot was successfully captured"
+      - "Data URL contains valid image data"
+      - "Screenshot captures the entire page content including areas below the fold"
+      - "Image size is larger than viewport-only screenshot would be"
+      - "No errors occurred during full page capture"
+      - "Screenshot includes both header and footer content"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "Verify the screenshot shows the complete Wikipedia article page"
+        - "Check that content above and below the fold is captured"
+        - "Confirm the image is taller than a typical viewport"
+        - "Ensure no content is cut off at the bottom"
+
+metadata:
+  tags: ["screenshot", "fullpage", "visual", "verification", "wikipedia"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/evals/data/screenshot-verification/screenshot-viewport-001.yaml b/evals/data/screenshot-verification/screenshot-viewport-001.yaml
new file mode 100644
index 0000000..69531ee
--- /dev/null
+++ b/evals/data/screenshot-verification/screenshot-viewport-001.yaml
@@ -0,0 +1,42 @@
+# Viewport screenshot verification test
+id: "screenshot-viewport-001"
+name: "Take Viewport Screenshot"
+description: "Test taking viewport screenshot and verify functionality"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+
+tool: "take_screenshot"
+timeout: 30000
+
+input:
+  fullPage: false
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Screenshot was successfully captured"
+      - "Data URL is properly formatted and contains image data"
+      - "Screenshot shows the viewport content correctly"
+      - "No errors occurred during screenshot capture"
+      - "Image data length indicates a valid screenshot was taken"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "Verify the screenshot shows the Google homepage"
+        - "Check that the screenshot is not empty or corrupted"
+        - "Confirm the image quality is appropriate for verification"
+        - "Ensure the screenshot captures the current viewport accurately"
+
+metadata:
+  tags: ["screenshot", "viewport", "visual", "verification"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/evals/data/screenshot-verification/visual-comparison-001.yaml b/evals/data/screenshot-verification/visual-comparison-001.yaml
new file mode 100644
index 0000000..7434a93
--- /dev/null
+++ b/evals/data/screenshot-verification/visual-comparison-001.yaml
@@ -0,0 +1,45 @@
+# Visual comparison verification test
+id: "visual-comparison-001"
+name: "Visual Comparison Before and After Action"
+description: "Test visual verification by comparing screenshots before and after an action"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Take a screenshot, then type 'DevTools testing' in the search box, and take another screenshot to compare"
+  reasoning: "Testing visual verification workflow with before/after screenshot comparison"
+  hint: "Use take_screenshot tool before and after performing the search input action"
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Initial screenshot was taken before performing any actions"
+      - "Search text was successfully entered into the search field"
+      - "Second screenshot was taken after the text input"
+      - "Visual comparison shows the difference between before and after states"
+      - "Search field contains the entered text in the final screenshot"
+      - "Screenshots demonstrate successful action verification workflow"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare the before and after screenshots"
+        - "Verify the search field is empty in the first screenshot"
+        - "Confirm the search field contains 'DevTools testing' in the second screenshot"
+        - "Check that the visual changes accurately reflect the performed action"
+
+metadata:
+  tags: ["screenshot", "visual-comparison", "action-verification", "before-after"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/amazon-product-001.yaml b/evals/data/streamlined-schema-extractor/amazon-product-001.yaml
new file mode 100644
index 0000000..b154454
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/amazon-product-001.yaml
@@ -0,0 +1,78 @@
+# E-commerce product extraction test (Streamlined)
+id: "amazon-product-001"
+name: "Extract Amazon Product Details"
+description: "Extract product information from an Amazon product page"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      product:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          brand:
+            type: "string"
+          price:
+            type: "object"
+            properties:
+              current:
+                type: "number"
+              currency:
+                type: "string"
+          rating:
+            type: "object"
+            properties:
+              average:
+                type: "number"
+              count:
+                type: "number"
+          images:
+            type: "array"
+            items:
+              type: "string"
+              format: "url"
+          features:
+            type: "array"
+            items:
+              type: "string"
+        required:
+          - "title"
+          - "price"
+      availability:
+        type: "string"
+    required:
+      - "product"
+  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
+  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Product title is accurate and complete"
+      - "Price information is current and properly formatted"
+      - "Rating data includes both average and review count"
+      - "Image URLs are valid and accessible"
+      - "Key product features are captured"
+      - "All URLs are properly resolved (not node IDs)"
+
+metadata:
+  tags: ["ecommerce", "amazon", "product", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/bbc-news-001.yaml b/evals/data/streamlined-schema-extractor/bbc-news-001.yaml
new file mode 100644
index 0000000..31ef288
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/bbc-news-001.yaml
@@ -0,0 +1,69 @@
+# News article extraction test (Streamlined)
+id: "bbc-news-001"
+name: "Extract BBC News Article"
+description: "Extract article content and metadata from a BBC News page"
+enabled: true
+
+target:
+  url: "https://www.bbc.com/news/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      headlines:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            summary:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            category:
+              type: "string"
+          required:
+            - "title"
+      mainStory:
+        type: "object"
+        properties:
+          headline:
+            type: "string"
+          summary:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+    required:
+      - "headlines"
+  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
+  reasoning: "Testing extraction from a news aggregation page with multiple articles"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Headlines are current and relevant to technology news"
+      - "Article summaries provide meaningful context"
+      - "URLs link to valid BBC news articles"
+      - "Main story is properly identified"
+      - "All extracted content is in English"
+
+metadata:
+  tags: ["news", "bbc", "aggregation", "dynamic"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/bing-search-001.yaml b/evals/data/streamlined-schema-extractor/bing-search-001.yaml
new file mode 100644
index 0000000..e9f3b6e
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/bing-search-001.yaml
@@ -0,0 +1,70 @@
+# Bing Search results extraction test
+id: "bing-search-001"
+name: "Extract Bing Search Results"
+description: "Extract search results from Bing search page"
+enabled: true
+
+target:
+  url: "https://www.bing.com/search?q=web+scraping+best+practices"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            datePublished:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      sidebarInfo:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          description:
+            type: "string"
+          source:
+            type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
+  reasoning: "Testing extraction from Bing search results with different layout than Google"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results match the query intent"
+      - "Results include valid URLs and meaningful snippets"
+      - "Sidebar information is extracted when present"
+      - "No duplicate results in the list"
+
+metadata:
+  tags: ["search", "bing", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/github-repo-001.yaml b/evals/data/streamlined-schema-extractor/github-repo-001.yaml
new file mode 100644
index 0000000..5c496c5
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/github-repo-001.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test (Streamlined)
+id: "github-repo-001"
+name: "Extract GitHub Repository Info"
+description: "Extract basic repository information from a GitHub page"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/google-flights-001.yaml b/evals/data/streamlined-schema-extractor/google-flights-001.yaml
new file mode 100644
index 0000000..981ccbd
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/google-flights-001.yaml
@@ -0,0 +1,106 @@
+# Google Flights search extraction test
+id: "google-flights-001"
+name: "Extract Google Flights Search Results"
+description: "Extract flight options from Google Flights search"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchCriteria:
+        type: "object"
+        properties:
+          origin:
+            type: "string"
+          destination:
+            type: "string"
+          departureDate:
+            type: "string"
+          returnDate:
+            type: "string"
+          tripType:
+            type: "string"
+          passengers:
+            type: "number"
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            flightNumber:
+              type: "string"
+            departureTime:
+              type: "string"
+            arrivalTime:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "number"
+            price:
+              type: "object"
+              properties:
+                amount:
+                  type: "number"
+                currency:
+                  type: "string"
+            cabin:
+              type: "string"
+            bookingUrl:
+              type: "string"
+              format: "url"
+            legroom:
+              type: "string"
+            amenities:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "airline"
+            - "departureTime"
+            - "arrivalTime"
+            - "price"
+      priceInsights:
+        type: "object"
+        properties:
+          trend:
+            type: "string"
+          recommendation:
+            type: "string"
+          averagePrice:
+            type: "number"
+    required:
+      - "flights"
+  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
+  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Flight times are in proper format"
+      - "Prices are numeric values with currency"
+      - "Airlines and flight numbers are accurate"
+      - "Stop information is correctly identified"
+      - "Duration is in readable format"
+
+metadata:
+  tags: ["travel", "flights", "google", "booking"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/google-search-001.yaml b/evals/data/streamlined-schema-extractor/google-search-001.yaml
new file mode 100644
index 0000000..c1725d4
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/google-search-001.yaml
@@ -0,0 +1,76 @@
+# Google Search results extraction test
+id: "google-search-001"
+name: "Extract Google Search Results"
+description: "Extract search results from Google search page"
+enabled: true
+
+target:
+  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            domain:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      featuredSnippet:
+        type: "object"
+        properties:
+          content:
+            type: "string"
+          source:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+      relatedSearches:
+        type: "array"
+        items:
+          type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
+  reasoning: "Testing extraction from Google search results page with various result types"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are relevant to the query"
+      - "Each result has a valid title, URL, and snippet"
+      - "URLs are properly resolved and not node IDs"
+      - "Related searches are extracted if present"
+      - "Featured snippet is captured when available"
+
+metadata:
+  tags: ["search", "google", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/homedepot-001.yaml b/evals/data/streamlined-schema-extractor/homedepot-001.yaml
new file mode 100644
index 0000000..1d26848
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/homedepot-001.yaml
@@ -0,0 +1,92 @@
+# Home Depot product search extraction test
+id: "homedepot-001"
+name: "Extract Home Depot Product Search"
+description: "Extract product listings from Home Depot search results"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/s/power%2520drill"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchQuery:
+        type: "string"
+      totalResults:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            price:
+              type: "number"
+            originalPrice:
+              type: "number"
+            savings:
+              type: "number"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            availability:
+              type: "string"
+            features:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "price"
+            - "productUrl"
+      filters:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
+  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are relevant to the search query"
+      - "Prices are numeric values in USD"
+      - "Product URLs link to Home Depot product pages"
+      - "Ratings are on a 5-star scale"
+      - "Key product features are captured"
+
+metadata:
+  tags: ["ecommerce", "homedepot", "products", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/macys-001.yaml b/evals/data/streamlined-schema-extractor/macys-001.yaml
new file mode 100644
index 0000000..28a2c10
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/macys-001.yaml
@@ -0,0 +1,106 @@
+# Macy's product listing extraction test
+id: "macys-001"
+name: "Extract Macy's Product Listings"
+description: "Extract fashion products from Macy's category page"
+enabled: true
+
+target:
+  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      category:
+        type: "string"
+      totalProducts:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            currentPrice:
+              type: "number"
+            originalPrice:
+              type: "number"
+            discount:
+              type: "string"
+            colors:
+              type: "array"
+              items:
+                type: "string"
+            sizes:
+              type: "array"
+              items:
+                type: "string"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            promotions:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "brand"
+            - "currentPrice"
+      refinements:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          sizes:
+            type: "array"
+            items:
+              type: "string"
+          colors:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
+  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are from the correct category"
+      - "Prices reflect current and sale prices"
+      - "Color and size options are captured"
+      - "Brand names are accurately extracted"
+      - "Promotional text is included when present"
+
+metadata:
+  tags: ["ecommerce", "macys", "fashion", "products"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/wikipedia-001.yaml b/evals/data/streamlined-schema-extractor/wikipedia-001.yaml
new file mode 100644
index 0000000..88983bd
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/wikipedia-001.yaml
@@ -0,0 +1,76 @@
+# Wikipedia article extraction test (Streamlined)
+id: "wikipedia-chrome-devtools-001"
+name: "Extract Chrome DevTools Wikipedia Article"
+description: "Extract structured information from the Chrome DevTools Wikipedia page"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      title:
+        type: "string"
+      summary:
+        type: "string"
+      tableOfContents:
+        type: "array"
+        items:
+          type: "string"
+      infobox:
+        type: "object"
+        properties:
+          developer:
+            type: "string"
+          initialRelease:
+            type: "string"
+          operatingSystem:
+            type: "string"
+          license:
+            type: "string"
+      externalLinks:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            text:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "title"
+      - "summary"
+  instruction: "Extract the main article information including title, summary, table of contents, and infobox details"
+  reasoning: "Testing extraction from a stable, well-structured Wikipedia page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "externalLinks[*].url"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Article title matches the Wikipedia page title"
+      - "Summary captures the main description of Chrome DevTools"
+      - "Table of contents includes major sections"
+      - "Infobox contains key technical details"
+      - "External links are properly resolved URLs"
+
+metadata:
+  tags: ["wikipedia", "documentation", "stable"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/streamlined-schema-extractor/wikipedia-search-001.yaml b/evals/data/streamlined-schema-extractor/wikipedia-search-001.yaml
new file mode 100644
index 0000000..c432c20
--- /dev/null
+++ b/evals/data/streamlined-schema-extractor/wikipedia-search-001.yaml
@@ -0,0 +1,77 @@
+# Wikipedia search results extraction test
+id: "wikipedia-search-001"
+name: "Extract Wikipedia Search Results"
+description: "Extract search results from Wikipedia search"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchTerm:
+        type: "string"
+      resultCount:
+        type: "number"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            category:
+              type: "string"
+            wordCount:
+              type: "number"
+            lastEdited:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      suggestedArticles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "searchResults"
+  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
+  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are Wikipedia articles"
+      - "Each result has a valid Wikipedia URL"
+      - "Snippets contain relevant content highlights"
+      - "Metadata like word count is extracted when available"
+
+metadata:
+  tags: ["search", "wikipedia", "encyclopedia"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/booking-001.yaml b/evals/data/web-task-agent/booking-001.yaml
new file mode 100644
index 0000000..8a99d17
--- /dev/null
+++ b/evals/data/web-task-agent/booking-001.yaml
@@ -0,0 +1,45 @@
+# Hotel Search Workflow - Web Task Agent
+id: "booking-001"
+name: "Hotel Search Workflow"
+description: "Test web task agent orchestrating complex multi-step booking search"
+enabled: true
+
+target:
+  url: "https://www.booking.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
+  reasoning: "Customer is looking for travel booking"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully searched for hotels in San Francisco"
+      - "Results show hotels available for March 15-17 dates"
+      - "Guest count of 2 adults is reflected in the search results"
+      - "Returned multiple hotel options with relevant details"
+      - "Each hotel includes essential information (name, price, location)"
+      - "Results are presented in a clear, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify hotel search results are displayed for San Francisco"
+        - "Check that dates March 15-17 are correctly selected"
+        - "Confirm guest count shows 2 adults"
+        - "Ensure search results show hotels with availability for specified dates"
+
+metadata:
+  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/ecommerce-001.yaml b/evals/data/web-task-agent/ecommerce-001.yaml
new file mode 100644
index 0000000..338f464
--- /dev/null
+++ b/evals/data/web-task-agent/ecommerce-001.yaml
@@ -0,0 +1,53 @@
+# E-commerce web task evaluation (matches DevTools test case)
+id: "ecommerce-001"
+name: "E-commerce Product Search"
+description: "Test web task agent handling product search on shopping site"
+enabled: true
+
+target:
+  url: "https://www.amazon.com"
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Search Amazon for \"wireless headphones\" and find products under $100"
+  reasoning: "Testing e-commerce search workflow with price filtering"
+  context: "User wants to find wireless headphones with specific price constraint"
+  extraction_schema:
+    type: "object"
+    properties:
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            price:
+              type: "string"
+            rating:
+              type: "string"
+            url:
+              type: "string"
+
+
+validation:
+  type: "hybrid"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully navigated to product search"
+      - "Applied appropriate filters correctly"
+      - "Extracted product details accurately"
+      - "Provided meaningful comparison of features"
+      - "Stayed within specified price range"
+  snapshot:
+    structure_only: true
+    exclude_paths:
+      - "timestamp"
+      - "sessionId"
+
+metadata:
+  tags: ["web-task", "multi-step", "ecommerce", "search"]
+  priority: "high"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/error-001.yaml b/evals/data/web-task-agent/error-001.yaml
new file mode 100644
index 0000000..1831a14
--- /dev/null
+++ b/evals/data/web-task-agent/error-001.yaml
@@ -0,0 +1,45 @@
+# Error Recovery Workflow - Web Task Agent
+id: "error-001"
+name: "Error Recovery Workflow"
+description: "Test web task agent handling action_agent failures and retry logic"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
+  reasoning: "Customer is asking for this response"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
+      - "Either found some results OR provided clear explanation why no results were found"
+      - "Response handles the edge case gracefully without errors"
+      - "If no results found, suggested alternative actions or explanations"
+      - "Maintained professional tone despite unusual request"
+      - "Final output is coherent and helpful to the user"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Check if search was attempted despite unusual query"
+        - "Verify error handling did not break the page interaction"
+        - "Confirm agent attempted to complete the task or provided clear error info"
+        - "Ensure page is still functional after error recovery attempts"
+
+metadata:
+  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/extract-001.yaml b/evals/data/web-task-agent/extract-001.yaml
new file mode 100644
index 0000000..e836aa0
--- /dev/null
+++ b/evals/data/web-task-agent/extract-001.yaml
@@ -0,0 +1,60 @@
+# Structured Data Extraction - Web Task Agent
+id: "extract-001"
+name: "Structured Data Extraction"
+description: "Test web task agent extracting structured data from search results"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
+  reasoning: "User is looking to understand the top stories on Hacker News"
+  extraction_schema:
+    type: "object"
+    properties:
+      stories:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments:
+              type: "number"
+            url:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully returned exactly 5 Hacker News stories in structured text format"
+      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
+      - "Results are presented in readable text format similar to the example provided"
+      - "Response includes all required fields: title, score, comments count, URL"
+      - "Maintained proper orchestration pattern throughout the extraction process"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News homepage is loaded and displaying stories"
+        - "Check that top stories are visible with scores and comment counts"
+        - "Confirm story titles and metadata are clearly displayed"
+        - "Ensure page structure allows for data extraction"
+
+metadata:
+  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/finance-001.yaml b/evals/data/web-task-agent/finance-001.yaml
new file mode 100644
index 0000000..2c661ed
--- /dev/null
+++ b/evals/data/web-task-agent/finance-001.yaml
@@ -0,0 +1,68 @@
+# Stock Information Research - Web Task Agent
+id: "finance-001"
+name: "Stock Information Research"
+description: "Test extracting stock prices and financial information"
+enabled: true
+
+target:
+  url: "https://finance.yahoo.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
+  reasoning: "Users need automated financial data collection for investment decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      stock_info:
+        type: "object"
+        properties:
+          symbol:
+            type: "string"
+          company_name:
+            type: "string"
+          current_price:
+            type: "string"
+          change:
+            type: "string"
+          change_percent:
+            type: "string"
+          market_cap:
+            type: "string"
+          pe_ratio:
+            type: "string"
+          volume:
+            type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Apple (AAPL) stock information"
+      - "Current stock price is clearly stated"
+      - "Market cap information is included"
+      - "Price change and percentage change are provided"
+      - "Additional metrics (PE ratio, volume) included when available"
+      - "Financial data is current and presented in readable text format (not JSON)"
+      - "Stock information is well-organized and easy to understand"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
+        - "Check that current stock price and change are visible"
+        - "Confirm market cap and trading volume are displayed"
+        - "Ensure financial metrics and charts are shown"
+
+metadata:
+  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/flight-001.yaml b/evals/data/web-task-agent/flight-001.yaml
new file mode 100644
index 0000000..f74b255
--- /dev/null
+++ b/evals/data/web-task-agent/flight-001.yaml
@@ -0,0 +1,45 @@
+# Complex Flight Search - Web Task Agent
+id: "flight-001"
+name: "Complex Flight Search"
+description: "Test web task agent handling complex flight search with multiple criteria"
+enabled: true
+
+target:
+  url: "https://www.kayak.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
+  reasoning: "Customer is looking for finding the best flight options"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Flight results show March 20 departure date"
+      - "Flight results show March 30 return date"
+      - "Returned multiple flight options with airlines and prices"
+      - "Each flight includes essential details (times, airlines, prices)"
+      - "Results clearly distinguish between outbound and return flights"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify flight search results are displayed"
+        - "Check SEA to NRT route is correctly selected"
+        - "Confirm dates March 20 departure and March 30 return"
+        - "Ensure flight options are showing with prices and airlines"
+
+metadata:
+  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/food-001.yaml b/evals/data/web-task-agent/food-001.yaml
new file mode 100644
index 0000000..382b470
--- /dev/null
+++ b/evals/data/web-task-agent/food-001.yaml
@@ -0,0 +1,68 @@
+# Restaurant Search and Menu Extraction - Web Task Agent
+id: "food-001"
+name: "Restaurant Search and Menu Extraction"
+description: "Test searching restaurants and extracting menu information"
+enabled: true
+
+target:
+  url: "https://www.yelp.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
+  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
+  extraction_schema:
+    type: "object"
+    properties:
+      restaurants:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            rating:
+              type: "string"
+            price_range:
+              type: "string"
+            cuisine:
+              type: "string"
+            address:
+              type: "string"
+            phone:
+              type: "string"
+            hours:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Italian restaurants in San Francisco"
+      - "Each restaurant includes name, rating, and price range"
+      - "Location/address information is provided for each restaurant"
+      - "Contact details (phone/hours) included when available"
+      - "All restaurants listed serve Italian cuisine"
+      - "Results are presented in clear, structured text format (not JSON)"
+      - "Restaurants are numbered or organized clearly for easy comparison"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yelp search results for Italian restaurants"
+        - "Check that restaurants show ratings and price ranges"
+        - "Confirm location filter shows San Francisco results"
+        - "Ensure restaurant listings include contact information"
+
+metadata:
+  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/iframe-001.yaml b/evals/data/web-task-agent/iframe-001.yaml
new file mode 100644
index 0000000..a9234e5
--- /dev/null
+++ b/evals/data/web-task-agent/iframe-001.yaml
@@ -0,0 +1,83 @@
+# ANA Airlines Iframe Content Extraction - Web Task Agent
+id: "iframe-001"
+name: "ANA Airlines Iframe Content Extraction"
+description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
+enabled: true
+
+target:
+  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
+  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            flight_number:
+              type: "string"
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            departure_date:
+              type: "string"
+            arrival_date:
+              type: "string"
+            duration:
+              type: "string"
+            aircraft:
+              type: "string"
+            price:
+              type: "string"
+            cabin_class:
+              type: "string"
+            stops:
+              type: "string"
+      booking_interface_status:
+        type: "string"
+      iframe_content_found:
+        type: "boolean"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully navigated ANA Airlines booking interface"
+      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
+      - "Extracted flight information from ANA flight search results"
+      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
+      - "Extracted pricing information in appropriate currency"
+      - "Handled any booking interface elements, popups, or navigation flows"
+      - "Results show flights for the correct dates (March 20-30, 2026)"
+      - "Successfully demonstrated iframe content extraction capabilities"
+      - "Booking interface status indicates successful page interaction"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify ANA Airlines flight search page loaded correctly"
+        - "Check that search parameters show SEA to NRT route"
+        - "Confirm flight results are displayed (may be in iframes)"
+        - "Ensure booking interface elements are functional"
+        - "Verify flight information is accessible and extractable"
+
+metadata:
+  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/jobs-001.yaml b/evals/data/web-task-agent/jobs-001.yaml
new file mode 100644
index 0000000..7a6caa8
--- /dev/null
+++ b/evals/data/web-task-agent/jobs-001.yaml
@@ -0,0 +1,68 @@
+# Job Search Workflow - Web Task Agent
+id: "jobs-001"
+name: "Job Search Workflow"
+description: "Test web task agent orchestrating job search on LinkedIn"
+enabled: true
+
+target:
+  url: "https://www.linkedin.com/jobs"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
+  reasoning: "User wants to find job opportunities in tech industry"
+  extraction_schema:
+    type: "object"
+    properties:
+      jobs:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            company:
+              type: "string"
+            location:
+              type: "string"
+            salary:
+              type: "string"
+            description:
+              type: "string"
+            url:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
+      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
+      - "If using forms: delegated keyword and location input to action_agent"
+      - "Extracted job listings using extract_data"
+      - "Returned structured job data in readable text format (not JSON)"
+      - "Each job listing includes title, company, location, and other relevant fields"
+      - "Results are numbered or organized clearly for easy reading"
+      - "Demonstrated proper workflow orchestration for job search"
+      - "Never used direct browser interaction tools"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify LinkedIn job search results are displayed"
+        - "Check that search shows Software Engineer jobs in San Francisco"
+        - "Confirm job listings include company names and titles"
+        - "Ensure at least 5 job results are visible"
+
+metadata:
+  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/learning-001.yaml b/evals/data/web-task-agent/learning-001.yaml
new file mode 100644
index 0000000..1e4c761
--- /dev/null
+++ b/evals/data/web-task-agent/learning-001.yaml
@@ -0,0 +1,69 @@
+# Online Course Search - Web Task Agent
+id: "learning-001"
+name: "Online Course Search"
+description: "Test searching and extracting course information from learning platforms"
+enabled: true
+
+target:
+  url: "https://www.coursera.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
+  reasoning: "Users want to compare courses across platforms for learning decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      courses:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            instructor:
+              type: "string"
+            university:
+              type: "string"
+            rating:
+              type: "string"
+            duration:
+              type: "string"
+            price:
+              type: "string"
+            description:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Machine Learning courses on Coursera"
+      - "Returned details for top 5 courses as requested"
+      - "Each course includes title, instructor, university, and rating"
+      - "Duration and pricing information included for each course"
+      - "Course descriptions or key topics are provided"
+      - "Results are presented in structured text format (not JSON)"
+      - "Courses are numbered (1-5) and well-organized for easy comparison"
+      - "Each course entry is clearly formatted and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Coursera search results for Machine Learning"
+        - "Check that courses show titles, instructors, and ratings"
+        - "Confirm course details include duration and pricing"
+        - "Ensure search results are relevant to Machine Learning"
+
+metadata:
+  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/nav-001.yaml b/evals/data/web-task-agent/nav-001.yaml
new file mode 100644
index 0000000..bff519f
--- /dev/null
+++ b/evals/data/web-task-agent/nav-001.yaml
@@ -0,0 +1,46 @@
+# Site Navigation Workflow - Web Task Agent
+id: "nav-001"
+name: "Site Navigation Workflow"
+description: "Test web task agent orchestrating navigation between different sections of a site"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
+  reasoning: "User is looking to explore Wikipedia content through structured navigation"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated Wikipedia search via action_agent calls"
+      - "Navigated to artificial intelligence article through action_agent"
+      - "Located machine learning section via action_agent coordination"
+      - "Extracted relevant information about machine learning"
+      - "Demonstrated multi-step navigation workflow"
+      - "Maintained orchestration pattern throughout navigation"
+      - "Provided structured summary of found information"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify navigation reached artificial intelligence Wikipedia page"
+        - "Check that machine learning section or content is visible"
+        - "Confirm successful navigation through multiple page sections"
+        - "Ensure content related to machine learning is displayed"
+
+metadata:
+  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/news-001.yaml b/evals/data/web-task-agent/news-001.yaml
new file mode 100644
index 0000000..4c29aed
--- /dev/null
+++ b/evals/data/web-task-agent/news-001.yaml
@@ -0,0 +1,64 @@
+# News Article Aggregation - Web Task Agent
+id: "news-001"
+name: "News Article Aggregation"
+description: "Test aggregating news headlines and summaries from news sites"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
+  reasoning: "Users want automated news monitoring for research and awareness"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments_count:
+              type: "number"
+            url:
+              type: "string"
+            top_comment:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted 10 Hacker News stories as requested"
+      - "Each story includes title, score, and comment count"
+      - "URLs are provided for each story"
+      - "Stories appear to be from the current top/front page"
+      - "Results are presented in clear, numbered text format (1-10), not JSON"
+      - "All required fields are present and properly formatted in readable text"
+      - "Each story is clearly separated and easy to read"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News stories are visible with scores"
+        - "Check that story titles and comment counts are shown"
+        - "Confirm top stories section is properly displayed"
+        - "Ensure story metadata is accessible for extraction"
+
+metadata:
+  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/realestate-001.yaml b/evals/data/web-task-agent/realestate-001.yaml
new file mode 100644
index 0000000..5fd824e
--- /dev/null
+++ b/evals/data/web-task-agent/realestate-001.yaml
@@ -0,0 +1,70 @@
+# Real Estate Property Search - Web Task Agent
+id: "realestate-001"
+name: "Real Estate Property Search"
+description: "Test property search workflow on real estate platforms"
+enabled: true
+
+target:
+  url: "https://www.zillow.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
+  reasoning: "User wants to find affordable housing options in a specific location"
+  extraction_schema:
+    type: "object"
+    properties:
+      properties:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            address:
+              type: "string"
+            price:
+              type: "string"
+            bedrooms:
+              type: "number"
+            bathrooms:
+              type: "number"
+            sqft:
+              type: "string"
+            lot_size:
+              type: "string"
+            year_built:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated location search via action_agent"
+      - "Delegated price filter setting to action_agent"
+      - "Coordinated property type selection through action_agent"
+      - "Applied search filters through proper action_agent calls"
+      - "Extracted property listings with extract_data"
+      - "Returned structured property data in readable text format (not JSON)"
+      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
+      - "Properties are clearly numbered or organized for easy comparison"
+      - "Demonstrated complex real estate search workflow orchestration"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Zillow search results for Austin, Texas properties"
+        - "Check that properties shown are under $500k"
+        - "Confirm property listings show price, beds, baths info"
+        - "Ensure search results match the specified criteria"
+
+metadata:
+  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/scroll-001.yaml b/evals/data/web-task-agent/scroll-001.yaml
new file mode 100644
index 0000000..12a986f
--- /dev/null
+++ b/evals/data/web-task-agent/scroll-001.yaml
@@ -0,0 +1,61 @@
+# Infinite Scroll Content Loading - Web Task Agent
+id: "scroll-001"
+name: "Infinite Scroll Content Loading"
+description: "Test web task agent handling infinite scroll pages to load more content"
+enabled: true
+
+target:
+  url: "https://twitter.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
+  reasoning: "Testing infinite scroll functionality for dynamic content loading"
+  extraction_schema:
+    type: "object"
+    properties:
+      tweets:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            author:
+              type: "string"
+            content:
+              type: "string"
+            likes:
+              type: "string"
+            retweets:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used scroll_page tool to scroll down the page"
+      - "Loaded additional content through scrolling actions"
+      - "Extracted at least 20 tweets from the feed"
+      - "Each tweet includes author and content information"
+      - "Demonstrated proper handling of dynamically loaded content"
+      - "Results are presented in clear, numbered text format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify initial Twitter feed is loaded"
+        - "Check that scrolling action loaded additional tweets"
+        - "Confirm at least 20 tweets are visible after scrolling"
+        - "Ensure page scrolled down significantly from initial position"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/scroll-002.yaml b/evals/data/web-task-agent/scroll-002.yaml
new file mode 100644
index 0000000..dce0156
--- /dev/null
+++ b/evals/data/web-task-agent/scroll-002.yaml
@@ -0,0 +1,65 @@
+# Product Review Scrolling - Web Task Agent
+id: "scroll-002"
+name: "Product Review Scrolling"
+description: "Test scrolling to load more product reviews on e-commerce sites"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/dp/B09B8V1LZ3"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
+  reasoning: "Users need to see multiple reviews beyond initial visible ones"
+  extraction_schema:
+    type: "object"
+    properties:
+      reviews:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            rating:
+              type: "string"
+            title:
+              type: "string"
+            author:
+              type: "string"
+            date:
+              type: "string"
+            verified:
+              type: "boolean"
+            content:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool to navigate to reviews section"
+      - "Scrolled within reviews area to load additional reviews"
+      - "Extracted multiple product reviews with ratings"
+      - "Each review includes rating, author, and content"
+      - "Successfully handled lazy-loaded review content"
+      - "Presented reviews in structured, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Amazon product page is loaded"
+        - "Check that page scrolled to reviews section"
+        - "Confirm additional reviews loaded after scrolling"
+        - "Ensure review content is fully visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/scroll-003.yaml b/evals/data/web-task-agent/scroll-003.yaml
new file mode 100644
index 0000000..df7eaba
--- /dev/null
+++ b/evals/data/web-task-agent/scroll-003.yaml
@@ -0,0 +1,61 @@
+# News Article Progressive Loading - Web Task Agent
+id: "scroll-003"
+name: "News Article Progressive Loading"
+description: "Test scrolling through news sites that load articles progressively"
+enabled: true
+
+target:
+  url: "https://medium.com/topic/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
+  reasoning: "Testing progressive content loading on news/blog platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            author:
+              type: "string"
+            reading_time:
+              type: "string"
+            preview:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool multiple times to load content"
+      - "Successfully loaded at least 15 articles through scrolling"
+      - "Extracted article titles and author information"
+      - "Handled Medium's progressive loading mechanism"
+      - "Articles are from technology topic as requested"
+      - "Results presented in clear, numbered format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Medium technology page is loaded"
+        - "Check that initial articles are visible"
+        - "Confirm scrolling loaded additional articles"
+        - "Ensure at least 15 articles are visible after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/scroll-004.yaml b/evals/data/web-task-agent/scroll-004.yaml
new file mode 100644
index 0000000..e9b3534
--- /dev/null
+++ b/evals/data/web-task-agent/scroll-004.yaml
@@ -0,0 +1,61 @@
+# Search Results Infinite Scroll - Web Task Agent
+id: "scroll-004"
+name: "Search Results Infinite Scroll"
+description: "Test handling search results that use infinite scroll instead of pagination"
+enabled: true
+
+target:
+  url: "https://www.pinterest.com/search/pins/?q=web%20design"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
+  reasoning: "Testing infinite scroll on visual search platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      pins:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            description:
+              type: "string"
+            saves:
+              type: "string"
+            source:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully performed search for \"web design\" pins"
+      - "Used scroll_page tool to trigger infinite scroll loading"
+      - "Loaded at least 30 pins through scrolling actions"
+      - "Extracted pin titles and metadata"
+      - "Handled Pinterest's masonry layout and lazy loading"
+      - "Results are well-organized and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Pinterest search results for web design"
+        - "Check initial pins are displayed"
+        - "Confirm scrolling loaded many more pins"
+        - "Ensure grid layout shows 30+ pins after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/scroll-005.yaml b/evals/data/web-task-agent/scroll-005.yaml
new file mode 100644
index 0000000..47c8769
--- /dev/null
+++ b/evals/data/web-task-agent/scroll-005.yaml
@@ -0,0 +1,73 @@
+# Google Flights Scroll and Show More - Web Task Agent
+id: "scroll-005"
+name: "Google Flights Scroll and Show More"
+description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
+  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "string"
+            price:
+              type: "string"
+            aircraft:
+              type: "string"
+      total_flights_found:
+        type: "number"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted initial flight results from Google Flights"
+      - "Used scroll_page tool to scroll down the flight results list"
+      - "Located and clicked \"Show more flights\" button using action_agent"
+      - "Loaded additional flight options beyond the initial set"
+      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Each flight includes airline, times, duration, stops, and price"
+      - "Flights are for the correct dates (March 20-30, 2026)"
+      - "Results are presented in clear, numbered format"
+      - "Successfully combined scrolling and clicking actions to load more content"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Google Flights page shows SEA to NRT flights"
+        - "Check that initial flight results are displayed"
+        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
+        - "Ensure additional flights loaded after clicking the button"
+        - "Verify at least 20 flight options are now visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/search-001.yaml b/evals/data/web-task-agent/search-001.yaml
new file mode 100644
index 0000000..da3a4eb
--- /dev/null
+++ b/evals/data/web-task-agent/search-001.yaml
@@ -0,0 +1,41 @@
+# Basic web task search evaluation (matches DevTools test case)
+id: "search-001"
+name: "Site-Specific Search Task"
+description: "Test web task agent orchestrating a search workflow on a specific site"
+enabled: true
+
+target:
+  url: "chrome://new-tab-page"
+
+tool: "web_task_agent"
+timeout: 60000
+
+input:
+  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
+  reasoning: "Testing basic site-specific search workflow orchestration"
+  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully returned exactly 3 search results in structured text format"
+      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
+      - "Each result includes a URL in the format \"URL: [link]\""
+      - "Results are presented in a clear, readable text format (not JSON)"
+      - "Response includes a brief summary or conclusion statement"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search was completed and results page is showing"
+        - "Check that search results are related to \"Chrome DevTools automation\""
+        - "Confirm at least 3 search results are visible on the page"
+        - "Ensure the search workflow was completed successfully"
+
+metadata:
+  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
+  priority: "normal"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/social-001.yaml b/evals/data/web-task-agent/social-001.yaml
new file mode 100644
index 0000000..a35ebfd
--- /dev/null
+++ b/evals/data/web-task-agent/social-001.yaml
@@ -0,0 +1,60 @@
+# Social Media Content Extraction - Web Task Agent
+id: "social-001"
+name: "Social Media Content Extraction"
+description: "Test extracting trending topics and posts from social media"
+enabled: true
+
+target:
+  url: "https://twitter.com/explore"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 trending topics from Twitter/X explore page"
+  reasoning: "User wants to stay updated on current trends"
+  extraction_schema:
+    type: "object"
+    properties:
+      trends:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            topic:
+              type: "string"
+            posts_count:
+              type: "string"
+            category:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully accessed Twitter/X explore page and found trending topics"
+      - "Returned exactly 5 trending topics as requested"
+      - "Each topic includes the trend name/hashtag"
+      - "Post counts or metrics are included when available"
+      - "Topics are current/recent trends (not outdated)"
+      - "Results are presented in clear, numbered text format (not JSON)"
+      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Twitter/X explore page is loaded"
+        - "Check that trending topics section is visible"
+        - "Confirm trending topics show names and post counts"
+        - "Ensure page shows current trending content"
+
+metadata:
+  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-booking-001.yaml b/evals/data/web-task-agent/web-task-agent-booking-001.yaml
new file mode 100644
index 0000000..a2842b6
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-booking-001.yaml
@@ -0,0 +1,45 @@
+# Hotel Search Workflow - Web Task Agent
+id: "web-task-agent-booking-001"
+name: "Hotel Search Workflow"
+description: "Test web task agent orchestrating complex multi-step booking search"
+enabled: true
+
+target:
+  url: "https://www.booking.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
+  reasoning: "Customer is looking for travel booking"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully searched for hotels in San Francisco"
+      - "Results show hotels available for March 15-17 dates"
+      - "Guest count of 2 adults is reflected in the search results"
+      - "Returned multiple hotel options with relevant details"
+      - "Each hotel includes essential information (name, price, location)"
+      - "Results are presented in a clear, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify hotel search results are displayed for San Francisco"
+        - "Check that dates March 15-17 are correctly selected"
+        - "Confirm guest count shows 2 adults"
+        - "Ensure search results show hotels with availability for specified dates"
+
+metadata:
+  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-ecommerce-001.yaml b/evals/data/web-task-agent/web-task-agent-ecommerce-001.yaml
new file mode 100644
index 0000000..a6b9735
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-ecommerce-001.yaml
@@ -0,0 +1,53 @@
+# E-commerce web task evaluation (matches DevTools test case)
+id: "web-task-agent-ecommerce-001"
+name: "E-commerce Product Search"
+description: "Test web task agent handling product search on shopping site"
+enabled: true
+
+target:
+  url: "https://www.amazon.com"
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Search Amazon for \"wireless headphones\" and find products under $100"
+  reasoning: "Testing e-commerce search workflow with price filtering"
+  context: "User wants to find wireless headphones with specific price constraint"
+  extraction_schema:
+    type: "object"
+    properties:
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            price:
+              type: "string"
+            rating:
+              type: "string"
+            url:
+              type: "string"
+
+
+validation:
+  type: "hybrid"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully navigated to product search"
+      - "Applied appropriate filters correctly"
+      - "Extracted product details accurately"
+      - "Provided meaningful comparison of features"
+      - "Stayed within specified price range"
+  snapshot:
+    structure_only: true
+    exclude_paths:
+      - "timestamp"
+      - "sessionId"
+
+metadata:
+  tags: ["web-task", "multi-step", "ecommerce", "search"]
+  priority: "high"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-error-001.yaml b/evals/data/web-task-agent/web-task-agent-error-001.yaml
new file mode 100644
index 0000000..cc5c7df
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-error-001.yaml
@@ -0,0 +1,45 @@
+# Error Recovery Workflow - Web Task Agent
+id: "web-task-agent-error-001"
+name: "Error Recovery Workflow"
+description: "Test web task agent handling action_agent failures and retry logic"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
+  reasoning: "Customer is asking for this response"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
+      - "Either found some results OR provided clear explanation why no results were found"
+      - "Response handles the edge case gracefully without errors"
+      - "If no results found, suggested alternative actions or explanations"
+      - "Maintained professional tone despite unusual request"
+      - "Final output is coherent and helpful to the user"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Check if search was attempted despite unusual query"
+        - "Verify error handling did not break the page interaction"
+        - "Confirm agent attempted to complete the task or provided clear error info"
+        - "Ensure page is still functional after error recovery attempts"
+
+metadata:
+  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-extract-001.yaml b/evals/data/web-task-agent/web-task-agent-extract-001.yaml
new file mode 100644
index 0000000..14eadcb
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-extract-001.yaml
@@ -0,0 +1,60 @@
+# Structured Data Extraction - Web Task Agent
+id: "web-task-agent-extract-001"
+name: "Structured Data Extraction"
+description: "Test web task agent extracting structured data from search results"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
+  reasoning: "User is looking to understand the top stories on Hacker News"
+  extraction_schema:
+    type: "object"
+    properties:
+      stories:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments:
+              type: "number"
+            url:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully returned exactly 5 Hacker News stories in structured text format"
+      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
+      - "Results are presented in readable text format similar to the example provided"
+      - "Response includes all required fields: title, score, comments count, URL"
+      - "Maintained proper orchestration pattern throughout the extraction process"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News homepage is loaded and displaying stories"
+        - "Check that top stories are visible with scores and comment counts"
+        - "Confirm story titles and metadata are clearly displayed"
+        - "Ensure page structure allows for data extraction"
+
+metadata:
+  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-finance-001.yaml b/evals/data/web-task-agent/web-task-agent-finance-001.yaml
new file mode 100644
index 0000000..8f7a2b0
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-finance-001.yaml
@@ -0,0 +1,68 @@
+# Stock Information Research - Web Task Agent
+id: "web-task-agent-finance-001"
+name: "Stock Information Research"
+description: "Test extracting stock prices and financial information"
+enabled: true
+
+target:
+  url: "https://finance.yahoo.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
+  reasoning: "Users need automated financial data collection for investment decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      stock_info:
+        type: "object"
+        properties:
+          symbol:
+            type: "string"
+          company_name:
+            type: "string"
+          current_price:
+            type: "string"
+          change:
+            type: "string"
+          change_percent:
+            type: "string"
+          market_cap:
+            type: "string"
+          pe_ratio:
+            type: "string"
+          volume:
+            type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Apple (AAPL) stock information"
+      - "Current stock price is clearly stated"
+      - "Market cap information is included"
+      - "Price change and percentage change are provided"
+      - "Additional metrics (PE ratio, volume) included when available"
+      - "Financial data is current and presented in readable text format (not JSON)"
+      - "Stock information is well-organized and easy to understand"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
+        - "Check that current stock price and change are visible"
+        - "Confirm market cap and trading volume are displayed"
+        - "Ensure financial metrics and charts are shown"
+
+metadata:
+  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-flight-001.yaml b/evals/data/web-task-agent/web-task-agent-flight-001.yaml
new file mode 100644
index 0000000..a17883f
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-flight-001.yaml
@@ -0,0 +1,45 @@
+# Complex Flight Search - Web Task Agent
+id: "web-task-agent-flight-001"
+name: "Complex Flight Search"
+description: "Test web task agent handling complex flight search with multiple criteria"
+enabled: true
+
+target:
+  url: "https://www.kayak.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
+  reasoning: "Customer is looking for finding the best flight options"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Flight results show March 20 departure date"
+      - "Flight results show March 30 return date"
+      - "Returned multiple flight options with airlines and prices"
+      - "Each flight includes essential details (times, airlines, prices)"
+      - "Results clearly distinguish between outbound and return flights"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify flight search results are displayed"
+        - "Check SEA to NRT route is correctly selected"
+        - "Confirm dates March 20 departure and March 30 return"
+        - "Ensure flight options are showing with prices and airlines"
+
+metadata:
+  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-food-001.yaml b/evals/data/web-task-agent/web-task-agent-food-001.yaml
new file mode 100644
index 0000000..32ee646
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-food-001.yaml
@@ -0,0 +1,68 @@
+# Restaurant Search and Menu Extraction - Web Task Agent
+id: "web-task-agent-food-001"
+name: "Restaurant Search and Menu Extraction"
+description: "Test searching restaurants and extracting menu information"
+enabled: true
+
+target:
+  url: "https://www.yelp.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
+  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
+  extraction_schema:
+    type: "object"
+    properties:
+      restaurants:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            rating:
+              type: "string"
+            price_range:
+              type: "string"
+            cuisine:
+              type: "string"
+            address:
+              type: "string"
+            phone:
+              type: "string"
+            hours:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Italian restaurants in San Francisco"
+      - "Each restaurant includes name, rating, and price range"
+      - "Location/address information is provided for each restaurant"
+      - "Contact details (phone/hours) included when available"
+      - "All restaurants listed serve Italian cuisine"
+      - "Results are presented in clear, structured text format (not JSON)"
+      - "Restaurants are numbered or organized clearly for easy comparison"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yelp search results for Italian restaurants"
+        - "Check that restaurants show ratings and price ranges"
+        - "Confirm location filter shows San Francisco results"
+        - "Ensure restaurant listings include contact information"
+
+metadata:
+  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-iframe-001.yaml b/evals/data/web-task-agent/web-task-agent-iframe-001.yaml
new file mode 100644
index 0000000..30b0eac
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-iframe-001.yaml
@@ -0,0 +1,83 @@
+# ANA Airlines Iframe Content Extraction - Web Task Agent
+id: "web-task-agent-iframe-001"
+name: "ANA Airlines Iframe Content Extraction"
+description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
+enabled: true
+
+target:
+  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
+  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            flight_number:
+              type: "string"
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            departure_date:
+              type: "string"
+            arrival_date:
+              type: "string"
+            duration:
+              type: "string"
+            aircraft:
+              type: "string"
+            price:
+              type: "string"
+            cabin_class:
+              type: "string"
+            stops:
+              type: "string"
+      booking_interface_status:
+        type: "string"
+      iframe_content_found:
+        type: "boolean"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully navigated ANA Airlines booking interface"
+      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
+      - "Extracted flight information from ANA flight search results"
+      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
+      - "Extracted pricing information in appropriate currency"
+      - "Handled any booking interface elements, popups, or navigation flows"
+      - "Results show flights for the correct dates (March 20-30, 2026)"
+      - "Successfully demonstrated iframe content extraction capabilities"
+      - "Booking interface status indicates successful page interaction"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify ANA Airlines flight search page loaded correctly"
+        - "Check that search parameters show SEA to NRT route"
+        - "Confirm flight results are displayed (may be in iframes)"
+        - "Ensure booking interface elements are functional"
+        - "Verify flight information is accessible and extractable"
+
+metadata:
+  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-jobs-001.yaml b/evals/data/web-task-agent/web-task-agent-jobs-001.yaml
new file mode 100644
index 0000000..2c72df3
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-jobs-001.yaml
@@ -0,0 +1,68 @@
+# Job Search Workflow - Web Task Agent
+id: "web-task-agent-jobs-001"
+name: "Job Search Workflow"
+description: "Test web task agent orchestrating job search on LinkedIn"
+enabled: true
+
+target:
+  url: "https://www.linkedin.com/jobs"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
+  reasoning: "User wants to find job opportunities in tech industry"
+  extraction_schema:
+    type: "object"
+    properties:
+      jobs:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            company:
+              type: "string"
+            location:
+              type: "string"
+            salary:
+              type: "string"
+            description:
+              type: "string"
+            url:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
+      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
+      - "If using forms: delegated keyword and location input to action_agent"
+      - "Extracted job listings using extract_data"
+      - "Returned structured job data in readable text format (not JSON)"
+      - "Each job listing includes title, company, location, and other relevant fields"
+      - "Results are numbered or organized clearly for easy reading"
+      - "Demonstrated proper workflow orchestration for job search"
+      - "Never used direct browser interaction tools"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify LinkedIn job search results are displayed"
+        - "Check that search shows Software Engineer jobs in San Francisco"
+        - "Confirm job listings include company names and titles"
+        - "Ensure at least 5 job results are visible"
+
+metadata:
+  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-learning-001.yaml b/evals/data/web-task-agent/web-task-agent-learning-001.yaml
new file mode 100644
index 0000000..8dcdc7d
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-learning-001.yaml
@@ -0,0 +1,69 @@
+# Online Course Search - Web Task Agent
+id: "web-task-agent-learning-001"
+name: "Online Course Search"
+description: "Test searching and extracting course information from learning platforms"
+enabled: true
+
+target:
+  url: "https://www.coursera.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
+  reasoning: "Users want to compare courses across platforms for learning decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      courses:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            instructor:
+              type: "string"
+            university:
+              type: "string"
+            rating:
+              type: "string"
+            duration:
+              type: "string"
+            price:
+              type: "string"
+            description:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Machine Learning courses on Coursera"
+      - "Returned details for top 5 courses as requested"
+      - "Each course includes title, instructor, university, and rating"
+      - "Duration and pricing information included for each course"
+      - "Course descriptions or key topics are provided"
+      - "Results are presented in structured text format (not JSON)"
+      - "Courses are numbered (1-5) and well-organized for easy comparison"
+      - "Each course entry is clearly formatted and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Coursera search results for Machine Learning"
+        - "Check that courses show titles, instructors, and ratings"
+        - "Confirm course details include duration and pricing"
+        - "Ensure search results are relevant to Machine Learning"
+
+metadata:
+  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-nav-001.yaml b/evals/data/web-task-agent/web-task-agent-nav-001.yaml
new file mode 100644
index 0000000..fdee2f4
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-nav-001.yaml
@@ -0,0 +1,46 @@
+# Site Navigation Workflow - Web Task Agent
+id: "web-task-agent-nav-001"
+name: "Site Navigation Workflow"
+description: "Test web task agent orchestrating navigation between different sections of a site"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
+  reasoning: "User is looking to explore Wikipedia content through structured navigation"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated Wikipedia search via action_agent calls"
+      - "Navigated to artificial intelligence article through action_agent"
+      - "Located machine learning section via action_agent coordination"
+      - "Extracted relevant information about machine learning"
+      - "Demonstrated multi-step navigation workflow"
+      - "Maintained orchestration pattern throughout navigation"
+      - "Provided structured summary of found information"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify navigation reached artificial intelligence Wikipedia page"
+        - "Check that machine learning section or content is visible"
+        - "Confirm successful navigation through multiple page sections"
+        - "Ensure content related to machine learning is displayed"
+
+metadata:
+  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-news-001.yaml b/evals/data/web-task-agent/web-task-agent-news-001.yaml
new file mode 100644
index 0000000..d9e1934
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-news-001.yaml
@@ -0,0 +1,64 @@
+# News Article Aggregation - Web Task Agent
+id: "web-task-agent-news-001"
+name: "News Article Aggregation"
+description: "Test aggregating news headlines and summaries from news sites"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
+  reasoning: "Users want automated news monitoring for research and awareness"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments_count:
+              type: "number"
+            url:
+              type: "string"
+            top_comment:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted 10 Hacker News stories as requested"
+      - "Each story includes title, score, and comment count"
+      - "URLs are provided for each story"
+      - "Stories appear to be from the current top/front page"
+      - "Results are presented in clear, numbered text format (1-10), not JSON"
+      - "All required fields are present and properly formatted in readable text"
+      - "Each story is clearly separated and easy to read"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News stories are visible with scores"
+        - "Check that story titles and comment counts are shown"
+        - "Confirm top stories section is properly displayed"
+        - "Ensure story metadata is accessible for extraction"
+
+metadata:
+  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-realestate-001.yaml b/evals/data/web-task-agent/web-task-agent-realestate-001.yaml
new file mode 100644
index 0000000..f22bc13
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-realestate-001.yaml
@@ -0,0 +1,70 @@
+# Real Estate Property Search - Web Task Agent
+id: "web-task-agent-realestate-001"
+name: "Real Estate Property Search"
+description: "Test property search workflow on real estate platforms"
+enabled: true
+
+target:
+  url: "https://www.zillow.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
+  reasoning: "User wants to find affordable housing options in a specific location"
+  extraction_schema:
+    type: "object"
+    properties:
+      properties:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            address:
+              type: "string"
+            price:
+              type: "string"
+            bedrooms:
+              type: "number"
+            bathrooms:
+              type: "number"
+            sqft:
+              type: "string"
+            lot_size:
+              type: "string"
+            year_built:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated location search via action_agent"
+      - "Delegated price filter setting to action_agent"
+      - "Coordinated property type selection through action_agent"
+      - "Applied search filters through proper action_agent calls"
+      - "Extracted property listings with extract_data"
+      - "Returned structured property data in readable text format (not JSON)"
+      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
+      - "Properties are clearly numbered or organized for easy comparison"
+      - "Demonstrated complex real estate search workflow orchestration"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Zillow search results for Austin, Texas properties"
+        - "Check that properties shown are under $500k"
+        - "Confirm property listings show price, beds, baths info"
+        - "Ensure search results match the specified criteria"
+
+metadata:
+  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-scroll-001.yaml b/evals/data/web-task-agent/web-task-agent-scroll-001.yaml
new file mode 100644
index 0000000..6fd0f6e
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-scroll-001.yaml
@@ -0,0 +1,61 @@
+# Infinite Scroll Content Loading - Web Task Agent
+id: "web-task-agent-scroll-001"
+name: "Infinite Scroll Content Loading"
+description: "Test web task agent handling infinite scroll pages to load more content"
+enabled: true
+
+target:
+  url: "https://twitter.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
+  reasoning: "Testing infinite scroll functionality for dynamic content loading"
+  extraction_schema:
+    type: "object"
+    properties:
+      tweets:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            author:
+              type: "string"
+            content:
+              type: "string"
+            likes:
+              type: "string"
+            retweets:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used scroll_page tool to scroll down the page"
+      - "Loaded additional content through scrolling actions"
+      - "Extracted at least 20 tweets from the feed"
+      - "Each tweet includes author and content information"
+      - "Demonstrated proper handling of dynamically loaded content"
+      - "Results are presented in clear, numbered text format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify initial Twitter feed is loaded"
+        - "Check that scrolling action loaded additional tweets"
+        - "Confirm at least 20 tweets are visible after scrolling"
+        - "Ensure page scrolled down significantly from initial position"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-scroll-002.yaml b/evals/data/web-task-agent/web-task-agent-scroll-002.yaml
new file mode 100644
index 0000000..d5d060a
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-scroll-002.yaml
@@ -0,0 +1,65 @@
+# Product Review Scrolling - Web Task Agent
+id: "web-task-agent-scroll-002"
+name: "Product Review Scrolling"
+description: "Test scrolling to load more product reviews on e-commerce sites"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/dp/B08N5WRWNW"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
+  reasoning: "Users need to see multiple reviews beyond initial visible ones"
+  extraction_schema:
+    type: "object"
+    properties:
+      reviews:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            rating:
+              type: "string"
+            title:
+              type: "string"
+            author:
+              type: "string"
+            date:
+              type: "string"
+            verified:
+              type: "boolean"
+            content:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool to navigate to reviews section"
+      - "Scrolled within reviews area to load additional reviews"
+      - "Extracted multiple product reviews with ratings"
+      - "Each review includes rating, author, and content"
+      - "Successfully handled lazy-loaded review content"
+      - "Presented reviews in structured, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Amazon product page is loaded"
+        - "Check that page scrolled to reviews section"
+        - "Confirm additional reviews loaded after scrolling"
+        - "Ensure review content is fully visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-scroll-003.yaml b/evals/data/web-task-agent/web-task-agent-scroll-003.yaml
new file mode 100644
index 0000000..f435017
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-scroll-003.yaml
@@ -0,0 +1,61 @@
+# News Article Progressive Loading - Web Task Agent
+id: "web-task-agent-scroll-003"
+name: "News Article Progressive Loading"
+description: "Test scrolling through news sites that load articles progressively"
+enabled: true
+
+target:
+  url: "https://medium.com/topic/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
+  reasoning: "Testing progressive content loading on news/blog platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            author:
+              type: "string"
+            reading_time:
+              type: "string"
+            preview:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool multiple times to load content"
+      - "Successfully loaded at least 15 articles through scrolling"
+      - "Extracted article titles and author information"
+      - "Handled Medium's progressive loading mechanism"
+      - "Articles are from technology topic as requested"
+      - "Results presented in clear, numbered format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Medium technology page is loaded"
+        - "Check that initial articles are visible"
+        - "Confirm scrolling loaded additional articles"
+        - "Ensure at least 15 articles are visible after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-scroll-004.yaml b/evals/data/web-task-agent/web-task-agent-scroll-004.yaml
new file mode 100644
index 0000000..5970947
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-scroll-004.yaml
@@ -0,0 +1,61 @@
+# Search Results Infinite Scroll - Web Task Agent
+id: "web-task-agent-scroll-004"
+name: "Search Results Infinite Scroll"
+description: "Test handling search results that use infinite scroll instead of pagination"
+enabled: true
+
+target:
+  url: "https://www.pinterest.com/search/pins/?q=web%20design"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
+  reasoning: "Testing infinite scroll on visual search platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      pins:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            description:
+              type: "string"
+            saves:
+              type: "string"
+            source:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully performed search for \"web design\" pins"
+      - "Used scroll_page tool to trigger infinite scroll loading"
+      - "Loaded at least 30 pins through scrolling actions"
+      - "Extracted pin titles and metadata"
+      - "Handled Pinterest's masonry layout and lazy loading"
+      - "Results are well-organized and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Pinterest search results for web design"
+        - "Check initial pins are displayed"
+        - "Confirm scrolling loaded many more pins"
+        - "Ensure grid layout shows 30+ pins after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-scroll-005.yaml b/evals/data/web-task-agent/web-task-agent-scroll-005.yaml
new file mode 100644
index 0000000..e603ff7
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-scroll-005.yaml
@@ -0,0 +1,73 @@
+# Google Flights Scroll and Show More - Web Task Agent
+id: "web-task-agent-scroll-005"
+name: "Google Flights Scroll and Show More"
+description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
+  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "string"
+            price:
+              type: "string"
+            aircraft:
+              type: "string"
+      total_flights_found:
+        type: "number"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted initial flight results from Google Flights"
+      - "Used scroll_page tool to scroll down the flight results list"
+      - "Located and clicked \"Show more flights\" button using action_agent"
+      - "Loaded additional flight options beyond the initial set"
+      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Each flight includes airline, times, duration, stops, and price"
+      - "Flights are for the correct dates (March 20-30, 2026)"
+      - "Results are presented in clear, numbered format"
+      - "Successfully combined scrolling and clicking actions to load more content"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Google Flights page shows SEA to NRT flights"
+        - "Check that initial flight results are displayed"
+        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
+        - "Ensure additional flights loaded after clicking the button"
+        - "Verify at least 20 flight options are now visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-search-001.yaml b/evals/data/web-task-agent/web-task-agent-search-001.yaml
new file mode 100644
index 0000000..50dc920
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-search-001.yaml
@@ -0,0 +1,41 @@
+# Basic web task search evaluation (matches DevTools test case)
+id: "web-task-agent-search-001"
+name: "Site-Specific Search Task"
+description: "Test web task agent orchestrating a search workflow on a specific site"
+enabled: true
+
+target:
+  url: "chrome://new-tab-page"
+
+tool: "web_task_agent"
+timeout: 60000
+
+input:
+  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
+  reasoning: "Testing basic site-specific search workflow orchestration"
+  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
+
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully returned exactly 3 search results in structured text format"
+      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
+      - "Each result includes a URL in the format \"URL: [link]\""
+      - "Results are presented in a clear, readable text format (not JSON)"
+      - "Response includes a brief summary or conclusion statement"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search was completed and results page is showing"
+        - "Check that search results are related to \"Chrome DevTools automation\""
+        - "Confirm at least 3 search results are visible on the page"
+        - "Ensure the search workflow was completed successfully"
+
+metadata:
+  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
+  priority: "normal"
\ No newline at end of file
diff --git a/evals/data/web-task-agent/web-task-agent-social-001.yaml b/evals/data/web-task-agent/web-task-agent-social-001.yaml
new file mode 100644
index 0000000..f1f969e
--- /dev/null
+++ b/evals/data/web-task-agent/web-task-agent-social-001.yaml
@@ -0,0 +1,60 @@
+# Social Media Content Extraction - Web Task Agent
+id: "web-task-agent-social-001"
+name: "Social Media Content Extraction"
+description: "Test extracting trending topics and posts from social media"
+enabled: true
+
+target:
+  url: "https://twitter.com/explore"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 trending topics from Twitter/X explore page"
+  reasoning: "User wants to stay updated on current trends"
+  extraction_schema:
+    type: "object"
+    properties:
+      trends:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            topic:
+              type: "string"
+            posts_count:
+              type: "string"
+            category:
+              type: "string"
+
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully accessed Twitter/X explore page and found trending topics"
+      - "Returned exactly 5 trending topics as requested"
+      - "Each topic includes the trend name/hashtag"
+      - "Post counts or metrics are included when available"
+      - "Topics are current/recent trends (not outdated)"
+      - "Results are presented in clear, numbered text format (not JSON)"
+      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Twitter/X explore page is loaded"
+        - "Check that trending topics section is visible"
+        - "Confirm trending topics show names and post counts"
+        - "Ensure page shows current trending content"
+
+metadata:
+  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py
new file mode 100644
index 0000000..a6b245b
--- /dev/null
+++ b/evals/lib/__init__.py
@@ -0,0 +1,19 @@
+"""
+Evaluation framework library.
+"""
+
+from .config_loader import ConfigLoader, get_config
+from .eval_loader import EvalLoader, Evaluation
+from .api_client import APIClient
+from .judge import LLMJudge, SimpleJudge, JudgeResult
+
+__all__ = [
+    'ConfigLoader',
+    'get_config',
+    'EvalLoader',
+    'Evaluation',
+    'APIClient',
+    'LLMJudge',
+    'SimpleJudge',
+    'JudgeResult'
+]
diff --git a/evals/lib/api_client.py b/evals/lib/api_client.py
new file mode 100644
index 0000000..2214710
--- /dev/null
+++ b/evals/lib/api_client.py
@@ -0,0 +1,207 @@
+"""
+API client for communicating with the evaluation server /v1/responses endpoint.
+"""
+
+import requests
+import time
+from typing import Dict, Any, Optional
+
+
+class APIClient:
+    """Client for interacting with /v1/responses API."""
+
+    def __init__(self, base_url: str, timeout: int = 300):
+        """
+        Initialize API client.
+
+        Args:
+            base_url: Base URL of the evaluation server (e.g., http://localhost:8080)
+            timeout: Request timeout in seconds
+        """
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+
+    def send_request(
+        self,
+        input_message: str,
+        model_config: Optional[Dict[str, Dict[str, str]]] = None,
+        url: Optional[str] = None,
+        wait_timeout: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """
+        Send a request to /v1/responses API.
+
+        Args:
+            input_message: The input prompt/question for the agent
+            model_config: Optional nested model configuration in format:
+                {
+                    "main_model": {"provider": "...", "model": "...", "api_key": "..."},
+                    "mini_model": {"provider": "...", "model": "...", "api_key": "..."},
+                    "nano_model": {"provider": "...", "model": "...", "api_key": "..."}
+                }
+            url: Optional target URL to open the tab at (defaults to about:blank)
+            wait_timeout: Optional timeout in milliseconds to wait for page load (defaults to 5000)
+
+        Returns:
+            Response dictionary with:
+            - success: bool
+            - response: str (extracted response text)
+            - raw_response: list (raw API response)
+            - execution_time_ms: int
+            - error: str (if any)
+
+        Raises:
+            requests.exceptions.RequestException: On API request failures
+        """
+        api_url = f"{self.base_url}/v1/responses"
+
+        # Build request payload
+        payload = {
+            "input": input_message
+        }
+
+        if model_config:
+            payload["model"] = model_config
+
+        if url:
+            payload["url"] = url
+
+        if wait_timeout is not None:
+            payload["wait_timeout"] = wait_timeout
+
+        # Track execution time
+        start_time = time.time()
+
+        try:
+            # Send POST request
+            response = requests.post(
+                api_url,
+                json=payload,
+                timeout=self.timeout,
+                headers={"Content-Type": "application/json"}
+            )
+
+            execution_time_ms = int((time.time() - start_time) * 1000)
+
+            # Check for HTTP errors
+            response.raise_for_status()
+
+            # Parse response
+            response_data = response.json()
+
+            # Extract text from OpenAI Responses API format
+            response_text = self._extract_response_text(response_data)
+
+            return {
+                "success": True,
+                "response": response_text,
+                "raw_response": response_data,
+                "execution_time_ms": execution_time_ms,
+                "error": None
+            }
+
+        except requests.exceptions.Timeout:
+            execution_time_ms = int((time.time() - start_time) * 1000)
+            return {
+                "success": False,
+                "response": None,
+                "raw_response": None,
+                "execution_time_ms": execution_time_ms,
+                "error": f"Request timed out after {self.timeout} seconds"
+            }
+
+        except requests.exceptions.HTTPError as e:
+            execution_time_ms = int((time.time() - start_time) * 1000)
+            error_msg = f"HTTP error: {e.response.status_code}"
+            try:
+                error_details = e.response.json()
+                error_msg += f" - {error_details.get('error', str(error_details))}"
+            except:
+                error_msg += f" - {e.response.text[:200]}"
+
+            return {
+                "success": False,
+                "response": None,
+                "raw_response": None,
+                "execution_time_ms": execution_time_ms,
+                "error": error_msg
+            }
+
+        except requests.exceptions.RequestException as e:
+            execution_time_ms = int((time.time() - start_time) * 1000)
+            return {
+                "success": False,
+                "response": None,
+                "raw_response": None,
+                "execution_time_ms": execution_time_ms,
+                "error": f"Request failed: {str(e)}"
+            }
+
+        except Exception as e:
+            execution_time_ms = int((time.time() - start_time) * 1000)
+            return {
+                "success": False,
+                "response": None,
+                "raw_response": None,
+                "execution_time_ms": execution_time_ms,
+                "error": f"Unexpected error: {str(e)}"
+            }
+
+    def _extract_response_text(self, response_data: Any) -> str:
+        """
+        Extract response text from OpenAI Responses API format.
+
+        Expected format:
+        [
+          {
+            "id": "msg_...",
+            "type": "message",
+            "role": "assistant",
+            "content": [
+              {
+                "type": "output_text",
+                "text": "Response text here",
+                "annotations": []
+              }
+            ]
+          }
+        ]
+
+        Args:
+            response_data: Raw API response
+
+        Returns:
+            Extracted response text
+        """
+        try:
+            if isinstance(response_data, list) and len(response_data) > 0:
+                message = response_data[0]
+                content = message.get('content', [])
+
+                if isinstance(content, list) and len(content) > 0:
+                    for item in content:
+                        if item.get('type') == 'output_text':
+                            return item.get('text', '')
+
+                    # Fallback: return first content item text
+                    return content[0].get('text', '')
+
+            # Fallback: convert to string
+            return str(response_data)
+
+        except Exception as e:
+            return f"[Error extracting response: {e}]"
+
+    def check_health(self) -> bool:
+        """
+        Check if the API server is healthy.
+
+        Returns:
+            True if server is reachable, False otherwise
+        """
+        try:
+            url = f"{self.base_url}/status"
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except:
+            return False
diff --git a/evals/lib/config_loader.py b/evals/lib/config_loader.py
new file mode 100644
index 0000000..da43dfd
--- /dev/null
+++ b/evals/lib/config_loader.py
@@ -0,0 +1,198 @@
+"""
+Configuration loader for evaluation framework.
+Loads config.yml and performs environment variable substitution.
+"""
+
+import os
+import re
+import yaml
+from pathlib import Path
+from typing import Any, Dict
+
+try:
+    from dotenv import load_dotenv
+    DOTENV_AVAILABLE = True
+except ImportError:
+    DOTENV_AVAILABLE = False
+
+
+class ConfigLoader:
+    """Loads and manages evaluation framework configuration."""
+
+    def __init__(self, config_path: str = None):
+        """
+        Initialize config loader.
+
+        Args:
+            config_path: Path to config.yml. If None, looks in evals/config.yml
+        """
+        # Load .env file if it exists
+        if DOTENV_AVAILABLE:
+            script_dir = Path(__file__).parent.parent
+            env_file = script_dir / ".env"
+            if env_file.exists():
+                load_dotenv(env_file)
+
+        if config_path is None:
+            # Default to config.yml in evals directory
+            script_dir = Path(__file__).parent.parent
+            config_path = script_dir / "config.yml"
+
+        self.config_path = Path(config_path)
+        self.config = self._load_config()
+
+    def _load_config(self) -> Dict[str, Any]:
+        """Load configuration from YAML file."""
+        if not self.config_path.exists():
+            raise FileNotFoundError(f"Config file not found: {self.config_path}")
+
+        with open(self.config_path, 'r') as f:
+            config = yaml.safe_load(f)
+
+        # Substitute environment variables
+        config = self._substitute_env_vars(config)
+
+        return config
+
+    def _substitute_env_vars(self, obj: Any) -> Any:
+        """
+        Recursively substitute environment variables in config.
+        Supports ${VAR_NAME} and ${VAR_NAME:-default} syntax.
+
+        Args:
+            obj: Config object (dict, list, str, etc.)
+
+        Returns:
+            Object with environment variables substituted
+        """
+        if isinstance(obj, dict):
+            return {k: self._substitute_env_vars(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [self._substitute_env_vars(item) for item in obj]
+        elif isinstance(obj, str):
+            return self._substitute_env_var_in_string(obj)
+        else:
+            return obj
+
+    def _substitute_env_var_in_string(self, value: str) -> str:
+        """
+        Substitute environment variables in a string.
+
+        Supports:
+        - ${VAR_NAME} - Required variable
+        - ${VAR_NAME:-default} - Variable with default value
+
+        Args:
+            value: String that may contain env var references
+
+        Returns:
+            String with variables substituted
+        """
+        # Pattern for ${VAR_NAME} or ${VAR_NAME:-default}
+        pattern = r'\$\{([A-Z_][A-Z0-9_]*)(:-([^}]*))?\}'
+
+        def replace_match(match):
+            var_name = match.group(1)
+            default_value = match.group(3) if match.group(3) is not None else None
+
+            # Get environment variable
+            env_value = os.getenv(var_name)
+
+            if env_value is not None:
+                return env_value
+            elif default_value is not None:
+                return default_value
+            else:
+                raise ValueError(
+                    f"Environment variable ${{{var_name}}} not found and no default provided"
+                )
+
+        return re.sub(pattern, replace_match, value)
+
+    def get_api_endpoint(self) -> str:
+        """Get API endpoint URL."""
+        return self.config.get('api_endpoint', 'http://localhost:8080')
+
+    def get_model_config(self, model_tier: str) -> Dict[str, str]:
+        """
+        Get model configuration for a specific tier.
+
+        Args:
+            model_tier: One of 'main_model', 'mini_model', 'nano_model', 'judge_model'
+
+        Returns:
+            Dictionary with provider, model_name, api_key
+        """
+        if model_tier not in self.config:
+            raise ValueError(f"Unknown model tier: {model_tier}")
+
+        return self.config[model_tier]
+
+    def get_nested_model_config(self) -> Dict[str, Dict[str, str]]:
+        """
+        Get nested model configuration for API requests.
+
+        Returns:
+            Dictionary in format expected by /v1/responses API:
+            {
+                "main_model": {"provider": "...", "model": "...", "api_key": "..."},
+                "mini_model": {"provider": "...", "model": "...", "api_key": "..."},
+                "nano_model": {"provider": "...", "model": "...", "api_key": "..."}
+            }
+        """
+        result = {}
+
+        for tier in ['main_model', 'mini_model', 'nano_model']:
+            if tier in self.config:
+                model_config = self.config[tier]
+                result[tier] = {
+                    'provider': model_config['provider'],
+                    'model': model_config['model_name'],
+                    'api_key': model_config['api_key']
+                }
+
+        return result
+
+    def get_judge_config(self) -> Dict[str, Any]:
+        """Get judge model configuration."""
+        return self.config.get('judge_model', {})
+
+    def get_execution_config(self) -> Dict[str, Any]:
+        """Get execution settings."""
+        return self.config.get('execution', {})
+
+    def get_reporting_config(self) -> Dict[str, Any]:
+        """Get reporting settings."""
+        return self.config.get('reporting', {})
+
+    def get_default_limit(self) -> int:
+        """Get default limit for number of evaluations to run."""
+        return self.config.get('execution', {}).get('default_limit', 20)
+
+    def get_timeout(self) -> int:
+        """Get timeout for API requests in seconds."""
+        return self.config.get('execution', {}).get('timeout', 300)
+
+    def get_reports_dir(self) -> Path:
+        """Get reports directory path."""
+        reports_dir = self.config.get('reporting', {}).get('reports_dir', 'reports')
+        # Make path relative to config file location
+        config_dir = self.config_path.parent
+        return config_dir / reports_dir
+
+
+# Singleton instance
+_config_loader = None
+
+
+def get_config() -> ConfigLoader:
+    """
+    Get global config loader instance.
+
+    Returns:
+        ConfigLoader instance
+    """
+    global _config_loader
+    if _config_loader is None:
+        _config_loader = ConfigLoader()
+    return _config_loader
diff --git a/evals/lib/eval_loader.py b/evals/lib/eval_loader.py
new file mode 100644
index 0000000..e1bc555
--- /dev/null
+++ b/evals/lib/eval_loader.py
@@ -0,0 +1,274 @@
+"""
+Evaluation loader for discovering and loading YAML evaluation definitions.
+"""
+
+import yaml
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+
+
+class Evaluation:
+    """Represents a single evaluation definition."""
+
+    def __init__(self, file_path: Path, data: Dict[str, Any]):
+        """
+        Initialize evaluation.
+
+        Args:
+            file_path: Path to the YAML file
+            data: Parsed YAML data
+        """
+        self.file_path = file_path
+        self.data = data
+
+        # Extract key fields
+        self.id = data.get('id', file_path.stem)
+        self.name = data.get('name', self.id)
+        self.description = data.get('description', '')
+        self.enabled = data.get('enabled', True)
+        self.tool = data.get('tool', 'unknown')
+        self.timeout = data.get('timeout', 60000)
+
+        # Target configuration
+        self.target = data.get('target', {})
+        self.url = self.target.get('url', '')
+
+        # Input configuration
+        self.input = data.get('input', {})
+
+        # Validation configuration
+        self.validation = data.get('validation', {})
+        self.validation_type = self.validation.get('type', 'llm-judge')
+
+        # Metadata
+        self.metadata = data.get('metadata', {})
+        self.tags = self.metadata.get('tags', [])
+        self.priority = self.metadata.get('priority', 'medium')
+
+        # Determine category from file path
+        self.category = self._determine_category()
+
+    def _determine_category(self) -> str:
+        """Determine evaluation category from file path."""
+        # Get parent directory name
+        parent_dir = self.file_path.parent.name
+        if parent_dir and parent_dir != 'data':
+            return parent_dir
+        return 'unknown'
+
+    def get_input_message(self) -> str:
+        """
+        Extract the input message for the evaluation.
+
+        Returns:
+            Input message/prompt for the agent
+        """
+        # For chat tool, extract message
+        if self.tool == 'chat':
+            return self.input.get('message', '')
+
+        # For action_agent tool, extract objective
+        if self.tool == 'action_agent':
+            return self.input.get('objective', '')
+
+        # For research_agent tool, extract query
+        if self.tool == 'research_agent':
+            return self.input.get('query', '')
+
+        # For web_task_agent, extract task
+        if self.tool == 'web_task_agent':
+            return self.input.get('task', '')
+
+        # For extract_data tools, return instruction
+        if self.tool in ['extract_data', 'extract_schema_streamlined']:
+            return self.input.get('instruction', 'Extract data according to schema')
+
+        # For take_screenshot tool, describe the action
+        if self.tool == 'take_screenshot':
+            full_page = self.input.get('fullPage', False)
+            return f"Take {'full page' if full_page else 'viewport'} screenshot of {self.url}"
+
+        # Fallback: return description
+        return self.description
+
+    def get_validation_criteria(self) -> List[str]:
+        """
+        Get validation criteria for LLM judge.
+
+        Returns:
+            List of criteria strings
+        """
+        llm_judge = self.validation.get('llm_judge', {})
+        return llm_judge.get('criteria', [])
+
+    def get_judge_model(self) -> str:
+        """Get the model specified for judging this evaluation."""
+        llm_judge = self.validation.get('llm_judge', {})
+        return llm_judge.get('model', 'gpt-4.1-mini')
+
+    def get_target_url(self) -> Optional[str]:
+        """
+        Get the target URL for this evaluation.
+
+        Returns:
+            Target URL or None if not specified
+        """
+        url = self.target.get('url', '')
+        return url if url else None
+
+    def get_wait_timeout(self) -> Optional[int]:
+        """
+        Get the wait timeout for page load.
+
+        Returns:
+            Wait timeout in milliseconds or None if not specified
+        """
+        return self.target.get('wait_timeout')
+
+    def is_enabled(self) -> bool:
+        """Check if evaluation is enabled."""
+        return self.enabled
+
+    def __repr__(self):
+        return f"Evaluation(id={self.id}, name={self.name}, tool={self.tool}, category={self.category})"
+
+
+class EvalLoader:
+    """Loads evaluation definitions from YAML files."""
+
+    def __init__(self, data_dir: str = None):
+        """
+        Initialize eval loader.
+
+        Args:
+            data_dir: Path to data directory containing evaluation YAML files.
+                     If None, uses evals/data/
+        """
+        if data_dir is None:
+            # Default to data/ in evals directory
+            script_dir = Path(__file__).parent.parent
+            data_dir = script_dir / "data"
+
+        self.data_dir = Path(data_dir)
+
+        if not self.data_dir.exists():
+            raise FileNotFoundError(f"Data directory not found: {self.data_dir}")
+
+    def load_from_directory(
+        self,
+        category: Optional[str] = None,
+        enabled_only: bool = True
+    ) -> List[Evaluation]:
+        """
+        Load evaluations from directory.
+
+        Args:
+            category: Optional category filter (subdirectory name).
+                     If None, loads from all categories.
+            enabled_only: If True, only return enabled evaluations.
+
+        Returns:
+            List of Evaluation objects
+        """
+        evaluations = []
+
+        if category:
+            # Load from specific category directory
+            category_dir = self.data_dir / category
+            if not category_dir.exists():
+                raise FileNotFoundError(f"Category directory not found: {category_dir}")
+            yaml_files = sorted(category_dir.glob("*.yaml"))
+        else:
+            # Load from all subdirectories
+            yaml_files = sorted(self.data_dir.glob("*/*.yaml"))
+
+        for yaml_file in yaml_files:
+            try:
+                # Skip config.yaml files
+                if yaml_file.name == 'config.yaml':
+                    continue
+
+                with open(yaml_file, 'r') as f:
+                    data = yaml.safe_load(f)
+
+                if data is None:
+                    continue
+
+                evaluation = Evaluation(yaml_file, data)
+
+                # Filter by enabled status
+                if enabled_only and not evaluation.is_enabled():
+                    continue
+
+                evaluations.append(evaluation)
+
+            except Exception as e:
+                print(f"Warning: Failed to load {yaml_file}: {e}")
+                continue
+
+        return evaluations
+
+    def load_by_id(self, eval_id: str) -> Optional[Evaluation]:
+        """
+        Load a specific evaluation by ID.
+
+        Args:
+            eval_id: Evaluation ID to load
+
+        Returns:
+            Evaluation object or None if not found
+        """
+        # Search all YAML files for matching ID
+        for yaml_file in self.data_dir.glob("*/*.yaml"):
+            try:
+                if yaml_file.name == 'config.yaml':
+                    continue
+
+                with open(yaml_file, 'r') as f:
+                    data = yaml.safe_load(f)
+
+                if data and data.get('id') == eval_id:
+                    return Evaluation(yaml_file, data)
+
+            except Exception:
+                continue
+
+        return None
+
+    def get_categories(self) -> List[str]:
+        """
+        Get list of available evaluation categories.
+
+        Returns:
+            List of category names (subdirectory names)
+        """
+        categories = []
+        for item in self.data_dir.iterdir():
+            if item.is_dir() and not item.name.startswith('.'):
+                categories.append(item.name)
+        return sorted(categories)
+
+    def count_evaluations(self, category: Optional[str] = None) -> int:
+        """
+        Count evaluations in a category or all categories.
+
+        Args:
+            category: Optional category filter
+
+        Returns:
+            Number of evaluation files
+        """
+        if category:
+            category_dir = self.data_dir / category
+            if not category_dir.exists():
+                return 0
+            return len(list(category_dir.glob("*.yaml"))) - \
+                   len(list(category_dir.glob("config.yaml")))
+        else:
+            total = 0
+            for subdir in self.data_dir.iterdir():
+                if subdir.is_dir():
+                    yaml_files = list(subdir.glob("*.yaml"))
+                    config_files = list(subdir.glob("config.yaml"))
+                    total += len(yaml_files) - len(config_files)
+            return total
diff --git a/evals/lib/judge.py b/evals/lib/judge.py
new file mode 100644
index 0000000..0878c17
--- /dev/null
+++ b/evals/lib/judge.py
@@ -0,0 +1,244 @@
+"""
+LLM-as-a-judge implementation for evaluating agent responses.
+"""
+
+import json
+from typing import Dict, Any, List
+from openai import OpenAI
+
+
+class JudgeResult:
+    """Result of judging an evaluation."""
+
+    def __init__(
+        self,
+        passed: bool,
+        score: float,
+        reasoning: str,
+        criteria_results: Dict[str, bool] = None
+    ):
+        """
+        Initialize judge result.
+
+        Args:
+            passed: Whether the evaluation passed
+            score: Numerical score (0-1)
+            reasoning: Explanation of the judgment
+            criteria_results: Dict mapping criterion to pass/fail
+        """
+        self.passed = passed
+        self.score = score
+        self.reasoning = reasoning
+        self.criteria_results = criteria_results or {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation."""
+        return {
+            "passed": self.passed,
+            "score": self.score,
+            "reasoning": self.reasoning,
+            "criteria_results": self.criteria_results
+        }
+
+
+class LLMJudge:
+    """LLM-based judge for evaluating agent responses."""
+
+    def __init__(
+        self,
+        provider: str,
+        model_name: str,
+        api_key: str,
+        temperature: float = None
+    ):
+        """
+        Initialize LLM judge.
+
+        Args:
+            provider: Provider name (currently only "openai" supported)
+            model_name: Model name (e.g., "gpt-4")
+            api_key: API key for the provider
+            temperature: Sampling temperature (optional, None uses model default)
+        """
+        self.provider = provider
+        self.model_name = model_name
+        self.api_key = api_key
+        self.temperature = temperature
+
+        if provider == "openai":
+            self.client = OpenAI(api_key=api_key)
+        else:
+            raise ValueError(f"Unsupported judge provider: {provider}")
+
+    def judge(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str]
+    ) -> JudgeResult:
+        """
+        Judge a response against evaluation criteria.
+
+        Args:
+            input_prompt: The original input/prompt sent to the agent
+            response: The agent's response to evaluate
+            criteria: List of criteria strings to evaluate against
+
+        Returns:
+            JudgeResult with pass/fail, score, and reasoning
+        """
+        # Build judgment prompt
+        judge_prompt = self._build_judge_prompt(input_prompt, response, criteria)
+
+        try:
+            # Build API call parameters
+            call_params = {
+                "model": self.model_name,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an expert evaluator assessing AI agent responses. "
+                                   "Provide objective, detailed assessments based on the given criteria."
+                    },
+                    {
+                        "role": "user",
+                        "content": judge_prompt
+                    }
+                ],
+                "response_format": {"type": "json_object"}
+            }
+
+            # Only add temperature if it's specified
+            if self.temperature is not None:
+                call_params["temperature"] = self.temperature
+
+            # Call LLM to judge
+            completion = self.client.chat.completions.create(**call_params)
+
+            # Parse response
+            result_text = completion.choices[0].message.content
+            result_data = json.loads(result_text)
+
+            # Extract fields
+            passed = result_data.get("passed", False)
+            score = result_data.get("score", 0.0)
+            reasoning = result_data.get("reasoning", "")
+            criteria_results = result_data.get("criteria_results", {})
+
+            return JudgeResult(
+                passed=passed,
+                score=score,
+                reasoning=reasoning,
+                criteria_results=criteria_results
+            )
+
+        except Exception as e:
+            # Return failure result on error
+            return JudgeResult(
+                passed=False,
+                score=0.0,
+                reasoning=f"Judge evaluation failed: {str(e)}",
+                criteria_results={}
+            )
+
+    def _build_judge_prompt(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str]
+    ) -> str:
+        """
+        Build the judgment prompt for the LLM.
+
+        Args:
+            input_prompt: Original input
+            response: Agent's response
+            criteria: List of evaluation criteria
+
+        Returns:
+            Formatted prompt string
+        """
+        criteria_list = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
+
+        prompt = f"""Evaluate the following AI agent response against the specified criteria.
+
+## Original Input/Task
+{input_prompt}
+
+## Agent's Response
+{response}
+
+## Evaluation Criteria
+{criteria_list}
+
+## Your Task
+Evaluate whether the agent's response satisfies each criterion. Provide your assessment in JSON format with the following structure:
+
+{{
+  "passed": true/false,  // Overall pass/fail
+  "score": 0.0-1.0,     // Numerical score (0=complete failure, 1=perfect)
+  "reasoning": "Detailed explanation of your assessment",
+  "criteria_results": {{
+    "Criterion 1 text": true/false,
+    "Criterion 2 text": true/false,
+    ...
+  }}
+}}
+
+Be strict but fair in your evaluation. A response should only pass if it genuinely satisfies the criteria.
+"""
+        return prompt
+
+
+class SimpleJudge:
+    """Simple keyword-based judge for basic evaluations (fallback)."""
+
+    def judge(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str]
+    ) -> JudgeResult:
+        """
+        Simple keyword-based judgment.
+
+        Args:
+            input_prompt: Original input
+            response: Agent's response
+            criteria: List of criteria (used as keywords)
+
+        Returns:
+            JudgeResult
+        """
+        if not response:
+            return JudgeResult(
+                passed=False,
+                score=0.0,
+                reasoning="No response provided",
+                criteria_results={}
+            )
+
+        # Check if response contains keywords from criteria
+        response_lower = response.lower()
+        matches = 0
+        total = len(criteria)
+
+        criteria_results = {}
+        for criterion in criteria:
+            # Extract key terms from criterion
+            words = criterion.lower().split()
+            # Check if any significant words appear in response
+            matched = any(word in response_lower for word in words if len(word) > 4)
+            criteria_results[criterion] = matched
+            if matched:
+                matches += 1
+
+        score = matches / total if total > 0 else 0.0
+        passed = score >= 0.7  # 70% threshold
+
+        return JudgeResult(
+            passed=passed,
+            score=score,
+            reasoning=f"Keyword matching: {matches}/{total} criteria matched",
+            criteria_results=criteria_results
+        )
diff --git a/evals/pyproject.toml b/evals/pyproject.toml
new file mode 100644
index 0000000..c1c2e99
--- /dev/null
+++ b/evals/pyproject.toml
@@ -0,0 +1,38 @@
+[project]
+name = "browser-agent-evals"
+version = "0.1.0"
+description = "Evaluation framework for browser automation agents using LLM-as-a-judge"
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+    "PyYAML>=6.0",
+    "requests>=2.31.0",
+    "openai>=1.0.0",
+    "python-dotenv>=1.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "black>=23.0.0",
+    "ruff>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["lib"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "N", "W"]
+ignore = []
+
+[tool.black]
+line-length = 100
+target-version = ["py39"]
diff --git a/evals/requirements.txt b/evals/requirements.txt
new file mode 100644
index 0000000..fc099f9
--- /dev/null
+++ b/evals/requirements.txt
@@ -0,0 +1,13 @@
+# Evaluation Framework Dependencies
+
+# YAML parsing
+PyYAML>=6.0
+
+# HTTP client
+requests>=2.31.0
+
+# OpenAI API client for LLM judge
+openai>=1.0.0
+
+# Environment variable management
+python-dotenv>=1.0.0
diff --git a/evals/run_action_agent.py b/evals/run_action_agent.py
new file mode 100755
index 0000000..52858e5
--- /dev/null
+++ b/evals/run_action_agent.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""
+Action Agent Evaluation Runner
+
+Runs evaluations for action-agent category and generates reports.
+"""
+
+import argparse
+import csv
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+# Add lib directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from lib import (
+    ConfigLoader,
+    EvalLoader,
+    APIClient,
+    LLMJudge,
+    Evaluation,
+    JudgeResult
+)
+
+
+class EvaluationRunner:
+    """Manages evaluation execution and reporting."""
+
+    def __init__(self, config: ConfigLoader):
+        """
+        Initialize evaluation runner.
+
+        Args:
+            config: Configuration loader
+        """
+        self.config = config
+
+        # Initialize components
+        self.eval_loader = EvalLoader()
+        self.api_client = APIClient(
+            base_url=config.get_api_endpoint(),
+            timeout=config.get_timeout()
+        )
+
+        # Initialize judge
+        judge_config = config.get_judge_config()
+        self.judge = LLMJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature', 0.1)
+        )
+
+        # Get nested model config for API requests
+        self.model_config = config.get_nested_model_config()
+
+        # Results tracking
+        self.results = []
+
+    def run_evaluations(
+        self,
+        category: str,
+        limit: int = None,
+        eval_ids: List[str] = None
+    ):
+        """
+        Run evaluations for a specific category.
+
+        Args:
+            category: Category name (e.g., 'action-agent')
+            limit: Maximum number of evaluations to run
+            eval_ids: Optional list of specific evaluation IDs to run
+        """
+        print(f"\n{'='*70}")
+        print(f"Running {category} Evaluations")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Load evaluations
+        print(f"Loading evaluations from {category}...")
+        evaluations = self.eval_loader.load_from_directory(
+            category=category,
+            enabled_only=True
+        )
+
+        # Filter by eval_ids if specified
+        if eval_ids:
+            evaluations = [e for e in evaluations if e.id in eval_ids]
+
+        # Apply limit
+        if limit:
+            evaluations = evaluations[:limit]
+
+        if not evaluations:
+            print(f"No evaluations found in category: {category}")
+            return
+
+        print(f"Found {len(evaluations)} evaluations to run\n")
+
+        # Run each evaluation
+        for i, evaluation in enumerate(evaluations, 1):
+            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+
+            try:
+                result = self._run_single_evaluation(evaluation)
+                self.results.append(result)
+
+                # Print result
+                status = "PASS" if result['passed'] else "FAIL"
+                print(f"  Status: {status}")
+                print(f"  Score: {result['score']:.2f}")
+                print(f"  Time: {result['execution_time_ms']}ms")
+                print()
+
+                # Add delay between requests
+                if i < len(evaluations):
+                    delay = self.config.get_execution_config().get('request_delay', 1)
+                    if delay > 0:
+                        time.sleep(delay)
+
+            except KeyboardInterrupt:
+                print("\n\nInterrupted by user. Saving partial results...")
+                break
+            except Exception as e:
+                print(f"  ERROR: {str(e)}\n")
+                # Record failure
+                self.results.append({
+                    'eval_id': evaluation.id,
+                    'eval_name': evaluation.name,
+                    'category': category,
+                    'passed': False,
+                    'score': 0.0,
+                    'reasoning': f"Execution error: {str(e)}",
+                    'execution_time_ms': 0,
+                    'error': str(e)
+                })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        self._save_report(category)
+
+    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
+        """
+        Run a single evaluation.
+
+        Args:
+            evaluation: Evaluation to run
+
+        Returns:
+            Result dictionary
+        """
+        # Get input message
+        input_message = evaluation.get_input_message()
+
+        # Get target URL and wait timeout
+        target_url = evaluation.get_target_url()
+        wait_timeout = evaluation.get_wait_timeout()
+
+        # Send API request
+        api_response = self.api_client.send_request(
+            input_message=input_message,
+            model_config=self.model_config,
+            url=target_url,
+            wait_timeout=wait_timeout
+        )
+
+        if not api_response['success']:
+            return {
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"API request failed: {api_response['error']}",
+                'execution_time_ms': api_response['execution_time_ms'],
+                'error': api_response['error']
+            }
+
+        # Judge the response
+        criteria = evaluation.get_validation_criteria()
+        judge_result = self.judge.judge(
+            input_prompt=input_message,
+            response=api_response['response'],
+            criteria=criteria
+        )
+
+        return {
+            'eval_id': evaluation.id,
+            'eval_name': evaluation.name,
+            'category': evaluation.category,
+            'passed': judge_result.passed,
+            'score': judge_result.score,
+            'reasoning': judge_result.reasoning,
+            'execution_time_ms': api_response['execution_time_ms'],
+            'error': None
+        }
+
+    def _print_summary(self):
+        """Print summary statistics."""
+        if not self.results:
+            return
+
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r['passed'])
+        failed = total - passed
+        pass_rate = (passed / total) * 100 if total > 0 else 0
+        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
+        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
+
+        print(f"\n{'='*70}")
+        print("Summary")
+        print(f"{'='*70}")
+        print(f"Total: {total}")
+        print(f"Passed: {passed} ({pass_rate:.1f}%)")
+        print(f"Failed: {failed}")
+        print(f"Average Score: {avg_score:.2f}")
+        print(f"Average Time: {avg_time:.0f}ms")
+        print(f"{'='*70}\n")
+
+    def _save_report(self, category: str):
+        """
+        Save evaluation results to CSV report.
+
+        Args:
+            category: Category name for report filename
+        """
+        if not self.results:
+            return
+
+        # Create reports directory
+        reports_dir = self.config.get_reports_dir()
+        reports_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        filename = f"{category}_{timestamp}.csv"
+        filepath = reports_dir / filename
+
+        # Write CSV
+        with open(filepath, 'w', newline='', encoding='utf-8') as f:
+            fieldnames = [
+                'timestamp',
+                'eval_id',
+                'eval_name',
+                'category',
+                'status',
+                'score',
+                'judge_reasoning',
+                'execution_time_ms',
+                'error'
+            ]
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'timestamp': datetime.now().isoformat(),
+                    'eval_id': result['eval_id'],
+                    'eval_name': result['eval_name'],
+                    'category': result['category'],
+                    'status': 'PASS' if result['passed'] else 'FAIL',
+                    'score': f"{result['score']:.2f}",
+                    'judge_reasoning': result['reasoning'],
+                    'execution_time_ms': result['execution_time_ms'],
+                    'error': result.get('error', '')
+                })
+
+        print(f"Report saved to: {filepath}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Run action-agent evaluations"
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Maximum number of evaluations to run (default: all)'
+    )
+    parser.add_argument(
+        '--eval-ids',
+        nargs='+',
+        help='Specific evaluation IDs to run'
+    )
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help='Path to config.yml (default: evals/config.yml)'
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Load configuration
+        config = ConfigLoader(config_path=args.config)
+
+        # Use limit from config if not specified
+        limit = args.limit if args.limit is not None else config.get_default_limit()
+
+        # Create and run evaluation runner
+        runner = EvaluationRunner(config)
+        runner.run_evaluations(
+            category='action-agent',
+            limit=limit,
+            eval_ids=args.eval_ids
+        )
+
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {e}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()

From 54af04e3b5796460c6c1f4116e42b79bfea3e6c0 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Sat, 18 Oct 2025 21:32:54 -0500
Subject: [PATCH 09/24] Evals refactoring. Only simple test works.

---
 eval-server/nodejs/CLAUDE.md                  | 207 ++++++++++-
 .../nodejs/examples/with-http-wrapper.js      |   6 +-
 eval-server/nodejs/src/api-server.js          |  77 ++++
 eval-server/nodejs/src/config.js              |   5 +
 eval-server/nodejs/src/lib/EvalServer.js      | 236 ++++++++++--
 eval-server/nodejs/src/lib/judges/Judge.js    |  80 ----
 eval-server/nodejs/src/lib/judges/LLMJudge.js | 344 ------------------
 evals/data/test-simple/math-001.yaml          |  32 ++
 evals/data/web-task-agent/ecommerce-001.yaml  |  28 +-
 evals/data/web-task-agent/search-001.yaml     |   6 +-
 evals/lib/api_client.py                       | 150 ++++++++
 evals/lib/judge.py                            | 195 ++++++++++
 evals/run_action_agent.py                     |   2 +-
 evals/run_test_simple.py                      | 333 +++++++++++++++++
 evals/test_vision_judge.py                    |  90 +++++
 15 files changed, 1301 insertions(+), 490 deletions(-)
 delete mode 100644 eval-server/nodejs/src/lib/judges/Judge.js
 delete mode 100644 eval-server/nodejs/src/lib/judges/LLMJudge.js
 create mode 100644 evals/data/test-simple/math-001.yaml
 create mode 100755 evals/run_test_simple.py
 create mode 100644 evals/test_vision_judge.py

diff --git a/eval-server/nodejs/CLAUDE.md b/eval-server/nodejs/CLAUDE.md
index 7403353..eb968f0 100644
--- a/eval-server/nodejs/CLAUDE.md
+++ b/eval-server/nodejs/CLAUDE.md
@@ -4,7 +4,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-bo-eval-server is a WebSocket-based evaluation server for LLM agents that implements an LLM-as-a-judge evaluation system. The server accepts connections from AI agents, sends them evaluation tasks via RPC calls, collects their responses, and uses an LLM to judge the quality of responses.
+bo-eval-server is a thin WebSocket and REST API server for LLM agent evaluation. The server provides:
+- WebSocket server for agent connections and RPC communication
+- REST APIs for browser automation via Chrome DevTools Protocol (CDP)
+- Screenshot capture and page content retrieval
+
+**Evaluation orchestration and LLM-as-a-judge logic lives in the separate `evals/` Python project**, which calls these APIs.
 
 ## Commands
 
@@ -49,10 +54,11 @@ bo-eval-server is a WebSocket-based evaluation server for LLM agents that implem
 - Calls `Evaluate(request: String) -> String` method on connected agents
 - Supports `configure_llm` method for dynamic LLM provider configuration
 
-**LLM Evaluator** (`src/evaluator.js`)
-- Integrates with OpenAI API for LLM-as-a-judge functionality
-- Evaluates agent responses on multiple criteria (correctness, completeness, clarity, relevance, helpfulness)
-- Returns structured JSON evaluation with scores and reasoning
+**CDP Integration** (`src/lib/EvalServer.js`)
+- Direct Chrome DevTools Protocol communication
+- Screenshot capture via `Page.captureScreenshot`
+- Page content access via `Runtime.evaluate`
+- Tab management via `Target.createTarget` / `Target.closeTarget`
 
 **Logger** (`src/logger.js`)
 - Structured logging using Winston
@@ -62,12 +68,18 @@ bo-eval-server is a WebSocket-based evaluation server for LLM agents that implem
 
 ### Evaluation Flow
 
+**WebSocket RPC Flow:**
 1. Agent connects to WebSocket server
 2. Agent sends "ready" signal
 3. Server calls agent's `Evaluate` method with a task
 4. Agent processes task and returns response
-5. Server sends response to LLM judge for evaluation
-6. Results are logged as JSON with scores and detailed feedback
+5. Response is returned to caller (evaluation orchestration happens externally in `evals/`)
+
+**REST API Flow (for screenshot/content capture):**
+1. External caller (e.g., Python evals runner) requests screenshot via `POST /page/screenshot`
+2. Server uses CDP to capture screenshot
+3. Returns base64-encoded image data
+4. External caller uses screenshots for LLM-as-a-judge visual verification
 
 ### Project Structure
 
@@ -86,13 +98,29 @@ logs/                  # Log files (created automatically)
 └── evaluations.jsonl  # Evaluation results in JSON Lines format
 ```
 
+### Architecture: Separation of Concerns
+
+**eval-server (Node.js)**: Thin API layer
+- WebSocket server for agent connections
+- JSON-RPC 2.0 bidirectional communication
+- REST APIs for CDP operations (screenshots, page content, tab management)
+- NO evaluation logic, NO judges, NO test orchestration
+
+**evals (Python)**: Evaluation orchestration and judging
+- LLM judges (LLMJudge, VisionJudge) in `lib/judge.py`
+- Evaluation runners that call eval-server APIs
+- Test case definitions (YAML files in `data/`)
+- Result reporting and analysis
+
+This separation keeps eval-server focused on infrastructure while evals/ handles business logic.
+
 ### Key Features
 
 - **Bidirectional RPC**: Server can call methods on connected clients
-- **Multi-Provider LLM Support**: Support for OpenAI, Groq, OpenRouter, and LiteLLM providers
+- **Multi-Provider LLM Support**: Support for OpenAI, Groq, OpenRouter, and LiteLLM providers (configured by clients)
 - **Dynamic LLM Configuration**: Runtime configuration via `configure_llm` JSON-RPC method
 - **Per-Client Configuration**: Each connected client can have different LLM settings
-- **LLM-as-a-Judge**: Automated evaluation of agent responses using configurable LLM providers
+- **CDP Browser Automation**: Screenshot capture, page content access, tab management
 - **Concurrent Evaluations**: Support for multiple agents and parallel evaluations
 - **Structured Logging**: All interactions logged as JSON for analysis
 - **Interactive CLI**: Built-in CLI for testing and server management
@@ -277,21 +305,83 @@ Response format:
 }
 ```
 
+**Get Page Content**
+```bash
+POST /page/content
+Content-Type: application/json
+
+{
+  "clientId": "baseClientId",
+  "tabId": "targetTabId",
+  "format": "html"  // or "text"
+}
+```
+
+Retrieves the HTML or text content of a specific tab.
+
+Response format:
+```json
+{
+  "clientId": "baseClientId",
+  "tabId": "targetTabId",
+  "content": "<html>...</html>",
+  "format": "html",
+  "length": 12345,
+  "timestamp": 1234567890
+}
+```
+
+**Capture Screenshot**
+```bash
+POST /page/screenshot
+Content-Type: application/json
+
+{
+  "clientId": "baseClientId",
+  "tabId": "targetTabId",
+  "fullPage": false
+}
+```
+
+Captures a screenshot of a specific tab.
+
+Response format:
+```json
+{
+  "clientId": "baseClientId",
+  "tabId": "targetTabId",
+  "imageData": "data:image/png;base64,iVBORw0KG...",
+  "format": "png",
+  "fullPage": false,
+  "timestamp": 1234567890
+}
+```
+
 #### Implementation Architecture
 
 **Direct CDP Approach (Current)**
 
-Tab management is implemented using direct Chrome DevTools Protocol (CDP) communication:
+Tab management and page content access are implemented using direct Chrome DevTools Protocol (CDP) communication:
 
 1. Server discovers the CDP WebSocket endpoint via `http://localhost:9223/json/version`
-2. For each command (open/close), a new WebSocket connection is established to the CDP endpoint
+2. For each command, a new WebSocket connection is established to the CDP endpoint
 3. Commands are sent using JSON-RPC 2.0 format:
-   - `Target.createTarget` - Opens new tab
-   - `Target.closeTarget` - Closes existing tab
-4. WebSocket connection is closed after receiving the response
+   - **Browser-level operations** (use `sendCDPCommand`):
+     - `Target.createTarget` - Opens new tab
+     - `Target.closeTarget` - Closes existing tab
+   - **Tab-level operations** (use `sendCDPCommandToTarget`):
+     - `Runtime.evaluate` - Execute JavaScript to get page content
+     - `Page.captureScreenshot` - Capture screenshot of tab
+4. For tab-level operations, the server first attaches to the target, executes the command, then detaches
+5. WebSocket connection is closed after receiving the response
 
 Key implementation files:
-- `src/lib/EvalServer.js` - Contains `sendCDPCommand()`, `openTab()`, and `closeTab()` methods
+- `src/lib/EvalServer.js` - Contains CDP methods:
+  - `sendCDPCommand()` - Browser-level CDP commands
+  - `sendCDPCommandToTarget()` - Tab-level CDP commands (with attach/detach)
+  - `openTab()`, `closeTab()` - Tab management
+  - `getPageHTML()`, `getPageText()` - Page content access
+  - `captureScreenshot()` - Screenshot capture
 - `src/api-server.js` - REST API endpoints that delegate to EvalServer methods
 
 **Alternative Approach Considered**
@@ -314,6 +404,81 @@ The CDP endpoint is accessible at:
 - HTTP: `http://localhost:9223/json/version`
 - WebSocket: `ws://localhost:9223/devtools/browser/{browserId}`
 
+#### Usage Examples
+
+**Complete workflow: Open tab, get content, take screenshot, close tab**
+
+```bash
+# 1. Get list of clients
+curl -X GET http://localhost:8081/clients
+
+# 2. Open a new tab
+curl -X POST http://localhost:8081/tabs/open \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","url":"https://example.com"}'
+
+# Response: {"tabId":"ABC123DEF456",...}
+
+# 3. Get page HTML content
+curl -X POST http://localhost:8081/page/content \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","format":"html"}'
+
+# 4. Get page text content
+curl -X POST http://localhost:8081/page/content \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","format":"text"}'
+
+# 5. Capture screenshot
+curl -X POST http://localhost:8081/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","fullPage":false}'
+
+# 6. Close the tab
+curl -X POST http://localhost:8081/tabs/close \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456"}'
+```
+
+**LLM-as-a-Judge evaluation pattern**
+
+This workflow replicates the DevTools evaluation pattern using the eval-server:
+
+```bash
+# 1. Open tab and navigate to test URL
+TAB_RESPONSE=$(curl -X POST http://localhost:8081/tabs/open \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"CLIENT_ID","url":"https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"}')
+
+TAB_ID=$(echo $TAB_RESPONSE | jq -r '.tabId')
+
+# 2. Capture BEFORE screenshot
+BEFORE_SCREENSHOT=$(curl -X POST http://localhost:8081/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"fullPage\":false}")
+
+# 3. Execute agent action (via /v1/responses or custom endpoint)
+# ... agent performs action ...
+
+# 4. Capture AFTER screenshot
+AFTER_SCREENSHOT=$(curl -X POST http://localhost:8081/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"fullPage\":false}")
+
+# 5. Get page content for verification
+PAGE_CONTENT=$(curl -X POST http://localhost:8081/page/content \
+  -H "Content-Type: application/json" \
+  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"format\":\"text\"}")
+
+# 6. Send to LLM judge with screenshots and content
+# (Use OpenAI Vision API or similar with before/after screenshots)
+
+# 7. Clean up
+curl -X POST http://localhost:8081/tabs/close \
+  -H "Content-Type: application/json" \
+  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\"}"
+```
+
 #### Current Limitations
 
 **⚠️ Known Issue: WebSocket Timeout**
@@ -333,6 +498,13 @@ The CDP endpoint is correctly discovered and accessible, but WebSocket messages
 
 **Workaround**: Until this issue is resolved, tab management via the API is not functional. Manual CDP testing is required to diagnose the root cause.
 
+#### Features Implemented
+
+- ✅ Page HTML/text content access via CDP
+- ✅ Screenshot capture via CDP
+- ✅ Direct CDP communication for tab management
+- ✅ Tab-level CDP command execution with attach/detach
+
 #### Future Enhancements
 
 - Automatic tab registration in ClientManager when DevTools connects
@@ -340,6 +512,11 @@ The CDP endpoint is correctly discovered and accessible, but WebSocket messages
 - Bulk tab operations
 - Tab metadata (title, URL, favicon)
 - Tab grouping and organization
+- Additional CDP methods:
+  - JavaScript execution with custom expressions
+  - DOM tree access (`DOM.getDocument`)
+  - MHTML snapshots (`Page.captureSnapshot`)
+  - PDF generation (`Page.printToPDF`)
 
 ### Configuration
 
diff --git a/eval-server/nodejs/examples/with-http-wrapper.js b/eval-server/nodejs/examples/with-http-wrapper.js
index 78b09f4..a26b0c9 100644
--- a/eval-server/nodejs/examples/with-http-wrapper.js
+++ b/eval-server/nodejs/examples/with-http-wrapper.js
@@ -18,7 +18,7 @@ const evalServer = new EvalServer({
 
 console.log('🔧 Creating HTTP wrapper...');
 const httpWrapper = new HTTPWrapper(evalServer, {
-  port: 8080,
+  port: 8083,
   host: '0.0.0.0'
 });
 
@@ -29,11 +29,11 @@ console.log('✅ EvalServer started on ws://127.0.0.1:8082');
 
 console.log('🔧 Starting HTTP wrapper...');
 await httpWrapper.start();
-console.log('✅ HTTP API started on http://127.0.0.1:8080');
+console.log('✅ HTTP API started on http://127.0.0.1:8083');
 
 console.log('⏳ Waiting for DevTools client to connect...');
 console.log('   WebSocket URL: ws://127.0.0.1:8082');
-console.log('   HTTP API URL: http://127.0.0.1:8080');
+console.log('   HTTP API URL: http://127.0.0.1:8083');
 console.log('   Auth: Disabled (automated mode)');
 
 // Add periodic status check
diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 08be5a4..50b2be1 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -142,6 +142,22 @@ class APIServer {
             result = await this.handleResponsesRequest(JSON.parse(body));
             break;
 
+          case '/page/content':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.getPageContent(JSON.parse(body));
+            break;
+
+          case '/page/screenshot':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.getScreenshot(JSON.parse(body));
+            break;
+
           default:
             this.sendError(res, 404, 'Not found');
             return;
@@ -349,6 +365,67 @@ class APIServer {
     };
   }
 
+  async getPageContent(payload) {
+    const { clientId, tabId, format = 'html' } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    if (!tabId) {
+      throw new Error('Tab ID is required');
+    }
+
+    if (!['html', 'text'].includes(format)) {
+      throw new Error('Format must be either "html" or "text"');
+    }
+
+    const baseClientId = clientId.split(':')[0];
+
+    logger.info('Getting page content', { baseClientId, tabId, format });
+
+    // Call appropriate method based on format
+    const result = format === 'html'
+      ? await this.evaluationServer.getPageHTML(tabId)
+      : await this.evaluationServer.getPageText(tabId);
+
+    return {
+      clientId: baseClientId,
+      tabId: result.tabId,
+      content: result.content,
+      format: result.format,
+      length: result.length,
+      timestamp: Date.now()
+    };
+  }
+
+  async getScreenshot(payload) {
+    const { clientId, tabId, fullPage = false } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    if (!tabId) {
+      throw new Error('Tab ID is required');
+    }
+
+    const baseClientId = clientId.split(':')[0];
+
+    logger.info('Capturing screenshot', { baseClientId, tabId, fullPage });
+
+    const result = await this.evaluationServer.captureScreenshot(tabId, { fullPage });
+
+    return {
+      clientId: baseClientId,
+      tabId: result.tabId,
+      imageData: result.imageData,
+      format: result.format,
+      fullPage: result.fullPage,
+      timestamp: Date.now()
+    };
+  }
+
   /**
    * Handle OpenAI Responses API compatible requests with nested model format
    */
diff --git a/eval-server/nodejs/src/config.js b/eval-server/nodejs/src/config.js
index 3992715..b8945f9 100644
--- a/eval-server/nodejs/src/config.js
+++ b/eval-server/nodejs/src/config.js
@@ -59,6 +59,11 @@ export const CONFIG = {
 
   evals: {
     dir: process.env.EVALS_DIR || './evals'
+  },
+
+  cdp: {
+    host: process.env.CDP_HOST || 'localhost',
+    port: parseInt(process.env.CDP_PORT) || 9222
   }
 };
 
diff --git a/eval-server/nodejs/src/lib/EvalServer.js b/eval-server/nodejs/src/lib/EvalServer.js
index 208ec1f..34db421 100644
--- a/eval-server/nodejs/src/lib/EvalServer.js
+++ b/eval-server/nodejs/src/lib/EvalServer.js
@@ -773,12 +773,6 @@ export class EvalServer extends EventEmitter {
         evaluation.timeout || 45000
       );
 
-      // Validate response if needed and judge is available
-      let validationResult = null;
-      if (evaluation.validation && this.judge) {
-        validationResult = await this.validateResponse(response, evaluation);
-      }
-
       // Update evaluation status
       this.clientManager.updateEvaluationStatus(
         connection.clientId,
@@ -786,7 +780,6 @@ export class EvalServer extends EventEmitter {
         'completed',
         {
           response,
-          validation: validationResult,
           duration: Date.now() - startTime
         }
       );
@@ -798,7 +791,6 @@ export class EvalServer extends EventEmitter {
         name: evaluation.name,
         tool: evaluation.tool,
         response,
-        validation: validationResult,
         timestamp: new Date().toISOString(),
         duration: Date.now() - startTime
       });
@@ -832,7 +824,9 @@ export class EvalServer extends EventEmitter {
    */
   async getCDPBrowserEndpoint() {
     try {
-      const response = await fetch('http://localhost:9223/json/version');
+      const cdpUrl = `http://${CONFIG.cdp.host}:${CONFIG.cdp.port}/json/version`;
+      logger.info('Attempting to connect to CDP', { cdpUrl });
+      const response = await fetch(cdpUrl);
       const data = await response.json();
       return data.webSocketDebuggerUrl;
     } catch (error) {
@@ -920,6 +914,105 @@ export class EvalServer extends EventEmitter {
     });
   }
 
+  /**
+   * Send a CDP command to a specific target (tab)
+   * This requires attaching to the target first, then detaching after
+   * @param {string} targetId - Target ID (tab ID)
+   * @param {string} method - CDP method name
+   * @param {Object} params - CDP method parameters
+   * @returns {Promise<Object>} CDP response
+   */
+  async sendCDPCommandToTarget(targetId, method, params = {}) {
+    return new Promise(async (resolve, reject) => {
+      try {
+        const { default: WebSocket } = await import('ws');
+        const cdpEndpoint = await this.getCDPBrowserEndpoint();
+        const ws = new WebSocket(cdpEndpoint);
+
+        let sessionId = null;
+        const attachId = Math.floor(Math.random() * 1000000);
+        const commandId = Math.floor(Math.random() * 1000000);
+
+        const timeout = setTimeout(() => {
+          ws.close();
+          reject(new Error(`CDP target command timeout: ${method} on ${targetId}`));
+        }, 15000);
+
+        ws.on('open', () => {
+          // First, attach to the target
+          const attachMessage = JSON.stringify({
+            id: attachId,
+            method: 'Target.attachToTarget',
+            params: {
+              targetId,
+              flatten: true
+            }
+          });
+          logger.info('CDP attaching to target', { targetId, method });
+          ws.send(attachMessage);
+        });
+
+        ws.on('message', (data) => {
+          try {
+            const response = JSON.parse(data.toString());
+
+            // Handle attach response
+            if (response.id === attachId) {
+              if (response.error) {
+                clearTimeout(timeout);
+                ws.close();
+                logger.error('CDP attach error', { targetId, error: response.error });
+                reject(new Error(`CDP attach error: ${response.error.message}`));
+                return;
+              }
+
+              sessionId = response.result.sessionId;
+              logger.info('CDP attached to target, sending command', { sessionId, method });
+
+              // Now send the actual command with the session ID
+              const commandMessage = JSON.stringify({
+                id: commandId,
+                method,
+                params,
+                sessionId
+              });
+              ws.send(commandMessage);
+            }
+
+            // Handle command response
+            else if (response.id === commandId) {
+              clearTimeout(timeout);
+
+              if (response.error) {
+                logger.error('CDP target command error', { method, targetId, error: response.error });
+                ws.close();
+                reject(new Error(`CDP error: ${response.error.message}`));
+              } else {
+                logger.info('CDP target command success', { method, targetId });
+                ws.close();
+                resolve(response.result);
+              }
+            }
+            // Ignore other messages (events, etc.)
+          } catch (error) {
+            clearTimeout(timeout);
+            ws.close();
+            logger.error('CDP message parse error', { error: error.message });
+            reject(error);
+          }
+        });
+
+        ws.on('error', (error) => {
+          clearTimeout(timeout);
+          logger.error('CDP WebSocket error', { error: error.message });
+          reject(error);
+        });
+      } catch (error) {
+        reject(error);
+      }
+    });
+  }
+
   /**
    * Open a new tab using CDP directly
    * @param {string} baseClientId - Base client ID (or will be extracted from composite ID)
@@ -1009,43 +1102,120 @@ export class EvalServer extends EventEmitter {
   }
 
   /**
-   * Validate response using configured judge
+   * Get page HTML content using CDP
+   * @param {string} tabId - Tab ID (target ID)
+   * @returns {Promise<Object>} Result with HTML content
    */
-  async validateResponse(response, evaluation) {
-    if (!this.judge) {
-      logger.warn('Validation requested but no judge configured');
+  async getPageHTML(tabId) {
+    try {
+      logger.info('Getting page HTML via CDP', { tabId });
+
+      // Use Runtime.evaluate to get document.documentElement.outerHTML
+      const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', {
+        expression: 'document.documentElement.outerHTML',
+        returnByValue: true
+      });
+
+      const html = result.result.value;
+
+      logger.info('Page HTML retrieved successfully', {
+        tabId,
+        length: html.length
+      });
+
       return {
-        type: 'no-judge',
-        result: { message: 'No judge configured for validation' },
-        passed: true // Assume passed if no judge
+        tabId,
+        content: html,
+        format: 'html',
+        length: html.length
       };
+    } catch (error) {
+      logger.error('Failed to get page HTML via CDP', {
+        tabId,
+        error: error.message
+      });
+      throw error;
     }
+  }
 
-    const validation = evaluation.validation;
+  /**
+   * Get page text content using CDP
+   * @param {string} tabId - Tab ID (target ID)
+   * @returns {Promise<Object>} Result with text content
+   */
+  async getPageText(tabId) {
+    try {
+      logger.info('Getting page text via CDP', { tabId });
 
-    if (validation.type === 'llm-judge' || validation.type === 'hybrid') {
-      const llmConfig = validation.llm_judge || validation.llmJudge;
-      const criteria = llmConfig?.criteria || [];
-      const task = `${evaluation.name} - ${evaluation.description || ''}`;
+      // Use Runtime.evaluate to get document.body.innerText
+      const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', {
+        expression: 'document.body.innerText',
+        returnByValue: true
+      });
 
-      const judgeResult = await this.judge.evaluate(
-        task,
-        JSON.stringify(response.output || response),
-        {
-          criteria,
-          model: llmConfig?.model
-        }
-      );
+      const text = result.result.value;
+
+      logger.info('Page text retrieved successfully', {
+        tabId,
+        length: text.length
+      });
 
       return {
-        type: 'llm-judge',
-        result: judgeResult,
-        passed: judgeResult.score >= 0.7
+        tabId,
+        content: text,
+        format: 'text',
+        length: text.length
       };
+    } catch (error) {
+      logger.error('Failed to get page text via CDP', {
+        tabId,
+        error: error.message
+      });
+      throw error;
     }
+  }
+
+  /**
+   * Capture page screenshot using CDP
+   * @param {string} tabId - Tab ID (target ID)
+   * @param {Object} options - Screenshot options
+   * @param {boolean} options.fullPage - Whether to capture full page (default: false)
+   * @returns {Promise<Object>} Result with screenshot data
+   */
+  async captureScreenshot(tabId, options = {}) {
+    const { fullPage = false } = options;
+
+    try {
+      logger.info('Capturing screenshot via CDP', { tabId, fullPage });
+
+      // Use Page.captureScreenshot
+      const result = await this.sendCDPCommandToTarget(tabId, 'Page.captureScreenshot', {
+        format: 'png',
+        captureBeyondViewport: fullPage
+      });
+
+      const imageData = `data:image/png;base64,${result.data}`;
+
+      logger.info('Screenshot captured successfully', {
+        tabId,
+        dataLength: result.data.length
+      });
 
-    return null;
+      return {
+        tabId,
+        imageData,
+        format: 'png',
+        fullPage
+      };
+    } catch (error) {
+      logger.error('Failed to capture screenshot via CDP', {
+        tabId,
+        error: error.message
+      });
+      throw error;
+    }
   }
+
 }
 
 /**
diff --git a/eval-server/nodejs/src/lib/judges/Judge.js b/eval-server/nodejs/src/lib/judges/Judge.js
deleted file mode 100644
index 83b0f53..0000000
--- a/eval-server/nodejs/src/lib/judges/Judge.js
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-/**
- * Judge - Abstract interface for evaluation judges
- * 
- * A Judge is responsible for evaluating the quality of responses from LLM agents.
- * Different implementations can provide different evaluation strategies.
- */
-export class Judge {
-  /**
-   * Evaluate an agent response against a task
-   * 
-   * @param {string} task - The original task or prompt
-   * @param {string} agentResponse - The response from the agent
-   * @param {Object} options - Additional options for evaluation
-   * @returns {Promise<Object>} Evaluation result with scores and feedback
-   */
-  async evaluate(task, agentResponse, options = {}) {
-    throw new Error('Judge.evaluate() must be implemented by subclass');
-  }
-
-  /**
-   * Get the name of this judge implementation
-   * @returns {string} The judge name
-   */
-  getName() {
-    return this.constructor.name;
-  }
-
-  /**
-   * Get configuration schema for this judge
-   * @returns {Object} Configuration schema
-   */
-  getConfigSchema() {
-    return {};
-  }
-
-  /**
-   * Validate judge configuration
-   * @param {Object} config - Configuration to validate
-   * @returns {boolean} Whether configuration is valid
-   */
-  validateConfig(config) {
-    return true;
-  }
-}
-
-/**
- * Default evaluation result structure
- */
-export const DEFAULT_EVALUATION_RESULT = {
-  overall_score: null,
-  criteria_scores: {},
-  reasoning: '',
-  strengths: [],
-  weaknesses: [],
-  suggestions: [],
-  metadata: {
-    judge: 'unknown',
-    timestamp: null,
-    duration: null
-  }
-};
-
-/**
- * Utility function to create a standardized evaluation result
- */
-export function createEvaluationResult(overrides = {}) {
-  return {
-    ...DEFAULT_EVALUATION_RESULT,
-    ...overrides,
-    metadata: {
-      ...DEFAULT_EVALUATION_RESULT.metadata,
-      ...overrides.metadata,
-      timestamp: new Date().toISOString()
-    }
-  };
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/judges/LLMJudge.js b/eval-server/nodejs/src/lib/judges/LLMJudge.js
deleted file mode 100644
index 9e4c8a5..0000000
--- a/eval-server/nodejs/src/lib/judges/LLMJudge.js
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-import OpenAI from 'openai';
-import { Judge, createEvaluationResult } from './Judge.js';
-import { CONFIG } from '../../config.js';
-import logger from '../../logger.js';
-
-/**
- * LLMJudge - Uses an LLM (like GPT-4) to evaluate agent responses
- * 
- * This is a refactored version of the original LLMEvaluator class,
- * now implementing the Judge interface for better modularity.
- */
-export class LLMJudge extends Judge {
-  constructor(config = {}) {
-    super();
-    
-    this.config = {
-      apiKey: config.apiKey || CONFIG.llm.apiKey,
-      model: config.model || CONFIG.llm.model,
-      temperature: config.temperature || CONFIG.llm.temperature,
-      maxTokens: config.maxTokens || 1000,
-      ...config
-    };
-    
-    if (!this.config.apiKey) {
-      throw new Error('OpenAI API key is required for LLMJudge');
-    }
-    
-    this.openai = new OpenAI({
-      apiKey: this.config.apiKey
-    });
-  }
-
-  /**
-   * Evaluate an agent response using an LLM
-   */
-  async evaluate(task, agentResponse, options = {}) {
-    const startTime = Date.now();
-    
-    try {
-      // Merge options with default config
-      const evalConfig = {
-        criteria: [],
-        model: this.config.model,
-        temperature: this.config.temperature,
-        ...options
-      };
-      
-      const prompt = this.buildEvaluationPrompt(task, agentResponse, evalConfig);
-      
-      const completion = await this.openai.chat.completions.create({
-        model: evalConfig.model,
-        messages: [
-          {
-            role: 'system',
-            content: 'You are an expert evaluator of AI agent responses. Provide objective, detailed evaluations in the requested JSON format.'
-          },
-          {
-            role: 'user',
-            content: prompt
-          }
-        ],
-        temperature: evalConfig.temperature,
-        max_tokens: this.config.maxTokens
-      });
-
-      const evaluation = completion.choices[0].message.content;
-      const usage = completion.usage;
-      const duration = Date.now() - startTime;
-
-      logger.info('LLMJudge: Evaluation completed', {
-        tokens_used: usage.total_tokens,
-        model: evalConfig.model,
-        duration
-      });
-
-      const result = this.parseEvaluation(evaluation);
-      
-      // Add metadata
-      result.metadata = {
-        judge: this.getName(),
-        model: evalConfig.model,
-        timestamp: new Date().toISOString(),
-        duration,
-        tokens_used: usage.total_tokens,
-        criteria: evalConfig.criteria
-      };
-
-      return result;
-      
-    } catch (error) {
-      logger.error('LLMJudge: Evaluation failed', { error: error.message });
-      
-      return createEvaluationResult({
-        overall_score: 0,
-        reasoning: `Evaluation failed: ${error.message}`,
-        metadata: {
-          judge: this.getName(),
-          timestamp: new Date().toISOString(),
-          duration: Date.now() - startTime,
-          error: error.message
-        }
-      });
-    }
-  }
-
-  /**
-   * Build the evaluation prompt
-   */
-  buildEvaluationPrompt(task, agentResponse, config) {
-    const { criteria } = config;
-    
-    let prompt = `Please evaluate the following AI agent response to a given task.
-
-TASK:
-${task}
-
-AGENT RESPONSE:
-${agentResponse}
-
-Please evaluate the response on the following criteria and provide a JSON response:
-
-`;
-
-    // Use custom criteria if provided, otherwise use default criteria
-    if (criteria && criteria.length > 0) {
-      criteria.forEach((criterion, index) => {
-        prompt += `${index + 1}. **${criterion}**: Evaluate how well the response meets this criterion\n`;
-      });
-    } else {
-      prompt += `1. **Correctness**: Is the response factually accurate and correct?
-2. **Completeness**: Does the response fully address the task?
-3. **Clarity**: Is the response clear and well-structured?
-4. **Relevance**: Is the response relevant to the task?
-5. **Helpfulness**: How helpful is the response to the user?
-`;
-    }
-
-    prompt += `
-Provide your evaluation in the following JSON format:
-{
-  "overall_score": <score from 0-10>,
-  "criteria_scores": {`;
-    
-    if (criteria && criteria.length > 0) {
-      criteria.forEach((criterion, index) => {
-        const key = criterion.toLowerCase().replace(/[^a-z0-9]/g, '_');
-        prompt += `\n    "${key}": <score from 0-10>`;
-        if (index < criteria.length - 1) prompt += ',';
-      });
-    } else {
-      prompt += `
-    "correctness": <score from 0-10>,
-    "completeness": <score from 0-10>,
-    "clarity": <score from 0-10>,
-    "relevance": <score from 0-10>,
-    "helpfulness": <score from 0-10>`;
-    }
-    
-    prompt += `
-  },
-  "reasoning": "<detailed explanation of your evaluation>",
-  "strengths": ["<list of strengths>"],
-  "weaknesses": ["<list of weaknesses>"],
-  "suggestions": ["<list of improvement suggestions>"]
-}`;
-
-    return prompt;
-  }
-
-  /**
-   * Parse the LLM evaluation response
-   */
-  parseEvaluation(evaluationText) {
-    try {
-      // Try to extract JSON from the response
-      const jsonMatch = evaluationText.match(/\{[\s\S]*\}/);
-      if (jsonMatch) {
-        const parsedResult = JSON.parse(jsonMatch[0]);
-        
-        // Validate and normalize the result
-        return createEvaluationResult({
-          overall_score: this.normalizeScore(parsedResult.overall_score),
-          criteria_scores: this.normalizeCriteriaScores(parsedResult.criteria_scores || {}),
-          reasoning: parsedResult.reasoning || '',
-          strengths: Array.isArray(parsedResult.strengths) ? parsedResult.strengths : [],
-          weaknesses: Array.isArray(parsedResult.weaknesses) ? parsedResult.weaknesses : [],
-          suggestions: Array.isArray(parsedResult.suggestions) ? parsedResult.suggestions : [],
-          raw_evaluation: evaluationText
-        });
-      }
-      
-      // If no JSON found, return a structured response with the raw text
-      return createEvaluationResult({
-        overall_score: null,
-        criteria_scores: {},
-        reasoning: evaluationText,
-        strengths: [],
-        weaknesses: [],
-        suggestions: [],
-        raw_evaluation: evaluationText
-      });
-      
-    } catch (error) {
-      logger.warn('LLMJudge: Failed to parse evaluation JSON', { error: error.message });
-      
-      return createEvaluationResult({
-        overall_score: null,
-        criteria_scores: {},
-        reasoning: evaluationText,
-        strengths: [],
-        weaknesses: [],
-        suggestions: [],
-        raw_evaluation: evaluationText,
-        parse_error: error.message
-      });
-    }
-  }
-
-  /**
-   * Normalize score to be between 0 and 10
-   */
-  normalizeScore(score) {
-    if (typeof score !== 'number' || isNaN(score)) {
-      return null;
-    }
-    
-    // Clamp score between 0 and 10
-    return Math.max(0, Math.min(10, score));
-  }
-
-  /**
-   * Normalize criteria scores
-   */
-  normalizeCriteriaScores(scores) {
-    const normalized = {};
-    
-    for (const [criterion, score] of Object.entries(scores)) {
-      normalized[criterion] = this.normalizeScore(score);
-    }
-    
-    return normalized;
-  }
-
-  /**
-   * Get configuration schema
-   */
-  getConfigSchema() {
-    return {
-      type: 'object',
-      properties: {
-        apiKey: {
-          type: 'string',
-          description: 'OpenAI API key'
-        },
-        model: {
-          type: 'string',
-          description: 'OpenAI model to use for evaluation',
-          default: 'gpt-4'
-        },
-        temperature: {
-          type: 'number',
-          description: 'Temperature for LLM generation',
-          minimum: 0,
-          maximum: 2,
-          default: 0.1
-        },
-        maxTokens: {
-          type: 'number',
-          description: 'Maximum tokens for evaluation response',
-          minimum: 100,
-          maximum: 4000,
-          default: 1000
-        }
-      },
-      required: ['apiKey']
-    };
-  }
-
-  /**
-   * Validate configuration
-   */
-  validateConfig(config) {
-    if (!config.apiKey) {
-      throw new Error('LLMJudge requires an API key');
-    }
-    
-    if (config.temperature !== undefined) {
-      if (typeof config.temperature !== 'number' || config.temperature < 0 || config.temperature > 2) {
-        throw new Error('Temperature must be a number between 0 and 2');
-      }
-    }
-    
-    if (config.maxTokens !== undefined) {
-      if (typeof config.maxTokens !== 'number' || config.maxTokens < 100 || config.maxTokens > 4000) {
-        throw new Error('maxTokens must be a number between 100 and 4000');
-      }
-    }
-    
-    return true;
-  }
-
-  /**
-   * Get available OpenAI models for evaluation
-   */
-  async getAvailableModels() {
-    try {
-      const models = await this.openai.models.list();
-      return models.data
-        .filter(model => model.id.includes('gpt'))
-        .map(model => model.id)
-        .sort();
-    } catch (error) {
-      logger.error('LLMJudge: Failed to fetch available models', { error: error.message });
-      return ['gpt-4', 'gpt-3.5-turbo']; // Fallback list
-    }
-  }
-
-  /**
-   * Test the judge with a simple evaluation
-   */
-  async test() {
-    const testTask = 'Summarize the main points of artificial intelligence';
-    const testResponse = 'AI is a technology that enables machines to perform tasks that typically require human intelligence, such as learning, reasoning, and problem-solving.';
-    
-    try {
-      const result = await this.evaluate(testTask, testResponse);
-      return {
-        success: true,
-        result,
-        message: 'LLMJudge test completed successfully'
-      };
-    } catch (error) {
-      return {
-        success: false,
-        error: error.message,
-        message: 'LLMJudge test failed'
-      };
-    }
-  }
-}
\ No newline at end of file
diff --git a/evals/data/test-simple/math-001.yaml b/evals/data/test-simple/math-001.yaml
new file mode 100644
index 0000000..c5096db
--- /dev/null
+++ b/evals/data/test-simple/math-001.yaml
@@ -0,0 +1,32 @@
+# Simple math test
+id: "math-001"
+name: "Simple Math 5x7"
+description: "Very simple test - calculate 5 times 7"
+enabled: true
+
+target:
+  url: "about:blank"
+  wait_for: "networkidle"
+  wait_timeout: 1000
+
+tool: "chat"
+timeout: 10000
+
+input:
+  message: "How much is 5x7? Just respond with the number."
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    criteria:
+      - "Response contains the number 35"
+      - "Response is mathematically correct"
+
+metadata:
+  tags: ["test", "simple", "math"]
+  priority: "high"
+  timeout: 10000
+  retries: 0
+  flaky: false
+  owner: "test"
diff --git a/evals/data/web-task-agent/ecommerce-001.yaml b/evals/data/web-task-agent/ecommerce-001.yaml
index 338f464..87d965b 100644
--- a/evals/data/web-task-agent/ecommerce-001.yaml
+++ b/evals/data/web-task-agent/ecommerce-001.yaml
@@ -33,20 +33,26 @@ input:
 
 
 validation:
-  type: "hybrid"
+  type: "llm-judge"
   llm_judge:
     model: "gpt-4o"
     criteria:
-      - "Successfully navigated to product search"
-      - "Applied appropriate filters correctly"
-      - "Extracted product details accurately"
-      - "Provided meaningful comparison of features"
-      - "Stayed within specified price range"
-  snapshot:
-    structure_only: true
-    exclude_paths:
-      - "timestamp"
-      - "sessionId"
+      - "Successfully found wireless headphones products on Amazon"
+      - "Returned products that are under $100 as requested"
+      - "Each product includes name, price, rating, and URL fields"
+      - "Results are presented in clear, structured text format (not JSON)"
+      - "All products listed are relevant to \"wireless headphones\""
+      - "Price information is clearly stated for each product"
+      - "Products are numbered or bulleted for easy reading"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Amazon search results are showing wireless headphones"
+        - "Check that visible products are under $100 price range"
+        - "Confirm product listings include name, price, and rating information"
+        - "Ensure search and filtering workflow completed successfully"
 
 metadata:
   tags: ["web-task", "multi-step", "ecommerce", "search"]
diff --git a/evals/data/web-task-agent/search-001.yaml b/evals/data/web-task-agent/search-001.yaml
index da3a4eb..ee164b9 100644
--- a/evals/data/web-task-agent/search-001.yaml
+++ b/evals/data/web-task-agent/search-001.yaml
@@ -28,9 +28,9 @@ validation:
       - "Response includes a brief summary or conclusion statement"
     visual_verification:
       enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
         - "Verify search was completed and results page is showing"
         - "Check that search results are related to \"Chrome DevTools automation\""
         - "Confirm at least 3 search results are visible on the page"
diff --git a/evals/lib/api_client.py b/evals/lib/api_client.py
index 2214710..af6bdf7 100644
--- a/evals/lib/api_client.py
+++ b/evals/lib/api_client.py
@@ -192,6 +192,156 @@ def _extract_response_text(self, response_data: Any) -> str:
         except Exception as e:
             return f"[Error extracting response: {e}]"
 
+    def capture_screenshot(
+        self,
+        client_id: str,
+        tab_id: str,
+        full_page: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Capture a screenshot of a specific tab.
+
+        Args:
+            client_id: Base client ID
+            tab_id: Tab ID to capture
+            full_page: Whether to capture the full page (default: False)
+
+        Returns:
+            Dict with:
+            - success: bool
+            - image_data: str (base64 data URL) if successful
+            - error: str (if any)
+        """
+        api_url = f"{self.base_url}/page/screenshot"
+
+        payload = {
+            "clientId": client_id,
+            "tabId": tab_id,
+            "fullPage": full_page
+        }
+
+        try:
+            response = requests.post(
+                api_url,
+                json=payload,
+                timeout=self.timeout,
+                headers={"Content-Type": "application/json"}
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            return {
+                "success": True,
+                "image_data": result.get("imageData"),
+                "format": result.get("format", "png"),
+                "error": None
+            }
+
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "image_data": None,
+                "error": f"Screenshot request timed out after {self.timeout} seconds"
+            }
+
+        except requests.exceptions.HTTPError as e:
+            error_msg = f"HTTP error: {e.response.status_code}"
+            try:
+                error_details = e.response.json()
+                error_msg += f" - {error_details.get('error', str(error_details))}"
+            except:
+                error_msg += f" - {e.response.text[:200]}"
+
+            return {
+                "success": False,
+                "image_data": None,
+                "error": error_msg
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "image_data": None,
+                "error": f"Screenshot failed: {str(e)}"
+            }
+
+    def get_page_content(
+        self,
+        client_id: str,
+        tab_id: str,
+        format: str = "html"
+    ) -> Dict[str, Any]:
+        """
+        Get page content (HTML or text) from a specific tab.
+
+        Args:
+            client_id: Base client ID
+            tab_id: Tab ID to get content from
+            format: Content format - "html" or "text" (default: "html")
+
+        Returns:
+            Dict with:
+            - success: bool
+            - content: str (page content) if successful
+            - format: str (content format)
+            - error: str (if any)
+        """
+        api_url = f"{self.base_url}/page/content"
+
+        payload = {
+            "clientId": client_id,
+            "tabId": tab_id,
+            "format": format
+        }
+
+        try:
+            response = requests.post(
+                api_url,
+                json=payload,
+                timeout=self.timeout,
+                headers={"Content-Type": "application/json"}
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            return {
+                "success": True,
+                "content": result.get("content"),
+                "format": result.get("format", format),
+                "length": result.get("length", 0),
+                "error": None
+            }
+
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "content": None,
+                "error": f"Content request timed out after {self.timeout} seconds"
+            }
+
+        except requests.exceptions.HTTPError as e:
+            error_msg = f"HTTP error: {e.response.status_code}"
+            try:
+                error_details = e.response.json()
+                error_msg += f" - {error_details.get('error', str(error_details))}"
+            except:
+                error_msg += f" - {e.response.text[:200]}"
+
+            return {
+                "success": False,
+                "content": None,
+                "error": error_msg
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "content": None,
+                "error": f"Content retrieval failed: {str(e)}"
+            }
+
     def check_health(self) -> bool:
         """
         Check if the API server is healthy.
diff --git a/evals/lib/judge.py b/evals/lib/judge.py
index 0878c17..400b07a 100644
--- a/evals/lib/judge.py
+++ b/evals/lib/judge.py
@@ -190,6 +190,201 @@ def _build_judge_prompt(
         return prompt
 
 
+class VisionJudge:
+    """Vision-capable LLM judge for evaluating agent responses with screenshots."""
+
+    def __init__(
+        self,
+        provider: str,
+        model_name: str,
+        api_key: str,
+        temperature: float = None
+    ):
+        """
+        Initialize Vision judge.
+
+        Args:
+            provider: Provider name (currently only "openai" supported)
+            model_name: Model name (e.g., "gpt-4o", "gpt-4-vision-preview")
+            api_key: API key for the provider
+            temperature: Sampling temperature (optional, None uses model default)
+        """
+        self.provider = provider
+        self.model_name = model_name
+        self.api_key = api_key
+        self.temperature = temperature
+
+        if provider == "openai":
+            self.client = OpenAI(api_key=api_key)
+        else:
+            raise ValueError(f"Unsupported judge provider: {provider}")
+
+    def judge(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str],
+        screenshots: Dict[str, str] = None,
+        verification_prompts: List[str] = None
+    ) -> JudgeResult:
+        """
+        Judge a response against evaluation criteria with visual verification.
+
+        Args:
+            input_prompt: The original input/prompt sent to the agent
+            response: The agent's response to evaluate
+            criteria: List of criteria strings to evaluate against
+            screenshots: Dict with 'before' and/or 'after' screenshot base64 data URLs
+            verification_prompts: Optional list of visual verification prompts
+
+        Returns:
+            JudgeResult with pass/fail, score, and reasoning
+        """
+        # Build judgment prompt
+        judge_prompt = self._build_judge_prompt(
+            input_prompt,
+            response,
+            criteria,
+            verification_prompts or []
+        )
+
+        # Build message content with text and images
+        content = [{"type": "text", "text": judge_prompt}]
+
+        # Add screenshots if provided
+        if screenshots:
+            if screenshots.get("before"):
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": screenshots["before"], "detail": "auto"}
+                })
+                content.append({
+                    "type": "text",
+                    "text": "BEFORE Screenshot: The page state before the agent action"
+                })
+
+            if screenshots.get("after"):
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": screenshots["after"], "detail": "auto"}
+                })
+                content.append({
+                    "type": "text",
+                    "text": "AFTER Screenshot: The page state after the agent action"
+                })
+
+        try:
+            # Build API call parameters
+            call_params = {
+                "model": self.model_name,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an expert evaluator assessing AI agent responses with visual verification capabilities. "
+                                   "Analyze both text responses and screenshots to provide objective, detailed assessments based on the given criteria."
+                    },
+                    {
+                        "role": "user",
+                        "content": content
+                    }
+                ],
+                "response_format": {"type": "json_object"}
+            }
+
+            # Only add temperature if it's specified
+            if self.temperature is not None:
+                call_params["temperature"] = self.temperature
+
+            # Call LLM to judge
+            completion = self.client.chat.completions.create(**call_params)
+
+            # Parse response
+            result_text = completion.choices[0].message.content
+            result_data = json.loads(result_text)
+
+            # Extract fields
+            passed = result_data.get("passed", False)
+            score = result_data.get("score", 0.0)
+            reasoning = result_data.get("reasoning", "")
+            criteria_results = result_data.get("criteria_results", {})
+
+            return JudgeResult(
+                passed=passed,
+                score=score,
+                reasoning=reasoning,
+                criteria_results=criteria_results
+            )
+
+        except Exception as e:
+            # Return failure result on error
+            return JudgeResult(
+                passed=False,
+                score=0.0,
+                reasoning=f"Vision judge evaluation failed: {str(e)}",
+                criteria_results={}
+            )
+
+    def _build_judge_prompt(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str],
+        verification_prompts: List[str]
+    ) -> str:
+        """
+        Build the judgment prompt for the vision LLM.
+
+        Args:
+            input_prompt: Original input
+            response: Agent's response
+            criteria: List of evaluation criteria
+            verification_prompts: List of visual verification prompts
+
+        Returns:
+            Formatted prompt string
+        """
+        criteria_list = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
+
+        prompt = f"""Evaluate the following AI agent response against the specified criteria.
+
+## Original Input/Task
+{input_prompt}
+
+## Agent's Response
+{response}
+"""
+
+        # Add visual verification prompts if provided
+        if verification_prompts:
+            verification_list = "\n".join([f"{i+1}. {p}" for i, p in enumerate(verification_prompts)])
+            prompt += f"""
+## Visual Verification Prompts
+{verification_list}
+"""
+
+        prompt += f"""
+## Evaluation Criteria
+{criteria_list}
+
+## Your Task
+Evaluate whether the agent's response satisfies each criterion. Use the screenshots (if provided) to verify the visual state of the page before and after the agent's action. Provide your assessment in JSON format with the following structure:
+
+{{
+  "passed": true/false,  // Overall pass/fail
+  "score": 0.0-1.0,     // Numerical score (0=complete failure, 1=perfect)
+  "reasoning": "Detailed explanation of your assessment including visual analysis",
+  "criteria_results": {{
+    "Criterion 1 text": true/false,
+    "Criterion 2 text": true/false,
+    ...
+  }}
+}}
+
+Be strict but fair in your evaluation. A response should only pass if it genuinely satisfies the criteria.
+"""
+        return prompt
+
+
 class SimpleJudge:
     """Simple keyword-based judge for basic evaluations (fallback)."""
 
diff --git a/evals/run_action_agent.py b/evals/run_action_agent.py
index 52858e5..cbe115e 100755
--- a/evals/run_action_agent.py
+++ b/evals/run_action_agent.py
@@ -51,7 +51,7 @@ def __init__(self, config: ConfigLoader):
             provider=judge_config['provider'],
             model_name=judge_config['model_name'],
             api_key=judge_config['api_key'],
-            temperature=judge_config.get('temperature', 0.1)
+            temperature=judge_config.get('temperature')  # None by default for GPT-5 compatibility
         )
 
         # Get nested model config for API requests
diff --git a/evals/run_test_simple.py b/evals/run_test_simple.py
new file mode 100755
index 0000000..72dd37b
--- /dev/null
+++ b/evals/run_test_simple.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""
+Test Simple Evaluation Runner
+
+Runs evaluations for test-simple category and generates reports.
+"""
+
+import argparse
+import csv
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+# Add lib directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from lib import (
+    ConfigLoader,
+    EvalLoader,
+    APIClient,
+    LLMJudge,
+    Evaluation,
+    JudgeResult
+)
+
+
+class EvaluationRunner:
+    """Manages evaluation execution and reporting."""
+
+    def __init__(self, config: ConfigLoader):
+        """
+        Initialize evaluation runner.
+
+        Args:
+            config: Configuration loader
+        """
+        self.config = config
+
+        # Initialize components
+        self.eval_loader = EvalLoader()
+        self.api_client = APIClient(
+            base_url=config.get_api_endpoint(),
+            timeout=config.get_timeout()
+        )
+
+        # Initialize judge
+        judge_config = config.get_judge_config()
+        self.judge = LLMJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature')  # None by default for GPT-5 compatibility
+        )
+
+        # Get nested model config for API requests
+        self.model_config = config.get_nested_model_config()
+
+        # Results tracking
+        self.results = []
+
+    def run_evaluations(
+        self,
+        category: str,
+        limit: int = None,
+        eval_ids: List[str] = None
+    ):
+        """
+        Run evaluations for a specific category.
+
+        Args:
+            category: Category name (e.g., 'test-simple')
+            limit: Maximum number of evaluations to run
+            eval_ids: Optional list of specific evaluation IDs to run
+        """
+        print(f"\n{'='*70}")
+        print(f"Running {category} Evaluations")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Load evaluations
+        print(f"Loading evaluations from {category}...")
+        evaluations = self.eval_loader.load_from_directory(
+            category=category,
+            enabled_only=True
+        )
+
+        # Filter by eval_ids if specified
+        if eval_ids:
+            evaluations = [e for e in evaluations if e.id in eval_ids]
+
+        # Apply limit
+        if limit:
+            evaluations = evaluations[:limit]
+
+        if not evaluations:
+            print(f"No evaluations found in category: {category}")
+            return
+
+        print(f"Found {len(evaluations)} evaluations to run\n")
+
+        # Run each evaluation
+        for i, evaluation in enumerate(evaluations, 1):
+            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+
+            try:
+                result = self._run_single_evaluation(evaluation)
+                self.results.append(result)
+
+                # Print result
+                status = "PASS" if result['passed'] else "FAIL"
+                print(f"  Status: {status}")
+                print(f"  Score: {result['score']:.2f}")
+                print(f"  Time: {result['execution_time_ms']}ms")
+                print()
+
+                # Add delay between requests
+                if i < len(evaluations):
+                    delay = self.config.get_execution_config().get('request_delay', 1)
+                    if delay > 0:
+                        time.sleep(delay)
+
+            except KeyboardInterrupt:
+                print("\n\nInterrupted by user. Saving partial results...")
+                break
+            except Exception as e:
+                print(f"  ERROR: {str(e)}\n")
+                # Record failure
+                self.results.append({
+                    'eval_id': evaluation.id,
+                    'eval_name': evaluation.name,
+                    'category': category,
+                    'passed': False,
+                    'score': 0.0,
+                    'reasoning': f"Execution error: {str(e)}",
+                    'execution_time_ms': 0,
+                    'error': str(e)
+                })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        self._save_report(category)
+
+    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
+        """
+        Run a single evaluation.
+
+        Args:
+            evaluation: Evaluation to run
+
+        Returns:
+            Result dictionary
+        """
+        # Get input message
+        input_message = evaluation.get_input_message()
+
+        # Get target URL and wait timeout
+        target_url = evaluation.get_target_url()
+        wait_timeout = evaluation.get_wait_timeout()
+
+        # Send API request
+        api_response = self.api_client.send_request(
+            input_message=input_message,
+            model_config=self.model_config,
+            url=target_url,
+            wait_timeout=wait_timeout
+        )
+
+        if not api_response['success']:
+            return {
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"API request failed: {api_response['error']}",
+                'execution_time_ms': api_response['execution_time_ms'],
+                'error': api_response['error']
+            }
+
+        # Judge the response
+        criteria = evaluation.get_validation_criteria()
+        judge_result = self.judge.judge(
+            input_prompt=input_message,
+            response=api_response['response'],
+            criteria=criteria
+        )
+
+        return {
+            'eval_id': evaluation.id,
+            'eval_name': evaluation.name,
+            'category': evaluation.category,
+            'passed': judge_result.passed,
+            'score': judge_result.score,
+            'reasoning': judge_result.reasoning,
+            'execution_time_ms': api_response['execution_time_ms'],
+            'error': None
+        }
+
+    def _print_summary(self):
+        """Print summary statistics."""
+        if not self.results:
+            return
+
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r['passed'])
+        failed = total - passed
+        pass_rate = (passed / total) * 100 if total > 0 else 0
+        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
+        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
+
+        print(f"\n{'='*70}")
+        print("Summary")
+        print(f"{'='*70}")
+        print(f"Total: {total}")
+        print(f"Passed: {passed} ({pass_rate:.1f}%)")
+        print(f"Failed: {failed}")
+        print(f"Average Score: {avg_score:.2f}")
+        print(f"Average Time: {avg_time:.0f}ms")
+        print(f"{'='*70}\n")
+
+    def _save_report(self, category: str):
+        """
+        Save evaluation results to CSV report.
+
+        Args:
+            category: Category name for report filename
+        """
+        if not self.results:
+            return
+
+        # Create reports directory
+        reports_dir = self.config.get_reports_dir()
+        reports_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        filename = f"{category}_{timestamp}.csv"
+        filepath = reports_dir / filename
+
+        # Write CSV
+        with open(filepath, 'w', newline='', encoding='utf-8') as f:
+            fieldnames = [
+                'timestamp',
+                'eval_id',
+                'eval_name',
+                'category',
+                'status',
+                'score',
+                'judge_reasoning',
+                'execution_time_ms',
+                'error'
+            ]
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'timestamp': datetime.now().isoformat(),
+                    'eval_id': result['eval_id'],
+                    'eval_name': result['eval_name'],
+                    'category': result['category'],
+                    'status': 'PASS' if result['passed'] else 'FAIL',
+                    'score': f"{result['score']:.2f}",
+                    'judge_reasoning': result['reasoning'],
+                    'execution_time_ms': result['execution_time_ms'],
+                    'error': result.get('error', '')
+                })
+
+        print(f"Report saved to: {filepath}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Run test-simple evaluations"
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Maximum number of evaluations to run (default: all)'
+    )
+    parser.add_argument(
+        '--eval-ids',
+        nargs='+',
+        help='Specific evaluation IDs to run'
+    )
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help='Path to config.yml (default: evals/config.yml)'
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Load configuration
+        config = ConfigLoader(config_path=args.config)
+
+        # Use limit from config if not specified
+        limit = args.limit if args.limit is not None else config.get_default_limit()
+
+        # Create and run evaluation runner
+        runner = EvaluationRunner(config)
+        runner.run_evaluations(
+            category='test-simple',
+            limit=limit,
+            eval_ids=args.eval_ids
+        )
+
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {e}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/test_vision_judge.py b/evals/test_vision_judge.py
new file mode 100644
index 0000000..290db07
--- /dev/null
+++ b/evals/test_vision_judge.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Test script for VisionJudge functionality.
+"""
+
+import os
+from lib.judge import VisionJudge, JudgeResult
+from lib.api_client import APIClient
+
+def test_vision_judge_creation():
+    """Test that VisionJudge can be created."""
+    print("Testing VisionJudge creation...")
+
+    # Create judge with dummy API key (won't actually call API in this test)
+    judge = VisionJudge(
+        provider="openai",
+        model_name="gpt-4o",
+        api_key="test-key-12345"
+    )
+
+    print("✅ VisionJudge created successfully")
+    print(f"   Provider: {judge.provider}")
+    print(f"   Model: {judge.model_name}")
+    return judge
+
+def test_api_client_methods():
+    """Test that APIClient has screenshot methods."""
+    print("\nTesting APIClient methods...")
+
+    client = APIClient("http://localhost:8081")
+
+    # Check methods exist
+    assert hasattr(client, 'capture_screenshot'), "Missing capture_screenshot method"
+    assert hasattr(client, 'get_page_content'), "Missing get_page_content method"
+
+    print("✅ APIClient has required methods:")
+    print("   - capture_screenshot(client_id, tab_id, full_page)")
+    print("   - get_page_content(client_id, tab_id, format)")
+
+def test_vision_judge_signature():
+    """Test VisionJudge.judge() method signature."""
+    print("\nTesting VisionJudge.judge() signature...")
+
+    # Get method signature
+    import inspect
+    judge = VisionJudge(provider="openai", model_name="gpt-4o", api_key="test")
+    sig = inspect.signature(judge.judge)
+
+    params = list(sig.parameters.keys())
+    print(f"✅ VisionJudge.judge() parameters: {params}")
+
+    # Verify expected parameters
+    assert 'input_prompt' in params
+    assert 'response' in params
+    assert 'criteria' in params
+    assert 'screenshots' in params
+    assert 'verification_prompts' in params
+
+    print("   All expected parameters present:")
+    print("   - input_prompt: str")
+    print("   - response: str")
+    print("   - criteria: List[str]")
+    print("   - screenshots: Dict[str, str] = None")
+    print("   - verification_prompts: List[str] = None")
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Vision Judge Implementation Test")
+    print("=" * 60)
+
+    try:
+        test_vision_judge_creation()
+        test_api_client_methods()
+        test_vision_judge_signature()
+
+        print("\n" + "=" * 60)
+        print("✅ All tests passed!")
+        print("=" * 60)
+        print("\nNext steps to test with real API:")
+        print("1. Start eval-server: cd ../eval-server/nodejs && npm start")
+        print("2. Start browser with CDP: chromium --remote-debugging-port=9223")
+        print("3. Connect an agent to eval-server")
+        print("4. Use APIClient.capture_screenshot() to get screenshots")
+        print("5. Use VisionJudge.judge() with screenshots for evaluation")
+
+    except Exception as e:
+        print(f"\n❌ Test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)

From c94dd248d8868a048efe8aea1c45df0f6bb1ea8c Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Mon, 20 Oct 2025 19:53:14 -0500
Subject: [PATCH 10/24] Changed evals structure

---
 docker-compose.yml                            |   2 +
 .../nodejs/examples/with-http-wrapper.js      |   6 +-
 eval-server/nodejs/src/api-server.js          |  28 +-
 evals/config.yml                              |   8 +-
 evals/data/action-agent/accordion-001.yaml    |   2 -
 evals/lib/__init__.py                         |   3 +-
 evals/lib/api_client.py                       |  24 +
 evals/lib/eval_loader.py                      |  40 ++
 evals/run.py                                  | 628 ++++++++++++++++++
 evals/run_action_agent.py                     | 333 ----------
 evals/run_test_simple.py                      | 333 ----------
 11 files changed, 725 insertions(+), 682 deletions(-)
 create mode 100755 evals/run.py
 delete mode 100755 evals/run_action_agent.py
 delete mode 100755 evals/run_test_simple.py

diff --git a/docker-compose.yml b/docker-compose.yml
index d54e756..d783215 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -39,6 +39,8 @@ services:
       - "./kernel-images/images/chromium-headful/.tmp/chromium/flags:/chromium/flags:ro"
       # Persist Chromium data across container restarts (set CHROMIUM_DATA_HOST env var to customize path)
       - "${CHROMIUM_DATA_HOST:-./chromium-data}:/data"
+      # Mount eval-server code for live updates during development
+      - "./eval-server/nodejs:/opt/eval-server"
     tmpfs:
       - /dev/shm:size=2g
     restart: unless-stopped
diff --git a/eval-server/nodejs/examples/with-http-wrapper.js b/eval-server/nodejs/examples/with-http-wrapper.js
index a26b0c9..688f532 100644
--- a/eval-server/nodejs/examples/with-http-wrapper.js
+++ b/eval-server/nodejs/examples/with-http-wrapper.js
@@ -18,7 +18,7 @@ const evalServer = new EvalServer({
 
 console.log('🔧 Creating HTTP wrapper...');
 const httpWrapper = new HTTPWrapper(evalServer, {
-  port: 8083,
+  port: 8080,
   host: '0.0.0.0'
 });
 
@@ -29,11 +29,11 @@ console.log('✅ EvalServer started on ws://127.0.0.1:8082');
 
 console.log('🔧 Starting HTTP wrapper...');
 await httpWrapper.start();
-console.log('✅ HTTP API started on http://127.0.0.1:8083');
+console.log('✅ HTTP API started on http://0.0.0.0:8080');
 
 console.log('⏳ Waiting for DevTools client to connect...');
 console.log('   WebSocket URL: ws://127.0.0.1:8082');
-console.log('   HTTP API URL: http://127.0.0.1:8083');
+console.log('   HTTP API URL: http://0.0.0.0:8080');
 console.log('   Auth: Disabled (automated mode)');
 
 // Add periodic status check
diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 50b2be1..fd3b13d 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -499,8 +499,8 @@ class APIServer {
       // Extract the response text from the result
       const responseText = this.extractResponseText(result);
 
-      // Format in OpenAI Responses API format
-      return this.formatOpenAIResponse(responseText);
+      // Format in OpenAI-compatible Responses API format with tab metadata
+      return this.formatResponse(responseText, tabResult.compositeClientId.split(':')[0], tabResult.tabId);
 
     } catch (error) {
       logger.error('Error handling responses request:', error);
@@ -701,12 +701,15 @@ class APIServer {
   }
 
   /**
-   * Format response in OpenAI Responses API format
+   * Format response in OpenAI-compatible Responses API format
    */
-  formatOpenAIResponse(responseText) {
+  formatResponse(responseText, clientId = null, tabId = null) {
     const messageId = `msg_${uuidv4().replace(/-/g, '')}`;
-    
-    return [
+
+    // Debug: log the parameters
+    logger.debug('formatResponse called with:', { clientId, tabId, hasClientId: !!clientId, hasTabId: !!tabId });
+
+    const response = [
       {
         id: messageId,
         type: 'message',
@@ -720,6 +723,19 @@ class APIServer {
         ]
       }
     ];
+
+    // Add metadata if clientId and tabId are provided
+    if (clientId && tabId) {
+      response[0].metadata = {
+        clientId,
+        tabId
+      };
+      logger.debug('Metadata added to response:', response[0].metadata);
+    } else {
+      logger.debug('Metadata NOT added - clientId or tabId missing');
+    }
+
+    return response;
   }
 
   sendResponse(res, statusCode, data) {
diff --git a/evals/config.yml b/evals/config.yml
index 5de80b5..5b42f16 100644
--- a/evals/config.yml
+++ b/evals/config.yml
@@ -9,12 +9,12 @@ api_endpoint: "http://localhost:8080"
 
 main_model:
   provider: "openai"
-  model_name: "gpt-5"
+  model_name: "gpt-5-mini"
   api_key: "${OPENAI_API_KEY}"
 
 mini_model:
   provider: "openai"
-  model_name: "gpt-5-mini"
+  model_name: "gpt-5-nano"
   api_key: "${OPENAI_API_KEY}"
 
 nano_model:
@@ -30,12 +30,12 @@ nano_model:
 
 # mini_model:
 #   provider: "openrouter"
-#   model_name: "x-ai/grok-4-fast:free"
+#   model_name: "openai/gpt-oss-20b:free"
 #   api_key: "${OPENROUTER_API_KEY}"
 
 # nano_model:
 #   provider: "openrouter"
-#   model_name: "x-ai/grok-4-fast:free"
+#   model_name: "openai/gpt-oss-20b:free"
 #   api_key: "${OPENROUTER_API_KEY}"
 
 # Model configuration for judging evaluation responses
diff --git a/evals/data/action-agent/accordion-001.yaml b/evals/data/action-agent/accordion-001.yaml
index dae142d..fc2fdbb 100644
--- a/evals/data/action-agent/accordion-001.yaml
+++ b/evals/data/action-agent/accordion-001.yaml
@@ -26,7 +26,6 @@ validation:
       - "Successfully clicked to expand the section"
       - "Section 2 content became visible"
       - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
     visual_verification:
       enabled: true
       capture_before: true
@@ -34,7 +33,6 @@ validation:
       prompts:
         - "Verify Section 2 is now expanded and content visible"
         - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
         - "Ensure Section 2 header shows expanded state"
 
 metadata:
diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py
index a6b245b..db65bcc 100644
--- a/evals/lib/__init__.py
+++ b/evals/lib/__init__.py
@@ -5,7 +5,7 @@
 from .config_loader import ConfigLoader, get_config
 from .eval_loader import EvalLoader, Evaluation
 from .api_client import APIClient
-from .judge import LLMJudge, SimpleJudge, JudgeResult
+from .judge import LLMJudge, SimpleJudge, VisionJudge, JudgeResult
 
 __all__ = [
     'ConfigLoader',
@@ -15,5 +15,6 @@
     'APIClient',
     'LLMJudge',
     'SimpleJudge',
+    'VisionJudge',
     'JudgeResult'
 ]
diff --git a/evals/lib/api_client.py b/evals/lib/api_client.py
index af6bdf7..f2cf3bb 100644
--- a/evals/lib/api_client.py
+++ b/evals/lib/api_client.py
@@ -92,11 +92,16 @@ def send_request(
             # Extract text from OpenAI Responses API format
             response_text = self._extract_response_text(response_data)
 
+            # Extract client/tab IDs from metadata (if present)
+            client_id, tab_id = self._extract_metadata(response_data)
+
             return {
                 "success": True,
                 "response": response_text,
                 "raw_response": response_data,
                 "execution_time_ms": execution_time_ms,
+                "client_id": client_id,
+                "tab_id": tab_id,
                 "error": None
             }
 
@@ -192,6 +197,25 @@ def _extract_response_text(self, response_data: Any) -> str:
         except Exception as e:
             return f"[Error extracting response: {e}]"
 
+    def _extract_metadata(self, response_data: Any) -> tuple[str | None, str | None]:
+        """
+        Extract clientId and tabId from response metadata.
+
+        Args:
+            response_data: Raw API response
+
+        Returns:
+            Tuple of (client_id, tab_id) or (None, None)
+        """
+        try:
+            if isinstance(response_data, list) and len(response_data) > 0:
+                message = response_data[0]
+                metadata = message.get('metadata', {})
+                return metadata.get('clientId'), metadata.get('tabId')
+        except Exception:
+            pass
+        return None, None
+
     def capture_screenshot(
         self,
         client_id: str,
diff --git a/evals/lib/eval_loader.py b/evals/lib/eval_loader.py
index e1bc555..f25303f 100644
--- a/evals/lib/eval_loader.py
+++ b/evals/lib/eval_loader.py
@@ -106,6 +106,46 @@ def get_judge_model(self) -> str:
         llm_judge = self.validation.get('llm_judge', {})
         return llm_judge.get('model', 'gpt-4.1-mini')
 
+    def requires_vision_judge(self) -> bool:
+        """
+        Check if this evaluation requires vision judge (visual verification).
+
+        Returns:
+            True if visual verification is enabled, False otherwise
+        """
+        if self.validation_type != 'llm-judge':
+            return False
+
+        llm_judge = self.validation.get('llm_judge', {})
+        visual_verification = llm_judge.get('visual_verification', {})
+        return visual_verification.get('enabled', False)
+
+    def get_visual_verification_config(self) -> Optional[Dict[str, Any]]:
+        """
+        Get visual verification configuration.
+
+        Returns:
+            Visual verification config dict or None if not enabled
+        """
+        if not self.requires_vision_judge():
+            return None
+
+        llm_judge = self.validation.get('llm_judge', {})
+        return llm_judge.get('visual_verification', {})
+
+    def get_verification_prompts(self) -> List[str]:
+        """
+        Get visual verification prompts.
+
+        Returns:
+            List of verification prompt strings for vision judge
+        """
+        visual_config = self.get_visual_verification_config()
+        if not visual_config:
+            return []
+
+        return visual_config.get('prompts', [])
+
     def get_target_url(self) -> Optional[str]:
         """
         Get the target URL for this evaluation.
diff --git a/evals/run.py b/evals/run.py
new file mode 100755
index 0000000..2d1efff
--- /dev/null
+++ b/evals/run.py
@@ -0,0 +1,628 @@
+#!/usr/bin/env python3
+"""
+Universal Evaluation Runner
+
+Runs evaluations from YAML definitions with flexible execution modes:
+- Run specific eval by path: --path action-agent/a11y-001.yaml
+- Run all evals in category: --category action-agent
+- Run all evals: --all
+"""
+
+import argparse
+import csv
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional
+
+# Add lib directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from lib import (
+    ConfigLoader,
+    EvalLoader,
+    APIClient,
+    LLMJudge,
+    VisionJudge,
+    Evaluation,
+    JudgeResult
+)
+
+
+class EvaluationRunner:
+    """Manages evaluation execution and reporting."""
+
+    def __init__(self, config: ConfigLoader, verbose: bool = False):
+        """
+        Initialize evaluation runner.
+
+        Args:
+            config: Configuration loader
+            verbose: Enable verbose output
+        """
+        self.config = config
+        self.verbose = verbose
+
+        # Initialize components
+        self.eval_loader = EvalLoader()
+        self.api_client = APIClient(
+            base_url=config.get_api_endpoint(),
+            timeout=config.get_timeout()
+        )
+
+        # Initialize judges
+        judge_config = config.get_judge_config()
+        self.judge = LLMJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature')
+        )
+        self.vision_judge = VisionJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature')
+        )
+
+        # Get nested model config for API requests
+        self.model_config = config.get_nested_model_config()
+
+        # Results tracking
+        self.results = []
+
+        # Create screenshots directory
+        self.screenshots_dir = Path(__file__).parent / 'screenshots'
+        self.screenshots_dir.mkdir(exist_ok=True)
+
+    def run_from_path(self, eval_path: str):
+        """
+        Run a specific evaluation from a file path.
+
+        Args:
+            eval_path: Path to evaluation YAML file (relative to data/ or absolute)
+        """
+        print(f"\n{'='*70}")
+        print(f"Running Evaluation from Path")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Resolve path
+        eval_file = self._resolve_eval_path(eval_path)
+        if not eval_file.exists():
+            print(f"ERROR: Evaluation file not found: {eval_file}")
+            sys.exit(1)
+
+        # Load evaluation
+        print(f"Loading evaluation from {eval_path}...")
+        import yaml
+        with open(eval_file, 'r') as f:
+            data = yaml.safe_load(f)
+
+        evaluation = Evaluation(eval_file, data)
+
+        if not evaluation.enabled:
+            print(f"WARNING: Evaluation {evaluation.id} is disabled")
+            return
+
+        print(f"Found: {evaluation.name} (ID: {evaluation.id})\n")
+
+        # Run evaluation
+        try:
+            result = self._run_single_evaluation(evaluation)
+            self.results.append(result)
+
+            # Print result
+            status = "PASS" if result['passed'] else "FAIL"
+            print(f"[1/1] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+            print(f"  Status: {status}")
+            print(f"  Score: {result['score']:.2f}")
+            print(f"  Time: {result['execution_time_ms']}ms")
+            print()
+
+        except Exception as e:
+            print(f"  ERROR: {str(e)}\n")
+            self.results.append({
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"Execution error: {str(e)}",
+                'execution_time_ms': 0,
+                'error': str(e)
+            })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        self._save_report(evaluation.category)
+
+    def run_evaluations(
+        self,
+        category: Optional[str] = None,
+        limit: Optional[int] = None,
+        eval_ids: Optional[List[str]] = None,
+        run_all: bool = False
+    ):
+        """
+        Run evaluations for a specific category or all categories.
+
+        Args:
+            category: Category name (e.g., 'action-agent'), None for all
+            limit: Maximum number of evaluations to run
+            eval_ids: Optional list of specific evaluation IDs to run
+            run_all: Run all evaluations across all categories
+        """
+        title = "All Evaluations" if run_all else f"{category} Evaluations"
+        print(f"\n{'='*70}")
+        print(f"Running {title}")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Load evaluations
+        if run_all:
+            print("Loading all evaluations...")
+            data_dir = Path(__file__).parent / 'data'
+            categories = [d.name for d in data_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
+            all_evaluations = []
+            for cat in categories:
+                evals = self.eval_loader.load_from_directory(category=cat, enabled_only=True)
+                all_evaluations.extend(evals)
+            evaluations = all_evaluations
+        else:
+            print(f"Loading evaluations from {category}...")
+            evaluations = self.eval_loader.load_from_directory(
+                category=category,
+                enabled_only=True
+            )
+
+        # Filter by eval_ids if specified
+        if eval_ids:
+            evaluations = [e for e in evaluations if e.id in eval_ids]
+
+        # Apply limit
+        if limit:
+            evaluations = evaluations[:limit]
+
+        if not evaluations:
+            msg = "all categories" if run_all else f"category: {category}"
+            print(f"No evaluations found in {msg}")
+            return
+
+        print(f"Found {len(evaluations)} evaluations to run\n")
+
+        # Run each evaluation
+        for i, evaluation in enumerate(evaluations, 1):
+            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+
+            try:
+                result = self._run_single_evaluation(evaluation)
+                self.results.append(result)
+
+                # Print result
+                status = "PASS" if result['passed'] else "FAIL"
+                print(f"  Status: {status}")
+                print(f"  Score: {result['score']:.2f}")
+                print(f"  Time: {result['execution_time_ms']}ms")
+                print()
+
+                # Add delay between requests
+                if i < len(evaluations):
+                    delay = self.config.get_execution_config().get('request_delay', 1)
+                    if delay > 0:
+                        time.sleep(delay)
+
+            except KeyboardInterrupt:
+                print("\n\nInterrupted by user. Saving partial results...")
+                break
+            except Exception as e:
+                print(f"  ERROR: {str(e)}\n")
+                # Record failure
+                self.results.append({
+                    'eval_id': evaluation.id,
+                    'eval_name': evaluation.name,
+                    'category': evaluation.category,
+                    'passed': False,
+                    'score': 0.0,
+                    'reasoning': f"Execution error: {str(e)}",
+                    'execution_time_ms': 0,
+                    'error': str(e)
+                })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        report_category = 'all' if run_all else category
+        self._save_report(report_category)
+
+    def _resolve_eval_path(self, eval_path: str) -> Path:
+        """
+        Resolve evaluation path to absolute path.
+
+        Args:
+            eval_path: Relative or absolute path to eval file
+
+        Returns:
+            Absolute Path object
+        """
+        path = Path(eval_path)
+
+        # If absolute and exists, use it
+        if path.is_absolute() and path.exists():
+            return path
+
+        # Try relative to data directory
+        data_dir = Path(__file__).parent / 'data'
+        candidate = data_dir / eval_path
+        if candidate.exists():
+            return candidate
+
+        # Try as-is (relative to current directory)
+        if path.exists():
+            return path.resolve()
+
+        # Return the data_dir candidate (will fail with proper error message)
+        return candidate
+
+    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
+        """
+        Run a single evaluation.
+
+        Args:
+            evaluation: Evaluation to run
+
+        Returns:
+            Result dictionary
+        """
+        # Get input message
+        input_message = evaluation.get_input_message()
+
+        # Verbose: print input
+        if self.verbose:
+            print(f"\n  Input: {input_message}")
+
+        # Get target URL and wait timeout
+        target_url = evaluation.get_target_url()
+        wait_timeout = evaluation.get_wait_timeout()
+
+        # Send API request
+        api_response = self.api_client.send_request(
+            input_message=input_message,
+            model_config=self.model_config,
+            url=target_url,
+            wait_timeout=wait_timeout
+        )
+
+        if not api_response['success']:
+            return {
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"API request failed: {api_response['error']}",
+                'execution_time_ms': api_response['execution_time_ms'],
+                'error': api_response['error'],
+                'screenshot_path': None
+            }
+
+        # Verbose: print response
+        if self.verbose:
+            print(f"  Response: {api_response['response'][:200]}{'...' if len(api_response['response']) > 200 else ''}")
+
+        # Capture screenshot if client/tab IDs are available
+        screenshot_path = None
+        if api_response.get('client_id') and api_response.get('tab_id'):
+            screenshot_path = self._capture_screenshot(
+                evaluation.id,
+                api_response['client_id'],
+                api_response['tab_id']
+            )
+
+        # Judge the response
+        criteria = evaluation.get_validation_criteria()
+
+        # Check if visual verification is required
+        if evaluation.requires_vision_judge() and screenshot_path:
+            # Use VisionJudge with screenshot
+            screenshot_data_url = self._load_screenshot_as_data_url(screenshot_path)
+            verification_prompts = evaluation.get_verification_prompts()
+
+            if self.verbose:
+                print(f"  Using Vision Judge with screenshot")
+
+            judge_result = self.vision_judge.judge(
+                input_prompt=input_message,
+                response=api_response['response'],
+                criteria=criteria,
+                screenshots={"after": screenshot_data_url} if screenshot_data_url else None,
+                verification_prompts=verification_prompts if verification_prompts else None
+            )
+        else:
+            # Use standard LLMJudge
+            judge_result = self.judge.judge(
+                input_prompt=input_message,
+                response=api_response['response'],
+                criteria=criteria
+            )
+
+        # Verbose: print reasoning
+        if self.verbose:
+            print(f"  Judge Reasoning: {judge_result.reasoning}")
+            if screenshot_path:
+                print(f"  Screenshot: {screenshot_path}")
+
+        return {
+            'eval_id': evaluation.id,
+            'eval_name': evaluation.name,
+            'category': evaluation.category,
+            'passed': judge_result.passed,
+            'score': judge_result.score,
+            'reasoning': judge_result.reasoning,
+            'execution_time_ms': api_response['execution_time_ms'],
+            'error': None,
+            'screenshot_path': screenshot_path
+        }
+
+    def _capture_screenshot(self, eval_id: str, client_id: str, tab_id: str) -> str | None:
+        """
+        Capture screenshot of the page after evaluation.
+
+        Args:
+            eval_id: Evaluation ID for filename
+            client_id: Client ID
+            tab_id: Tab ID
+
+        Returns:
+            Path to saved screenshot or None if failed
+        """
+        try:
+            from datetime import datetime
+            import base64
+
+            result = self.api_client.capture_screenshot(client_id, tab_id, full_page=False)
+
+            if result['success'] and result.get('image_data'):
+                # Generate filename with timestamp
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+                filename = f"{eval_id}_{timestamp}.png"
+                filepath = self.screenshots_dir / filename
+
+                # Extract base64 data (remove data:image/png;base64, prefix if present)
+                image_data = result['image_data']
+                if image_data.startswith('data:image'):
+                    image_data = image_data.split(',', 1)[1]
+
+                # Save screenshot
+                with open(filepath, 'wb') as f:
+                    f.write(base64.b64decode(image_data))
+
+                return str(filepath)
+
+        except Exception as e:
+            if self.verbose:
+                print(f"  Screenshot capture failed: {e}")
+
+        return None
+
+    def _load_screenshot_as_data_url(self, screenshot_path: str) -> str | None:
+        """
+        Load a screenshot file and convert it to a base64 data URL.
+
+        Args:
+            screenshot_path: Path to the screenshot file
+
+        Returns:
+            Data URL string (data:image/png;base64,...) or None if failed
+        """
+        try:
+            import base64
+
+            with open(screenshot_path, 'rb') as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+                return f"data:image/png;base64,{image_data}"
+
+        except Exception as e:
+            if self.verbose:
+                print(f"  Screenshot load failed: {e}")
+            return None
+
+    def _print_summary(self):
+        """Print summary statistics."""
+        if not self.results:
+            return
+
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r['passed'])
+        failed = total - passed
+        pass_rate = (passed / total) * 100 if total > 0 else 0
+        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
+        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
+
+        print(f"\n{'='*70}")
+        print("Summary")
+        print(f"{'='*70}")
+        print(f"Total: {total}")
+        print(f"Passed: {passed} ({pass_rate:.1f}%)")
+        print(f"Failed: {failed}")
+        print(f"Average Score: {avg_score:.2f}")
+        print(f"Average Time: {avg_time:.0f}ms")
+        print(f"{'='*70}\n")
+
+    def _save_report(self, category: str):
+        """
+        Save evaluation results to CSV report.
+
+        Args:
+            category: Category name for report filename
+        """
+        if not self.results:
+            return
+
+        # Create reports directory
+        reports_dir = self.config.get_reports_dir()
+        reports_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        filename = f"{category}_{timestamp}.csv"
+        filepath = reports_dir / filename
+
+        # Write CSV
+        with open(filepath, 'w', newline='', encoding='utf-8') as f:
+            fieldnames = [
+                'timestamp',
+                'eval_id',
+                'eval_name',
+                'category',
+                'status',
+                'score',
+                'judge_reasoning',
+                'execution_time_ms',
+                'error'
+            ]
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'timestamp': datetime.now().isoformat(),
+                    'eval_id': result['eval_id'],
+                    'eval_name': result['eval_name'],
+                    'category': result['category'],
+                    'status': 'PASS' if result['passed'] else 'FAIL',
+                    'score': f"{result['score']:.2f}",
+                    'judge_reasoning': result['reasoning'],
+                    'execution_time_ms': result['execution_time_ms'],
+                    'error': result.get('error', '')
+                })
+
+        print(f"Report saved to: {filepath}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Universal evaluation runner for browser-agent evals",
+        epilog="""
+Examples:
+  # Run specific eval by path
+  python3 run.py --path action-agent/a11y-001.yaml
+
+  # Run all evals in a category
+  python3 run.py --category action-agent --limit 5
+
+  # Run specific evals by ID
+  python3 run.py --category action-agent --eval-ids a11y-001 a11y-002
+
+  # Run all evals across all categories
+  python3 run.py --all
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    # Execution mode (mutually exclusive)
+    mode_group = parser.add_mutually_exclusive_group(required=True)
+    mode_group.add_argument(
+        '--path',
+        type=str,
+        help='Path to specific evaluation YAML file (e.g., action-agent/a11y-001.yaml)'
+    )
+    mode_group.add_argument(
+        '--category',
+        type=str,
+        help='Run all evaluations in a specific category (e.g., action-agent)'
+    )
+    mode_group.add_argument(
+        '--all',
+        action='store_true',
+        help='Run all evaluations across all categories'
+    )
+
+    # Filtering options (only for category/all modes)
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Maximum number of evaluations to run (default: all)'
+    )
+    parser.add_argument(
+        '--eval-ids',
+        nargs='+',
+        help='Specific evaluation IDs to run (only with --category)'
+    )
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help='Path to config.yml (default: evals/config.yml)'
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Enable verbose output (show input, response, reasoning, screenshots)'
+    )
+
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if args.eval_ids and not args.category:
+        parser.error("--eval-ids can only be used with --category")
+
+    try:
+        # Load configuration
+        config = ConfigLoader(config_path=args.config)
+
+        # Create evaluation runner with verbose flag
+        runner = EvaluationRunner(config, verbose=args.verbose)
+
+        # Execute based on mode
+        if args.path:
+            runner.run_from_path(args.path)
+        elif args.category:
+            # Use limit from config if not specified
+            limit = args.limit if args.limit is not None else config.get_default_limit()
+            runner.run_evaluations(
+                category=args.category,
+                limit=limit,
+                eval_ids=args.eval_ids
+            )
+        elif args.all:
+            limit = args.limit if args.limit is not None else config.get_default_limit()
+            runner.run_evaluations(
+                limit=limit,
+                run_all=True
+            )
+
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/run_action_agent.py b/evals/run_action_agent.py
deleted file mode 100755
index cbe115e..0000000
--- a/evals/run_action_agent.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-"""
-Action Agent Evaluation Runner
-
-Runs evaluations for action-agent category and generates reports.
-"""
-
-import argparse
-import csv
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List
-
-# Add lib directory to path
-sys.path.insert(0, str(Path(__file__).parent))
-
-from lib import (
-    ConfigLoader,
-    EvalLoader,
-    APIClient,
-    LLMJudge,
-    Evaluation,
-    JudgeResult
-)
-
-
-class EvaluationRunner:
-    """Manages evaluation execution and reporting."""
-
-    def __init__(self, config: ConfigLoader):
-        """
-        Initialize evaluation runner.
-
-        Args:
-            config: Configuration loader
-        """
-        self.config = config
-
-        # Initialize components
-        self.eval_loader = EvalLoader()
-        self.api_client = APIClient(
-            base_url=config.get_api_endpoint(),
-            timeout=config.get_timeout()
-        )
-
-        # Initialize judge
-        judge_config = config.get_judge_config()
-        self.judge = LLMJudge(
-            provider=judge_config['provider'],
-            model_name=judge_config['model_name'],
-            api_key=judge_config['api_key'],
-            temperature=judge_config.get('temperature')  # None by default for GPT-5 compatibility
-        )
-
-        # Get nested model config for API requests
-        self.model_config = config.get_nested_model_config()
-
-        # Results tracking
-        self.results = []
-
-    def run_evaluations(
-        self,
-        category: str,
-        limit: int = None,
-        eval_ids: List[str] = None
-    ):
-        """
-        Run evaluations for a specific category.
-
-        Args:
-            category: Category name (e.g., 'action-agent')
-            limit: Maximum number of evaluations to run
-            eval_ids: Optional list of specific evaluation IDs to run
-        """
-        print(f"\n{'='*70}")
-        print(f"Running {category} Evaluations")
-        print(f"{'='*70}\n")
-
-        # Check API server health
-        print("Checking API server connection...")
-        if not self.api_client.check_health():
-            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
-            print("Please ensure the evaluation server is running.")
-            sys.exit(1)
-        print("✓ API server is reachable\n")
-
-        # Load evaluations
-        print(f"Loading evaluations from {category}...")
-        evaluations = self.eval_loader.load_from_directory(
-            category=category,
-            enabled_only=True
-        )
-
-        # Filter by eval_ids if specified
-        if eval_ids:
-            evaluations = [e for e in evaluations if e.id in eval_ids]
-
-        # Apply limit
-        if limit:
-            evaluations = evaluations[:limit]
-
-        if not evaluations:
-            print(f"No evaluations found in category: {category}")
-            return
-
-        print(f"Found {len(evaluations)} evaluations to run\n")
-
-        # Run each evaluation
-        for i, evaluation in enumerate(evaluations, 1):
-            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
-            print(f"  ID: {evaluation.id}")
-
-            try:
-                result = self._run_single_evaluation(evaluation)
-                self.results.append(result)
-
-                # Print result
-                status = "PASS" if result['passed'] else "FAIL"
-                print(f"  Status: {status}")
-                print(f"  Score: {result['score']:.2f}")
-                print(f"  Time: {result['execution_time_ms']}ms")
-                print()
-
-                # Add delay between requests
-                if i < len(evaluations):
-                    delay = self.config.get_execution_config().get('request_delay', 1)
-                    if delay > 0:
-                        time.sleep(delay)
-
-            except KeyboardInterrupt:
-                print("\n\nInterrupted by user. Saving partial results...")
-                break
-            except Exception as e:
-                print(f"  ERROR: {str(e)}\n")
-                # Record failure
-                self.results.append({
-                    'eval_id': evaluation.id,
-                    'eval_name': evaluation.name,
-                    'category': category,
-                    'passed': False,
-                    'score': 0.0,
-                    'reasoning': f"Execution error: {str(e)}",
-                    'execution_time_ms': 0,
-                    'error': str(e)
-                })
-
-        # Print summary
-        self._print_summary()
-
-        # Save report
-        self._save_report(category)
-
-    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
-        """
-        Run a single evaluation.
-
-        Args:
-            evaluation: Evaluation to run
-
-        Returns:
-            Result dictionary
-        """
-        # Get input message
-        input_message = evaluation.get_input_message()
-
-        # Get target URL and wait timeout
-        target_url = evaluation.get_target_url()
-        wait_timeout = evaluation.get_wait_timeout()
-
-        # Send API request
-        api_response = self.api_client.send_request(
-            input_message=input_message,
-            model_config=self.model_config,
-            url=target_url,
-            wait_timeout=wait_timeout
-        )
-
-        if not api_response['success']:
-            return {
-                'eval_id': evaluation.id,
-                'eval_name': evaluation.name,
-                'category': evaluation.category,
-                'passed': False,
-                'score': 0.0,
-                'reasoning': f"API request failed: {api_response['error']}",
-                'execution_time_ms': api_response['execution_time_ms'],
-                'error': api_response['error']
-            }
-
-        # Judge the response
-        criteria = evaluation.get_validation_criteria()
-        judge_result = self.judge.judge(
-            input_prompt=input_message,
-            response=api_response['response'],
-            criteria=criteria
-        )
-
-        return {
-            'eval_id': evaluation.id,
-            'eval_name': evaluation.name,
-            'category': evaluation.category,
-            'passed': judge_result.passed,
-            'score': judge_result.score,
-            'reasoning': judge_result.reasoning,
-            'execution_time_ms': api_response['execution_time_ms'],
-            'error': None
-        }
-
-    def _print_summary(self):
-        """Print summary statistics."""
-        if not self.results:
-            return
-
-        total = len(self.results)
-        passed = sum(1 for r in self.results if r['passed'])
-        failed = total - passed
-        pass_rate = (passed / total) * 100 if total > 0 else 0
-        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
-        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
-
-        print(f"\n{'='*70}")
-        print("Summary")
-        print(f"{'='*70}")
-        print(f"Total: {total}")
-        print(f"Passed: {passed} ({pass_rate:.1f}%)")
-        print(f"Failed: {failed}")
-        print(f"Average Score: {avg_score:.2f}")
-        print(f"Average Time: {avg_time:.0f}ms")
-        print(f"{'='*70}\n")
-
-    def _save_report(self, category: str):
-        """
-        Save evaluation results to CSV report.
-
-        Args:
-            category: Category name for report filename
-        """
-        if not self.results:
-            return
-
-        # Create reports directory
-        reports_dir = self.config.get_reports_dir()
-        reports_dir.mkdir(parents=True, exist_ok=True)
-
-        # Generate filename with timestamp
-        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        filename = f"{category}_{timestamp}.csv"
-        filepath = reports_dir / filename
-
-        # Write CSV
-        with open(filepath, 'w', newline='', encoding='utf-8') as f:
-            fieldnames = [
-                'timestamp',
-                'eval_id',
-                'eval_name',
-                'category',
-                'status',
-                'score',
-                'judge_reasoning',
-                'execution_time_ms',
-                'error'
-            ]
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-
-            writer.writeheader()
-            for result in self.results:
-                writer.writerow({
-                    'timestamp': datetime.now().isoformat(),
-                    'eval_id': result['eval_id'],
-                    'eval_name': result['eval_name'],
-                    'category': result['category'],
-                    'status': 'PASS' if result['passed'] else 'FAIL',
-                    'score': f"{result['score']:.2f}",
-                    'judge_reasoning': result['reasoning'],
-                    'execution_time_ms': result['execution_time_ms'],
-                    'error': result.get('error', '')
-                })
-
-        print(f"Report saved to: {filepath}")
-
-
-def main():
-    """Main entry point."""
-    parser = argparse.ArgumentParser(
-        description="Run action-agent evaluations"
-    )
-    parser.add_argument(
-        '--limit',
-        type=int,
-        default=None,
-        help='Maximum number of evaluations to run (default: all)'
-    )
-    parser.add_argument(
-        '--eval-ids',
-        nargs='+',
-        help='Specific evaluation IDs to run'
-    )
-    parser.add_argument(
-        '--config',
-        type=str,
-        default=None,
-        help='Path to config.yml (default: evals/config.yml)'
-    )
-
-    args = parser.parse_args()
-
-    try:
-        # Load configuration
-        config = ConfigLoader(config_path=args.config)
-
-        # Use limit from config if not specified
-        limit = args.limit if args.limit is not None else config.get_default_limit()
-
-        # Create and run evaluation runner
-        runner = EvaluationRunner(config)
-        runner.run_evaluations(
-            category='action-agent',
-            limit=limit,
-            eval_ids=args.eval_ids
-        )
-
-    except KeyboardInterrupt:
-        print("\nInterrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        print(f"ERROR: {e}")
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/evals/run_test_simple.py b/evals/run_test_simple.py
deleted file mode 100755
index 72dd37b..0000000
--- a/evals/run_test_simple.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test Simple Evaluation Runner
-
-Runs evaluations for test-simple category and generates reports.
-"""
-
-import argparse
-import csv
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List
-
-# Add lib directory to path
-sys.path.insert(0, str(Path(__file__).parent))
-
-from lib import (
-    ConfigLoader,
-    EvalLoader,
-    APIClient,
-    LLMJudge,
-    Evaluation,
-    JudgeResult
-)
-
-
-class EvaluationRunner:
-    """Manages evaluation execution and reporting."""
-
-    def __init__(self, config: ConfigLoader):
-        """
-        Initialize evaluation runner.
-
-        Args:
-            config: Configuration loader
-        """
-        self.config = config
-
-        # Initialize components
-        self.eval_loader = EvalLoader()
-        self.api_client = APIClient(
-            base_url=config.get_api_endpoint(),
-            timeout=config.get_timeout()
-        )
-
-        # Initialize judge
-        judge_config = config.get_judge_config()
-        self.judge = LLMJudge(
-            provider=judge_config['provider'],
-            model_name=judge_config['model_name'],
-            api_key=judge_config['api_key'],
-            temperature=judge_config.get('temperature')  # None by default for GPT-5 compatibility
-        )
-
-        # Get nested model config for API requests
-        self.model_config = config.get_nested_model_config()
-
-        # Results tracking
-        self.results = []
-
-    def run_evaluations(
-        self,
-        category: str,
-        limit: int = None,
-        eval_ids: List[str] = None
-    ):
-        """
-        Run evaluations for a specific category.
-
-        Args:
-            category: Category name (e.g., 'test-simple')
-            limit: Maximum number of evaluations to run
-            eval_ids: Optional list of specific evaluation IDs to run
-        """
-        print(f"\n{'='*70}")
-        print(f"Running {category} Evaluations")
-        print(f"{'='*70}\n")
-
-        # Check API server health
-        print("Checking API server connection...")
-        if not self.api_client.check_health():
-            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
-            print("Please ensure the evaluation server is running.")
-            sys.exit(1)
-        print("✓ API server is reachable\n")
-
-        # Load evaluations
-        print(f"Loading evaluations from {category}...")
-        evaluations = self.eval_loader.load_from_directory(
-            category=category,
-            enabled_only=True
-        )
-
-        # Filter by eval_ids if specified
-        if eval_ids:
-            evaluations = [e for e in evaluations if e.id in eval_ids]
-
-        # Apply limit
-        if limit:
-            evaluations = evaluations[:limit]
-
-        if not evaluations:
-            print(f"No evaluations found in category: {category}")
-            return
-
-        print(f"Found {len(evaluations)} evaluations to run\n")
-
-        # Run each evaluation
-        for i, evaluation in enumerate(evaluations, 1):
-            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
-            print(f"  ID: {evaluation.id}")
-
-            try:
-                result = self._run_single_evaluation(evaluation)
-                self.results.append(result)
-
-                # Print result
-                status = "PASS" if result['passed'] else "FAIL"
-                print(f"  Status: {status}")
-                print(f"  Score: {result['score']:.2f}")
-                print(f"  Time: {result['execution_time_ms']}ms")
-                print()
-
-                # Add delay between requests
-                if i < len(evaluations):
-                    delay = self.config.get_execution_config().get('request_delay', 1)
-                    if delay > 0:
-                        time.sleep(delay)
-
-            except KeyboardInterrupt:
-                print("\n\nInterrupted by user. Saving partial results...")
-                break
-            except Exception as e:
-                print(f"  ERROR: {str(e)}\n")
-                # Record failure
-                self.results.append({
-                    'eval_id': evaluation.id,
-                    'eval_name': evaluation.name,
-                    'category': category,
-                    'passed': False,
-                    'score': 0.0,
-                    'reasoning': f"Execution error: {str(e)}",
-                    'execution_time_ms': 0,
-                    'error': str(e)
-                })
-
-        # Print summary
-        self._print_summary()
-
-        # Save report
-        self._save_report(category)
-
-    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
-        """
-        Run a single evaluation.
-
-        Args:
-            evaluation: Evaluation to run
-
-        Returns:
-            Result dictionary
-        """
-        # Get input message
-        input_message = evaluation.get_input_message()
-
-        # Get target URL and wait timeout
-        target_url = evaluation.get_target_url()
-        wait_timeout = evaluation.get_wait_timeout()
-
-        # Send API request
-        api_response = self.api_client.send_request(
-            input_message=input_message,
-            model_config=self.model_config,
-            url=target_url,
-            wait_timeout=wait_timeout
-        )
-
-        if not api_response['success']:
-            return {
-                'eval_id': evaluation.id,
-                'eval_name': evaluation.name,
-                'category': evaluation.category,
-                'passed': False,
-                'score': 0.0,
-                'reasoning': f"API request failed: {api_response['error']}",
-                'execution_time_ms': api_response['execution_time_ms'],
-                'error': api_response['error']
-            }
-
-        # Judge the response
-        criteria = evaluation.get_validation_criteria()
-        judge_result = self.judge.judge(
-            input_prompt=input_message,
-            response=api_response['response'],
-            criteria=criteria
-        )
-
-        return {
-            'eval_id': evaluation.id,
-            'eval_name': evaluation.name,
-            'category': evaluation.category,
-            'passed': judge_result.passed,
-            'score': judge_result.score,
-            'reasoning': judge_result.reasoning,
-            'execution_time_ms': api_response['execution_time_ms'],
-            'error': None
-        }
-
-    def _print_summary(self):
-        """Print summary statistics."""
-        if not self.results:
-            return
-
-        total = len(self.results)
-        passed = sum(1 for r in self.results if r['passed'])
-        failed = total - passed
-        pass_rate = (passed / total) * 100 if total > 0 else 0
-        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
-        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
-
-        print(f"\n{'='*70}")
-        print("Summary")
-        print(f"{'='*70}")
-        print(f"Total: {total}")
-        print(f"Passed: {passed} ({pass_rate:.1f}%)")
-        print(f"Failed: {failed}")
-        print(f"Average Score: {avg_score:.2f}")
-        print(f"Average Time: {avg_time:.0f}ms")
-        print(f"{'='*70}\n")
-
-    def _save_report(self, category: str):
-        """
-        Save evaluation results to CSV report.
-
-        Args:
-            category: Category name for report filename
-        """
-        if not self.results:
-            return
-
-        # Create reports directory
-        reports_dir = self.config.get_reports_dir()
-        reports_dir.mkdir(parents=True, exist_ok=True)
-
-        # Generate filename with timestamp
-        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        filename = f"{category}_{timestamp}.csv"
-        filepath = reports_dir / filename
-
-        # Write CSV
-        with open(filepath, 'w', newline='', encoding='utf-8') as f:
-            fieldnames = [
-                'timestamp',
-                'eval_id',
-                'eval_name',
-                'category',
-                'status',
-                'score',
-                'judge_reasoning',
-                'execution_time_ms',
-                'error'
-            ]
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-
-            writer.writeheader()
-            for result in self.results:
-                writer.writerow({
-                    'timestamp': datetime.now().isoformat(),
-                    'eval_id': result['eval_id'],
-                    'eval_name': result['eval_name'],
-                    'category': result['category'],
-                    'status': 'PASS' if result['passed'] else 'FAIL',
-                    'score': f"{result['score']:.2f}",
-                    'judge_reasoning': result['reasoning'],
-                    'execution_time_ms': result['execution_time_ms'],
-                    'error': result.get('error', '')
-                })
-
-        print(f"Report saved to: {filepath}")
-
-
-def main():
-    """Main entry point."""
-    parser = argparse.ArgumentParser(
-        description="Run test-simple evaluations"
-    )
-    parser.add_argument(
-        '--limit',
-        type=int,
-        default=None,
-        help='Maximum number of evaluations to run (default: all)'
-    )
-    parser.add_argument(
-        '--eval-ids',
-        nargs='+',
-        help='Specific evaluation IDs to run'
-    )
-    parser.add_argument(
-        '--config',
-        type=str,
-        default=None,
-        help='Path to config.yml (default: evals/config.yml)'
-    )
-
-    args = parser.parse_args()
-
-    try:
-        # Load configuration
-        config = ConfigLoader(config_path=args.config)
-
-        # Use limit from config if not specified
-        limit = args.limit if args.limit is not None else config.get_default_limit()
-
-        # Create and run evaluation runner
-        runner = EvaluationRunner(config)
-        runner.run_evaluations(
-            category='test-simple',
-            limit=limit,
-            eval_ids=args.eval_ids
-        )
-
-    except KeyboardInterrupt:
-        print("\nInterrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        print(f"ERROR: {e}")
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()

From 0a23ba6d321dd161579c43a9e15590ee5d620a89 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Mon, 20 Oct 2025 20:39:30 -0500
Subject: [PATCH 11/24] Cleanup

---
 eval-server/README.md                         |  542 ++++---
 eval-server/nodejs/CLAUDE.md                  |  630 +++-----
 .../1233ae25-9f9e-4f77-924d-865f7d615cef.yaml |   12 -
 eval-server/nodejs/docs/CLIENT_SETUP.md       |  445 ------
 eval-server/nodejs/docs/PROTOCOL.md           |  310 ----
 .../nodejs/docs/TRIGGERING_EVALUATIONS.md     |  306 ----
 eval-server/nodejs/docs/YAML_SCHEMA.md        |  315 ----
 .../nodejs/evals/action-agent/a11y-001.yaml   |   46 -
 .../evals/action-agent/accordion-001.yaml     |   46 -
 .../action-agent/action-agent-a11y-001.yaml   |   46 -
 .../action-agent-accordion-001.yaml           |   46 -
 .../action-agent-autocomplete-001.yaml        |   46 -
 .../action-agent-checkbox-001.yaml            |   46 -
 .../action-agent-checkbox-002.yaml            |   47 -
 .../action-agent/action-agent-click-001.yaml  |   47 -
 .../action-agent-context-001.yaml             |   46 -
 .../action-agent-datepicker-001.yaml          |   46 -
 .../action-agent-daterange-001.yaml           |   46 -
 .../action-agent-dropdown-001.yaml            |   46 -
 .../action-agent-dynamic-001.yaml             |   46 -
 .../action-agent-ecommerce-001.yaml           |   46 -
 .../action-agent/action-agent-error-001.yaml  |   47 -
 .../action-agent/action-agent-filter-001.yaml |   46 -
 .../action-agent/action-agent-form-001.yaml   |   46 -
 .../action-agent/action-agent-hover-001.yaml  |   46 -
 .../action-agent-keyboard-001.yaml            |   46 -
 .../action-agent/action-agent-login-001.yaml  |   47 -
 .../action-agent/action-agent-modal-001.yaml  |   46 -
 .../action-agent-multiselect-001.yaml         |   46 -
 .../action-agent-multistep-001.yaml           |   47 -
 .../action-agent/action-agent-nav-001.yaml    |   46 -
 .../action-agent/action-agent-radio-001.yaml  |   47 -
 .../action-agent/action-agent-slider-001.yaml |   46 -
 .../action-agent-tableselect-001.yaml         |   46 -
 .../action-agent-tablesort-001.yaml           |   46 -
 .../action-agent/action-agent-tabs-001.yaml   |   46 -
 .../action-agent-timepicker-001.yaml          |   46 -
 .../action-agent/action-agent-upload-001.yaml |   46 -
 .../action-agent/action-agent-video-001.yaml  |   47 -
 .../action-agent/action-agent-video-002.yaml  |   47 -
 .../evals/action-agent/autocomplete-001.yaml  |   46 -
 .../evals/action-agent/checkbox-001.yaml      |   46 -
 .../evals/action-agent/checkbox-002.yaml      |   47 -
 .../nodejs/evals/action-agent/click-001.yaml  |   47 -
 .../evals/action-agent/context-001.yaml       |   46 -
 .../evals/action-agent/datepicker-001.yaml    |   46 -
 .../evals/action-agent/daterange-001.yaml     |   46 -
 .../evals/action-agent/dropdown-001.yaml      |   46 -
 .../evals/action-agent/dynamic-001.yaml       |   46 -
 .../evals/action-agent/ecommerce-001.yaml     |   46 -
 .../nodejs/evals/action-agent/error-001.yaml  |   47 -
 .../nodejs/evals/action-agent/filter-001.yaml |   46 -
 .../nodejs/evals/action-agent/form-001.yaml   |   46 -
 .../nodejs/evals/action-agent/hover-001.yaml  |   46 -
 .../evals/action-agent/keyboard-001.yaml      |   46 -
 .../nodejs/evals/action-agent/login-001.yaml  |   47 -
 .../nodejs/evals/action-agent/modal-001.yaml  |   46 -
 .../evals/action-agent/multiselect-001.yaml   |   46 -
 .../evals/action-agent/multistep-001.yaml     |   47 -
 .../nodejs/evals/action-agent/nav-001.yaml    |   46 -
 .../nodejs/evals/action-agent/radio-001.yaml  |   47 -
 .../nodejs/evals/action-agent/slider-001.yaml |   46 -
 .../evals/action-agent/tableselect-001.yaml   |   46 -
 .../evals/action-agent/tablesort-001.yaml     |   46 -
 .../nodejs/evals/action-agent/tabs-001.yaml   |   46 -
 .../evals/action-agent/timepicker-001.yaml    |   46 -
 .../nodejs/evals/action-agent/upload-001.yaml |   46 -
 .../nodejs/evals/action-agent/video-001.yaml  |   47 -
 .../nodejs/evals/action-agent/video-002.yaml  |   47 -
 eval-server/nodejs/evals/config.yaml          |   11 -
 .../end-to-end/b-vitamins-research-001.yaml   |   35 -
 .../end-to-end/investment-research-001.yaml   |   35 -
 .../end-to-end/product-comparison-001.yaml    |   40 -
 .../end-to-end/recipe-nutrition-001.yaml      |   40 -
 .../evals/end-to-end/travel-planning-001.yaml |   40 -
 .../evals/research-agent/basic-001.yaml       |   39 -
 .../evals/research-agent/business-001.yaml    |   39 -
 .../evals/research-agent/comparison-001.yaml  |   39 -
 .../evals/research-agent/current-001.yaml     |   40 -
 .../nodejs/evals/research-agent/edge-001.yaml |   39 -
 .../research-agent-basic-001.yaml             |   39 -
 .../research-agent-business-001.yaml          |   39 -
 .../research-agent-comparison-001.yaml        |   39 -
 .../research-agent-current-001.yaml           |   40 -
 .../research-agent-edge-001.yaml              |   39 -
 .../research-agent-technical-001.yaml         |   39 -
 .../research-agent-tools-001.yaml             |   40 -
 .../evals/research-agent/technical-001.yaml   |   39 -
 .../evals/research-agent/tools-001.yaml       |   40 -
 .../schema-extractor/amazon-product-001.yaml  |   78 -
 .../evals/schema-extractor/bbc-news-001.yaml  |   69 -
 .../schema-extractor/bing-search-001.yaml     |   70 -
 .../github-repo-001-streamlined.yaml          |   66 -
 .../schema-extractor/github-repo-001.yaml     |   66 -
 .../schema-extractor/google-flights-001.yaml  |  106 --
 .../schema-extractor/google-search-001.yaml   |   76 -
 .../evals/schema-extractor/homedepot-001.yaml |   92 --
 .../evals/schema-extractor/macys-001.yaml     |  106 --
 .../wikipedia-search-001.yaml                 |   77 -
 .../dynamic-content-verification-001.yaml     |   45 -
 .../screenshot-error-handling-001.yaml        |   42 -
 .../screenshot-fullpage-001.yaml              |   43 -
 .../screenshot-viewport-001.yaml              |   42 -
 .../visual-comparison-001.yaml                |   45 -
 .../amazon-product-001.yaml                   |   78 -
 .../bbc-news-001.yaml                         |   69 -
 .../bing-search-001.yaml                      |   70 -
 .../github-repo-001.yaml                      |   66 -
 .../google-flights-001.yaml                   |  106 --
 .../google-search-001.yaml                    |   76 -
 .../homedepot-001.yaml                        |   92 --
 .../macys-001.yaml                            |  106 --
 .../wikipedia-001.yaml                        |   76 -
 .../wikipedia-search-001.yaml                 |   77 -
 .../evals/web-task-agent/booking-001.yaml     |   45 -
 .../evals/web-task-agent/ecommerce-001.yaml   |   53 -
 .../evals/web-task-agent/error-001.yaml       |   45 -
 .../evals/web-task-agent/extract-001.yaml     |   60 -
 .../evals/web-task-agent/finance-001.yaml     |   68 -
 .../evals/web-task-agent/flight-001.yaml      |   45 -
 .../nodejs/evals/web-task-agent/food-001.yaml |   68 -
 .../evals/web-task-agent/iframe-001.yaml      |   83 --
 .../nodejs/evals/web-task-agent/jobs-001.yaml |   68 -
 .../evals/web-task-agent/learning-001.yaml    |   69 -
 .../nodejs/evals/web-task-agent/nav-001.yaml  |   46 -
 .../nodejs/evals/web-task-agent/news-001.yaml |   64 -
 .../evals/web-task-agent/realestate-001.yaml  |   70 -
 .../evals/web-task-agent/scroll-001.yaml      |   61 -
 .../evals/web-task-agent/scroll-002.yaml      |   65 -
 .../evals/web-task-agent/scroll-003.yaml      |   61 -
 .../evals/web-task-agent/scroll-004.yaml      |   61 -
 .../evals/web-task-agent/scroll-005.yaml      |   73 -
 .../evals/web-task-agent/search-001.yaml      |   41 -
 .../evals/web-task-agent/social-001.yaml      |   60 -
 .../web-task-agent-booking-001.yaml           |   45 -
 .../web-task-agent-ecommerce-001.yaml         |   53 -
 .../web-task-agent-error-001.yaml             |   45 -
 .../web-task-agent-extract-001.yaml           |   60 -
 .../web-task-agent-finance-001.yaml           |   68 -
 .../web-task-agent-flight-001.yaml            |   45 -
 .../web-task-agent-food-001.yaml              |   68 -
 .../web-task-agent-iframe-001.yaml            |   83 --
 .../web-task-agent-jobs-001.yaml              |   68 -
 .../web-task-agent-learning-001.yaml          |   69 -
 .../web-task-agent-nav-001.yaml               |   46 -
 .../web-task-agent-news-001.yaml              |   64 -
 .../web-task-agent-realestate-001.yaml        |   70 -
 .../web-task-agent-scroll-001.yaml            |   61 -
 .../web-task-agent-scroll-002.yaml            |   65 -
 .../web-task-agent-scroll-003.yaml            |   61 -
 .../web-task-agent-scroll-004.yaml            |   61 -
 .../web-task-agent-scroll-005.yaml            |   73 -
 .../web-task-agent-search-001.yaml            |   41 -
 .../web-task-agent-social-001.yaml            |   60 -
 .../1233ae25-9f9e-4f77-924d-865f7d615cef.yaml |   12 -
 eval-server/nodejs/examples/library-usage.js  |  250 ----
 eval-server/nodejs/examples/logs/.gitignore   |    3 -
 eval-server/nodejs/examples/multiple-evals.js |  167 ---
 .../nodejs/examples/with-http-wrapper.js      |   45 -
 eval-server/nodejs/logs/.gitignore            |    2 -
 eval-server/nodejs/package.json               |   25 +-
 eval-server/nodejs/schemas/client.schema.json |  299 ----
 eval-server/nodejs/src/cli/CLI.js             |  518 -------
 eval-server/nodejs/src/cli/index.js           |   23 -
 eval-server/nodejs/src/evaluator.js           |  117 --
 .../nodejs/src/lib/EvaluationLoader.js        |  448 ------
 eval-server/nodejs/src/lib/EvaluationStack.js |   85 --
 .../nodejs/templates/default-client.yaml      |   56 -
 eval-server/python/README.md                  |  368 -----
 eval-server/python/UV_COMMANDS.md             |  188 ---
 eval-server/python/evals/README.md            |  195 ---
 .../python/evals/browsecomp_dataset.py        |  252 ----
 .../python/evals/browsecomp_eval_server.py    |  836 -----------
 eval-server/python/evals/browsecomp_scorer.py |  328 -----
 .../evals/run_browsecomp_eval_server.sh       |   12 -
 eval-server/python/examples/__init__.py       |   10 -
 eval-server/python/examples/basic_server.py   |  100 --
 eval-server/python/examples/logs/.gitignore   |    2 -
 .../python/examples/programmatic_evals.py     |  428 ------
 eval-server/python/examples/with_stack.py     |  201 ---
 eval-server/python/logs/.gitignore            |    2 -
 eval-server/python/pyproject.toml             |   84 --
 eval-server/python/quick_test.py              |   38 -
 eval-server/python/requirements.txt           |   10 -
 eval-server/python/run.py                     |  100 --
 eval-server/python/scripts.py                 |   68 -
 .../python/src/bo_eval_server/__init__.py     |   29 -
 .../src/bo_eval_server/client_manager.py      |  401 -----
 .../python/src/bo_eval_server/config.py       |   75 -
 .../python/src/bo_eval_server/eval_server.py  |  292 ----
 .../src/bo_eval_server/evaluation_stack.py    |  102 --
 .../python/src/bo_eval_server/logger.py       |  180 ---
 .../python/src/bo_eval_server/rpc_client.py   |  229 ---
 eval-server/python/test_client.py             |  190 ---
 eval-server/python/uv.lock                    | 1306 -----------------
 evals/README.md                               |  935 +++++++++---
 196 files changed, 1320 insertions(+), 18121 deletions(-)
 delete mode 100644 eval-server/nodejs/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
 delete mode 100644 eval-server/nodejs/docs/CLIENT_SETUP.md
 delete mode 100644 eval-server/nodejs/docs/PROTOCOL.md
 delete mode 100644 eval-server/nodejs/docs/TRIGGERING_EVALUATIONS.md
 delete mode 100644 eval-server/nodejs/docs/YAML_SCHEMA.md
 delete mode 100644 eval-server/nodejs/evals/action-agent/a11y-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/accordion-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/checkbox-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/checkbox-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/click-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/context-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/datepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/daterange-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/dropdown-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/dynamic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/filter-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/form-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/hover-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/keyboard-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/login-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/modal-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/multiselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/multistep-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/radio-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/slider-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tableselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tablesort-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tabs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/timepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/upload-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/video-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/video-002.yaml
 delete mode 100644 eval-server/nodejs/evals/config.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/basic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/business-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/current-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/edge-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/technical-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/tools-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/macys-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/booking-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/extract-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/finance-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/flight-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/food-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/learning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/social-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
 delete mode 100644 eval-server/nodejs/examples/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
 delete mode 100644 eval-server/nodejs/examples/library-usage.js
 delete mode 100644 eval-server/nodejs/examples/logs/.gitignore
 delete mode 100755 eval-server/nodejs/examples/multiple-evals.js
 delete mode 100644 eval-server/nodejs/examples/with-http-wrapper.js
 delete mode 100644 eval-server/nodejs/logs/.gitignore
 delete mode 100644 eval-server/nodejs/schemas/client.schema.json
 delete mode 100644 eval-server/nodejs/src/cli/CLI.js
 delete mode 100644 eval-server/nodejs/src/cli/index.js
 delete mode 100644 eval-server/nodejs/src/evaluator.js
 delete mode 100644 eval-server/nodejs/src/lib/EvaluationLoader.js
 delete mode 100644 eval-server/nodejs/src/lib/EvaluationStack.js
 delete mode 100644 eval-server/nodejs/templates/default-client.yaml
 delete mode 100644 eval-server/python/README.md
 delete mode 100644 eval-server/python/UV_COMMANDS.md
 delete mode 100644 eval-server/python/evals/README.md
 delete mode 100644 eval-server/python/evals/browsecomp_dataset.py
 delete mode 100755 eval-server/python/evals/browsecomp_eval_server.py
 delete mode 100644 eval-server/python/evals/browsecomp_scorer.py
 delete mode 100755 eval-server/python/evals/run_browsecomp_eval_server.sh
 delete mode 100644 eval-server/python/examples/__init__.py
 delete mode 100644 eval-server/python/examples/basic_server.py
 delete mode 100644 eval-server/python/examples/logs/.gitignore
 delete mode 100644 eval-server/python/examples/programmatic_evals.py
 delete mode 100644 eval-server/python/examples/with_stack.py
 delete mode 100644 eval-server/python/logs/.gitignore
 delete mode 100644 eval-server/python/pyproject.toml
 delete mode 100644 eval-server/python/quick_test.py
 delete mode 100644 eval-server/python/requirements.txt
 delete mode 100644 eval-server/python/run.py
 delete mode 100644 eval-server/python/scripts.py
 delete mode 100644 eval-server/python/src/bo_eval_server/__init__.py
 delete mode 100644 eval-server/python/src/bo_eval_server/client_manager.py
 delete mode 100644 eval-server/python/src/bo_eval_server/config.py
 delete mode 100644 eval-server/python/src/bo_eval_server/eval_server.py
 delete mode 100644 eval-server/python/src/bo_eval_server/evaluation_stack.py
 delete mode 100644 eval-server/python/src/bo_eval_server/logger.py
 delete mode 100644 eval-server/python/src/bo_eval_server/rpc_client.py
 delete mode 100644 eval-server/python/test_client.py
 delete mode 100644 eval-server/python/uv.lock

diff --git a/eval-server/README.md b/eval-server/README.md
index 88c852f..01af8cd 100644
--- a/eval-server/README.md
+++ b/eval-server/README.md
@@ -1,239 +1,415 @@
 # Eval-Server
 
-A WebSocket-based evaluation server for LLM agents with multiple language implementations.
+HTTP API wrapper for Browser Operator - provides WebSocket server with Chrome DevTools Protocol (CDP) integration for browser automation.
 
 ## Overview
 
-This directory contains two functionally equivalent implementations of the bo-eval-server:
+The eval-server exposes browser automation capabilities via HTTP API endpoints. It manages WebSocket connections to browser agents and provides REST APIs for:
+- Sending tasks to agents (`/v1/responses`)
+- Capturing screenshots via CDP
+- Retrieving page content
+- Managing browser tabs
 
-- **NodeJS** (`nodejs/`) - Full-featured implementation with YAML evaluations, HTTP API, CLI, and judge system
-- **Python** (`python/`) - Minimal library focused on core WebSocket functionality and programmatic evaluation creation
+**Note:** Evaluation orchestration and LLM-as-a-judge logic lives in the separate `evals/` Python project, which calls these APIs.
 
-Both implementations provide:
-- 🔌 **WebSocket Server** - Real-time agent connections
-- 🤖 **Bidirectional RPC** - JSON-RPC 2.0 for calling agent methods  
-- 📚 **Programmatic API** - Create and manage evaluations in code
-- ⚡ **Concurrent Support** - Handle multiple agents simultaneously
-- 📊 **Structured Logging** - Comprehensive evaluation tracking
+## Architecture
 
-## Quick Start
+```
+eval-server/
+└── nodejs/
+    ├── src/
+    │   ├── api-server.js           # HTTP REST API endpoints
+    │   ├── client-manager.js       # WebSocket client management
+    │   ├── rpc-client.js           # JSON-RPC 2.0 communication
+    │   ├── config.js               # Configuration management
+    │   ├── logger.js               # Winston logging
+    │   └── lib/
+    │       ├── EvalServer.js       # Core server + CDP integration
+    │       └── HTTPWrapper.js      # HTTP API wrapper
+    └── package.json
+```
 
-### NodeJS (Full Featured)
+## Quick Start
 
-The NodeJS implementation includes YAML evaluation loading, HTTP API wrapper, CLI tools, and LLM-as-a-judge functionality.
+### Installation
 
 ```bash
-cd nodejs/
+cd eval-server/nodejs
 npm install
-npm start
 ```
 
-**Key Features:**
-- YAML evaluation file loading
-- HTTP API wrapper for REST integration  
-- Interactive CLI for management
-- LLM judge system for response evaluation
-- Comprehensive documentation and examples
+### Configuration
+
+Copy `.env.example` to `.env` and configure:
 
-See [`nodejs/README.md`](nodejs/README.md) for detailed usage.
+```bash
+# WebSocket server port
+PORT=8080
+
+# HTTP API server port
+API_PORT=8081
+
+# Authentication
+AUTH_KEY=your-secret-key
 
-### Python (Lightweight Library)
+# Chrome DevTools Protocol endpoint
+CDP_HOST=localhost
+CDP_PORT=9223
+```
 
-The Python implementation focuses on core WebSocket functionality with programmatic evaluation creation.
+### Start Server
 
 ```bash
-cd python/
-pip install -e .
-python examples/basic_server.py
-```
-
-**Key Features:**
-- Minimal dependencies (websockets, loguru)
-- Full async/await support
-- Evaluation stack for LIFO queuing
-- Type hints throughout
-- Clean Pythonic API
-
-See [`python/README.md`](python/README.md) for detailed usage.
-
-## Architecture Comparison
-
-| Feature | NodeJS | Python |
-|---------|--------|--------|
-| **Core WebSocket Server** | ✅ | ✅ |
-| **JSON-RPC 2.0** | ✅ | ✅ |
-| **Client Management** | ✅ | ✅ |
-| **Programmatic Evaluations** | ✅ | ✅ |
-| **Evaluation Stack** | ✅ | ✅ |
-| **Structured Logging** | ✅ (Winston) | ✅ (Loguru) |
-| **YAML Evaluations** | ✅ | ❌ |
-| **HTTP API Wrapper** | ✅ | ❌ |
-| **CLI Interface** | ✅ | ❌ |
-| **LLM Judge System** | ✅ | ❌ |
-| **Type System** | TypeScript | Type Hints |
-
-## Choosing an Implementation
-
-**Choose NodeJS if you need:**
-- YAML-based evaluation definitions
-- HTTP REST API endpoints
-- Interactive CLI for management
-- LLM-as-a-judge evaluation
-- Comprehensive feature set
-
-**Choose Python if you need:**
-- Minimal dependencies
-- Pure programmatic approach
-- Integration with Python ML pipelines
-- Modern async/await patterns
-- Lightweight deployment
-
-## Agent Protocol
-
-Both implementations use the same WebSocket protocol:
-
-### 1. Connect to WebSocket
-```javascript
-// NodeJS
-const ws = new WebSocket('ws://localhost:8080');
-
-// Python
-import websockets
-ws = await websockets.connect('ws://localhost:8080')
-```
-
-### 2. Send Registration
+npm start
+```
+
+The server will start:
+- WebSocket server on `ws://localhost:8080`
+- HTTP API server on `http://localhost:8081`
+
+## HTTP API Endpoints
+
+### Core Endpoint
+
+#### `POST /v1/responses`
+
+Send a task to a connected browser agent and get response.
+
+**Request:**
 ```json
 {
-  "type": "register",
-  "clientId": "your-client-id",
-  "secretKey": "your-secret-key", 
-  "capabilities": ["chat", "action"]
+  "input": "Click the submit button",
+  "url": "https://example.com",
+  "wait_timeout": 5000,
+  "model": {
+    "main_model": {
+      "provider": "openai",
+      "model": "gpt-5-mini",
+      "api_key": "sk-..."
+    },
+    "mini_model": {
+      "provider": "openai",
+      "model": "gpt-5-nano",
+      "api_key": "sk-..."
+    },
+    "nano_model": {
+      "provider": "openai",
+      "model": "gpt-5-nano",
+      "api_key": "sk-..."
+    }
+  }
 }
 ```
 
-### 3. Send Ready Signal
+**Response:**
+```json
+[
+  {
+    "id": "msg_abc123",
+    "type": "message",
+    "role": "assistant",
+    "content": [
+      {
+        "type": "output_text",
+        "text": "Done - clicked submit button",
+        "annotations": []
+      }
+    ],
+    "metadata": {
+      "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+      "tabId": "482D56EE57B1931A3B9D1BFDAF935429"
+    }
+  }
+]
+```
+
+The `metadata` field contains `clientId` and `tabId` which can be used for screenshot capture and other CDP operations.
+
+### CDP Endpoints
+
+#### `POST /page/screenshot`
+
+Capture screenshot of a specific browser tab.
+
+**Request:**
+```json
+{
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "fullPage": false
+}
+```
+
+**Response:**
 ```json
 {
-  "type": "ready"
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "imageData": "data:image/png;base64,iVBORw0KG...",
+  "format": "png",
+  "fullPage": false,
+  "timestamp": 1234567890
 }
 ```
 
-### 4. Handle RPC Calls
-Both implementations send JSON-RPC 2.0 requests with the `evaluate` method:
+#### `POST /page/content`
+
+Get HTML or text content of a page.
 
+**Request:**
 ```json
 {
-  "jsonrpc": "2.0",
-  "method": "evaluate", 
-  "params": {
-    "id": "eval_001",
-    "name": "Test Evaluation",
-    "tool": "chat",
-    "input": {"message": "Hello world"}
-  },
-  "id": "unique-call-id"
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "format": "html"
 }
 ```
 
-Agents should respond with:
+**Response:**
 ```json
 {
-  "jsonrpc": "2.0",
-  "id": "unique-call-id",
-  "result": {
-    "status": "completed",
-    "output": {"response": "Hello! How can I help you?"}
-  }
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "content": "<html>...</html>",
+  "format": "html",
+  "length": 12345,
+  "timestamp": 1234567890
 }
 ```
 
-## Examples
-
-### NodeJS Example
-```javascript
-import { EvalServer } from 'bo-eval-server';
-
-const server = new EvalServer({
-  authKey: 'secret',
-  port: 8080
-});
-
-server.onConnect(async client => {
-  const result = await client.evaluate({
-    id: "test",
-    name: "Hello World", 
-    tool: "chat",
-    input: {message: "Hi there!"}
-  });
-  console.log(result);
-});
-
-await server.start();
-```
-
-### Python Example
-```python
-import asyncio
-from bo_eval_server import EvalServer
-
-async def main():
-    server = EvalServer(
-        auth_key='secret',
-        port=8080
-    )
-    
-    @server.on_connect
-    async def handle_client(client):
-        result = await client.evaluate({
-            "id": "test",
-            "name": "Hello World",
-            "tool": "chat", 
-            "input": {"message": "Hi there!"}
-        })
-        print(result)
-    
-    await server.start()
-    await server.wait_closed()
-
-asyncio.run(main())
+#### `POST /tabs/open`
+
+Open a new browser tab.
+
+**Request:**
+```json
+{
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "url": "https://example.com",
+  "background": false
+}
 ```
 
-## Development
+**Response:**
+```json
+{
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "NEW_TAB_ID",
+  "compositeClientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306:NEW_TAB_ID",
+  "url": "https://example.com",
+  "status": "opened"
+}
+```
 
-Each implementation has its own development setup:
+#### `POST /tabs/close`
 
-**NodeJS:**
-```bash
-cd nodejs/
-npm install
-npm run dev    # Watch mode
-npm test       # Run tests
-npm run cli    # Interactive CLI
+Close a browser tab.
+
+**Request:**
+```json
+{
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "TAB_ID_TO_CLOSE"
+}
+```
+
+**Response:**
+```json
+{
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "TAB_ID_TO_CLOSE",
+  "status": "closed",
+  "success": true
+}
+```
+
+### Status Endpoints
+
+#### `GET /status`
+
+Get server health and connected clients.
+
+**Response:**
+```json
+{
+  "server": {
+    "running": true,
+    "uptime": 12345,
+    "connections": 1
+  },
+  "clients": [
+    {
+      "id": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+      "name": "DevTools Client",
+      "connected": true,
+      "ready": true
+    }
+  ]
+}
+```
+
+#### `GET /clients`
+
+List all connected clients with their tabs.
+
+**Response:**
+```json
+[
+  {
+    "id": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+    "name": "DevTools Client",
+    "description": "Browser automation agent",
+    "tabCount": 3,
+    "tabs": [
+      {
+        "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+        "compositeClientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306:482D56EE57B1931A3B9D1BFDAF935429",
+        "connected": true,
+        "ready": true,
+        "connectedAt": "2025-01-15T10:30:00.000Z",
+        "remoteAddress": "::ffff:172.18.0.1"
+      }
+    ]
+  }
+]
 ```
 
-**Python:**
+## WebSocket Protocol
+
+Browser agents connect to the WebSocket server and implement JSON-RPC 2.0 protocol.
+
+### Connection Flow
+
+1. **Connect to WebSocket**
+   ```
+   ws://localhost:8080
+   ```
+
+2. **Send Registration**
+   ```json
+   {
+     "type": "register",
+     "clientId": "unique-client-id",
+     "secretKey": "your-auth-key",
+     "capabilities": ["browser-automation"]
+   }
+   ```
+
+3. **Send Ready Signal**
+   ```json
+   {
+     "type": "ready"
+   }
+   ```
+
+4. **Handle RPC Calls**
+
+   Server sends JSON-RPC 2.0 requests:
+   ```json
+   {
+     "jsonrpc": "2.0",
+     "method": "evaluate",
+     "params": {
+       "tool": "action_agent",
+       "input": {"objective": "Click submit button"},
+       "model": {...}
+     },
+     "id": "request-id"
+   }
+   ```
+
+   Agent responds:
+   ```json
+   {
+     "jsonrpc": "2.0",
+     "id": "request-id",
+     "result": {
+       "status": "completed",
+       "output": "Task completed successfully"
+     }
+   }
+   ```
+
+## Chrome DevTools Protocol Setup
+
+The browser must be started with remote debugging enabled:
+
 ```bash
-cd python/
-pip install -e ".[dev]"
-pytest         # Run tests  
-black .        # Format code
-mypy src/      # Type checking
+chromium --remote-debugging-port=9223
 ```
 
-## Contributing
+The CDP endpoint is accessible at:
+- HTTP: `http://localhost:9223/json/version`
+- WebSocket: `ws://localhost:9223/devtools/browser/{browserId}`
 
-When contributing to either implementation:
+## Usage with Evals Framework
 
-1. Maintain API compatibility between versions where possible
-2. Update documentation for both implementations when adding shared features
-3. Follow the existing code style and patterns
-4. Add appropriate tests and examples
+The eval-server is designed to work with the separate `evals/` Python project:
 
-## License
+1. **Start eval-server:**
+   ```bash
+   cd eval-server/nodejs
+   npm start
+   ```
+
+2. **Run evaluations from evals/:**
+   ```bash
+   cd evals
+   python3 run.py --path action-agent/accordion-001.yaml --verbose
+   ```
+
+The evals framework:
+- Sends tasks to `/v1/responses` endpoint
+- Extracts `clientId` and `tabId` from response metadata
+- Captures screenshots via `/page/screenshot`
+- Uses LLM judges (LLMJudge, VisionJudge) to evaluate results
+- Generates reports and saves screenshots
+
+See `evals/README.md` for detailed evaluation framework documentation.
+
+## Dependencies
 
-MIT License - see individual implementation directories for details.
+Core dependencies:
+- **ws** - WebSocket server
+- **uuid** - ID generation
+- **winston** - Structured logging
+- **dotenv** - Environment variable management
+
+## Logging
+
+Logs are written to `logs/` directory (auto-created):
+- `combined.log` - All log events
+- `error.log` - Error events only
+- `api.log` - API request/response logs
+
+## Docker Integration
+
+The eval-server runs inside the `kernel-browser-extended` Docker container. Volume mount for live development:
+
+```yaml
+# docker-compose.yml
+volumes:
+  - "./eval-server/nodejs:/opt/eval-server"
+```
 
----
+## Development
+
+```bash
+# Install dependencies
+npm install
+
+# Start server
+npm start
+
+# Check status
+curl http://localhost:8081/status
+
+# Test screenshot capture
+curl -X POST http://localhost:8081/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d '{"clientId":"CLIENT_ID","tabId":"TAB_ID","fullPage":false}'
+```
+
+## License
 
-Both implementations provide robust, production-ready evaluation servers for LLM agents with different feature sets optimized for different use cases.
\ No newline at end of file
+MIT License
diff --git a/eval-server/nodejs/CLAUDE.md b/eval-server/nodejs/CLAUDE.md
index eb968f0..cb4ade4 100644
--- a/eval-server/nodejs/CLAUDE.md
+++ b/eval-server/nodejs/CLAUDE.md
@@ -4,101 +4,17 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-bo-eval-server is a thin WebSocket and REST API server for LLM agent evaluation. The server provides:
-- WebSocket server for agent connections and RPC communication
+The eval-server is a **thin HTTP API wrapper for Browser Operator**. It provides:
+- WebSocket server for browser agent connections
 - REST APIs for browser automation via Chrome DevTools Protocol (CDP)
+- JSON-RPC 2.0 bidirectional communication
 - Screenshot capture and page content retrieval
 
-**Evaluation orchestration and LLM-as-a-judge logic lives in the separate `evals/` Python project**, which calls these APIs.
-
-## Commands
-
-### Development
-- `npm start` - Start the WebSocket server
-- `npm run dev` - Start server with file watching for development
-- `npm run cli` - Start interactive CLI for server management and testing
-- `npm test` - Run example agent client for testing
-
-### Installation
-- `npm install` - Install dependencies
-- Copy `.env.example` to `.env` and configure environment variables
-
-### Required Environment Variables
-- `OPENAI_API_KEY` - OpenAI API key for LLM judge functionality
-- `PORT` - WebSocket server port (default: 8080)
-
-### LLM Provider Configuration (Optional)
-- `GROQ_API_KEY` - Groq API key for Groq provider support
-- `OPENROUTER_API_KEY` - OpenRouter API key for OpenRouter provider support
-- `LITELLM_ENDPOINT` - LiteLLM server endpoint URL
-- `LITELLM_API_KEY` - LiteLLM API key for LiteLLM provider support
-- `DEFAULT_PROVIDER` - Default LLM provider (openai, groq, openrouter, litellm)
-- `DEFAULT_MAIN_MODEL` - Default main model name
-- `DEFAULT_MINI_MODEL` - Default mini model name
-- `DEFAULT_NANO_MODEL` - Default nano model name
+**Important:** Evaluation orchestration and LLM-as-a-judge logic lives in the separate `evals/` Python project, which calls these APIs.
 
 ## Architecture
 
-### Core Components
-
-**WebSocket Server** (`src/server.js`)
-- Accepts connections from LLM agents
-- Manages agent lifecycle (connect, ready, disconnect)
-- Orchestrates evaluation sessions
-- Handles bidirectional RPC communication
-
-**RPC Client** (`src/rpc-client.js`)
-- Implements JSON-RPC 2.0 protocol for bidirectional communication
-- Manages request/response correlation with unique IDs
-- Handles timeouts and error conditions
-- Calls `Evaluate(request: String) -> String` method on connected agents
-- Supports `configure_llm` method for dynamic LLM provider configuration
-
-**CDP Integration** (`src/lib/EvalServer.js`)
-- Direct Chrome DevTools Protocol communication
-- Screenshot capture via `Page.captureScreenshot`
-- Page content access via `Runtime.evaluate`
-- Tab management via `Target.createTarget` / `Target.closeTarget`
-
-**Logger** (`src/logger.js`)
-- Structured logging using Winston
-- Separate log files for different event types
-- JSON format for easy parsing and analysis
-- Logs all RPC calls, evaluations, and connection events
-
-### Evaluation Flow
-
-**WebSocket RPC Flow:**
-1. Agent connects to WebSocket server
-2. Agent sends "ready" signal
-3. Server calls agent's `Evaluate` method with a task
-4. Agent processes task and returns response
-5. Response is returned to caller (evaluation orchestration happens externally in `evals/`)
-
-**REST API Flow (for screenshot/content capture):**
-1. External caller (e.g., Python evals runner) requests screenshot via `POST /page/screenshot`
-2. Server uses CDP to capture screenshot
-3. Returns base64-encoded image data
-4. External caller uses screenshots for LLM-as-a-judge visual verification
-
-### Project Structure
-
-```
-src/
-├── server.js          # Main WebSocket server and evaluation orchestration
-├── rpc-client.js      # JSON-RPC client for calling agent methods
-├── evaluator.js       # LLM judge integration (OpenAI)
-├── logger.js          # Structured logging and result storage
-├── config.js          # Configuration management
-└── cli.js             # Interactive CLI for testing and management
-
-logs/                  # Log files (created automatically)
-├── combined.log       # All log events
-├── error.log          # Error events only
-└── evaluations.jsonl  # Evaluation results in JSON Lines format
-```
-
-### Architecture: Separation of Concerns
+### Separation of Concerns
 
 **eval-server (Node.js)**: Thin API layer
 - WebSocket server for agent connections
@@ -112,418 +28,316 @@ logs/                  # Log files (created automatically)
 - Test case definitions (YAML files in `data/`)
 - Result reporting and analysis
 
-This separation keeps eval-server focused on infrastructure while evals/ handles business logic.
+## Core Components
 
-### Key Features
-
-- **Bidirectional RPC**: Server can call methods on connected clients
-- **Multi-Provider LLM Support**: Support for OpenAI, Groq, OpenRouter, and LiteLLM providers (configured by clients)
-- **Dynamic LLM Configuration**: Runtime configuration via `configure_llm` JSON-RPC method
-- **Per-Client Configuration**: Each connected client can have different LLM settings
-- **CDP Browser Automation**: Screenshot capture, page content access, tab management
-- **Concurrent Evaluations**: Support for multiple agents and parallel evaluations
-- **Structured Logging**: All interactions logged as JSON for analysis
-- **Interactive CLI**: Built-in CLI for testing and server management
-- **Connection Management**: Robust handling of agent connections and disconnections
-- **Timeout Handling**: Configurable timeouts for RPC calls and evaluations
+### WebSocket Server (src/lib/EvalServer.js)
+- Accepts connections from browser agents (DevTools clients)
+- Manages agent lifecycle (connect, ready, disconnect)
+- Handles bidirectional RPC communication
+- Integrates directly with Chrome DevTools Protocol
 
-### Agent Protocol
+### HTTP API Server (src/api-server.js)
+- Exposes REST endpoints for external callers (e.g., Python evals)
+- Main endpoint: `POST /v1/responses` - Send task to agent
+- CDP endpoints: screenshot, page content, tab management
+- Returns metadata (clientId, tabId) for subsequent operations
 
-Agents must implement:
-- WebSocket connection to server
-- JSON-RPC 2.0 protocol support
-- `Evaluate(task: string) -> string` method
-- "ready" message to signal availability for evaluations
+### RPC Client (src/rpc-client.js)
+- Implements JSON-RPC 2.0 protocol for bidirectional communication
+- Manages request/response correlation with unique IDs
+- Handles timeouts and error conditions
+- Calls `evaluate(params)` method on connected agents
 
-### Model Configuration Schema
+### Client Manager (src/client-manager.js)
+- Tracks WebSocket client connections
+- Manages tab-level connections (composite clientId:tabId)
+- Maintains client state (connected, ready)
 
-The server uses a canonical nested model configuration format that allows per-tier provider and API key settings:
+### CDP Integration (src/lib/EvalServer.js)
+- Direct Chrome DevTools Protocol communication
+- Screenshot capture via `Page.captureScreenshot`
+- Page content access via `Runtime.evaluate`
+- Tab management via `Target.createTarget` / `Target.closeTarget`
 
-#### Model Configuration Structure
+### Logger (src/logger.js)
+- Structured logging using Winston
+- Separate log files for different event types
+- JSON format for easy parsing and analysis
 
-```typescript
-interface ModelTierConfig {
-  provider: string;  // "openai" | "groq" | "openrouter" | "litellm"
-  model: string;     // Model name (e.g., "gpt-4", "llama-3.1-8b-instant")
-  api_key: string;   // API key for this tier
-}
+## Key API Endpoints
 
-interface ModelConfig {
-  main_model: ModelTierConfig;  // Primary model for complex tasks
-  mini_model: ModelTierConfig;  // Secondary model for simpler tasks
-  nano_model: ModelTierConfig;  // Tertiary model for basic tasks
-}
-```
+### POST /v1/responses
 
-#### Example: Evaluation with Model Configuration
+Primary endpoint for sending tasks to browser agents.
 
+**Request:**
 ```json
 {
-  "jsonrpc": "2.0",
-  "method": "evaluate",
-  "params": {
-    "tool": "chat",
-    "input": {"message": "Hello"},
-    "model": {
-      "main_model": {
-        "provider": "openai",
-        "model": "gpt-4",
-        "api_key": "sk-main-key"
-      },
-      "mini_model": {
-        "provider": "openai",
-        "model": "gpt-4-mini",
-        "api_key": "sk-mini-key"
-      },
-      "nano_model": {
-        "provider": "groq",
-        "model": "llama-3.1-8b-instant",
-        "api_key": "gsk-nano-key"
-      }
-    }
+  "input": "Click the submit button",
+  "url": "https://example.com",
+  "wait_timeout": 5000,
+  "model": {
+    "main_model": {"provider": "openai", "model": "gpt-5-mini", "api_key": "sk-..."},
+    "mini_model": {"provider": "openai", "model": "gpt-5-nano", "api_key": "sk-..."},
+    "nano_model": {"provider": "openai", "model": "gpt-5-nano", "api_key": "sk-..."}
   }
 }
 ```
 
-### Dynamic LLM Configuration
-
-The server supports runtime LLM configuration via the `configure_llm` JSON-RPC method:
-
-```json
-{
-  "jsonrpc": "2.0",
-  "method": "configure_llm",
-  "params": {
-    "provider": "openai|groq|openrouter|litellm",
-    "apiKey": "your-api-key",
-    "endpoint": "endpoint-url-for-litellm",
-    "models": {
-      "main": "main-model-name",
-      "mini": "mini-model-name",
-      "nano": "nano-model-name"
-    },
-    "partial": false
-  },
-  "id": "config-request-id"
-}
-```
-
-### Tab Management
-
-The evaluation server supports managing browser tabs via REST API endpoints and Chrome DevTools Protocol (CDP).
-
-#### Tab Identification
-
-Each browser tab is identified by a **composite client ID** in the format: `baseClientId:tabId`
-
-- `baseClientId`: The persistent identifier for the DevTools client (e.g., `9907fd8d-92a8-4a6a-bce9-458ec8c57306`)
-- `tabId`: The Chrome target ID for the specific tab (e.g., `482D56EE57B1931A3B9D1BFDAF935429`)
-
-#### API Endpoints
-
-**List All Clients and Tabs**
-```bash
-GET /clients
-```
-
-Returns all registered clients with their active tabs, connection status, and readiness state.
-
-Response format:
+**Response (OpenAI-compatible format):**
 ```json
 [
   {
-    "id": "baseClientId",
-    "name": "Client Name",
-    "description": "Client Description",
-    "tabCount": 3,
-    "tabs": [
-      {
-        "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
-        "compositeClientId": "baseClientId:tabId",
-        "connected": true,
-        "ready": true,
-        "connectedAt": "2025-01-15T10:30:00.000Z",
-        "remoteAddress": "::ffff:172.18.0.1"
-      }
-    ]
+    "id": "msg_abc123",
+    "type": "message",
+    "role": "assistant",
+    "content": [{"type": "output_text", "text": "Done", "annotations": []}],
+    "metadata": {
+      "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+      "tabId": "482D56EE57B1931A3B9D1BFDAF935429"
+    }
   }
 ]
 ```
 
-**List Tabs for Specific Client**
-```bash
-GET /clients/{clientId}/tabs
-```
+**Important:** The `metadata` field contains `clientId` and `tabId` which are used by the evals framework for screenshot capture.
 
-Returns all tabs for a specific client identified by `baseClientId`.
+### POST /page/screenshot
 
-**Open New Tab**
-```bash
-POST /tabs/open
-Content-Type: application/json
+Capture screenshot of a browser tab via CDP.
 
+**Request:**
+```json
 {
-  "clientId": "baseClientId:tabId",
-  "url": "https://example.com",
-  "background": false
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "fullPage": false
 }
 ```
 
-Opens a new tab in the browser associated with the specified client.
-
-Response format:
+**Response:**
 ```json
 {
-  "clientId": "baseClientId:tabId",
-  "tabId": "newTabId",
-  "compositeClientId": "baseClientId:newTabId",
-  "url": "https://example.com",
-  "status": "opened"
+  "clientId": "...",
+  "tabId": "...",
+  "imageData": "data:image/png;base64,iVBORw0KG...",
+  "format": "png",
+  "timestamp": 1234567890
 }
 ```
 
-**Close Tab**
-```bash
-POST /tabs/close
-Content-Type: application/json
+### POST /page/content
 
-{
-  "clientId": "baseClientId:tabId",
-  "tabId": "targetTabId"
-}
-```
+Get HTML or text content of a page.
 
-Closes the specified tab.
-
-Response format:
+**Request:**
 ```json
 {
-  "clientId": "baseClientId:tabId",
-  "tabId": "targetTabId",
-  "status": "closed",
-  "success": true
+  "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306",
+  "tabId": "482D56EE57B1931A3B9D1BFDAF935429",
+  "format": "html"
 }
 ```
 
-**Get Page Content**
-```bash
-POST /page/content
-Content-Type: application/json
+### POST /tabs/open, POST /tabs/close
 
-{
-  "clientId": "baseClientId",
-  "tabId": "targetTabId",
-  "format": "html"  // or "text"
-}
-```
+Tab management via CDP.
 
-Retrieves the HTML or text content of a specific tab.
+## Configuration
 
-Response format:
-```json
-{
-  "clientId": "baseClientId",
-  "tabId": "targetTabId",
-  "content": "<html>...</html>",
-  "format": "html",
-  "length": 12345,
-  "timestamp": 1234567890
-}
-```
+All configuration is managed through environment variables and `src/config.js`:
 
-**Capture Screenshot**
 ```bash
-POST /page/screenshot
-Content-Type: application/json
-
-{
-  "clientId": "baseClientId",
-  "tabId": "targetTabId",
-  "fullPage": false
-}
+# .env file
+PORT=8080              # WebSocket server port
+API_PORT=8081          # HTTP API server port
+AUTH_KEY=hello         # Authentication key
+CDP_HOST=localhost     # Chrome DevTools Protocol host
+CDP_PORT=9223          # Chrome DevTools Protocol port
 ```
 
-Captures a screenshot of a specific tab.
+## Model Configuration Schema
 
-Response format:
-```json
-{
-  "clientId": "baseClientId",
-  "tabId": "targetTabId",
-  "imageData": "data:image/png;base64,iVBORw0KG...",
-  "format": "png",
-  "fullPage": false,
-  "timestamp": 1234567890
-}
-```
+The server uses a canonical nested model configuration format:
 
-#### Implementation Architecture
+```typescript
+interface ModelTierConfig {
+  provider: string;  // "openai" | "groq" | "openrouter" | "litellm"
+  model: string;     // Model name (e.g., "gpt-5-mini")
+  api_key: string;   // API key for this tier
+}
 
-**Direct CDP Approach (Current)**
+interface ModelConfig {
+  main_model: ModelTierConfig;  // Primary model
+  mini_model: ModelTierConfig;  // Secondary model
+  nano_model: ModelTierConfig;  // Tertiary model
+}
+```
 
-Tab management and page content access are implemented using direct Chrome DevTools Protocol (CDP) communication:
+## Tab Management
 
-1. Server discovers the CDP WebSocket endpoint via `http://localhost:9223/json/version`
-2. For each command, a new WebSocket connection is established to the CDP endpoint
-3. Commands are sent using JSON-RPC 2.0 format:
-   - **Browser-level operations** (use `sendCDPCommand`):
-     - `Target.createTarget` - Opens new tab
-     - `Target.closeTarget` - Closes existing tab
-   - **Tab-level operations** (use `sendCDPCommandToTarget`):
-     - `Runtime.evaluate` - Execute JavaScript to get page content
-     - `Page.captureScreenshot` - Capture screenshot of tab
-4. For tab-level operations, the server first attaches to the target, executes the command, then detaches
-5. WebSocket connection is closed after receiving the response
+Each browser tab is identified by a **composite client ID**: `baseClientId:tabId`
 
-Key implementation files:
-- `src/lib/EvalServer.js` - Contains CDP methods:
-  - `sendCDPCommand()` - Browser-level CDP commands
-  - `sendCDPCommandToTarget()` - Tab-level CDP commands (with attach/detach)
-  - `openTab()`, `closeTab()` - Tab management
-  - `getPageHTML()`, `getPageText()` - Page content access
-  - `captureScreenshot()` - Screenshot capture
-- `src/api-server.js` - REST API endpoints that delegate to EvalServer methods
+Example:
+- Base Client ID: `9907fd8d-92a8-4a6a-bce9-458ec8c57306`
+- Tab ID: `482D56EE57B1931A3B9D1BFDAF935429`
+- Composite: `9907fd8d-92a8-4a6a-bce9-458ec8c57306:482D56EE57B1931A3B9D1BFDAF935429`
 
-**Alternative Approach Considered**
+## CDP Communication Pattern
 
-An RPC-based approach was initially considered where:
-- API server sends JSON-RPC request to DevTools client via WebSocket
-- DevTools client executes CDP commands locally
-- Response is sent back via JSON-RPC
+The server uses **direct CDP communication**:
 
-This was rejected in favor of direct CDP communication for simplicity and reduced latency.
+1. Discover CDP WebSocket endpoint via `http://localhost:9223/json/version`
+2. For each command, establish WebSocket connection to CDP endpoint
+3. Send commands using JSON-RPC 2.0:
+   - **Browser-level**: `Target.createTarget`, `Target.closeTarget`
+   - **Tab-level**: `Runtime.evaluate`, `Page.captureScreenshot`
+4. For tab-level operations: attach → execute → detach
+5. Close WebSocket after receiving response
 
-#### Chrome Setup
+## Integration with Evals Framework
 
-The browser must be started with remote debugging enabled:
-```bash
-chromium --remote-debugging-port=9223
-```
+The eval-server is designed to work with the separate `evals/` Python project:
 
-The CDP endpoint is accessible at:
-- HTTP: `http://localhost:9223/json/version`
-- WebSocket: `ws://localhost:9223/devtools/browser/{browserId}`
+**Flow:**
+1. Python evals runner sends request to `/v1/responses`
+2. Eval-server forwards to DevTools agent via WebSocket/JSON-RPC
+3. Agent performs browser automation task
+4. Response includes `metadata.clientId` and `metadata.tabId`
+5. Python evals uses these IDs to capture screenshot via `/page/screenshot`
+6. Python evals uses VisionJudge to evaluate with screenshot
+7. Python evals generates report and saves screenshot
 
-#### Usage Examples
+See `evals/README.md` for detailed evaluation framework documentation.
 
-**Complete workflow: Open tab, get content, take screenshot, close tab**
+## Development Commands
 
 ```bash
-# 1. Get list of clients
-curl -X GET http://localhost:8081/clients
+# Install dependencies
+npm install
 
-# 2. Open a new tab
-curl -X POST http://localhost:8081/tabs/open \
-  -H "Content-Type: application/json" \
-  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","url":"https://example.com"}'
+# Start server
+npm start
 
-# Response: {"tabId":"ABC123DEF456",...}
+# Check status
+curl http://localhost:8081/status
 
-# 3. Get page HTML content
-curl -X POST http://localhost:8081/page/content \
+# Test screenshot
+curl -X POST http://localhost:8081/page/screenshot \
   -H "Content-Type: application/json" \
-  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","format":"html"}'
+  -d '{"clientId":"CLIENT_ID","tabId":"TAB_ID","fullPage":false}'
+```
 
-# 4. Get page text content
-curl -X POST http://localhost:8081/page/content \
-  -H "Content-Type: application/json" \
-  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","format":"text"}'
+## Chrome Setup
 
-# 5. Capture screenshot
-curl -X POST http://localhost:8081/page/screenshot \
-  -H "Content-Type: application/json" \
-  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456","fullPage":false}'
+The browser must be started with remote debugging enabled:
 
-# 6. Close the tab
-curl -X POST http://localhost:8081/tabs/close \
-  -H "Content-Type: application/json" \
-  -d '{"clientId":"9907fd8d-92a8-4a6a-bce9-458ec8c57306","tabId":"ABC123DEF456"}'
+```bash
+chromium --remote-debugging-port=9223
 ```
 
-**LLM-as-a-Judge evaluation pattern**
+CDP endpoint: `http://localhost:9223/json/version`
 
-This workflow replicates the DevTools evaluation pattern using the eval-server:
+## File Structure
 
-```bash
-# 1. Open tab and navigate to test URL
-TAB_RESPONSE=$(curl -X POST http://localhost:8081/tabs/open \
-  -H "Content-Type: application/json" \
-  -d '{"clientId":"CLIENT_ID","url":"https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"}')
+```
+nodejs/
+├── package.json
+├── .env.example
+└── src/
+    ├── api-server.js           # HTTP REST API endpoints
+    ├── client-manager.js       # WebSocket client management
+    ├── rpc-client.js           # JSON-RPC 2.0 communication
+    ├── config.js               # Configuration management
+    ├── logger.js               # Winston logging
+    └── lib/
+        ├── EvalServer.js       # Core server + CDP integration
+        └── HTTPWrapper.js      # HTTP wrapper around EvalServer
+```
 
-TAB_ID=$(echo $TAB_RESPONSE | jq -r '.tabId')
+## Key Implementation Details
 
-# 2. Capture BEFORE screenshot
-BEFORE_SCREENSHOT=$(curl -X POST http://localhost:8081/page/screenshot \
-  -H "Content-Type: application/json" \
-  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"fullPage\":false}")
+### formatResponse() Method
 
-# 3. Execute agent action (via /v1/responses or custom endpoint)
-# ... agent performs action ...
+Located in `src/api-server.js:706`
 
-# 4. Capture AFTER screenshot
-AFTER_SCREENSHOT=$(curl -X POST http://localhost:8081/page/screenshot \
-  -H "Content-Type: application/json" \
-  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"fullPage\":false}")
+Converts agent responses to OpenAI-compatible format and **adds metadata**:
 
-# 5. Get page content for verification
-PAGE_CONTENT=$(curl -X POST http://localhost:8081/page/content \
-  -H "Content-Type: application/json" \
-  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\",\"format\":\"text\"}")
+```javascript
+formatResponse(responseText, clientId = null, tabId = null) {
+  const messageId = `msg_${uuidv4().replace(/-/g, '')}`;
 
-# 6. Send to LLM judge with screenshots and content
-# (Use OpenAI Vision API or similar with before/after screenshots)
+  const response = [{
+    id: messageId,
+    type: 'message',
+    role: 'assistant',
+    content: [{ type: 'output_text', text: responseText, annotations: [] }]
+  }];
 
-# 7. Clean up
-curl -X POST http://localhost:8081/tabs/close \
-  -H "Content-Type: application/json" \
-  -d "{\"clientId\":\"CLIENT_ID\",\"tabId\":\"$TAB_ID\"}"
+  // Add metadata for screenshot capture
+  if (clientId && tabId) {
+    response[0].metadata = { clientId, tabId };
+  }
+
+  return response;
+}
 ```
 
-#### Current Limitations
+This metadata is critical for the evals framework to capture screenshots.
 
-**⚠️ Known Issue: WebSocket Timeout**
+### Screenshot Capture Flow
 
-Tab opening and closing functionality is currently experiencing a WebSocket timeout issue:
+1. Evals calls `/v1/responses` with task
+2. Server returns response with `metadata: {clientId, tabId}`
+3. Evals extracts metadata from response
+4. Evals calls `/page/screenshot` with extracted IDs
+5. Server uses CDP to capture screenshot
+6. Returns base64-encoded PNG
+7. Evals saves screenshot and uses for VisionJudge evaluation
 
-- Symptom: `sendCDPCommand()` times out after 10 seconds with no response
-- Error: `CDP command timeout: Target.createTarget`
-- Status: Under investigation
-- Debugging approach: Added extensive logging to track WebSocket lifecycle events
+## Logging
 
-The CDP endpoint is correctly discovered and accessible, but WebSocket messages are not being received. This may be related to:
-- WebSocket handshake issues
-- CDP protocol version mismatch
-- Network/proxy configuration
-- Chrome process state
+Logs are written to `logs/` directory (auto-created):
+- `combined.log` - All log events
+- `error.log` - Error events only
+- `api.log` - API request/response logs
+
+## Docker Integration
+
+The eval-server runs inside `kernel-browser-extended` Docker container.
+
+Volume mount for live development:
+```yaml
+volumes:
+  - "./eval-server/nodejs:/opt/eval-server"
+```
 
-**Workaround**: Until this issue is resolved, tab management via the API is not functional. Manual CDP testing is required to diagnose the root cause.
+## Dependencies
 
-#### Features Implemented
+Core dependencies:
+- `ws` - WebSocket server
+- `uuid` - ID generation
+- `winston` - Structured logging
+- `dotenv` - Environment variable management
 
-- ✅ Page HTML/text content access via CDP
-- ✅ Screenshot capture via CDP
-- ✅ Direct CDP communication for tab management
-- ✅ Tab-level CDP command execution with attach/detach
+Removed dependencies:
+- ~~`openai`~~ - Not needed (evals handles judging)
+- ~~`js-yaml`~~ - Not needed (evals handles YAML loading)
 
-#### Future Enhancements
+## What This Server Does NOT Do
 
-- Automatic tab registration in ClientManager when DevTools connects
-- Tab lifecycle events (opened, closed, navigated)
-- Bulk tab operations
-- Tab metadata (title, URL, favicon)
-- Tab grouping and organization
-- Additional CDP methods:
-  - JavaScript execution with custom expressions
-  - DOM tree access (`DOM.getDocument`)
-  - MHTML snapshots (`Page.captureSnapshot`)
-  - PDF generation (`Page.printToPDF`)
+- ❌ Load YAML evaluation definitions (handled by evals/)
+- ❌ LLM-as-a-judge evaluation (handled by evals/)
+- ❌ Test orchestration (handled by evals/)
+- ❌ Result reporting (handled by evals/)
+- ❌ Screenshot analysis (handled by evals/)
 
-### Configuration
+## What This Server DOES Do
 
-All configuration is managed through environment variables and `src/config.js`. Key settings:
-- Server port and host
-- OpenAI API configuration
-- RPC timeouts
-- Logging levels and directories
-- Maximum concurrent evaluations
-- CDP endpoint (default: localhost:9223)
\ No newline at end of file
+- ✅ WebSocket server for browser agent connections
+- ✅ JSON-RPC 2.0 bidirectional communication
+- ✅ HTTP REST API endpoints
+- ✅ CDP screenshot capture
+- ✅ CDP page content retrieval
+- ✅ CDP tab management
+- ✅ Return metadata (clientId, tabId) for screenshot capture
diff --git a/eval-server/nodejs/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml b/eval-server/nodejs/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
deleted file mode 100644
index f5b865f..0000000
--- a/eval-server/nodejs/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-client:
-  id: 1233ae25-9f9e-4f77-924d-865f7d615cef
-  name: DevTools Client 1233ae25
-  secret_key: hello
-  description: Auto-generated DevTools evaluation client
-settings:
-  max_concurrent_evaluations: 3
-  default_timeout: 45000
-  retry_policy:
-    max_retries: 2
-    backoff_multiplier: 2
-    initial_delay: 1000
diff --git a/eval-server/nodejs/docs/CLIENT_SETUP.md b/eval-server/nodejs/docs/CLIENT_SETUP.md
deleted file mode 100644
index 53502ae..0000000
--- a/eval-server/nodejs/docs/CLIENT_SETUP.md
+++ /dev/null
@@ -1,445 +0,0 @@
-# Client Setup Guide
-
-## Overview
-
-This guide explains how to set up a new evaluation client to connect to the evaluation server. Clients can be any application that implements the WebSocket evaluation protocol, such as Chrome DevTools or custom test agents.
-
-## Prerequisites
-
-- WebSocket client library
-- JSON-RPC 2.0 implementation
-- UUID v4 generator
-- Tools/agents to execute evaluations
-
-## Setup Steps
-
-### 1. Generate Client ID
-
-Generate a unique UUID v4 for your client:
-
-```javascript
-// JavaScript example
-import { v4 as uuidv4 } from 'uuid';
-const clientId = uuidv4(); // e.g., "550e8400-e29b-41d4-a716-446655440000"
-```
-
-Store this ID persistently - it will be used for all connections.
-
-### 2. Request YAML Configuration
-
-Contact the evaluation server administrator to:
-1. Create a YAML evaluation file for your client ID
-2. Optionally set up a secret key for authentication
-3. Configure appropriate evaluations for your client
-
-Example request:
-```
-Client ID: 550e8400-e29b-41d4-a716-446655440000
-Client Name: Chrome DevTools Production
-Tools Available: extract_schema_data, research_agent, action_agent
-Purpose: Automated regression testing
-```
-
-### 3. Implement WebSocket Connection
-
-```javascript
-class EvaluationClient {
-  constructor(serverUrl, clientId, secretKey) {
-    this.serverUrl = serverUrl;
-    this.clientId = clientId;
-    this.secretKey = secretKey;
-    this.ws = null;
-  }
-
-  connect() {
-    this.ws = new WebSocket(this.serverUrl);
-    
-    this.ws.onopen = () => {
-      console.log('Connected to evaluation server');
-    };
-    
-    this.ws.onmessage = (event) => {
-      this.handleMessage(JSON.parse(event.data));
-    };
-    
-    this.ws.onerror = (error) => {
-      console.error('WebSocket error:', error);
-    };
-  }
-}
-```
-
-### 4. Implement Protocol Messages
-
-#### Handle Welcome Message
-```javascript
-handleMessage(message) {
-  switch (message.type) {
-    case 'welcome':
-      // Server is ready, send registration
-      this.register();
-      break;
-    
-    case 'registration_ack':
-      if (message.status === 'accepted') {
-        console.log(`Registered! ${message.evaluationsCount} evaluations assigned`);
-        this.sendReady();
-      } else {
-        console.error('Registration rejected:', message.reason);
-      }
-      break;
-    
-    default:
-      // Handle other messages...
-  }
-}
-```
-
-#### Send Registration
-```javascript
-register() {
-  this.send({
-    type: 'register',
-    clientId: this.clientId,
-    secretKey: this.secretKey, // Optional
-    capabilities: {
-      tools: ['extract_schema_data', 'research_agent'],
-      maxConcurrency: 3,
-      version: '1.0.0'
-    }
-  });
-}
-```
-
-#### Send Ready Signal
-```javascript
-sendReady() {
-  this.send({
-    type: 'ready',
-    timestamp: new Date().toISOString()
-  });
-}
-```
-
-### 5. Implement RPC Handler
-
-```javascript
-handleMessage(message) {
-  // ... existing code ...
-  
-  // Handle JSON-RPC requests
-  if (message.jsonrpc === '2.0' && message.method) {
-    this.handleRpcRequest(message);
-  }
-}
-
-async handleRpcRequest(request) {
-  if (request.method === 'evaluate') {
-    try {
-      const result = await this.executeEvaluation(request.params);
-      
-      this.send({
-        jsonrpc: '2.0',
-        result: {
-          status: 'success',
-          output: result.output,
-          executionTime: result.duration,
-          toolCalls: result.toolCalls,
-          metadata: result.metadata
-        },
-        id: request.id
-      });
-    } catch (error) {
-      this.send({
-        jsonrpc: '2.0',
-        error: {
-          code: -32000,
-          message: error.message,
-          data: {
-            tool: request.params.tool,
-            error: error.toString(),
-            timestamp: new Date().toISOString()
-          }
-        },
-        id: request.id
-      });
-    }
-  }
-}
-```
-
-### 6. Implement Tool Execution
-
-```javascript
-async executeEvaluation(params) {
-  const startTime = Date.now();
-  
-  // Send status update
-  this.send({
-    type: 'status',
-    evaluationId: params.evaluationId,
-    status: 'running',
-    progress: 0.1,
-    message: 'Starting evaluation...'
-  });
-  
-  // Execute the appropriate tool
-  let result;
-  switch (params.tool) {
-    case 'extract_schema_data':
-      result = await this.extractSchema(params.url, params.input);
-      break;
-    
-    case 'research_agent':
-      result = await this.runResearchAgent(params.url, params.input);
-      break;
-    
-    default:
-      throw new Error(`Unknown tool: ${params.tool}`);
-  }
-  
-  const executionTime = Date.now() - startTime;
-  
-  return {
-    output: result,
-    duration: executionTime,
-    toolCalls: [{
-      tool: params.tool,
-      timestamp: new Date().toISOString(),
-      duration: executionTime,
-      status: 'success'
-    }],
-    metadata: {
-      url: params.url,
-      toolVersion: '1.0.0'
-    }
-  };
-}
-```
-
-## Chrome DevTools Integration
-
-For Chrome DevTools specifically:
-
-### 1. Update EvaluationConfig
-
-```typescript
-// In EvaluationConfig.ts
-interface EvaluationConfiguration {
-  enabled: boolean;
-  endpoint: string;
-  secretKey?: string;
-  clientId?: string; // Add client ID field
-}
-
-// Generate and store client ID
-function ensureClientId(): string {
-  let clientId = localStorage.getItem('ai_chat_evaluation_client_id');
-  if (!clientId) {
-    clientId = generateUUID();
-    localStorage.setItem('ai_chat_evaluation_client_id', clientId);
-  }
-  return clientId;
-}
-```
-
-### 2. Create Evaluation Agent
-
-```typescript
-// EvaluationAgent.ts
-import { WebSocketRPCClient } from '../common/WebSocketRPCClient.js';
-import { ToolRegistry } from '../agent_framework/ConfigurableAgentTool.js';
-
-export class EvaluationAgent {
-  private client: WebSocketRPCClient;
-  private clientId: string;
-  
-  constructor(config: EvaluationConfiguration) {
-    this.clientId = config.clientId || ensureClientId();
-    this.client = new WebSocketRPCClient({
-      endpoint: config.endpoint,
-      secretKey: config.secretKey
-    });
-    
-    this.setupHandlers();
-  }
-  
-  private setupHandlers(): void {
-    this.client.on('connected', () => {
-      this.register();
-    });
-    
-    // Handle RPC requests
-    this.client.on('rpc-request', async (request) => {
-      if (request.method === 'evaluate') {
-        const result = await this.handleEvaluation(request.params);
-        return result;
-      }
-    });
-  }
-  
-  private async handleEvaluation(params: any): Promise<any> {
-    const tool = ToolRegistry.getRegisteredTool(params.tool);
-    if (!tool) {
-      throw new Error(`Tool not found: ${params.tool}`);
-    }
-    
-    // Execute tool with params.input
-    const result = await tool.execute(params.input);
-    
-    return {
-      status: 'success',
-      output: result,
-      executionTime: Date.now() - startTime
-    };
-  }
-}
-```
-
-## Testing Your Client
-
-### 1. Local Testing
-
-Use the example agent to test your server setup:
-
-```bash
-# In bo-eval-server directory
-npm test
-```
-
-### 2. Connection Test
-
-```javascript
-// Quick connection test
-const client = new EvaluationClient(
-  'ws://localhost:8080',
-  'your-client-id',
-  'optional-secret'
-);
-
-client.connect();
-
-// Should see:
-// Connected to evaluation server
-// Registered! X evaluations assigned
-```
-
-### 3. Manual Evaluation Test
-
-You can trigger evaluations manually through the server's CLI:
-
-```bash
-npm run cli
-> run-evaluation your-client-id evaluation-id
-```
-
-## Troubleshooting
-
-### Connection Issues
-
-1. **Check server is running**
-   ```bash
-   curl -i -N -H "Connection: Upgrade" -H "Upgrade: websocket" http://localhost:8080
-   ```
-
-2. **Verify client ID exists**
-   - Check `clients/{your-client-id}.yaml` exists on server
-   - Ensure client ID format is valid UUID v4
-
-3. **Authentication failures**
-   - Verify secret key matches server configuration
-   - Check for typos in client ID or secret
-
-### Evaluation Failures
-
-1. **Tool not found**
-   - Ensure tool name in YAML matches client capabilities
-   - Verify tool is registered in your client
-
-2. **Timeouts**
-   - Increase timeout in YAML configuration
-   - Check for infinite loops in tool execution
-
-3. **Invalid input**
-   - Validate input against expected schema
-   - Check for required fields
-
-## Security Best Practices
-
-1. **Store credentials securely**
-   - Never hardcode secret keys
-   - Use environment variables or secure storage
-
-2. **Validate inputs**
-   - Sanitize URLs before navigation
-   - Validate schemas before execution
-
-3. **Resource limits**
-   - Implement timeout handling
-   - Limit concurrent evaluations
-
-4. **Use WSS in production**
-   ```javascript
-   const client = new EvaluationClient(
-     'wss://eval-server.example.com',  // Use WSS
-     clientId,
-     secretKey
-   );
-   ```
-
-## Example: Minimal Client
-
-```javascript
-// minimal-client.js
-import WebSocket from 'ws';
-
-const CLIENT_ID = 'your-uuid-here';
-const SECRET_KEY = 'your-secret-here';
-
-const ws = new WebSocket('ws://localhost:8080');
-
-ws.on('open', () => {
-  console.log('Connected');
-});
-
-ws.on('message', async (data) => {
-  const msg = JSON.parse(data);
-  
-  if (msg.type === 'welcome') {
-    // Register
-    ws.send(JSON.stringify({
-      type: 'register',
-      clientId: CLIENT_ID,
-      secretKey: SECRET_KEY,
-      capabilities: {
-        tools: ['extract_schema_data'],
-        maxConcurrency: 1,
-        version: '1.0.0'
-      }
-    }));
-  }
-  
-  if (msg.type === 'registration_ack' && msg.status === 'accepted') {
-    // Send ready
-    ws.send(JSON.stringify({
-      type: 'ready',
-      timestamp: new Date().toISOString()
-    }));
-  }
-  
-  if (msg.jsonrpc && msg.method === 'evaluate') {
-    // Simple evaluation response
-    ws.send(JSON.stringify({
-      jsonrpc: '2.0',
-      result: {
-        status: 'success',
-        output: { message: 'Evaluation completed' },
-        executionTime: 1000
-      },
-      id: msg.id
-    }));
-  }
-});
-
-ws.on('error', console.error);
-```
\ No newline at end of file
diff --git a/eval-server/nodejs/docs/PROTOCOL.md b/eval-server/nodejs/docs/PROTOCOL.md
deleted file mode 100644
index 694e58a..0000000
--- a/eval-server/nodejs/docs/PROTOCOL.md
+++ /dev/null
@@ -1,310 +0,0 @@
-# WebSocket Evaluation Protocol
-
-## Overview
-
-This document describes the WebSocket communication protocol between evaluation clients (e.g., Chrome DevTools) and the evaluation server. The protocol supports client registration, authentication, and bidirectional evaluation task execution using JSON-RPC 2.0.
-
-## Connection Flow
-
-```
-Client                           Server
-  |                                |
-  |------ WebSocket Connect ------>|
-  |                                |
-  |<----- Welcome Message ---------|
-  |                                |
-  |------ Register Message ------->|
-  |                                |
-  |<----- Registration ACK ---------|
-  |                                |
-  |------ Ready Signal ----------->|
-  |                                |
-  |<===== Evaluation Loop ========>|
-```
-
-## Message Types
-
-### 1. Client → Server Messages
-
-#### 1.1 Registration Message
-Sent immediately after receiving the welcome message to register the client with the server.
-
-```json
-{
-  "type": "register",
-  "clientId": "550e8400-e29b-41d4-a716-446655440000",
-  "secretKey": "optional-secret-key",  // Optional field for authentication
-  "capabilities": {
-    "tools": ["extract_schema_data", "research_agent", "action_agent"],
-    "maxConcurrency": 3,
-    "version": "1.0.0"
-  }
-}
-```
-
-**Fields:**
-- `type`: Must be "register"
-- `clientId`: UUID v4 format, unique identifier for the client
-- `secretKey`: Optional authentication key
-- `capabilities`: Object describing client capabilities
-  - `tools`: Array of tool names the client can execute
-  - `maxConcurrency`: Maximum number of concurrent evaluations
-  - `version`: Client version string
-
-#### 1.2 Ready Signal
-Indicates the client is ready to receive evaluation tasks.
-
-```json
-{
-  "type": "ready",
-  "timestamp": "2024-01-01T00:00:00Z"
-}
-```
-
-#### 1.3 Status Update
-Provides progress updates for running evaluations.
-
-```json
-{
-  "type": "status",
-  "evaluationId": "eval-123",
-  "status": "running" | "completed" | "failed",
-  "progress": 0.5,  // Optional, value between 0 and 1
-  "message": "Processing page content..."  // Optional status message
-}
-```
-
-#### 1.4 Heartbeat (Ping)
-Keep-alive message to maintain connection.
-
-```json
-{
-  "type": "ping",
-  "timestamp": "2024-01-01T00:00:00Z"
-}
-```
-
-### 2. Server → Client Messages
-
-#### 2.1 Welcome Message
-Sent immediately after WebSocket connection is established.
-
-```json
-{
-  "type": "welcome",
-  "serverId": "server-001",
-  "version": "1.0.0",
-  "timestamp": "2024-01-01T00:00:00Z"
-}
-```
-
-#### 2.2 Registration Acknowledgment
-Response to client registration.
-
-```json
-{
-  "type": "registration_ack",
-  "clientId": "550e8400-e29b-41d4-a716-446655440000",
-  "status": "accepted" | "rejected",
-  "message": "Client registered successfully",
-  "evaluationsCount": 5,  // Number of evaluations assigned to this client
-  "reason": "Invalid secret key"  // Only present if status is "rejected"
-}
-```
-
-#### 2.3 Heartbeat Response (Pong)
-Response to client ping.
-
-```json
-{
-  "type": "pong",
-  "timestamp": "2024-01-01T00:00:00Z"
-}
-```
-
-## JSON-RPC 2.0 Evaluation Protocol
-
-The evaluation tasks are sent using JSON-RPC 2.0 protocol over the WebSocket connection.
-
-### 3. Evaluation Request (Server → Client)
-
-#### 3.1 Evaluate Method
-Requests the client to execute an evaluation task.
-
-```json
-{
-  "jsonrpc": "2.0",
-  "method": "evaluate",
-  "params": {
-    "evaluationId": "wikipedia-chrome-devtools-001",
-    "name": "Extract Chrome DevTools Wikipedia Article",
-    "url": "https://en.wikipedia.org/wiki/Chrome_DevTools",
-    "tool": "extract_schema_data",
-    "input": {
-      "schema": {
-        "type": "object",
-        "properties": {
-          "title": {"type": "string"},
-          "summary": {"type": "string"},
-          "tableOfContents": {
-            "type": "array",
-            "items": {"type": "string"}
-          }
-        }
-      }
-    },
-    "timeout": 30000,  // Timeout in milliseconds
-    "metadata": {
-      "tags": ["schema-extraction", "wikipedia"],
-      "retries": 2,
-      "priority": "normal"
-    }
-  },
-  "id": "rpc-001"
-}
-```
-
-**Parameters:**
-- `evaluationId`: Unique identifier for this evaluation (from YAML definition)
-- `name`: Human-readable name of the evaluation
-- `url`: Target URL for the evaluation
-- `tool`: Name of the tool to execute
-- `input`: Tool-specific input parameters
-- `timeout`: Maximum execution time in milliseconds
-- `metadata`: Additional evaluation metadata
-
-### 4. Evaluation Response (Client → Server)
-
-#### 4.1 Success Response
-Sent when evaluation completes successfully.
-
-```json
-{
-  "jsonrpc": "2.0",
-  "result": {
-    "status": "success",
-    "output": {
-      "title": "Chrome DevTools",
-      "summary": "Chrome DevTools is a set of web developer tools built directly into the Google Chrome browser.",
-      "tableOfContents": [
-        "Overview",
-        "Features",
-        "History",
-        "Usage"
-      ]
-    },
-    "executionTime": 2500,  // Total execution time in milliseconds
-    "toolCalls": [
-      {
-        "tool": "extract_schema_data",
-        "timestamp": "2024-01-01T00:00:00Z",
-        "duration": 2400,
-        "status": "success"
-      }
-    ],
-    "metadata": {
-      "pageLoadTime": 800,
-      "extractionTime": 1700,
-      "retryCount": 0
-    }
-  },
-  "id": "rpc-001"
-}
-```
-
-#### 4.2 Error Response
-Sent when evaluation fails.
-
-```json
-{
-  "jsonrpc": "2.0",
-  "error": {
-    "code": -32000,
-    "message": "Tool execution failed",
-    "data": {
-      "tool": "extract_schema_data",
-      "error": "Page load timeout after 30000ms",
-      "url": "https://en.wikipedia.org/wiki/Chrome_DevTools",
-      "timestamp": "2024-01-01T00:00:00Z",
-      "stackTrace": "Error: Timeout...\n  at PageLoader.load..."  // Optional
-    }
-  },
-  "id": "rpc-001"
-}
-```
-
-## Error Codes
-
-Standard JSON-RPC 2.0 error codes:
-- `-32700`: Parse error - Invalid JSON was received
-- `-32600`: Invalid request - JSON is not a valid request object
-- `-32601`: Method not found - Method does not exist
-- `-32602`: Invalid params - Invalid method parameters
-- `-32603`: Internal error - Internal JSON-RPC error
-
-Custom error codes for evaluation:
-- `-32000`: Tool execution error - Tool failed during execution
-- `-32001`: Timeout error - Evaluation exceeded timeout
-- `-32002`: Authentication error - Invalid or missing credentials
-- `-32003`: Rate limit exceeded - Too many requests
-- `-32004`: Invalid tool - Requested tool not available
-- `-32005`: Resource error - Unable to access required resources
-
-## Connection Management
-
-### Reconnection
-- Clients should implement automatic reconnection with exponential backoff
-- On reconnection, clients must re-register with the same clientId
-- Server maintains evaluation state across reconnections
-
-### Timeouts
-- Default connection timeout: 60 seconds
-- Ping interval: 30 seconds
-- Evaluation timeout: Specified per evaluation in YAML
-
-### Rate Limiting
-- Server may implement rate limiting per client
-- Rate limit errors use code `-32003`
-- Clients should respect rate limit headers in error responses
-
-## Security Considerations
-
-1. **Authentication**: Clients may use optional secret keys for authentication
-2. **Transport Security**: Production deployments should use WSS (WebSocket Secure)
-3. **Input Validation**: All inputs should be validated against schemas
-4. **Resource Limits**: Enforce timeouts and memory limits for evaluations
-
-## Examples
-
-### Complete Flow Example
-
-1. **Client connects and registers:**
-```json
-// Client → Server
-{"type": "register", "clientId": "550e8400-e29b-41d4-a716-446655440000", "capabilities": {"tools": ["extract_schema_data"], "maxConcurrency": 3, "version": "1.0.0"}}
-
-// Server → Client
-{"type": "registration_ack", "clientId": "550e8400-e29b-41d4-a716-446655440000", "status": "accepted", "message": "Client registered successfully", "evaluationsCount": 2}
-```
-
-2. **Client signals ready:**
-```json
-// Client → Server
-{"type": "ready", "timestamp": "2024-01-01T00:00:00Z"}
-```
-
-3. **Server sends evaluation:**
-```json
-// Server → Client
-{"jsonrpc": "2.0", "method": "evaluate", "params": {"evaluationId": "test-001", "url": "https://example.com", "tool": "extract_schema_data", "input": {"schema": {"type": "object", "properties": {"title": {"type": "string"}}}}, "timeout": 30000}, "id": "rpc-001"}
-```
-
-4. **Client returns result:**
-```json
-// Client → Server
-{"jsonrpc": "2.0", "result": {"status": "success", "output": {"title": "Example Domain"}, "executionTime": 1500}, "id": "rpc-001"}
-```
-
-## Version History
-
-- **1.0.0** (2024-01-01): Initial protocol version
\ No newline at end of file
diff --git a/eval-server/nodejs/docs/TRIGGERING_EVALUATIONS.md b/eval-server/nodejs/docs/TRIGGERING_EVALUATIONS.md
deleted file mode 100644
index 4dd0078..0000000
--- a/eval-server/nodejs/docs/TRIGGERING_EVALUATIONS.md
+++ /dev/null
@@ -1,306 +0,0 @@
-# How to Trigger Evaluations
-
-This guide explains all the different ways to trigger evaluations in the system.
-
-## Prerequisites
-
-1. **Server Running**: Make sure the evaluation server is running:
-   ```bash
-   npm start
-   ```
-
-2. **Client Connected**: A DevTools client must be connected and ready. You'll see logs like:
-   ```
-   [info]: Client registered successfully {"clientId":"550e8400...","capabilities":"extract_schema_data, research_agent"}
-   [info]: Client ready for evaluations {"clientId":"550e8400..."}
-   ```
-
-## Method 1: Interactive CLI
-
-Start the interactive CLI:
-```bash
-npm run cli
-```
-
-### Available Commands
-
-#### List Clients and Evaluations
-```bash
-eval-server> clients
-```
-This shows all registered clients and their available evaluations with current status.
-
-#### Run Specific Evaluation
-```bash
-eval-server> run <client-id> <evaluation-id>
-```
-Example:
-```bash
-eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001
-```
-
-#### Run All Evaluations for a Client
-```bash
-eval-server> run-all <client-id>
-```
-Example:
-```bash
-eval-server> run-all 550e8400-e29b-41d4-a716-446655440000
-```
-
-#### Check Status
-```bash
-eval-server> status
-```
-Shows server status, connected clients, and active evaluations.
-
-#### Get Help
-```bash
-eval-server> help
-```
-
-## Method 2: HTTP API
-
-The server also exposes an HTTP API on port 8081.
-
-### Get Server Status
-```bash
-curl http://localhost:8081/status
-```
-
-### List All Clients
-```bash
-curl http://localhost:8081/clients
-```
-
-### Get Client Evaluations
-```bash
-curl "http://localhost:8081/clients/:id/evaluations?id=550e8400-e29b-41d4-a716-446655440000"
-```
-
-### Trigger Specific Evaluation
-```bash
-curl -X POST http://localhost:8081/evaluate \\
-  -H "Content-Type: application/json" \\
-  -d '{
-    "clientId": "550e8400-e29b-41d4-a716-446655440000",
-    "evaluationId": "wikipedia-chrome-devtools-001"
-  }'
-```
-
-### Trigger All Evaluations for a Client
-```bash
-curl -X POST http://localhost:8081/evaluate \\
-  -H "Content-Type: application/json" \\
-  -d '{
-    "clientId": "550e8400-e29b-41d4-a716-446655440000",
-    "runAll": true
-  }'
-```
-
-
-## Method 3: Programmatic Integration
-
-You can integrate the evaluation system into your own applications:
-
-### Node.js Example
-```javascript
-import { EvaluationServer } from './src/server.js';
-
-const server = new EvaluationServer();
-server.start();
-
-// Wait for client to connect
-setTimeout(async () => {
-  const clientId = '550e8400-e29b-41d4-a716-446655440000';
-  const evaluationId = 'wikipedia-chrome-devtools-001';
-  
-  // Get client connection
-  const connection = server.connectedAgents.get(clientId);
-  if (connection && connection.ready) {
-    // Get evaluation
-    const evaluation = server.getClientManager()
-      .getClientEvaluations(clientId)
-      .find(e => e.id === evaluationId);
-    
-    if (evaluation) {
-      // Execute evaluation
-      await server.executeEvaluation(connection, evaluation);
-      console.log('Evaluation completed!');
-    }
-  }
-}, 5000);
-```
-
-### Python Example (using HTTP API)
-```python
-import requests
-import json
-
-def trigger_evaluation(client_id, evaluation_id):
-    response = requests.post('http://localhost:8081/evaluate', 
-        headers={'Content-Type': 'application/json'},
-        json={
-            'clientId': client_id,
-            'evaluationId': evaluation_id
-        })
-    
-    if response.status_code == 200:
-        return response.json()
-    else:
-        raise Exception(f"Failed to trigger evaluation: {response.text}")
-
-# Example usage
-result = trigger_evaluation(
-    '550e8400-e29b-41d4-a716-446655440000',
-    'wikipedia-chrome-devtools-001'
-)
-print(json.dumps(result, indent=2))
-```
-
-## Method 4: Webhook Integration
-
-You can set up webhooks to trigger evaluations from external systems:
-
-### GitHub Actions Example
-```yaml
-name: Run Evaluations
-on:
-  schedule:
-    - cron: '0 9 * * *'  # Daily at 9 AM
-  workflow_dispatch:  # Manual trigger
-
-jobs:
-  evaluate:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Trigger Evaluation
-        run: |
-          curl -X POST ${{ secrets.EVAL_SERVER_URL }}/evaluate \\
-            -H "Content-Type: application/json" \\
-            -d '{
-              "clientId": "${{ secrets.CLIENT_ID }}",
-              "runAll": true
-            }'
-```
-
-### Slack Bot Example
-```javascript
-// Slack bot command: /eval wikipedia
-app.command('/eval', async ({ command, ack, respond }) => {
-  await ack();
-  
-  const evaluationId = command.text.trim();
-  const clientId = process.env.DEFAULT_CLIENT_ID;
-  
-  try {
-    const response = await fetch('http://localhost:8081/evaluate', {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ clientId, evaluationId })
-    });
-    
-    const result = await response.json();
-    await respond(`✅ Evaluation '${evaluationId}' completed successfully!`);
-  } catch (error) {
-    await respond(`❌ Evaluation failed: ${error.message}`);
-  }
-});
-```
-
-## Monitoring Evaluation Results
-
-### Real-time Logs
-Monitor the server logs to see evaluation progress:
-```bash
-tail -f logs/combined.log
-```
-
-### Status Checking
-Check evaluation status via API:
-```bash
-# Get all evaluations for a client
-curl "http://localhost:8081/clients/:id/evaluations?id=CLIENT_ID"
-
-# Check server status
-curl http://localhost:8081/status
-```
-
-### Log Files
-Evaluation results are logged to:
-- `logs/combined.log` - All logs
-- `logs/error.log` - Error logs only
-
-## Troubleshooting
-
-### Client Not Connected
-```
-❌ Client 'CLIENT_ID' is not connected or not ready
-```
-**Solutions:**
-1. Make sure DevTools is running and connected
-2. Check that the client ID matches
-3. Verify the WebSocket connection is working
-
-### Evaluation Not Found
-```
-❌ Evaluation 'EVAL_ID' not found for client 'CLIENT_ID'
-```
-**Solutions:**
-1. Check the YAML file for the correct evaluation ID
-2. Ensure the evaluation is enabled (`enabled: true`)
-3. Reload the server if you changed the YAML file
-
-### Tool Not Available
-```
-Tool execution failed: Tool not found: tool_name
-```
-**Solutions:**
-1. Verify the tool is registered in DevTools
-2. Check that the tool name matches exactly
-3. Ensure DevTools has the required capabilities
-
-### Connection Timeout
-```
-WebSocket connection failed
-```
-**Solutions:**
-1. Check if the server is running on the correct port
-2. Verify firewall settings
-3. Check network connectivity
-
-## Best Practices
-
-1. **Start Simple**: Begin with manual evaluations before setting up automation
-2. **Monitor Logs**: Always monitor logs when running evaluations
-3. **Test Connections**: Use the `status` command to verify everything is connected
-4. **Gradual Rollout**: Test individual evaluations before running batch operations
-5. **Error Handling**: Implement proper error handling in automated systems
-6. **Rate Limiting**: Don't run too many evaluations simultaneously
-
-## Example Workflow
-
-Here's a typical workflow for triggering evaluations:
-
-```bash
-# 1. Start the server
-npm start
-
-# 2. In another terminal, start the CLI
-npm run cli
-
-# 3. Check status and clients
-eval-server> status
-eval-server> clients
-
-# 4. Run a specific evaluation
-eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001
-
-# 5. Check results in logs
-# (Monitor the server logs for detailed results)
-
-# 6. Run all evaluations if needed
-eval-server> run-all 550e8400-e29b-41d4-a716-446655440000
-```
-
-This comprehensive guide covers all the ways to trigger and monitor evaluations in your system!
\ No newline at end of file
diff --git a/eval-server/nodejs/docs/YAML_SCHEMA.md b/eval-server/nodejs/docs/YAML_SCHEMA.md
deleted file mode 100644
index ea15dcd..0000000
--- a/eval-server/nodejs/docs/YAML_SCHEMA.md
+++ /dev/null
@@ -1,315 +0,0 @@
-# YAML Evaluation Schema Documentation
-
-## Overview
-
-This document describes the YAML schema used to define evaluations for each client. Each client has a dedicated YAML file stored in the `clients/` directory, named after their client ID.
-
-## File Location
-
-```
-bo-eval-server/
-└── clients/
-    ├── 550e8400-e29b-41d4-a716-446655440000.yaml
-    ├── 771f9500-f39c-52e5-b827-557766551111.yaml
-    └── ...
-```
-
-## Schema Structure
-
-### Root Level
-
-```yaml
-# Client identification and authentication
-client:
-  id: "550e8400-e29b-41d4-a716-446655440000"  # Required: UUID v4
-  name: "Chrome DevTools Agent"                 # Required: Human-readable name
-  secret_key: "optional-secret-key"            # Optional: Authentication key
-  description: "Production DevTools instance"   # Optional: Client description
-
-# Client-specific settings
-settings:
-  max_concurrent_evaluations: 3     # Maximum parallel evaluations
-  default_timeout: 30000           # Default timeout in milliseconds
-  retry_policy:
-    max_retries: 2                 # Maximum retry attempts
-    backoff_multiplier: 2          # Exponential backoff multiplier
-    initial_delay: 1000            # Initial retry delay in ms
-
-# List of evaluations assigned to this client
-evaluations:
-  - id: "eval-001"
-    # ... evaluation definition
-  - id: "eval-002"
-    # ... evaluation definition
-```
-
-### Evaluation Definition
-
-Each evaluation in the `evaluations` array follows this structure:
-
-```yaml
-- id: "wikipedia-chrome-devtools-001"        # Required: Unique evaluation ID
-  name: "Extract Chrome DevTools Wikipedia"  # Required: Display name
-  description: "Extract structured data"     # Optional: Detailed description
-  enabled: true                             # Optional: Enable/disable (default: true)
-  
-  # Target configuration
-  target:
-    url: "https://en.wikipedia.org/wiki/Chrome_DevTools"  # Required: Target URL
-    wait_for: "networkidle"    # Optional: Wait condition (load|domcontentloaded|networkidle)
-    wait_timeout: 5000         # Optional: Wait timeout in ms
-  
-  # Tool configuration
-  tool: "extract_schema_data"   # Required: Tool to execute
-  timeout: 30000               # Optional: Override default timeout
-  
-  # Tool-specific input
-  input:
-    schema:                    # For extract_schema_data tool
-      type: "object"
-      properties:
-        title:
-          type: "string"
-        summary:
-          type: "string"
-    
-  
-  # Validation configuration
-  validation:
-    type: "llm-judge"          # llm-judge|snapshot|hybrid
-    
-    # For llm-judge validation
-    llm_judge:
-      model: "gpt-4o-mini"     # LLM model to use
-      temperature: 0.3         # Model temperature
-      criteria:                # Evaluation criteria
-        - "Title should be accurately extracted"
-        - "Summary should be comprehensive"
-        - "All required fields should be present"
-      
-      # Visual verification settings
-      visual_verification:
-        enabled: true
-        capture_before: true   # Screenshot before tool execution
-        capture_after: true    # Screenshot after tool execution
-        prompts:              # Custom verification prompts
-          - "Verify the title matches the page header"
-    
-    # For snapshot validation
-    snapshot:
-      structure_only: false    # Compare structure only
-      exclude_paths:          # Paths to exclude from comparison
-        - "timestamp"
-        - "random_id"
-      sanitizers:             # Value sanitization rules
-        - path: "date"
-          pattern: "\\d{4}-\\d{2}-\\d{2}"
-          replacement: "YYYY-MM-DD"
-    
-    # For hybrid validation (both llm-judge and snapshot)
-    hybrid:
-      weight_llm: 0.7         # Weight for LLM score
-      weight_snapshot: 0.3    # Weight for snapshot score
-  
-  # Metadata and tags
-  metadata:
-    tags:                     # Categorization tags
-      - "schema-extraction"
-      - "wikipedia"
-      - "regression"
-    priority: "normal"        # low|normal|high
-    owner: "team-browser"     # Responsible team/person
-    created: "2024-01-01"     # Creation date
-    modified: "2024-01-15"    # Last modification date
-```
-
-## Tool-Specific Input Schemas
-
-### extract_schema_data
-
-```yaml
-input:
-  schema:                     # JSON Schema for extraction
-    type: "object"
-    properties:
-      title:
-        type: "string"
-      items:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            price:
-              type: "number"
-```
-
-### research_agent
-
-```yaml
-input:
-  query: "Research the latest AI developments"  # Research query
-  max_iterations: 5          # Maximum agent iterations
-  include_sources: true      # Include source URLs
-  depth: "comprehensive"     # shallow|moderate|comprehensive
-```
-
-### action_agent
-
-```yaml
-input:
-  task: "Fill out the contact form"  # Task description
-  form_data:                         # Data to use
-    name: "Test User"
-    email: "test@example.com"
-  verify_completion: true            # Verify task completion
-```
-
-### web_task_agent
-
-```yaml
-input:
-  instructions: |                    # Multi-line instructions
-    1. Navigate to the products page
-    2. Search for "laptop"
-    3. Filter by price < $1000
-    4. Extract the first 5 results
-  expected_outcome: "List of laptops under $1000"
-  max_steps: 10                     # Maximum action steps
-```
-
-## Complete Example
-
-```yaml
-client:
-  id: "550e8400-e29b-41d4-a716-446655440000"
-  name: "Chrome DevTools Production Agent"
-  secret_key: "sk-prod-abc123"
-  description: "Production DevTools instance for continuous evaluation"
-
-settings:
-  max_concurrent_evaluations: 5
-  default_timeout: 45000
-  retry_policy:
-    max_retries: 3
-    backoff_multiplier: 2
-    initial_delay: 2000
-
-evaluations:
-  # Schema extraction evaluation
-  - id: "schema-extract-wiki-001"
-    name: "Wikipedia Chrome DevTools Schema Extraction"
-    description: "Test schema extraction on Wikipedia article"
-    enabled: true
-    
-    target:
-      url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
-      wait_for: "networkidle"
-      wait_timeout: 5000
-    
-    tool: "extract_schema_data"
-    timeout: 30000
-    
-    input:
-      schema:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          summary:
-            type: "string"
-          features:
-            type: "array"
-            items:
-              type: "string"
-          lastModified:
-            type: "string"
-    
-    
-    validation:
-      type: "hybrid"
-      llm_judge:
-        model: "gpt-4o"
-        criteria:
-          - "All schema fields must be populated"
-          - "Summary should be at least 100 characters"
-          - "Features should contain at least 5 items"
-      snapshot:
-        exclude_paths:
-          - "lastModified"
-      hybrid:
-        weight_llm: 0.6
-        weight_snapshot: 0.4
-    
-    metadata:
-      tags: ["schema", "wikipedia", "daily"]
-      priority: "high"
-      owner: "qa-team"
-
-  # Research agent evaluation
-  - id: "research-agent-news-001"
-    name: "Research Latest Tech News"
-    description: "Test research agent on current tech news"
-    enabled: true
-    
-    target:
-      url: "https://news.ycombinator.com"
-    
-    tool: "research_agent"
-    timeout: 60000
-    
-    input:
-      query: "What are the top 3 technology stories today?"
-      max_iterations: 5
-      include_sources: true
-      depth: "moderate"
-    
-    
-    validation:
-      type: "llm-judge"
-      llm_judge:
-        model: "gpt-4o-mini"
-        temperature: 0.3
-        criteria:
-          - "Response includes 3 distinct technology stories"
-          - "Each story has a clear summary"
-          - "Sources are provided for each story"
-          - "Information is current (from today)"
-    
-    metadata:
-      tags: ["research", "news", "tech"]
-      priority: "normal"
-```
-
-## Validation Rules
-
-1. **Client ID**: Must be valid UUID v4 format
-2. **Evaluation IDs**: Must be unique within the file
-3. **Tool names**: Must match registered tools in the client
-4. **URLs**: Must be valid HTTP/HTTPS URLs
-5. **Timeouts**: Must be positive integers (milliseconds)
-
-## YAML Best Practices
-
-1. Use meaningful IDs that describe the evaluation
-2. Group related evaluations together
-3. Use tags consistently for categorization
-4. Document complex input schemas with comments
-5. Keep validation criteria specific and measurable
-6. Use anchors and aliases for repeated configurations:
-
-```yaml
-# Define anchor
-defaults: &defaults
-  timeout: 30000
-  retry_policy:
-    max_retries: 2
-
-# Use alias
-evaluations:
-  - id: "eval-001"
-    <<: *defaults  # Inherits timeout and retry_policy
-    name: "Test 1"
-    # ...
-```
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/a11y-001.yaml b/eval-server/nodejs/evals/action-agent/a11y-001.yaml
deleted file mode 100644
index 7c7947a..0000000
--- a/eval-server/nodejs/evals/action-agent/a11y-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accessibility action test
-id: "a11y-001"
-name: "Click Using ARIA Label"
-description: "Test clicking an element identified primarily by ARIA attributes"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the button with aria-label \"Print Page\""
-  reasoning: "Testing action selection using accessibility attributes"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Used accessibility tree to find elements"
-      - "Correctly identified element by ARIA label"
-      - "Successfully clicked the target button"
-      - "Demonstrated understanding of accessibility attributes"
-      - "No reliance on visual appearance alone"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Print Page button was successfully clicked"
-        - "Check if any print dialog or print preview appeared"
-        - "Confirm the button showed visual feedback (pressed state)"
-        - "Ensure the action was performed on the correct accessibility-labeled element"
-
-metadata:
-  tags: ["action", "accessibility", "aria", "click", "a11y"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/accordion-001.yaml b/eval-server/nodejs/evals/action-agent/accordion-001.yaml
deleted file mode 100644
index dae142d..0000000
--- a/eval-server/nodejs/evals/action-agent/accordion-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accordion expansion test
-id: "accordion-001"
-name: "Expand Accordion Section"
-description: "Test clicking to expand an accordion panel"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/accordion/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to expand the \"Section 2\" accordion panel"
-  reasoning: "Testing accordion expand/collapse interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Section 2 accordion header"
-      - "Successfully clicked to expand the section"
-      - "Section 2 content became visible"
-      - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify Section 2 is now expanded and content visible"
-        - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
-        - "Ensure Section 2 header shows expanded state"
-
-metadata:
-  tags: ["action", "accordion", "expand", "collapse", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
deleted file mode 100644
index 9526551..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accessibility action test
-id: "action-agent-a11y-001"
-name: "Click Using ARIA Label"
-description: "Test clicking an element identified primarily by ARIA attributes"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the button with aria-label \"Print Page\""
-  reasoning: "Testing action selection using accessibility attributes"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Used accessibility tree to find elements"
-      - "Correctly identified element by ARIA label"
-      - "Successfully clicked the target button"
-      - "Demonstrated understanding of accessibility attributes"
-      - "No reliance on visual appearance alone"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Print Page button was successfully clicked"
-        - "Check if any print dialog or print preview appeared"
-        - "Confirm the button showed visual feedback (pressed state)"
-        - "Ensure the action was performed on the correct accessibility-labeled element"
-
-metadata:
-  tags: ["action", "accessibility", "aria", "click", "a11y"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
deleted file mode 100644
index f2df343..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accordion expansion test
-id: "action-agent-accordion-001"
-name: "Expand Accordion Section"
-description: "Test clicking to expand an accordion panel"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/accordion/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to expand the \"Section 2\" accordion panel"
-  reasoning: "Testing accordion expand/collapse interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Section 2 accordion header"
-      - "Successfully clicked to expand the section"
-      - "Section 2 content became visible"
-      - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify Section 2 is now expanded and content visible"
-        - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
-        - "Ensure Section 2 header shows expanded state"
-
-metadata:
-  tags: ["action", "accordion", "expand", "collapse", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
deleted file mode 100644
index c22bfc7..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Autocomplete search test
-id: "action-agent-autocomplete-001"
-name: "Use Autocomplete Search"
-description: "Test typing in autocomplete field and selecting from suggestions"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/autocomplete/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
-  reasoning: "Testing autocomplete/typeahead interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the autocomplete input field"
-      - "Typed \"Java\" to trigger suggestions"
-      - "Autocomplete dropdown appeared with suggestions"
-      - "Selected \"JavaScript\" from the suggestion list"
-      - "Input field shows the selected value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify \"JavaScript\" appears in the input field"
-        - "Check if autocomplete suggestions appeared"
-        - "Confirm the correct suggestion was selected"
-        - "Ensure dropdown closed after selection"
-
-metadata:
-  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
deleted file mode 100644
index b76f307..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Checkbox/radio button test
-id: "action-agent-checkbox-001"
-name: "Toggle Newsletter Checkbox"
-description: "Test clicking checkbox elements for form options"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Click the checkbox labeled \"I have a bike\" to check it"
-  reasoning: "Testing interaction with checkbox form elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct checkbox among multiple options"
-      - "Used click action on the checkbox element"
-      - "Checkbox state changed from unchecked to checked"
-      - "Handled the iframe structure if present"
-      - "No errors with form element interaction"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
-        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
-        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
-        - "Ensure no other checkboxes were accidentally modified"
-
-metadata:
-  tags: ["action", "checkbox", "form", "w3schools", "input"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml b/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
deleted file mode 100644
index 0b25fa8..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Toggle checkbox test - using HTML form test site
-id: "action-agent-checkbox-002"
-name: "Check Extra Cheese Checkbox"
-description: "Test checking a specific checkbox using the check method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
-  reasoning: "Testing checkbox interaction functionality using check method"
-  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
-      - "Used the check method instead of click for better reliability"
-      - "Checkbox became checked (if it wasn't already)"
-      - "No errors occurred during checkbox interaction"
-      - "Form maintained its structure after checkbox selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
-        - "Check that the checkbox shows proper visual feedback for checked state"
-        - "Confirm the form structure remained intact"
-        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
-
-metadata:
-  tags: ["action", "checkbox", "check", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
deleted file mode 100644
index e9af6cf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Basic search interaction test
-id: "action-agent-click-001"
-name: "Search with Text Entry and Click"
-description: "Test entering text in search field and clicking search button"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
-  reasoning: "Testing multi-step interaction: text input followed by button click"
-  hint: "First fill the search input field, then find and click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully located the search input field"
-      - "Entered \"DevTools automation\" text in the search box"
-      - "Located the Google Search button after entering text"
-      - "Successfully clicked the search button"
-      - "Search was executed and results page loaded"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify text \"DevTools automation\" was entered in the search field"
-        - "Check if search results page loaded with relevant results"
-        - "Confirm the search was executed (URL changed to results page)"
-        - "Ensure search results are related to \"DevTools automation\""
-
-metadata:
-  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
deleted file mode 100644
index 6162697..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Right click context menu test
-id: "action-agent-context-001"
-name: "Right Click Context Menu"
-description: "Test right-clicking to open context menu"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/context_menu"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Right-click on the context menu area to open the context menu"
-  reasoning: "Testing right-click context menu interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the designated context menu area"
-      - "Performed right-click action correctly"
-      - "Context menu appeared with options"
-      - "Successfully triggered the right-click event"
-      - "Alert or confirmation appeared as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify right-click was performed on correct area"
-        - "Check if context menu or alert appeared"
-        - "Confirm right-click event was properly triggered"
-        - "Ensure the expected response occurred"
-
-metadata:
-  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
deleted file mode 100644
index f4abbf7..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date picker test
-id: "action-agent-datepicker-001"
-name: "Select Date from Calendar"
-description: "Test clicking date input and selecting a specific date from calendar popup"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/datepicker/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
-  reasoning: "Testing interaction with calendar popup widgets"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the date input field"
-      - "Calendar popup opened successfully"
-      - "Navigated to correct month/year if needed"
-      - "Selected the specific date (March 15, 2024)"
-      - "Date input field shows the selected date"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the date input field contains the selected date"
-        - "Check if the calendar widget opened and closed properly"
-        - "Confirm the correct date was highlighted and selected"
-        - "Ensure the date format matches expected output"
-
-metadata:
-  tags: ["action", "datepicker", "calendar", "form", "popup"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
deleted file mode 100644
index 4581a47..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date range picker test
-id: "action-agent-daterange-001"
-name: "Select Date Range"
-description: "Test selecting a date range with start and end dates"
-enabled: true
-
-target:
-  url: "https://www.daterangepicker.com/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select a date range from February 1, 2024 to February 28, 2024"
-  reasoning: "Testing complex date range selection with start and end dates"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Opened the date range picker interface"
-      - "Selected the start date (February 1, 2024)"
-      - "Selected the end date (February 28, 2024)"
-      - "Date range was properly applied"
-      - "Input field shows the complete date range"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both start and end dates are displayed in the input"
-        - "Check if the date range picker shows the selected range"
-        - "Confirm the format matches expected date range display"
-        - "Ensure both dates were selected in sequence"
-
-metadata:
-  tags: ["action", "daterange", "date-picker", "form", "complex"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
deleted file mode 100644
index b37b91c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dropdown selection test
-id: "action-agent-dropdown-001"
-name: "Select Dropdown Option"
-description: "Test selecting an option from a dropdown menu"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select \"Audi\" from the car brands dropdown menu"
-  reasoning: "Testing dropdown selection interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the dropdown/select element"
-      - "Identified the correct option to select"
-      - "Successfully selected the Audi option"
-      - "Dropdown value changed to the selected option"
-      - "Handled select element interaction properly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the dropdown selection changed"
-        - "Confirm \"Audi\" is now displayed as the selected option"
-        - "Check if the dropdown is closed after selection"
-        - "Verify no other form elements were affected by the selection"
-
-metadata:
-  tags: ["action", "dropdown", "select", "form", "w3schools"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
deleted file mode 100644
index a4380f3..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dynamic content interaction test
-id: "action-agent-dynamic-001"
-name: "Click Dynamic Load Button"
-description: "Test clicking a button that loads dynamic content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the \"Start\" button to trigger dynamic content loading"
-  reasoning: "Testing interaction with dynamically loaded content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found and clicked the Start button"
-      - "Handled the dynamic loading process"
-      - "Recognized that content changes after clicking"
-      - "No timing issues with the dynamic content"
-      - "Successfully triggered the loading animation"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loaded after clicking Start"
-        - "Check if loading animation or spinner was displayed"
-        - "Confirm new content appeared that was previously hidden"
-        - "Verify the Start button state changed or was replaced after clicking"
-
-metadata:
-  tags: ["action", "dynamic", "click", "ajax", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
deleted file mode 100644
index 503c157..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# E-commerce action test
-id: "action-agent-ecommerce-001"
-name: "Add Product to Cart"
-description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 180000
-
-input:
-  objective: "Click the \"Add to Cart\" button for this storage container"
-  reasoning: "Testing e-commerce interaction with product cart functionality"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Add to Cart button on the product page"
-      - "Successfully clicked the button"
-      - "Handled any popups or confirmations that appeared"
-      - "Verified the item was added (cart count changed or confirmation shown)"
-      - "Dealt with page dynamics after clicking"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the Add to Cart button was clicked"
-        - "Check if cart count indicator increased or shows the item was added"
-        - "Look for any confirmation popup or notification about the item being added"
-        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
-
-metadata:
-  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
-  priority: "high"
-  timeout: 180000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
deleted file mode 100644
index 43c95e6..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Error recovery test
-id: "action-agent-error-001"
-name: "Handle Missing Element"
-description: "Test agent behavior when target element is not found"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the \"Sign Up\" button"
-  reasoning: "Testing error handling when element does not exist"
-  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to find the requested element"
-      - "Recognized that the element does not exist"
-      - "Provided clear error message or explanation"
-      - "Did not crash or produce confusing output"
-      - "Suggested alternatives or explained the issue"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the page remains in a stable state despite the missing element"
-        - "Confirm no error dialogs or broken UI elements appeared"
-        - "Check that the agent handled the missing element gracefully"
-        - "Ensure the page was properly analyzed even though the target was not found"
-
-metadata:
-  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
-  priority: "high"
-  timeout: 60000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
deleted file mode 100644
index 7782999..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Search filter application test
-id: "action-agent-filter-001"
-name: "Apply Search Filters"
-description: "Test applying search filters to modify results"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Anna\" in the search filter to filter the list"
-  reasoning: "Testing search filter application"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the search filter input"
-      - "Typed \"Anna\" in the filter field"
-      - "List items filtered to show only matching results"
-      - "Non-matching items were hidden or removed from view"
-      - "Filter functionality worked as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search input contains \"Anna\""
-        - "Check if list shows only items containing \"Anna\""
-        - "Confirm non-matching items are not visible"
-        - "Ensure filter functionality reduced the visible list items"
-
-metadata:
-  tags: ["action", "filter", "search", "list", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
deleted file mode 100644
index 61d036f..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Form fill action test
-id: "action-agent-form-001"
-name: "Fill Search Query"
-description: "Test filling a search input field with specific text"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Fill the search box with \"Chrome DevTools automation testing\""
-  reasoning: "Testing form input capability with a specific search query"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully identified the search input field"
-      - "Used perform_action with fill method"
-      - "Correctly filled the field with the specified text"
-      - "Verified the field accepted the input"
-      - "No formatting or encoding issues with the text"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to confirm text was entered in the search field"
-        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
-        - "Check if search suggestions or autocomplete dropdown appeared"
-        - "Ensure no input validation errors are shown"
-
-metadata:
-  tags: ["action", "form-fill", "input", "google", "basic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
deleted file mode 100644
index ed98fbf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Hover action test
-id: "action-agent-hover-001"
-name: "Hover to Reveal Menu"
-description: "Test hovering over an element to reveal hidden content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/hovers"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Hover over the first user avatar image to reveal the hidden caption"
-  reasoning: "Testing hover interaction to reveal dynamic content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first user avatar image"
-      - "Used appropriate hover action method"
-      - "Successfully triggered the hover state"
-      - "Hidden caption became visible after hover"
-      - "Handled mouse interaction correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify hover revealed hidden content"
-        - "Check that caption or overlay appeared over the first avatar"
-        - "Confirm the hover state is visually active on the image"
-        - "Verify user information or caption text is now visible"
-
-metadata:
-  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
deleted file mode 100644
index 6bfceac..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Keyboard tab navigation test
-id: "action-agent-keyboard-001"
-name: "Keyboard Tab Navigation"
-description: "Test using keyboard navigation to move between elements"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Use Tab key to navigate between menu items and Enter to activate"
-  reasoning: "Testing keyboard-only navigation patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used keyboard navigation"
-      - "Tab key moved focus between menu items"
-      - "Focus indicators were visible during navigation"
-      - "Enter key activated the focused menu item"
-      - "Keyboard navigation followed accessibility standards"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify focus indicators are visible on menu items"
-        - "Check if keyboard navigation moved focus correctly"
-        - "Confirm Enter key activated the focused item"
-        - "Ensure accessibility navigation patterns worked"
-
-metadata:
-  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
deleted file mode 100644
index 1b705ce..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Login form test
-id: "action-agent-login-001"
-name: "Fill Login Credentials"
-description: "Test filling username and password fields in a login form"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/login"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
-  reasoning: "Testing form fill with multiple fields including password type"
-  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified both username and password fields"
-      - "Filled username field with correct value"
-      - "Filled password field with correct value"
-      - "Handled password field type appropriately"
-      - "Used the provided input_data XML format correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the username field shows \"tomsmith\" entered"
-        - "Confirm the password field has dots/asterisks indicating password entry"
-        - "Check that both fields are properly filled before submission"
-        - "Ensure no validation errors are shown for the filled fields"
-
-metadata:
-  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
deleted file mode 100644
index 1324fee..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Modal dialog test
-id: "action-agent-modal-001"
-name: "Open and Close Modal"
-description: "Test opening modal dialog and closing it with X button"
-enabled: true
-
-target:
-  url: "https://getbootstrap.com/docs/5.0/components/modal/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to open the modal dialog, then close it using the X button"
-  reasoning: "Testing modal dialog interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the modal trigger button"
-      - "Modal dialog opened successfully"
-      - "Modal content was visible and accessible"
-      - "Found and clicked the close (X) button"
-      - "Modal closed and page returned to normal state"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify modal opened with visible content"
-        - "Check if modal overlay appeared correctly"
-        - "Confirm modal was closed after clicking X"
-        - "Ensure page background is accessible again"
-
-metadata:
-  tags: ["action", "modal", "dialog", "popup", "overlay"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
deleted file mode 100644
index fed3f78..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Multi-select dropdown test
-id: "action-agent-multiselect-001"
-name: "Select Multiple Options"
-description: "Test selecting multiple options from a multi-select dropdown"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
-  reasoning: "Testing multiple selection in select elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the multi-select dropdown element"
-      - "Successfully selected Volvo option"
-      - "Successfully selected Audi option"
-      - "Both options remain selected simultaneously"
-      - "Used appropriate multi-select interaction method"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both Volvo and Audi appear selected"
-        - "Check if both options are highlighted/marked"
-        - "Confirm multi-select functionality worked correctly"
-        - "Ensure no other options were accidentally selected"
-
-metadata:
-  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
deleted file mode 100644
index 31514dd..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Multi-step form test
-id: "action-agent-multistep-001"
-name: "Complete Search and Submit"
-description: "Test filling a search form and then clicking the submit button"
-enabled: true
-
-target:
-  url: "https://www.bing.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
-  reasoning: "Testing multi-step form interaction combining fill and click actions"
-  hint: "This requires two actions: first fill the search field, then click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Recognized this requires multiple actions"
-      - "First filled the search input correctly"
-      - "Then located and clicked the search button"
-      - "Both actions completed successfully in sequence"
-      - "Search was initiated with the correct query"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the search input contains \"automated testing tools\" text"
-        - "Confirm the search was submitted and results page loaded"
-        - "Check that search results are related to the query"
-        - "Ensure the multi-step action completed fully with both fill and click"
-
-metadata:
-  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
deleted file mode 100644
index f49a0cf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Complex navigation test
-id: "action-agent-nav-001"
-name: "Navigate via Menu Click"
-description: "Test clicking navigation menu items to navigate between pages"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
-  reasoning: "Testing navigation through link clicks on a multilingual site"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct language link among many options"
-      - "Successfully clicked the English link"
-      - "Navigation occurred to the English Wikipedia"
-      - "Used appropriate tools to verify navigation success"
-      - "Handled the multilingual page structure correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
-        - "Check if the page language and content changed to English"
-        - "Verify the URL changed to en.wikipedia.org"
-        - "Confirm the English Wikipedia main page is displayed"
-
-metadata:
-  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
deleted file mode 100644
index 07d6ef8..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Radio button selection test
-id: "action-agent-radio-001"
-name: "Select Radio Button Option"
-description: "Test selecting a specific radio button option using click method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
-  reasoning: "Testing radio button selection functionality"
-  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Medium radio button in the Pizza Size section"
-      - "Successfully clicked the Medium radio button"
-      - "Radio button became selected (checked state)"
-      - "Other radio buttons in the same group became unselected"
-      - "Form maintained its structure after radio button selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Medium radio button is now selected (shows filled circle)"
-        - "Check that other pizza size options (Small, Large) are no longer selected"
-        - "Confirm the form structure remained intact"
-        - "Ensure the Medium pizza size radio button was specifically targeted"
-
-metadata:
-  tags: ["action", "radio", "click", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
deleted file mode 100644
index c370658..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Range slider test
-id: "action-agent-slider-001"
-name: "Adjust Range Slider"
-description: "Test moving slider to set a specific value"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/slider/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Move the slider to set the value to 75"
-  reasoning: "Testing slider/range input manipulation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the slider control element"
-      - "Successfully moved the slider handle"
-      - "Set the slider value to approximately 75"
-      - "Slider position reflects the target value"
-      - "Any associated display shows the correct value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify slider handle moved to represent value 75"
-        - "Check if value display shows 75 or close to it"
-        - "Confirm slider position visually matches target"
-        - "Ensure slider interaction was smooth and successful"
-
-metadata:
-  tags: ["action", "slider", "range", "form", "drag"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
deleted file mode 100644
index d78e66c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table row selection test
-id: "action-agent-tableselect-001"
-name: "Select Table Row"
-description: "Test clicking to select a table row"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/api/select_single_row.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the first row to select it"
-  reasoning: "Testing table row selection patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first table row"
-      - "Successfully clicked the row"
-      - "Row became highlighted/selected"
-      - "Selection state is visually apparent"
-      - "Only one row is selected at a time"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the first row is now highlighted/selected"
-        - "Check if row selection visual feedback is clear"
-        - "Confirm only the clicked row is selected"
-        - "Ensure row selection styling is properly applied"
-
-metadata:
-  tags: ["action", "table", "select", "row", "highlight"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
deleted file mode 100644
index e3e3176..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table column sorting test
-id: "action-agent-tablesort-001"
-name: "Sort Table Column"
-description: "Test clicking table column header to sort data"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Name\" column header to sort the table by name"
-  reasoning: "Testing table column sorting interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Name column header"
-      - "Successfully clicked the column header"
-      - "Table data reordered by name alphabetically"
-      - "Sort indicator appeared on the Name column"
-      - "Table sorting completed without errors"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify table rows are now sorted alphabetically by name"
-        - "Check if sort arrow/indicator appears on Name column"
-        - "Confirm the data order changed from before to after"
-        - "Ensure table structure remained intact after sorting"
-
-metadata:
-  tags: ["action", "table", "sort", "column", "data"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
deleted file mode 100644
index 22db60c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Tab panel navigation test
-id: "action-agent-tabs-001"
-name: "Navigate Tab Panels"
-description: "Test clicking tab to switch between tab panels"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/tabs/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
-  reasoning: "Testing tab panel navigation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the \"Nunc tincidunt\" tab button"
-      - "Successfully clicked the tab"
-      - "Tab panel content switched to the selected tab"
-      - "Active tab visual state changed appropriately"
-      - "Content area updated to show the new panel"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
-        - "Check if the content panel changed to show new content"
-        - "Confirm the tab switching animation completed"
-        - "Ensure the correct tab content is visible"
-
-metadata:
-  tags: ["action", "tabs", "navigation", "panels", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
deleted file mode 100644
index 056fbe9..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Time picker test
-id: "action-agent-timepicker-001"
-name: "Select Time from Picker"
-description: "Test setting time using time picker controls"
-enabled: true
-
-target:
-  url: "https://timepicker.co/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Set the time to 2:30 PM using the time picker controls"
-  reasoning: "Testing time selection with hour/minute controls"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the time picker interface"
-      - "Set the hour to 2 (14 for 24-hour format)"
-      - "Set the minutes to 30"
-      - "Selected PM or appropriate time format"
-      - "Time input shows 2:30 PM or equivalent"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the time input displays 2:30 PM or 14:30"
-        - "Check if hour and minute were set correctly"
-        - "Confirm AM/PM selection if applicable"
-        - "Ensure the time picker interface was properly used"
-
-metadata:
-  tags: ["action", "timepicker", "time", "form", "clock"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
deleted file mode 100644
index 518515d..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# File upload test
-id: "action-agent-upload-001"
-name: "Upload File via Input"
-description: "Test clicking file input and uploading a test file"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/upload"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the file input and upload a test file"
-  reasoning: "Testing file upload interaction through input elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the file input element"
-      - "Triggered file selection dialog"
-      - "Selected a file for upload"
-      - "File name appears in the input field"
-      - "Upload process initiated successfully"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify file name appears in the upload input field"
-        - "Check if file selection was successful"
-        - "Confirm upload button is available or file is ready"
-        - "Ensure no upload errors are displayed"
-
-metadata:
-  tags: ["action", "upload", "file", "input", "form"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
deleted file mode 100644
index ba21b28..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video playback controls test
-id: "action-agent-video-001"
-name: "Control Video Playback"
-description: "Test starting video playback using click + spacebar"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the video element to focus it, then press spacebar to start playback"
-  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
-  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Video element in the accessibility tree"
-      - "Successfully clicked the Video element to focus it"
-      - "Used keyboard input to press spacebar"
-      - "Video playback started after spacebar press"
-      - "No errors occurred during the interaction sequence"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify video player is visible on the page"
-        - "Check if the play button was clicked (may show pause button after)"
-        - "Look for visual indicators that video started playing"
-        - "Ensure no error messages appeared during video interaction"
-
-metadata:
-  tags: ["action", "video", "media", "controls", "playback"]
-  priority: "high"
-  timeout: 90000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml b/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
deleted file mode 100644
index d7188ec..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video play button specific targeting test
-id: "action-agent-video-002"
-name: "Click Video Play Button Specifically"
-description: "Test clicking the specific play button (not the video element)"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
-  reasoning: "Testing specific targeting of the play button element"
-  hint: "Target the button element with text or label \"play\", do not click the Video element"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found a button element (not Video element) with \"play\" in the name"
-      - "Successfully clicked the play button specifically"
-      - "Did not click on the Video element itself"
-      - "Play button click was executed correctly"
-      - "Video responded to the button click"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the play button (not video element) was clicked"
-        - "Check if video started playing after button click"
-        - "Confirm the target was the button, not the video container"
-        - "Look for changes in video player state"
-
-metadata:
-  tags: ["action", "video", "button", "specific-targeting"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml b/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
deleted file mode 100644
index 4bd4aa8..0000000
--- a/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Autocomplete search test
-id: "autocomplete-001"
-name: "Use Autocomplete Search"
-description: "Test typing in autocomplete field and selecting from suggestions"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/autocomplete/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
-  reasoning: "Testing autocomplete/typeahead interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the autocomplete input field"
-      - "Typed \"Java\" to trigger suggestions"
-      - "Autocomplete dropdown appeared with suggestions"
-      - "Selected \"JavaScript\" from the suggestion list"
-      - "Input field shows the selected value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify \"JavaScript\" appears in the input field"
-        - "Check if autocomplete suggestions appeared"
-        - "Confirm the correct suggestion was selected"
-        - "Ensure dropdown closed after selection"
-
-metadata:
-  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/checkbox-001.yaml b/eval-server/nodejs/evals/action-agent/checkbox-001.yaml
deleted file mode 100644
index 041f2f6..0000000
--- a/eval-server/nodejs/evals/action-agent/checkbox-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Checkbox/radio button test
-id: "checkbox-001"
-name: "Toggle Newsletter Checkbox"
-description: "Test clicking checkbox elements for form options"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Click the checkbox labeled \"I have a bike\" to check it"
-  reasoning: "Testing interaction with checkbox form elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct checkbox among multiple options"
-      - "Used click action on the checkbox element"
-      - "Checkbox state changed from unchecked to checked"
-      - "Handled the iframe structure if present"
-      - "No errors with form element interaction"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
-        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
-        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
-        - "Ensure no other checkboxes were accidentally modified"
-
-metadata:
-  tags: ["action", "checkbox", "form", "w3schools", "input"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/checkbox-002.yaml b/eval-server/nodejs/evals/action-agent/checkbox-002.yaml
deleted file mode 100644
index 036f388..0000000
--- a/eval-server/nodejs/evals/action-agent/checkbox-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Toggle checkbox test - using HTML form test site
-id: "checkbox-002"
-name: "Check Extra Cheese Checkbox"
-description: "Test checking a specific checkbox using the check method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
-  reasoning: "Testing checkbox interaction functionality using check method"
-  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
-      - "Used the check method instead of click for better reliability"
-      - "Checkbox became checked (if it wasn't already)"
-      - "No errors occurred during checkbox interaction"
-      - "Form maintained its structure after checkbox selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
-        - "Check that the checkbox shows proper visual feedback for checked state"
-        - "Confirm the form structure remained intact"
-        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
-
-metadata:
-  tags: ["action", "checkbox", "check", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/click-001.yaml b/eval-server/nodejs/evals/action-agent/click-001.yaml
deleted file mode 100644
index e86c8fd..0000000
--- a/eval-server/nodejs/evals/action-agent/click-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Basic search interaction test
-id: "click-001"
-name: "Search with Text Entry and Click"
-description: "Test entering text in search field and clicking search button"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
-  reasoning: "Testing multi-step interaction: text input followed by button click"
-  hint: "First fill the search input field, then find and click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully located the search input field"
-      - "Entered \"DevTools automation\" text in the search box"
-      - "Located the Google Search button after entering text"
-      - "Successfully clicked the search button"
-      - "Search was executed and results page loaded"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify text \"DevTools automation\" was entered in the search field"
-        - "Check if search results page loaded with relevant results"
-        - "Confirm the search was executed (URL changed to results page)"
-        - "Ensure search results are related to \"DevTools automation\""
-
-metadata:
-  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/context-001.yaml b/eval-server/nodejs/evals/action-agent/context-001.yaml
deleted file mode 100644
index 0ca7c58..0000000
--- a/eval-server/nodejs/evals/action-agent/context-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Right click context menu test
-id: "context-001"
-name: "Right Click Context Menu"
-description: "Test right-clicking to open context menu"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/context_menu"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Right-click on the context menu area to open the context menu"
-  reasoning: "Testing right-click context menu interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the designated context menu area"
-      - "Performed right-click action correctly"
-      - "Context menu appeared with options"
-      - "Successfully triggered the right-click event"
-      - "Alert or confirmation appeared as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify right-click was performed on correct area"
-        - "Check if context menu or alert appeared"
-        - "Confirm right-click event was properly triggered"
-        - "Ensure the expected response occurred"
-
-metadata:
-  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/datepicker-001.yaml b/eval-server/nodejs/evals/action-agent/datepicker-001.yaml
deleted file mode 100644
index 9b6a9df..0000000
--- a/eval-server/nodejs/evals/action-agent/datepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date picker test
-id: "datepicker-001"
-name: "Select Date from Calendar"
-description: "Test clicking date input and selecting a specific date from calendar popup"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/datepicker/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
-  reasoning: "Testing interaction with calendar popup widgets"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the date input field"
-      - "Calendar popup opened successfully"
-      - "Navigated to correct month/year if needed"
-      - "Selected the specific date (March 15, 2024)"
-      - "Date input field shows the selected date"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the date input field contains the selected date"
-        - "Check if the calendar widget opened and closed properly"
-        - "Confirm the correct date was highlighted and selected"
-        - "Ensure the date format matches expected output"
-
-metadata:
-  tags: ["action", "datepicker", "calendar", "form", "popup"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/daterange-001.yaml b/eval-server/nodejs/evals/action-agent/daterange-001.yaml
deleted file mode 100644
index a9b202b..0000000
--- a/eval-server/nodejs/evals/action-agent/daterange-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date range picker test
-id: "daterange-001"
-name: "Select Date Range"
-description: "Test selecting a date range with start and end dates"
-enabled: true
-
-target:
-  url: "https://www.daterangepicker.com/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select a date range from February 1, 2024 to February 28, 2024"
-  reasoning: "Testing complex date range selection with start and end dates"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Opened the date range picker interface"
-      - "Selected the start date (February 1, 2024)"
-      - "Selected the end date (February 28, 2024)"
-      - "Date range was properly applied"
-      - "Input field shows the complete date range"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both start and end dates are displayed in the input"
-        - "Check if the date range picker shows the selected range"
-        - "Confirm the format matches expected date range display"
-        - "Ensure both dates were selected in sequence"
-
-metadata:
-  tags: ["action", "daterange", "date-picker", "form", "complex"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/dropdown-001.yaml b/eval-server/nodejs/evals/action-agent/dropdown-001.yaml
deleted file mode 100644
index a64edb0..0000000
--- a/eval-server/nodejs/evals/action-agent/dropdown-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dropdown selection test
-id: "dropdown-001"
-name: "Select Dropdown Option"
-description: "Test selecting an option from a dropdown menu"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select \"Audi\" from the car brands dropdown menu"
-  reasoning: "Testing dropdown selection interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the dropdown/select element"
-      - "Identified the correct option to select"
-      - "Successfully selected the Audi option"
-      - "Dropdown value changed to the selected option"
-      - "Handled select element interaction properly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the dropdown selection changed"
-        - "Confirm \"Audi\" is now displayed as the selected option"
-        - "Check if the dropdown is closed after selection"
-        - "Verify no other form elements were affected by the selection"
-
-metadata:
-  tags: ["action", "dropdown", "select", "form", "w3schools"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/dynamic-001.yaml b/eval-server/nodejs/evals/action-agent/dynamic-001.yaml
deleted file mode 100644
index fba60bd..0000000
--- a/eval-server/nodejs/evals/action-agent/dynamic-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dynamic content interaction test
-id: "dynamic-001"
-name: "Click Dynamic Load Button"
-description: "Test clicking a button that loads dynamic content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the \"Start\" button to trigger dynamic content loading"
-  reasoning: "Testing interaction with dynamically loaded content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found and clicked the Start button"
-      - "Handled the dynamic loading process"
-      - "Recognized that content changes after clicking"
-      - "No timing issues with the dynamic content"
-      - "Successfully triggered the loading animation"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loaded after clicking Start"
-        - "Check if loading animation or spinner was displayed"
-        - "Confirm new content appeared that was previously hidden"
-        - "Verify the Start button state changed or was replaced after clicking"
-
-metadata:
-  tags: ["action", "dynamic", "click", "ajax", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml b/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
deleted file mode 100644
index ae573de..0000000
--- a/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# E-commerce action test
-id: "ecommerce-001"
-name: "Add Product to Cart"
-description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 180000
-
-input:
-  objective: "Click the \"Add to Cart\" button for this storage container"
-  reasoning: "Testing e-commerce interaction with product cart functionality"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Add to Cart button on the product page"
-      - "Successfully clicked the button"
-      - "Handled any popups or confirmations that appeared"
-      - "Verified the item was added (cart count changed or confirmation shown)"
-      - "Dealt with page dynamics after clicking"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the Add to Cart button was clicked"
-        - "Check if cart count indicator increased or shows the item was added"
-        - "Look for any confirmation popup or notification about the item being added"
-        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
-
-metadata:
-  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
-  priority: "high"
-  timeout: 180000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/error-001.yaml b/eval-server/nodejs/evals/action-agent/error-001.yaml
deleted file mode 100644
index a2b5646..0000000
--- a/eval-server/nodejs/evals/action-agent/error-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Error recovery test
-id: "error-001"
-name: "Handle Missing Element"
-description: "Test agent behavior when target element is not found"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the \"Sign Up\" button"
-  reasoning: "Testing error handling when element does not exist"
-  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to find the requested element"
-      - "Recognized that the element does not exist"
-      - "Provided clear error message or explanation"
-      - "Did not crash or produce confusing output"
-      - "Suggested alternatives or explained the issue"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the page remains in a stable state despite the missing element"
-        - "Confirm no error dialogs or broken UI elements appeared"
-        - "Check that the agent handled the missing element gracefully"
-        - "Ensure the page was properly analyzed even though the target was not found"
-
-metadata:
-  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
-  priority: "high"
-  timeout: 60000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/filter-001.yaml b/eval-server/nodejs/evals/action-agent/filter-001.yaml
deleted file mode 100644
index 7efa8f1..0000000
--- a/eval-server/nodejs/evals/action-agent/filter-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Search filter application test
-id: "filter-001"
-name: "Apply Search Filters"
-description: "Test applying search filters to modify results"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Anna\" in the search filter to filter the list"
-  reasoning: "Testing search filter application"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the search filter input"
-      - "Typed \"Anna\" in the filter field"
-      - "List items filtered to show only matching results"
-      - "Non-matching items were hidden or removed from view"
-      - "Filter functionality worked as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search input contains \"Anna\""
-        - "Check if list shows only items containing \"Anna\""
-        - "Confirm non-matching items are not visible"
-        - "Ensure filter functionality reduced the visible list items"
-
-metadata:
-  tags: ["action", "filter", "search", "list", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/form-001.yaml b/eval-server/nodejs/evals/action-agent/form-001.yaml
deleted file mode 100644
index c4f06da..0000000
--- a/eval-server/nodejs/evals/action-agent/form-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Form fill action test
-id: "form-001"
-name: "Fill Search Query"
-description: "Test filling a search input field with specific text"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Fill the search box with \"Chrome DevTools automation testing\""
-  reasoning: "Testing form input capability with a specific search query"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully identified the search input field"
-      - "Used perform_action with fill method"
-      - "Correctly filled the field with the specified text"
-      - "Verified the field accepted the input"
-      - "No formatting or encoding issues with the text"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to confirm text was entered in the search field"
-        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
-        - "Check if search suggestions or autocomplete dropdown appeared"
-        - "Ensure no input validation errors are shown"
-
-metadata:
-  tags: ["action", "form-fill", "input", "google", "basic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/hover-001.yaml b/eval-server/nodejs/evals/action-agent/hover-001.yaml
deleted file mode 100644
index a58b225..0000000
--- a/eval-server/nodejs/evals/action-agent/hover-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Hover action test
-id: "hover-001"
-name: "Hover to Reveal Menu"
-description: "Test hovering over an element to reveal hidden content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/hovers"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Hover over the first user avatar image to reveal the hidden caption"
-  reasoning: "Testing hover interaction to reveal dynamic content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first user avatar image"
-      - "Used appropriate hover action method"
-      - "Successfully triggered the hover state"
-      - "Hidden caption became visible after hover"
-      - "Handled mouse interaction correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify hover revealed hidden content"
-        - "Check that caption or overlay appeared over the first avatar"
-        - "Confirm the hover state is visually active on the image"
-        - "Verify user information or caption text is now visible"
-
-metadata:
-  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/keyboard-001.yaml b/eval-server/nodejs/evals/action-agent/keyboard-001.yaml
deleted file mode 100644
index 6a1ffd1..0000000
--- a/eval-server/nodejs/evals/action-agent/keyboard-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Keyboard tab navigation test
-id: "keyboard-001"
-name: "Keyboard Tab Navigation"
-description: "Test using keyboard navigation to move between elements"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Use Tab key to navigate between menu items and Enter to activate"
-  reasoning: "Testing keyboard-only navigation patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used keyboard navigation"
-      - "Tab key moved focus between menu items"
-      - "Focus indicators were visible during navigation"
-      - "Enter key activated the focused menu item"
-      - "Keyboard navigation followed accessibility standards"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify focus indicators are visible on menu items"
-        - "Check if keyboard navigation moved focus correctly"
-        - "Confirm Enter key activated the focused item"
-        - "Ensure accessibility navigation patterns worked"
-
-metadata:
-  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/login-001.yaml b/eval-server/nodejs/evals/action-agent/login-001.yaml
deleted file mode 100644
index b56fbca..0000000
--- a/eval-server/nodejs/evals/action-agent/login-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Login form test
-id: "login-001"
-name: "Fill Login Credentials"
-description: "Test filling username and password fields in a login form"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/login"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
-  reasoning: "Testing form fill with multiple fields including password type"
-  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified both username and password fields"
-      - "Filled username field with correct value"
-      - "Filled password field with correct value"
-      - "Handled password field type appropriately"
-      - "Used the provided input_data XML format correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the username field shows \"tomsmith\" entered"
-        - "Confirm the password field has dots/asterisks indicating password entry"
-        - "Check that both fields are properly filled before submission"
-        - "Ensure no validation errors are shown for the filled fields"
-
-metadata:
-  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/modal-001.yaml b/eval-server/nodejs/evals/action-agent/modal-001.yaml
deleted file mode 100644
index ef05d16..0000000
--- a/eval-server/nodejs/evals/action-agent/modal-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Modal dialog test
-id: "modal-001"
-name: "Open and Close Modal"
-description: "Test opening modal dialog and closing it with X button"
-enabled: true
-
-target:
-  url: "https://getbootstrap.com/docs/5.0/components/modal/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to open the modal dialog, then close it using the X button"
-  reasoning: "Testing modal dialog interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the modal trigger button"
-      - "Modal dialog opened successfully"
-      - "Modal content was visible and accessible"
-      - "Found and clicked the close (X) button"
-      - "Modal closed and page returned to normal state"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify modal opened with visible content"
-        - "Check if modal overlay appeared correctly"
-        - "Confirm modal was closed after clicking X"
-        - "Ensure page background is accessible again"
-
-metadata:
-  tags: ["action", "modal", "dialog", "popup", "overlay"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/multiselect-001.yaml b/eval-server/nodejs/evals/action-agent/multiselect-001.yaml
deleted file mode 100644
index a456c9b..0000000
--- a/eval-server/nodejs/evals/action-agent/multiselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Multi-select dropdown test
-id: "multiselect-001"
-name: "Select Multiple Options"
-description: "Test selecting multiple options from a multi-select dropdown"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
-  reasoning: "Testing multiple selection in select elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the multi-select dropdown element"
-      - "Successfully selected Volvo option"
-      - "Successfully selected Audi option"
-      - "Both options remain selected simultaneously"
-      - "Used appropriate multi-select interaction method"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both Volvo and Audi appear selected"
-        - "Check if both options are highlighted/marked"
-        - "Confirm multi-select functionality worked correctly"
-        - "Ensure no other options were accidentally selected"
-
-metadata:
-  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/multistep-001.yaml b/eval-server/nodejs/evals/action-agent/multistep-001.yaml
deleted file mode 100644
index 14923a2..0000000
--- a/eval-server/nodejs/evals/action-agent/multistep-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Multi-step form test
-id: "multistep-001"
-name: "Complete Search and Submit"
-description: "Test filling a search form and then clicking the submit button"
-enabled: true
-
-target:
-  url: "https://www.bing.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
-  reasoning: "Testing multi-step form interaction combining fill and click actions"
-  hint: "This requires two actions: first fill the search field, then click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Recognized this requires multiple actions"
-      - "First filled the search input correctly"
-      - "Then located and clicked the search button"
-      - "Both actions completed successfully in sequence"
-      - "Search was initiated with the correct query"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the search input contains \"automated testing tools\" text"
-        - "Confirm the search was submitted and results page loaded"
-        - "Check that search results are related to the query"
-        - "Ensure the multi-step action completed fully with both fill and click"
-
-metadata:
-  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/nav-001.yaml b/eval-server/nodejs/evals/action-agent/nav-001.yaml
deleted file mode 100644
index e1ef610..0000000
--- a/eval-server/nodejs/evals/action-agent/nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Complex navigation test
-id: "nav-001"
-name: "Navigate via Menu Click"
-description: "Test clicking navigation menu items to navigate between pages"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
-  reasoning: "Testing navigation through link clicks on a multilingual site"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct language link among many options"
-      - "Successfully clicked the English link"
-      - "Navigation occurred to the English Wikipedia"
-      - "Used appropriate tools to verify navigation success"
-      - "Handled the multilingual page structure correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
-        - "Check if the page language and content changed to English"
-        - "Verify the URL changed to en.wikipedia.org"
-        - "Confirm the English Wikipedia main page is displayed"
-
-metadata:
-  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/radio-001.yaml b/eval-server/nodejs/evals/action-agent/radio-001.yaml
deleted file mode 100644
index a136e1e..0000000
--- a/eval-server/nodejs/evals/action-agent/radio-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Radio button selection test
-id: "radio-001"
-name: "Select Radio Button Option"
-description: "Test selecting a specific radio button option using click method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
-  reasoning: "Testing radio button selection functionality"
-  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Medium radio button in the Pizza Size section"
-      - "Successfully clicked the Medium radio button"
-      - "Radio button became selected (checked state)"
-      - "Other radio buttons in the same group became unselected"
-      - "Form maintained its structure after radio button selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Medium radio button is now selected (shows filled circle)"
-        - "Check that other pizza size options (Small, Large) are no longer selected"
-        - "Confirm the form structure remained intact"
-        - "Ensure the Medium pizza size radio button was specifically targeted"
-
-metadata:
-  tags: ["action", "radio", "click", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/slider-001.yaml b/eval-server/nodejs/evals/action-agent/slider-001.yaml
deleted file mode 100644
index 9369671..0000000
--- a/eval-server/nodejs/evals/action-agent/slider-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Range slider test
-id: "slider-001"
-name: "Adjust Range Slider"
-description: "Test moving slider to set a specific value"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/slider/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Move the slider to set the value to 75"
-  reasoning: "Testing slider/range input manipulation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the slider control element"
-      - "Successfully moved the slider handle"
-      - "Set the slider value to approximately 75"
-      - "Slider position reflects the target value"
-      - "Any associated display shows the correct value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify slider handle moved to represent value 75"
-        - "Check if value display shows 75 or close to it"
-        - "Confirm slider position visually matches target"
-        - "Ensure slider interaction was smooth and successful"
-
-metadata:
-  tags: ["action", "slider", "range", "form", "drag"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tableselect-001.yaml b/eval-server/nodejs/evals/action-agent/tableselect-001.yaml
deleted file mode 100644
index b38341e..0000000
--- a/eval-server/nodejs/evals/action-agent/tableselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table row selection test
-id: "tableselect-001"
-name: "Select Table Row"
-description: "Test clicking to select a table row"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/api/select_single_row.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the first row to select it"
-  reasoning: "Testing table row selection patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first table row"
-      - "Successfully clicked the row"
-      - "Row became highlighted/selected"
-      - "Selection state is visually apparent"
-      - "Only one row is selected at a time"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the first row is now highlighted/selected"
-        - "Check if row selection visual feedback is clear"
-        - "Confirm only the clicked row is selected"
-        - "Ensure row selection styling is properly applied"
-
-metadata:
-  tags: ["action", "table", "select", "row", "highlight"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tablesort-001.yaml b/eval-server/nodejs/evals/action-agent/tablesort-001.yaml
deleted file mode 100644
index 32695c7..0000000
--- a/eval-server/nodejs/evals/action-agent/tablesort-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table column sorting test
-id: "tablesort-001"
-name: "Sort Table Column"
-description: "Test clicking table column header to sort data"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Name\" column header to sort the table by name"
-  reasoning: "Testing table column sorting interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Name column header"
-      - "Successfully clicked the column header"
-      - "Table data reordered by name alphabetically"
-      - "Sort indicator appeared on the Name column"
-      - "Table sorting completed without errors"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify table rows are now sorted alphabetically by name"
-        - "Check if sort arrow/indicator appears on Name column"
-        - "Confirm the data order changed from before to after"
-        - "Ensure table structure remained intact after sorting"
-
-metadata:
-  tags: ["action", "table", "sort", "column", "data"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tabs-001.yaml b/eval-server/nodejs/evals/action-agent/tabs-001.yaml
deleted file mode 100644
index 1079266..0000000
--- a/eval-server/nodejs/evals/action-agent/tabs-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Tab panel navigation test
-id: "tabs-001"
-name: "Navigate Tab Panels"
-description: "Test clicking tab to switch between tab panels"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/tabs/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
-  reasoning: "Testing tab panel navigation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the \"Nunc tincidunt\" tab button"
-      - "Successfully clicked the tab"
-      - "Tab panel content switched to the selected tab"
-      - "Active tab visual state changed appropriately"
-      - "Content area updated to show the new panel"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
-        - "Check if the content panel changed to show new content"
-        - "Confirm the tab switching animation completed"
-        - "Ensure the correct tab content is visible"
-
-metadata:
-  tags: ["action", "tabs", "navigation", "panels", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/timepicker-001.yaml b/eval-server/nodejs/evals/action-agent/timepicker-001.yaml
deleted file mode 100644
index cbc5742..0000000
--- a/eval-server/nodejs/evals/action-agent/timepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Time picker test
-id: "timepicker-001"
-name: "Select Time from Picker"
-description: "Test setting time using time picker controls"
-enabled: true
-
-target:
-  url: "https://timepicker.co/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Set the time to 2:30 PM using the time picker controls"
-  reasoning: "Testing time selection with hour/minute controls"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the time picker interface"
-      - "Set the hour to 2 (14 for 24-hour format)"
-      - "Set the minutes to 30"
-      - "Selected PM or appropriate time format"
-      - "Time input shows 2:30 PM or equivalent"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the time input displays 2:30 PM or 14:30"
-        - "Check if hour and minute were set correctly"
-        - "Confirm AM/PM selection if applicable"
-        - "Ensure the time picker interface was properly used"
-
-metadata:
-  tags: ["action", "timepicker", "time", "form", "clock"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/upload-001.yaml b/eval-server/nodejs/evals/action-agent/upload-001.yaml
deleted file mode 100644
index d5c276c..0000000
--- a/eval-server/nodejs/evals/action-agent/upload-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# File upload test
-id: "upload-001"
-name: "Upload File via Input"
-description: "Test clicking file input and uploading a test file"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/upload"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the file input and upload a test file"
-  reasoning: "Testing file upload interaction through input elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the file input element"
-      - "Triggered file selection dialog"
-      - "Selected a file for upload"
-      - "File name appears in the input field"
-      - "Upload process initiated successfully"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify file name appears in the upload input field"
-        - "Check if file selection was successful"
-        - "Confirm upload button is available or file is ready"
-        - "Ensure no upload errors are displayed"
-
-metadata:
-  tags: ["action", "upload", "file", "input", "form"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/video-001.yaml b/eval-server/nodejs/evals/action-agent/video-001.yaml
deleted file mode 100644
index 17c76be..0000000
--- a/eval-server/nodejs/evals/action-agent/video-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video playback controls test
-id: "video-001"
-name: "Control Video Playback"
-description: "Test starting video playback using click + spacebar"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the video element to focus it, then press spacebar to start playback"
-  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
-  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Video element in the accessibility tree"
-      - "Successfully clicked the Video element to focus it"
-      - "Used keyboard input to press spacebar"
-      - "Video playback started after spacebar press"
-      - "No errors occurred during the interaction sequence"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify video player is visible on the page"
-        - "Check if the play button was clicked (may show pause button after)"
-        - "Look for visual indicators that video started playing"
-        - "Ensure no error messages appeared during video interaction"
-
-metadata:
-  tags: ["action", "video", "media", "controls", "playback"]
-  priority: "high"
-  timeout: 90000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/video-002.yaml b/eval-server/nodejs/evals/action-agent/video-002.yaml
deleted file mode 100644
index b20014c..0000000
--- a/eval-server/nodejs/evals/action-agent/video-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video play button specific targeting test
-id: "video-002"
-name: "Click Video Play Button Specifically"
-description: "Test clicking the specific play button (not the video element)"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
-  reasoning: "Testing specific targeting of the play button element"
-  hint: "Target the button element with text or label \"play\", do not click the Video element"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found a button element (not Video element) with \"play\" in the name"
-      - "Successfully clicked the play button specifically"
-      - "Did not click on the Video element itself"
-      - "Play button click was executed correctly"
-      - "Video responded to the button click"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the play button (not video element) was clicked"
-        - "Check if video started playing after button click"
-        - "Confirm the target was the button, not the video container"
-        - "Look for changes in video player state"
-
-metadata:
-  tags: ["action", "video", "button", "specific-targeting"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/config.yaml b/eval-server/nodejs/evals/config.yaml
deleted file mode 100644
index 3968421..0000000
--- a/eval-server/nodejs/evals/config.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# model:
-#   main_model: "deepseek-r1:14b"
-#   mini_model: "deepseek-r1:14b"
-#   nano_model: "deepseek-r1:14b"
-#   provider: "litellm"
-
-model:
-  main_model: "gpt-4.1"
-  mini_model: "gpt-4.1-mini"
-  nano_model: "gpt-4.1-nano"
-  provider: "openai"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml b/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
deleted file mode 100644
index 746ead6..0000000
--- a/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# B-Vitamins Research - End-to-End Test
-id: "vitamins-research-001"
-name: "B-Vitamins Supplementation Research"
-description: "End-to-end test for comprehensive B-vitamins research using chat interface"
-enabled: true
-
-tool: "chat"
-timeout: 600000
-
-input:
-  message: "Research everything on the supplementation of B-vitamins for adults. I need: types of vitamins, available forms and their advantages, dosage and safety"
-  reasoning: "End-to-end test validating complete user workflow with dynamic tool usage for health research"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Covers all B-vitamin types (B1, B2, B3, B5, B6, B7, B9, B12) comprehensively"
-      - "Explains different forms of each vitamin and their advantages"
-      - "Provides appropriate dosage recommendations for adults"
-      - "Discusses safety considerations and potential side effects"
-      - "Information is accurate and from reliable health sources"
-      - "Response is well-organized and easy to understand"
-      - "Demonstrates intelligent tool selection for health research"
-      - "Shows complete workflow from request to comprehensive result"
-
-metadata:
-  tags: ["end-to-end", "chat", "health", "vitamins", "research", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml b/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
deleted file mode 100644
index 72014df..0000000
--- a/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Renewable Energy Stocks Research - End-to-End Test
-id: "investment-research-001"
-name: "Renewable Energy Stocks Research"
-description: "End-to-end test for investment research using chat interface"
-enabled: true
-
-tool: "chat"
-timeout: 600000
-
-input:
-  message: "Research renewable energy stocks for potential investment. Focus on solar and wind companies with market cap over $1B."
-  reasoning: "End-to-end test validating financial research workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identifies specific solar and wind energy companies"
-      - "Confirms companies have market cap over $1 billion"
-      - "Provides relevant financial metrics and data"
-      - "Includes business descriptions and growth prospects"
-      - "Discusses investment considerations and risks"
-      - "Information appears current and from reliable sources"
-      - "Demonstrates intelligent financial research tool usage"
-      - "Shows complete workflow from request to investment analysis"
-
-metadata:
-  tags: ["end-to-end", "chat", "investment", "stocks", "renewable-energy", "financial", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml b/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
deleted file mode 100644
index 1363a09..0000000
--- a/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Headphones Comparison - End-to-End Test
-id: "product-comparison-001"
-name: "Noise-Canceling Headphones Comparison"
-description: "End-to-end test for product research and comparison using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Compare the top 3 noise-canceling headphones under $300. Include features, pros/cons, and where to buy them."
-  reasoning: "End-to-end test validating product comparison workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identifies 3 specific noise-canceling headphones under $300"
-      - "Provides detailed feature comparison for each model"
-      - "Lists pros and cons for each headphone clearly"
-      - "Includes pricing information and purchase locations"
-      - "Comparison is fair and based on objective criteria"
-      - "Information appears current and accurate"
-      - "Demonstrates intelligent research and extraction tool usage"
-      - "Shows complete workflow from request to actionable buying guide"
-
-metadata:
-  tags: ["end-to-end", "chat", "product", "comparison", "headphones", "shopping", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml b/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
deleted file mode 100644
index ef8b0f0..0000000
--- a/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Healthy Recipe Search - End-to-End Test
-id: "recipe-nutrition-001"
-name: "Healthy Family Dinner Recipes"
-description: "End-to-end test for recipe search with nutrition criteria using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Find me 3 healthy dinner recipes for a family of 4 that are under 500 calories per serving and take less than 30 minutes to prepare."
-  reasoning: "End-to-end test validating recipe search workflow with specific nutritional and time criteria"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Provides exactly 3 dinner recipes suitable for family of 4"
-      - "Each recipe is under 500 calories per serving"
-      - "All recipes can be prepared in under 30 minutes"
-      - "Includes ingredient lists and cooking instructions"
-      - "Nutritional information is provided or estimated"
-      - "Recipes are practical and family-friendly"
-      - "Demonstrates intelligent recipe search and analysis"
-      - "Shows complete workflow from request to actionable meal plan"
-
-metadata:
-  tags: ["end-to-end", "chat", "recipes", "nutrition", "healthy", "family", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml b/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
deleted file mode 100644
index 401f8b1..0000000
--- a/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Barcelona Travel Planning - End-to-End Test
-id: "travel-planning-001"
-name: "Barcelona Trip Planning"
-description: "End-to-end test for comprehensive travel planning using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Help me plan a 3-day trip to Barcelona. I need flight options from New York, hotel recommendations in the city center, and top 5 attractions to visit."
-  reasoning: "End-to-end test validating complete travel planning workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Provides realistic flight options from New York to Barcelona"
-      - "Recommends hotels in Barcelona city center with details"
-      - "Lists top 5 attractions in Barcelona with descriptions"
-      - "Information is current and practically useful for trip planning"
-      - "Includes relevant details like prices, locations, or booking info"
-      - "Response is well-organized into clear sections"
-      - "Demonstrates multi-tool usage for comprehensive planning"
-      - "Shows complete workflow from request to actionable itinerary"
-
-metadata:
-  tags: ["end-to-end", "chat", "travel", "planning", "barcelona", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/basic-001.yaml b/eval-server/nodejs/evals/research-agent/basic-001.yaml
deleted file mode 100644
index fcd0086..0000000
--- a/eval-server/nodejs/evals/research-agent/basic-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Basic research test - stable topic with clear sources
-id: "basic-001"
-name: "Research Chrome DevTools History"
-description: "Research the history and development of Chrome DevTools"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "History and development of Chrome DevTools browser developer tools"
-  reasoning: "Testing basic research capabilities on a well-documented technical topic"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0
-    criteria:
-      - "Research covers the origins and early development of Chrome DevTools"
-      - "Information includes key milestones and major feature additions"
-      - "Sources include official documentation or reliable technical sources"
-      - "At least 3-5 different sources were consulted"
-      - "Information is factually accurate and up-to-date"
-      - "Research demonstrates understanding of the topic evolution"
-      - "Handoff to content_writer_agent occurred with comprehensive data"
-
-metadata:
-  tags: ["basic", "technical", "stable", "documentation"]
-  priority: "high"
-  timeout: 180000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/business-001.yaml b/eval-server/nodejs/evals/research-agent/business-001.yaml
deleted file mode 100644
index 7558120..0000000
--- a/eval-server/nodejs/evals/research-agent/business-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Business research test
-id: "business-001"
-name: "Research Remote Work Productivity"
-description: "Research remote work impact on productivity and business outcomes"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Remote work productivity statistics impact business outcomes 2024 studies"
-  reasoning: "Testing business research requiring statistical data and multiple perspectives"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research includes statistical data and survey results"
-      - "Covers multiple perspectives (employee, employer, industry)"
-      - "Sources include business publications, research studies, and reports"
-      - "Information addresses both positive and negative impacts"
-      - "Data is recent and relevant to current work trends"
-      - "Research demonstrates understanding of business implications"
-      - "Statistics and claims are properly sourced"
-
-metadata:
-  tags: ["business", "statistics", "workplace", "comprehensive"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/comparison-001.yaml b/eval-server/nodejs/evals/research-agent/comparison-001.yaml
deleted file mode 100644
index a9aa22b..0000000
--- a/eval-server/nodejs/evals/research-agent/comparison-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Comparative research test
-id: "comparison-001"
-name: "Compare JavaScript vs TypeScript"
-description: "Research and compare JavaScript and TypeScript for web development"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 200000
-
-input:
-  query: "JavaScript vs TypeScript comparison web development pros cons differences"
-  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers both JavaScript and TypeScript comprehensively"
-      - "Includes clear comparison points (syntax, features, ecosystem)"
-      - "Presents advantages and disadvantages of each language"
-      - "Sources include technical documentation and developer resources"
-      - "Information is balanced and objective, not biased toward one option"
-      - "Demonstrates understanding of use cases for each language"
-      - "Research data is well-organized for comparative analysis"
-
-metadata:
-  tags: ["comparison", "technical", "programming", "balanced"]
-  priority: "high"
-  timeout: 200000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/current-001.yaml b/eval-server/nodejs/evals/research-agent/current-001.yaml
deleted file mode 100644
index 6878868..0000000
--- a/eval-server/nodejs/evals/research-agent/current-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Current events research test
-id: "current-001"
-name: "Research Latest AI Development Trends"
-description: "Research recent developments in AI and machine learning (last 6 months)"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
-  reasoning: "Testing research on current events and rapidly evolving topics"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Research focuses on recent developments (within last 6 months)"
-      - "Covers multiple aspects of AI development (models, applications, research)"
-      - "Sources are current and from reputable news or research outlets"
-      - "Information includes specific examples or case studies"
-      - "Demonstrates ability to identify current trends vs older information"
-      - "Successfully gathered information from diverse source types"
-      - "Data is properly organized for content writer handoff"
-
-metadata:
-  tags: ["current-events", "ai", "dynamic", "trends"]
-  priority: "high"
-  timeout: 240000
-  retries: 1
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/edge-001.yaml b/eval-server/nodejs/evals/research-agent/edge-001.yaml
deleted file mode 100644
index d75c2bf..0000000
--- a/eval-server/nodejs/evals/research-agent/edge-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# No-results edge case test
-id: "edge-001"
-name: "Research Obscure Fictional Topic"
-description: "Test handling of queries with very limited or no reliable sources"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "quantum bluetooth watermelon encryption algorithm 2024"
-  reasoning: "Testing edge case handling when query yields no meaningful results"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent recognizes when query yields limited or unreliable results"
-      - "Demonstrates appropriate search strategy modification"
-      - "Does not fabricate information when sources are unavailable"
-      - "Gracefully handles lack of substantive results"
-      - "Still attempts handoff to content writer with available information"
-      - "Maintains professional approach despite limited data"
-      - "Shows appropriate uncertainty when information is sparse"
-
-metadata:
-  tags: ["edge-case", "no-results", "error-handling", "fictional"]
-  priority: "high"
-  timeout: 180000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
deleted file mode 100644
index 85743d5..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Basic research test - stable topic with clear sources
-id: "research-agent-basic-001"
-name: "Research Chrome DevTools History"
-description: "Research the history and development of Chrome DevTools"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "History and development of Chrome DevTools browser developer tools"
-  reasoning: "Testing basic research capabilities on a well-documented technical topic"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0
-    criteria:
-      - "Research covers the origins and early development of Chrome DevTools"
-      - "Information includes key milestones and major feature additions"
-      - "Sources include official documentation or reliable technical sources"
-      - "At least 3-5 different sources were consulted"
-      - "Information is factually accurate and up-to-date"
-      - "Research demonstrates understanding of the topic evolution"
-      - "Handoff to content_writer_agent occurred with comprehensive data"
-
-metadata:
-  tags: ["basic", "technical", "stable", "documentation"]
-  priority: "high"
-  timeout: 180000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
deleted file mode 100644
index defeed1..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Business research test
-id: "research-agent-business-001"
-name: "Research Remote Work Productivity"
-description: "Research remote work impact on productivity and business outcomes"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Remote work productivity statistics impact business outcomes 2024 studies"
-  reasoning: "Testing business research requiring statistical data and multiple perspectives"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research includes statistical data and survey results"
-      - "Covers multiple perspectives (employee, employer, industry)"
-      - "Sources include business publications, research studies, and reports"
-      - "Information addresses both positive and negative impacts"
-      - "Data is recent and relevant to current work trends"
-      - "Research demonstrates understanding of business implications"
-      - "Statistics and claims are properly sourced"
-
-metadata:
-  tags: ["business", "statistics", "workplace", "comprehensive"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
deleted file mode 100644
index a433a58..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Comparative research test
-id: "research-agent-comparison-001"
-name: "Compare JavaScript vs TypeScript"
-description: "Research and compare JavaScript and TypeScript for web development"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 200000
-
-input:
-  query: "JavaScript vs TypeScript comparison web development pros cons differences"
-  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers both JavaScript and TypeScript comprehensively"
-      - "Includes clear comparison points (syntax, features, ecosystem)"
-      - "Presents advantages and disadvantages of each language"
-      - "Sources include technical documentation and developer resources"
-      - "Information is balanced and objective, not biased toward one option"
-      - "Demonstrates understanding of use cases for each language"
-      - "Research data is well-organized for comparative analysis"
-
-metadata:
-  tags: ["comparison", "technical", "programming", "balanced"]
-  priority: "high"
-  timeout: 200000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
deleted file mode 100644
index 198c981..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Current events research test
-id: "research-agent-current-001"
-name: "Research Latest AI Development Trends"
-description: "Research recent developments in AI and machine learning (last 6 months)"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
-  reasoning: "Testing research on current events and rapidly evolving topics"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Research focuses on recent developments (within last 6 months)"
-      - "Covers multiple aspects of AI development (models, applications, research)"
-      - "Sources are current and from reputable news or research outlets"
-      - "Information includes specific examples or case studies"
-      - "Demonstrates ability to identify current trends vs older information"
-      - "Successfully gathered information from diverse source types"
-      - "Data is properly organized for content writer handoff"
-
-metadata:
-  tags: ["current-events", "ai", "dynamic", "trends"]
-  priority: "high"
-  timeout: 240000
-  retries: 1
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
deleted file mode 100644
index 234c832..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# No-results edge case test
-id: "research-agent-edge-001"
-name: "Research Obscure Fictional Topic"
-description: "Test handling of queries with very limited or no reliable sources"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "quantum bluetooth watermelon encryption algorithm 2024"
-  reasoning: "Testing edge case handling when query yields no meaningful results"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent recognizes when query yields limited or unreliable results"
-      - "Demonstrates appropriate search strategy modification"
-      - "Does not fabricate information when sources are unavailable"
-      - "Gracefully handles lack of substantive results"
-      - "Still attempts handoff to content writer with available information"
-      - "Maintains professional approach despite limited data"
-      - "Shows appropriate uncertainty when information is sparse"
-
-metadata:
-  tags: ["edge-case", "no-results", "error-handling", "fictional"]
-  priority: "high"
-  timeout: 180000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
deleted file mode 100644
index c5e2540..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Deep technical research test
-id: "research-agent-technical-001"
-name: "Research WebAssembly Performance"
-description: "Deep dive research into WebAssembly performance characteristics and use cases"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 900000
-
-input:
-  query: "WebAssembly WASM performance benchmarks use cases implementation details"
-  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers technical details of WebAssembly architecture"
-      - "Includes performance benchmarks and comparison data"
-      - "Discusses practical use cases and implementation scenarios"
-      - "Sources include technical specifications, benchmarks, and expert analysis"
-      - "Information demonstrates deep understanding of the technology"
-      - "Research addresses both benefits and limitations"
-      - "Technical accuracy is maintained throughout"
-
-metadata:
-  tags: ["technical", "deep-dive", "performance", "webassembly"]
-  priority: "high"
-  timeout: 900000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
deleted file mode 100644
index 44da108..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Tool orchestration test - focuses on how well the agent uses available tools
-id: "research-agent-tools-001"
-name: "Research Python Framework Comparison"
-description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Django vs Flask Python web framework comparison features performance"
-  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent effectively used navigate_url to access search engines"
-      - "Schema-based extraction was used to gather structured search results"
-      - "Fetcher tool was used to collect content from multiple URLs"
-      - "Navigation strategy was logical and systematic"
-      - "Tool usage demonstrated purposeful research progression"
-      - "Information from different tools was effectively synthesized"
-      - "At least 3-5 different sources were accessed and processed"
-      - "Final handoff included comprehensive data from all tools"
-
-metadata:
-  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/technical-001.yaml b/eval-server/nodejs/evals/research-agent/technical-001.yaml
deleted file mode 100644
index f434081..0000000
--- a/eval-server/nodejs/evals/research-agent/technical-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Deep technical research test
-id: "technical-001"
-name: "Research WebAssembly Performance"
-description: "Deep dive research into WebAssembly performance characteristics and use cases"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 900000
-
-input:
-  query: "WebAssembly WASM performance benchmarks use cases implementation details"
-  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers technical details of WebAssembly architecture"
-      - "Includes performance benchmarks and comparison data"
-      - "Discusses practical use cases and implementation scenarios"
-      - "Sources include technical specifications, benchmarks, and expert analysis"
-      - "Information demonstrates deep understanding of the technology"
-      - "Research addresses both benefits and limitations"
-      - "Technical accuracy is maintained throughout"
-
-metadata:
-  tags: ["technical", "deep-dive", "performance", "webassembly"]
-  priority: "high"
-  timeout: 900000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/tools-001.yaml b/eval-server/nodejs/evals/research-agent/tools-001.yaml
deleted file mode 100644
index ae97430..0000000
--- a/eval-server/nodejs/evals/research-agent/tools-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Tool orchestration test - focuses on how well the agent uses available tools
-id: "tools-001"
-name: "Research Python Framework Comparison"
-description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Django vs Flask Python web framework comparison features performance"
-  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent effectively used navigate_url to access search engines"
-      - "Schema-based extraction was used to gather structured search results"
-      - "Fetcher tool was used to collect content from multiple URLs"
-      - "Navigation strategy was logical and systematic"
-      - "Tool usage demonstrated purposeful research progression"
-      - "Information from different tools was effectively synthesized"
-      - "At least 3-5 different sources were accessed and processed"
-      - "Final handoff included comprehensive data from all tools"
-
-metadata:
-  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml b/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
deleted file mode 100644
index 42e4738..0000000
--- a/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# E-commerce product extraction test
-id: "amazon-product-001"
-name: "Extract Amazon Product Details"
-description: "Extract product information from an Amazon product page"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      product:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          brand:
-            type: "string"
-          price:
-            type: "object"
-            properties:
-              current:
-                type: "number"
-              currency:
-                type: "string"
-          rating:
-            type: "object"
-            properties:
-              average:
-                type: "number"
-              count:
-                type: "number"
-          images:
-            type: "array"
-            items:
-              type: "string"
-              format: "url"
-          features:
-            type: "array"
-            items:
-              type: "string"
-        required:
-          - "title"
-          - "price"
-      availability:
-        type: "string"
-    required:
-      - "product"
-  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
-  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Product title is accurate and complete"
-      - "Price information is current and properly formatted"
-      - "Rating data includes both average and review count"
-      - "Image URLs are valid and accessible"
-      - "Key product features are captured"
-      - "All URLs are properly resolved (not node IDs)"
-
-metadata:
-  tags: ["ecommerce", "amazon", "product", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml b/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
deleted file mode 100644
index 6843147..0000000
--- a/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# News article extraction test
-id: "bbc-news-001"
-name: "Extract BBC News Article"
-description: "Extract article content and metadata from a BBC News page"
-enabled: true
-
-target:
-  url: "https://www.bbc.com/news/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      headlines:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            summary:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            category:
-              type: "string"
-          required:
-            - "title"
-      mainStory:
-        type: "object"
-        properties:
-          headline:
-            type: "string"
-          summary:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-    required:
-      - "headlines"
-  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
-  reasoning: "Testing extraction from a news aggregation page with multiple articles"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Headlines are current and relevant to technology news"
-      - "Article summaries provide meaningful context"
-      - "URLs link to valid BBC news articles"
-      - "Main story is properly identified"
-      - "All extracted content is in English"
-
-metadata:
-  tags: ["news", "bbc", "aggregation", "dynamic"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
deleted file mode 100644
index 7e7d674..0000000
--- a/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Bing Search results extraction test
-id: "bing-search-001"
-name: "Extract Bing Search Results"
-description: "Extract search results from Bing search page"
-enabled: true
-
-target:
-  url: "https://www.bing.com/search?q=web+scraping+best+practices"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            datePublished:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      sidebarInfo:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          description:
-            type: "string"
-          source:
-            type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
-  reasoning: "Testing extraction from Bing search results with different layout than Google"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results match the query intent"
-      - "Results include valid URLs and meaningful snippets"
-      - "Sidebar information is extracted when present"
-      - "No duplicate results in the list"
-
-metadata:
-  tags: ["search", "bing", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml b/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
deleted file mode 100644
index 07532e7..0000000
--- a/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test (Streamlined version)
-id: "github-repo-001-streamlined"
-name: "Extract GitHub Repository Info (Streamlined)"
-description: "Extract basic repository information from a GitHub page using streamlined extractor"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured", "streamlined"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml b/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
deleted file mode 100644
index 6693577..0000000
--- a/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test
-id: "github-repo-001"
-name: "Extract GitHub Repository Info"
-description: "Extract basic repository information from a GitHub page"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml b/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
deleted file mode 100644
index ab2e53c..0000000
--- a/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Google Flights search extraction test
-id: "google-flights-001"
-name: "Extract Google Flights Search Results"
-description: "Extract flight options from Google Flights search"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchCriteria:
-        type: "object"
-        properties:
-          origin:
-            type: "string"
-          destination:
-            type: "string"
-          departureDate:
-            type: "string"
-          returnDate:
-            type: "string"
-          tripType:
-            type: "string"
-          passengers:
-            type: "number"
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            flightNumber:
-              type: "string"
-            departureTime:
-              type: "string"
-            arrivalTime:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "number"
-            price:
-              type: "object"
-              properties:
-                amount:
-                  type: "number"
-                currency:
-                  type: "string"
-            cabin:
-              type: "string"
-            bookingUrl:
-              type: "string"
-              format: "url"
-            legroom:
-              type: "string"
-            amenities:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "airline"
-            - "departureTime"
-            - "arrivalTime"
-            - "price"
-      priceInsights:
-        type: "object"
-        properties:
-          trend:
-            type: "string"
-          recommendation:
-            type: "string"
-          averagePrice:
-            type: "number"
-    required:
-      - "flights"
-  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
-  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Flight times are in proper format"
-      - "Prices are numeric values with currency"
-      - "Airlines and flight numbers are accurate"
-      - "Stop information is correctly identified"
-      - "Duration is in readable format"
-
-metadata:
-  tags: ["travel", "flights", "google", "booking"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
deleted file mode 100644
index 5763ba8..0000000
--- a/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Google Search results extraction test
-id: "google-search-001"
-name: "Extract Google Search Results"
-description: "Extract search results from Google search page"
-enabled: true
-
-target:
-  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            domain:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      featuredSnippet:
-        type: "object"
-        properties:
-          content:
-            type: "string"
-          source:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-      relatedSearches:
-        type: "array"
-        items:
-          type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
-  reasoning: "Testing extraction from Google search results page with various result types"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are relevant to the query"
-      - "Each result has a valid title, URL, and snippet"
-      - "URLs are properly resolved and not node IDs"
-      - "Related searches are extracted if present"
-      - "Featured snippet is captured when available"
-
-metadata:
-  tags: ["search", "google", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml b/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
deleted file mode 100644
index 2eb4883..0000000
--- a/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Home Depot product search extraction test
-id: "homedepot-001"
-name: "Extract Home Depot Product Search"
-description: "Extract product listings from Home Depot search results"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/s/power%2520drill"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchQuery:
-        type: "string"
-      totalResults:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            price:
-              type: "number"
-            originalPrice:
-              type: "number"
-            savings:
-              type: "number"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            availability:
-              type: "string"
-            features:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "price"
-            - "productUrl"
-      filters:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
-  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are relevant to the search query"
-      - "Prices are numeric values in USD"
-      - "Product URLs link to Home Depot product pages"
-      - "Ratings are on a 5-star scale"
-      - "Key product features are captured"
-
-metadata:
-  tags: ["ecommerce", "homedepot", "products", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/macys-001.yaml b/eval-server/nodejs/evals/schema-extractor/macys-001.yaml
deleted file mode 100644
index 81e05f9..0000000
--- a/eval-server/nodejs/evals/schema-extractor/macys-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Macy's product listing extraction test
-id: "macys-001"
-name: "Extract Macy's Product Listings"
-description: "Extract fashion products from Macy's category page"
-enabled: true
-
-target:
-  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      category:
-        type: "string"
-      totalProducts:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            currentPrice:
-              type: "number"
-            originalPrice:
-              type: "number"
-            discount:
-              type: "string"
-            colors:
-              type: "array"
-              items:
-                type: "string"
-            sizes:
-              type: "array"
-              items:
-                type: "string"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            promotions:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "brand"
-            - "currentPrice"
-      refinements:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          sizes:
-            type: "array"
-            items:
-              type: "string"
-          colors:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
-  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are from the correct category"
-      - "Prices reflect current and sale prices"
-      - "Color and size options are captured"
-      - "Brand names are accurately extracted"
-      - "Promotional text is included when present"
-
-metadata:
-  tags: ["ecommerce", "macys", "fashion", "products"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
deleted file mode 100644
index 616f0d6..0000000
--- a/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-# Wikipedia search results extraction test
-id: "wikipedia-search-001"
-name: "Extract Wikipedia Search Results"
-description: "Extract search results from Wikipedia search"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchTerm:
-        type: "string"
-      resultCount:
-        type: "number"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            category:
-              type: "string"
-            wordCount:
-              type: "number"
-            lastEdited:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      suggestedArticles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "searchResults"
-  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
-  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are Wikipedia articles"
-      - "Each result has a valid Wikipedia URL"
-      - "Snippets contain relevant content highlights"
-      - "Metadata like word count is extracted when available"
-
-metadata:
-  tags: ["search", "wikipedia", "encyclopedia"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml b/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
deleted file mode 100644
index 6ec53c4..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Dynamic content visual verification test
-id: "dynamic-content-verification-001"
-name: "Dynamic Content Visual Verification"
-description: "Test visual verification of dynamic content loading using screenshots"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Take a screenshot, click the Start button, wait for content to load, then take another screenshot to verify the dynamic content appeared"
-  reasoning: "Testing visual verification of dynamic content changes using screenshot comparison"
-  hint: "Use take_screenshot before clicking Start, then again after the dynamic content loads"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Initial screenshot captured the page before dynamic loading"
-      - "Start button was successfully clicked"
-      - "Agent waited for dynamic content to fully load"
-      - "Final screenshot shows the revealed dynamic content"
-      - "Visual comparison demonstrates successful content loading verification"
-      - "Screenshots show clear before/after difference in content visibility"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loading"
-        - "Confirm the first screenshot shows hidden content area"
-        - "Verify the second screenshot shows the revealed 'Hello World!' text"
-        - "Check that the loading animation or process is properly captured"
-
-metadata:
-  tags: ["screenshot", "dynamic-content", "visual-verification", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: true
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
deleted file mode 100644
index 6d31c50..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Screenshot error handling test
-id: "screenshot-error-handling-001"
-name: "Screenshot Error Handling"
-description: "Test screenshot tool error handling and recovery"
-enabled: true
-
-target:
-  url: "https://httpstat.us/500"
-
-tool: "take_screenshot"
-timeout: 30000
-
-input:
-  fullPage: false
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Screenshot tool handled the error page gracefully"
-      - "Either successfully captured the error page or reported appropriate error"
-      - "No crashes or undefined behavior occurred"
-      - "Tool response is meaningful regardless of page loading issues"
-      - "Error handling demonstrates robustness of screenshot functionality"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "If screenshot was taken, verify it shows the error page content"
-        - "Check that the tool handled the HTTP 500 error appropriately"
-        - "Confirm no blank or corrupted screenshots were produced"
-        - "Ensure error scenarios are handled professionally"
-
-metadata:
-  tags: ["screenshot", "error-handling", "robustness", "edge-case"]
-  priority: "normal"
-  timeout: 30000
-  retries: 1
-  flaky: true
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
deleted file mode 100644
index a1c71f9..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Full page screenshot verification test
-id: "screenshot-fullpage-001"
-name: "Take Full Page Screenshot"
-description: "Test taking full page screenshot and verify functionality"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
-
-tool: "take_screenshot"
-timeout: 45000
-
-input:
-  fullPage: true
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Full page screenshot was successfully captured"
-      - "Data URL contains valid image data"
-      - "Screenshot captures the entire page content including areas below the fold"
-      - "Image size is larger than viewport-only screenshot would be"
-      - "No errors occurred during full page capture"
-      - "Screenshot includes both header and footer content"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "Verify the screenshot shows the complete Wikipedia article page"
-        - "Check that content above and below the fold is captured"
-        - "Confirm the image is taller than a typical viewport"
-        - "Ensure no content is cut off at the bottom"
-
-metadata:
-  tags: ["screenshot", "fullpage", "visual", "verification", "wikipedia"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
deleted file mode 100644
index 69531ee..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Viewport screenshot verification test
-id: "screenshot-viewport-001"
-name: "Take Viewport Screenshot"
-description: "Test taking viewport screenshot and verify functionality"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-
-tool: "take_screenshot"
-timeout: 30000
-
-input:
-  fullPage: false
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Screenshot was successfully captured"
-      - "Data URL is properly formatted and contains image data"
-      - "Screenshot shows the viewport content correctly"
-      - "No errors occurred during screenshot capture"
-      - "Image data length indicates a valid screenshot was taken"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "Verify the screenshot shows the Google homepage"
-        - "Check that the screenshot is not empty or corrupted"
-        - "Confirm the image quality is appropriate for verification"
-        - "Ensure the screenshot captures the current viewport accurately"
-
-metadata:
-  tags: ["screenshot", "viewport", "visual", "verification"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml b/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
deleted file mode 100644
index 7434a93..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Visual comparison verification test
-id: "visual-comparison-001"
-name: "Visual Comparison Before and After Action"
-description: "Test visual verification by comparing screenshots before and after an action"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Take a screenshot, then type 'DevTools testing' in the search box, and take another screenshot to compare"
-  reasoning: "Testing visual verification workflow with before/after screenshot comparison"
-  hint: "Use take_screenshot tool before and after performing the search input action"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Initial screenshot was taken before performing any actions"
-      - "Search text was successfully entered into the search field"
-      - "Second screenshot was taken after the text input"
-      - "Visual comparison shows the difference between before and after states"
-      - "Search field contains the entered text in the final screenshot"
-      - "Screenshots demonstrate successful action verification workflow"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare the before and after screenshots"
-        - "Verify the search field is empty in the first screenshot"
-        - "Confirm the search field contains 'DevTools testing' in the second screenshot"
-        - "Check that the visual changes accurately reflect the performed action"
-
-metadata:
-  tags: ["screenshot", "visual-comparison", "action-verification", "before-after"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
deleted file mode 100644
index b154454..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# E-commerce product extraction test (Streamlined)
-id: "amazon-product-001"
-name: "Extract Amazon Product Details"
-description: "Extract product information from an Amazon product page"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      product:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          brand:
-            type: "string"
-          price:
-            type: "object"
-            properties:
-              current:
-                type: "number"
-              currency:
-                type: "string"
-          rating:
-            type: "object"
-            properties:
-              average:
-                type: "number"
-              count:
-                type: "number"
-          images:
-            type: "array"
-            items:
-              type: "string"
-              format: "url"
-          features:
-            type: "array"
-            items:
-              type: "string"
-        required:
-          - "title"
-          - "price"
-      availability:
-        type: "string"
-    required:
-      - "product"
-  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
-  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Product title is accurate and complete"
-      - "Price information is current and properly formatted"
-      - "Rating data includes both average and review count"
-      - "Image URLs are valid and accessible"
-      - "Key product features are captured"
-      - "All URLs are properly resolved (not node IDs)"
-
-metadata:
-  tags: ["ecommerce", "amazon", "product", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
deleted file mode 100644
index 31ef288..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# News article extraction test (Streamlined)
-id: "bbc-news-001"
-name: "Extract BBC News Article"
-description: "Extract article content and metadata from a BBC News page"
-enabled: true
-
-target:
-  url: "https://www.bbc.com/news/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      headlines:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            summary:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            category:
-              type: "string"
-          required:
-            - "title"
-      mainStory:
-        type: "object"
-        properties:
-          headline:
-            type: "string"
-          summary:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-    required:
-      - "headlines"
-  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
-  reasoning: "Testing extraction from a news aggregation page with multiple articles"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Headlines are current and relevant to technology news"
-      - "Article summaries provide meaningful context"
-      - "URLs link to valid BBC news articles"
-      - "Main story is properly identified"
-      - "All extracted content is in English"
-
-metadata:
-  tags: ["news", "bbc", "aggregation", "dynamic"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
deleted file mode 100644
index e9f3b6e..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Bing Search results extraction test
-id: "bing-search-001"
-name: "Extract Bing Search Results"
-description: "Extract search results from Bing search page"
-enabled: true
-
-target:
-  url: "https://www.bing.com/search?q=web+scraping+best+practices"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            datePublished:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      sidebarInfo:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          description:
-            type: "string"
-          source:
-            type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
-  reasoning: "Testing extraction from Bing search results with different layout than Google"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results match the query intent"
-      - "Results include valid URLs and meaningful snippets"
-      - "Sidebar information is extracted when present"
-      - "No duplicate results in the list"
-
-metadata:
-  tags: ["search", "bing", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
deleted file mode 100644
index 5c496c5..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test (Streamlined)
-id: "github-repo-001"
-name: "Extract GitHub Repository Info"
-description: "Extract basic repository information from a GitHub page"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
deleted file mode 100644
index 981ccbd..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Google Flights search extraction test
-id: "google-flights-001"
-name: "Extract Google Flights Search Results"
-description: "Extract flight options from Google Flights search"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchCriteria:
-        type: "object"
-        properties:
-          origin:
-            type: "string"
-          destination:
-            type: "string"
-          departureDate:
-            type: "string"
-          returnDate:
-            type: "string"
-          tripType:
-            type: "string"
-          passengers:
-            type: "number"
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            flightNumber:
-              type: "string"
-            departureTime:
-              type: "string"
-            arrivalTime:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "number"
-            price:
-              type: "object"
-              properties:
-                amount:
-                  type: "number"
-                currency:
-                  type: "string"
-            cabin:
-              type: "string"
-            bookingUrl:
-              type: "string"
-              format: "url"
-            legroom:
-              type: "string"
-            amenities:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "airline"
-            - "departureTime"
-            - "arrivalTime"
-            - "price"
-      priceInsights:
-        type: "object"
-        properties:
-          trend:
-            type: "string"
-          recommendation:
-            type: "string"
-          averagePrice:
-            type: "number"
-    required:
-      - "flights"
-  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
-  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Flight times are in proper format"
-      - "Prices are numeric values with currency"
-      - "Airlines and flight numbers are accurate"
-      - "Stop information is correctly identified"
-      - "Duration is in readable format"
-
-metadata:
-  tags: ["travel", "flights", "google", "booking"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
deleted file mode 100644
index c1725d4..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Google Search results extraction test
-id: "google-search-001"
-name: "Extract Google Search Results"
-description: "Extract search results from Google search page"
-enabled: true
-
-target:
-  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            domain:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      featuredSnippet:
-        type: "object"
-        properties:
-          content:
-            type: "string"
-          source:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-      relatedSearches:
-        type: "array"
-        items:
-          type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
-  reasoning: "Testing extraction from Google search results page with various result types"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are relevant to the query"
-      - "Each result has a valid title, URL, and snippet"
-      - "URLs are properly resolved and not node IDs"
-      - "Related searches are extracted if present"
-      - "Featured snippet is captured when available"
-
-metadata:
-  tags: ["search", "google", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
deleted file mode 100644
index 1d26848..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Home Depot product search extraction test
-id: "homedepot-001"
-name: "Extract Home Depot Product Search"
-description: "Extract product listings from Home Depot search results"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/s/power%2520drill"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchQuery:
-        type: "string"
-      totalResults:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            price:
-              type: "number"
-            originalPrice:
-              type: "number"
-            savings:
-              type: "number"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            availability:
-              type: "string"
-            features:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "price"
-            - "productUrl"
-      filters:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
-  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are relevant to the search query"
-      - "Prices are numeric values in USD"
-      - "Product URLs link to Home Depot product pages"
-      - "Ratings are on a 5-star scale"
-      - "Key product features are captured"
-
-metadata:
-  tags: ["ecommerce", "homedepot", "products", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
deleted file mode 100644
index 28a2c10..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Macy's product listing extraction test
-id: "macys-001"
-name: "Extract Macy's Product Listings"
-description: "Extract fashion products from Macy's category page"
-enabled: true
-
-target:
-  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      category:
-        type: "string"
-      totalProducts:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            currentPrice:
-              type: "number"
-            originalPrice:
-              type: "number"
-            discount:
-              type: "string"
-            colors:
-              type: "array"
-              items:
-                type: "string"
-            sizes:
-              type: "array"
-              items:
-                type: "string"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            promotions:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "brand"
-            - "currentPrice"
-      refinements:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          sizes:
-            type: "array"
-            items:
-              type: "string"
-          colors:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
-  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are from the correct category"
-      - "Prices reflect current and sale prices"
-      - "Color and size options are captured"
-      - "Brand names are accurately extracted"
-      - "Promotional text is included when present"
-
-metadata:
-  tags: ["ecommerce", "macys", "fashion", "products"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
deleted file mode 100644
index 88983bd..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Wikipedia article extraction test (Streamlined)
-id: "wikipedia-chrome-devtools-001"
-name: "Extract Chrome DevTools Wikipedia Article"
-description: "Extract structured information from the Chrome DevTools Wikipedia page"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      title:
-        type: "string"
-      summary:
-        type: "string"
-      tableOfContents:
-        type: "array"
-        items:
-          type: "string"
-      infobox:
-        type: "object"
-        properties:
-          developer:
-            type: "string"
-          initialRelease:
-            type: "string"
-          operatingSystem:
-            type: "string"
-          license:
-            type: "string"
-      externalLinks:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            text:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "title"
-      - "summary"
-  instruction: "Extract the main article information including title, summary, table of contents, and infobox details"
-  reasoning: "Testing extraction from a stable, well-structured Wikipedia page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "externalLinks[*].url"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Article title matches the Wikipedia page title"
-      - "Summary captures the main description of Chrome DevTools"
-      - "Table of contents includes major sections"
-      - "Infobox contains key technical details"
-      - "External links are properly resolved URLs"
-
-metadata:
-  tags: ["wikipedia", "documentation", "stable"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
deleted file mode 100644
index c432c20..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-# Wikipedia search results extraction test
-id: "wikipedia-search-001"
-name: "Extract Wikipedia Search Results"
-description: "Extract search results from Wikipedia search"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchTerm:
-        type: "string"
-      resultCount:
-        type: "number"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            category:
-              type: "string"
-            wordCount:
-              type: "number"
-            lastEdited:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      suggestedArticles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "searchResults"
-  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
-  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are Wikipedia articles"
-      - "Each result has a valid Wikipedia URL"
-      - "Snippets contain relevant content highlights"
-      - "Metadata like word count is extracted when available"
-
-metadata:
-  tags: ["search", "wikipedia", "encyclopedia"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/booking-001.yaml b/eval-server/nodejs/evals/web-task-agent/booking-001.yaml
deleted file mode 100644
index 8a99d17..0000000
--- a/eval-server/nodejs/evals/web-task-agent/booking-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Hotel Search Workflow - Web Task Agent
-id: "booking-001"
-name: "Hotel Search Workflow"
-description: "Test web task agent orchestrating complex multi-step booking search"
-enabled: true
-
-target:
-  url: "https://www.booking.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
-  reasoning: "Customer is looking for travel booking"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully searched for hotels in San Francisco"
-      - "Results show hotels available for March 15-17 dates"
-      - "Guest count of 2 adults is reflected in the search results"
-      - "Returned multiple hotel options with relevant details"
-      - "Each hotel includes essential information (name, price, location)"
-      - "Results are presented in a clear, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify hotel search results are displayed for San Francisco"
-        - "Check that dates March 15-17 are correctly selected"
-        - "Confirm guest count shows 2 adults"
-        - "Ensure search results show hotels with availability for specified dates"
-
-metadata:
-  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml b/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
deleted file mode 100644
index 338f464..0000000
--- a/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# E-commerce web task evaluation (matches DevTools test case)
-id: "ecommerce-001"
-name: "E-commerce Product Search"
-description: "Test web task agent handling product search on shopping site"
-enabled: true
-
-target:
-  url: "https://www.amazon.com"
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Search Amazon for \"wireless headphones\" and find products under $100"
-  reasoning: "Testing e-commerce search workflow with price filtering"
-  context: "User wants to find wireless headphones with specific price constraint"
-  extraction_schema:
-    type: "object"
-    properties:
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            price:
-              type: "string"
-            rating:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "hybrid"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully navigated to product search"
-      - "Applied appropriate filters correctly"
-      - "Extracted product details accurately"
-      - "Provided meaningful comparison of features"
-      - "Stayed within specified price range"
-  snapshot:
-    structure_only: true
-    exclude_paths:
-      - "timestamp"
-      - "sessionId"
-
-metadata:
-  tags: ["web-task", "multi-step", "ecommerce", "search"]
-  priority: "high"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/error-001.yaml b/eval-server/nodejs/evals/web-task-agent/error-001.yaml
deleted file mode 100644
index 1831a14..0000000
--- a/eval-server/nodejs/evals/web-task-agent/error-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Error Recovery Workflow - Web Task Agent
-id: "error-001"
-name: "Error Recovery Workflow"
-description: "Test web task agent handling action_agent failures and retry logic"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
-  reasoning: "Customer is asking for this response"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
-      - "Either found some results OR provided clear explanation why no results were found"
-      - "Response handles the edge case gracefully without errors"
-      - "If no results found, suggested alternative actions or explanations"
-      - "Maintained professional tone despite unusual request"
-      - "Final output is coherent and helpful to the user"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Check if search was attempted despite unusual query"
-        - "Verify error handling did not break the page interaction"
-        - "Confirm agent attempted to complete the task or provided clear error info"
-        - "Ensure page is still functional after error recovery attempts"
-
-metadata:
-  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/extract-001.yaml b/eval-server/nodejs/evals/web-task-agent/extract-001.yaml
deleted file mode 100644
index e836aa0..0000000
--- a/eval-server/nodejs/evals/web-task-agent/extract-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Structured Data Extraction - Web Task Agent
-id: "extract-001"
-name: "Structured Data Extraction"
-description: "Test web task agent extracting structured data from search results"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
-  reasoning: "User is looking to understand the top stories on Hacker News"
-  extraction_schema:
-    type: "object"
-    properties:
-      stories:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments:
-              type: "number"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully returned exactly 5 Hacker News stories in structured text format"
-      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
-      - "Results are presented in readable text format similar to the example provided"
-      - "Response includes all required fields: title, score, comments count, URL"
-      - "Maintained proper orchestration pattern throughout the extraction process"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News homepage is loaded and displaying stories"
-        - "Check that top stories are visible with scores and comment counts"
-        - "Confirm story titles and metadata are clearly displayed"
-        - "Ensure page structure allows for data extraction"
-
-metadata:
-  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/finance-001.yaml b/eval-server/nodejs/evals/web-task-agent/finance-001.yaml
deleted file mode 100644
index 2c661ed..0000000
--- a/eval-server/nodejs/evals/web-task-agent/finance-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Stock Information Research - Web Task Agent
-id: "finance-001"
-name: "Stock Information Research"
-description: "Test extracting stock prices and financial information"
-enabled: true
-
-target:
-  url: "https://finance.yahoo.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
-  reasoning: "Users need automated financial data collection for investment decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      stock_info:
-        type: "object"
-        properties:
-          symbol:
-            type: "string"
-          company_name:
-            type: "string"
-          current_price:
-            type: "string"
-          change:
-            type: "string"
-          change_percent:
-            type: "string"
-          market_cap:
-            type: "string"
-          pe_ratio:
-            type: "string"
-          volume:
-            type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Apple (AAPL) stock information"
-      - "Current stock price is clearly stated"
-      - "Market cap information is included"
-      - "Price change and percentage change are provided"
-      - "Additional metrics (PE ratio, volume) included when available"
-      - "Financial data is current and presented in readable text format (not JSON)"
-      - "Stock information is well-organized and easy to understand"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
-        - "Check that current stock price and change are visible"
-        - "Confirm market cap and trading volume are displayed"
-        - "Ensure financial metrics and charts are shown"
-
-metadata:
-  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/flight-001.yaml b/eval-server/nodejs/evals/web-task-agent/flight-001.yaml
deleted file mode 100644
index f74b255..0000000
--- a/eval-server/nodejs/evals/web-task-agent/flight-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Complex Flight Search - Web Task Agent
-id: "flight-001"
-name: "Complex Flight Search"
-description: "Test web task agent handling complex flight search with multiple criteria"
-enabled: true
-
-target:
-  url: "https://www.kayak.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
-  reasoning: "Customer is looking for finding the best flight options"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Flight results show March 20 departure date"
-      - "Flight results show March 30 return date"
-      - "Returned multiple flight options with airlines and prices"
-      - "Each flight includes essential details (times, airlines, prices)"
-      - "Results clearly distinguish between outbound and return flights"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify flight search results are displayed"
-        - "Check SEA to NRT route is correctly selected"
-        - "Confirm dates March 20 departure and March 30 return"
-        - "Ensure flight options are showing with prices and airlines"
-
-metadata:
-  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/food-001.yaml b/eval-server/nodejs/evals/web-task-agent/food-001.yaml
deleted file mode 100644
index 382b470..0000000
--- a/eval-server/nodejs/evals/web-task-agent/food-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Restaurant Search and Menu Extraction - Web Task Agent
-id: "food-001"
-name: "Restaurant Search and Menu Extraction"
-description: "Test searching restaurants and extracting menu information"
-enabled: true
-
-target:
-  url: "https://www.yelp.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
-  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
-  extraction_schema:
-    type: "object"
-    properties:
-      restaurants:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            rating:
-              type: "string"
-            price_range:
-              type: "string"
-            cuisine:
-              type: "string"
-            address:
-              type: "string"
-            phone:
-              type: "string"
-            hours:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Italian restaurants in San Francisco"
-      - "Each restaurant includes name, rating, and price range"
-      - "Location/address information is provided for each restaurant"
-      - "Contact details (phone/hours) included when available"
-      - "All restaurants listed serve Italian cuisine"
-      - "Results are presented in clear, structured text format (not JSON)"
-      - "Restaurants are numbered or organized clearly for easy comparison"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yelp search results for Italian restaurants"
-        - "Check that restaurants show ratings and price ranges"
-        - "Confirm location filter shows San Francisco results"
-        - "Ensure restaurant listings include contact information"
-
-metadata:
-  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml b/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
deleted file mode 100644
index a9234e5..0000000
--- a/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# ANA Airlines Iframe Content Extraction - Web Task Agent
-id: "iframe-001"
-name: "ANA Airlines Iframe Content Extraction"
-description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
-enabled: true
-
-target:
-  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
-  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            flight_number:
-              type: "string"
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            departure_date:
-              type: "string"
-            arrival_date:
-              type: "string"
-            duration:
-              type: "string"
-            aircraft:
-              type: "string"
-            price:
-              type: "string"
-            cabin_class:
-              type: "string"
-            stops:
-              type: "string"
-      booking_interface_status:
-        type: "string"
-      iframe_content_found:
-        type: "boolean"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully navigated ANA Airlines booking interface"
-      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
-      - "Extracted flight information from ANA flight search results"
-      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
-      - "Extracted pricing information in appropriate currency"
-      - "Handled any booking interface elements, popups, or navigation flows"
-      - "Results show flights for the correct dates (March 20-30, 2026)"
-      - "Successfully demonstrated iframe content extraction capabilities"
-      - "Booking interface status indicates successful page interaction"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify ANA Airlines flight search page loaded correctly"
-        - "Check that search parameters show SEA to NRT route"
-        - "Confirm flight results are displayed (may be in iframes)"
-        - "Ensure booking interface elements are functional"
-        - "Verify flight information is accessible and extractable"
-
-metadata:
-  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml b/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
deleted file mode 100644
index 7a6caa8..0000000
--- a/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Job Search Workflow - Web Task Agent
-id: "jobs-001"
-name: "Job Search Workflow"
-description: "Test web task agent orchestrating job search on LinkedIn"
-enabled: true
-
-target:
-  url: "https://www.linkedin.com/jobs"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
-  reasoning: "User wants to find job opportunities in tech industry"
-  extraction_schema:
-    type: "object"
-    properties:
-      jobs:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            company:
-              type: "string"
-            location:
-              type: "string"
-            salary:
-              type: "string"
-            description:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
-      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
-      - "If using forms: delegated keyword and location input to action_agent"
-      - "Extracted job listings using extract_data"
-      - "Returned structured job data in readable text format (not JSON)"
-      - "Each job listing includes title, company, location, and other relevant fields"
-      - "Results are numbered or organized clearly for easy reading"
-      - "Demonstrated proper workflow orchestration for job search"
-      - "Never used direct browser interaction tools"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify LinkedIn job search results are displayed"
-        - "Check that search shows Software Engineer jobs in San Francisco"
-        - "Confirm job listings include company names and titles"
-        - "Ensure at least 5 job results are visible"
-
-metadata:
-  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/learning-001.yaml b/eval-server/nodejs/evals/web-task-agent/learning-001.yaml
deleted file mode 100644
index 1e4c761..0000000
--- a/eval-server/nodejs/evals/web-task-agent/learning-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Online Course Search - Web Task Agent
-id: "learning-001"
-name: "Online Course Search"
-description: "Test searching and extracting course information from learning platforms"
-enabled: true
-
-target:
-  url: "https://www.coursera.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
-  reasoning: "Users want to compare courses across platforms for learning decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      courses:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            instructor:
-              type: "string"
-            university:
-              type: "string"
-            rating:
-              type: "string"
-            duration:
-              type: "string"
-            price:
-              type: "string"
-            description:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Machine Learning courses on Coursera"
-      - "Returned details for top 5 courses as requested"
-      - "Each course includes title, instructor, university, and rating"
-      - "Duration and pricing information included for each course"
-      - "Course descriptions or key topics are provided"
-      - "Results are presented in structured text format (not JSON)"
-      - "Courses are numbered (1-5) and well-organized for easy comparison"
-      - "Each course entry is clearly formatted and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Coursera search results for Machine Learning"
-        - "Check that courses show titles, instructors, and ratings"
-        - "Confirm course details include duration and pricing"
-        - "Ensure search results are relevant to Machine Learning"
-
-metadata:
-  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/nav-001.yaml b/eval-server/nodejs/evals/web-task-agent/nav-001.yaml
deleted file mode 100644
index bff519f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Site Navigation Workflow - Web Task Agent
-id: "nav-001"
-name: "Site Navigation Workflow"
-description: "Test web task agent orchestrating navigation between different sections of a site"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
-  reasoning: "User is looking to explore Wikipedia content through structured navigation"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated Wikipedia search via action_agent calls"
-      - "Navigated to artificial intelligence article through action_agent"
-      - "Located machine learning section via action_agent coordination"
-      - "Extracted relevant information about machine learning"
-      - "Demonstrated multi-step navigation workflow"
-      - "Maintained orchestration pattern throughout navigation"
-      - "Provided structured summary of found information"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify navigation reached artificial intelligence Wikipedia page"
-        - "Check that machine learning section or content is visible"
-        - "Confirm successful navigation through multiple page sections"
-        - "Ensure content related to machine learning is displayed"
-
-metadata:
-  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/news-001.yaml b/eval-server/nodejs/evals/web-task-agent/news-001.yaml
deleted file mode 100644
index 4c29aed..0000000
--- a/eval-server/nodejs/evals/web-task-agent/news-001.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# News Article Aggregation - Web Task Agent
-id: "news-001"
-name: "News Article Aggregation"
-description: "Test aggregating news headlines and summaries from news sites"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
-  reasoning: "Users want automated news monitoring for research and awareness"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments_count:
-              type: "number"
-            url:
-              type: "string"
-            top_comment:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted 10 Hacker News stories as requested"
-      - "Each story includes title, score, and comment count"
-      - "URLs are provided for each story"
-      - "Stories appear to be from the current top/front page"
-      - "Results are presented in clear, numbered text format (1-10), not JSON"
-      - "All required fields are present and properly formatted in readable text"
-      - "Each story is clearly separated and easy to read"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News stories are visible with scores"
-        - "Check that story titles and comment counts are shown"
-        - "Confirm top stories section is properly displayed"
-        - "Ensure story metadata is accessible for extraction"
-
-metadata:
-  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml b/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
deleted file mode 100644
index 5fd824e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Real Estate Property Search - Web Task Agent
-id: "realestate-001"
-name: "Real Estate Property Search"
-description: "Test property search workflow on real estate platforms"
-enabled: true
-
-target:
-  url: "https://www.zillow.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
-  reasoning: "User wants to find affordable housing options in a specific location"
-  extraction_schema:
-    type: "object"
-    properties:
-      properties:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            address:
-              type: "string"
-            price:
-              type: "string"
-            bedrooms:
-              type: "number"
-            bathrooms:
-              type: "number"
-            sqft:
-              type: "string"
-            lot_size:
-              type: "string"
-            year_built:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated location search via action_agent"
-      - "Delegated price filter setting to action_agent"
-      - "Coordinated property type selection through action_agent"
-      - "Applied search filters through proper action_agent calls"
-      - "Extracted property listings with extract_data"
-      - "Returned structured property data in readable text format (not JSON)"
-      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
-      - "Properties are clearly numbered or organized for easy comparison"
-      - "Demonstrated complex real estate search workflow orchestration"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Zillow search results for Austin, Texas properties"
-        - "Check that properties shown are under $500k"
-        - "Confirm property listings show price, beds, baths info"
-        - "Ensure search results match the specified criteria"
-
-metadata:
-  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
deleted file mode 100644
index 12a986f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Infinite Scroll Content Loading - Web Task Agent
-id: "scroll-001"
-name: "Infinite Scroll Content Loading"
-description: "Test web task agent handling infinite scroll pages to load more content"
-enabled: true
-
-target:
-  url: "https://twitter.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
-  reasoning: "Testing infinite scroll functionality for dynamic content loading"
-  extraction_schema:
-    type: "object"
-    properties:
-      tweets:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            author:
-              type: "string"
-            content:
-              type: "string"
-            likes:
-              type: "string"
-            retweets:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used scroll_page tool to scroll down the page"
-      - "Loaded additional content through scrolling actions"
-      - "Extracted at least 20 tweets from the feed"
-      - "Each tweet includes author and content information"
-      - "Demonstrated proper handling of dynamically loaded content"
-      - "Results are presented in clear, numbered text format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify initial Twitter feed is loaded"
-        - "Check that scrolling action loaded additional tweets"
-        - "Confirm at least 20 tweets are visible after scrolling"
-        - "Ensure page scrolled down significantly from initial position"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
deleted file mode 100644
index dce0156..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Product Review Scrolling - Web Task Agent
-id: "scroll-002"
-name: "Product Review Scrolling"
-description: "Test scrolling to load more product reviews on e-commerce sites"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/dp/B09B8V1LZ3"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
-  reasoning: "Users need to see multiple reviews beyond initial visible ones"
-  extraction_schema:
-    type: "object"
-    properties:
-      reviews:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            rating:
-              type: "string"
-            title:
-              type: "string"
-            author:
-              type: "string"
-            date:
-              type: "string"
-            verified:
-              type: "boolean"
-            content:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool to navigate to reviews section"
-      - "Scrolled within reviews area to load additional reviews"
-      - "Extracted multiple product reviews with ratings"
-      - "Each review includes rating, author, and content"
-      - "Successfully handled lazy-loaded review content"
-      - "Presented reviews in structured, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Amazon product page is loaded"
-        - "Check that page scrolled to reviews section"
-        - "Confirm additional reviews loaded after scrolling"
-        - "Ensure review content is fully visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
deleted file mode 100644
index df7eaba..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# News Article Progressive Loading - Web Task Agent
-id: "scroll-003"
-name: "News Article Progressive Loading"
-description: "Test scrolling through news sites that load articles progressively"
-enabled: true
-
-target:
-  url: "https://medium.com/topic/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
-  reasoning: "Testing progressive content loading on news/blog platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            author:
-              type: "string"
-            reading_time:
-              type: "string"
-            preview:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool multiple times to load content"
-      - "Successfully loaded at least 15 articles through scrolling"
-      - "Extracted article titles and author information"
-      - "Handled Medium's progressive loading mechanism"
-      - "Articles are from technology topic as requested"
-      - "Results presented in clear, numbered format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Medium technology page is loaded"
-        - "Check that initial articles are visible"
-        - "Confirm scrolling loaded additional articles"
-        - "Ensure at least 15 articles are visible after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
deleted file mode 100644
index e9b3534..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Search Results Infinite Scroll - Web Task Agent
-id: "scroll-004"
-name: "Search Results Infinite Scroll"
-description: "Test handling search results that use infinite scroll instead of pagination"
-enabled: true
-
-target:
-  url: "https://www.pinterest.com/search/pins/?q=web%20design"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
-  reasoning: "Testing infinite scroll on visual search platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      pins:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            description:
-              type: "string"
-            saves:
-              type: "string"
-            source:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully performed search for \"web design\" pins"
-      - "Used scroll_page tool to trigger infinite scroll loading"
-      - "Loaded at least 30 pins through scrolling actions"
-      - "Extracted pin titles and metadata"
-      - "Handled Pinterest's masonry layout and lazy loading"
-      - "Results are well-organized and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Pinterest search results for web design"
-        - "Check initial pins are displayed"
-        - "Confirm scrolling loaded many more pins"
-        - "Ensure grid layout shows 30+ pins after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
deleted file mode 100644
index 47c8769..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-# Google Flights Scroll and Show More - Web Task Agent
-id: "scroll-005"
-name: "Google Flights Scroll and Show More"
-description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
-  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "string"
-            price:
-              type: "string"
-            aircraft:
-              type: "string"
-      total_flights_found:
-        type: "number"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted initial flight results from Google Flights"
-      - "Used scroll_page tool to scroll down the flight results list"
-      - "Located and clicked \"Show more flights\" button using action_agent"
-      - "Loaded additional flight options beyond the initial set"
-      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Each flight includes airline, times, duration, stops, and price"
-      - "Flights are for the correct dates (March 20-30, 2026)"
-      - "Results are presented in clear, numbered format"
-      - "Successfully combined scrolling and clicking actions to load more content"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Google Flights page shows SEA to NRT flights"
-        - "Check that initial flight results are displayed"
-        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
-        - "Ensure additional flights loaded after clicking the button"
-        - "Verify at least 20 flight options are now visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/search-001.yaml b/eval-server/nodejs/evals/web-task-agent/search-001.yaml
deleted file mode 100644
index da3a4eb..0000000
--- a/eval-server/nodejs/evals/web-task-agent/search-001.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Basic web task search evaluation (matches DevTools test case)
-id: "search-001"
-name: "Site-Specific Search Task"
-description: "Test web task agent orchestrating a search workflow on a specific site"
-enabled: true
-
-target:
-  url: "chrome://new-tab-page"
-
-tool: "web_task_agent"
-timeout: 60000
-
-input:
-  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
-  reasoning: "Testing basic site-specific search workflow orchestration"
-  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully returned exactly 3 search results in structured text format"
-      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
-      - "Each result includes a URL in the format \"URL: [link]\""
-      - "Results are presented in a clear, readable text format (not JSON)"
-      - "Response includes a brief summary or conclusion statement"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search was completed and results page is showing"
-        - "Check that search results are related to \"Chrome DevTools automation\""
-        - "Confirm at least 3 search results are visible on the page"
-        - "Ensure the search workflow was completed successfully"
-
-metadata:
-  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
-  priority: "normal"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/social-001.yaml b/eval-server/nodejs/evals/web-task-agent/social-001.yaml
deleted file mode 100644
index a35ebfd..0000000
--- a/eval-server/nodejs/evals/web-task-agent/social-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Social Media Content Extraction - Web Task Agent
-id: "social-001"
-name: "Social Media Content Extraction"
-description: "Test extracting trending topics and posts from social media"
-enabled: true
-
-target:
-  url: "https://twitter.com/explore"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 trending topics from Twitter/X explore page"
-  reasoning: "User wants to stay updated on current trends"
-  extraction_schema:
-    type: "object"
-    properties:
-      trends:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            topic:
-              type: "string"
-            posts_count:
-              type: "string"
-            category:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully accessed Twitter/X explore page and found trending topics"
-      - "Returned exactly 5 trending topics as requested"
-      - "Each topic includes the trend name/hashtag"
-      - "Post counts or metrics are included when available"
-      - "Topics are current/recent trends (not outdated)"
-      - "Results are presented in clear, numbered text format (not JSON)"
-      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Twitter/X explore page is loaded"
-        - "Check that trending topics section is visible"
-        - "Confirm trending topics show names and post counts"
-        - "Ensure page shows current trending content"
-
-metadata:
-  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
deleted file mode 100644
index a2842b6..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Hotel Search Workflow - Web Task Agent
-id: "web-task-agent-booking-001"
-name: "Hotel Search Workflow"
-description: "Test web task agent orchestrating complex multi-step booking search"
-enabled: true
-
-target:
-  url: "https://www.booking.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
-  reasoning: "Customer is looking for travel booking"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully searched for hotels in San Francisco"
-      - "Results show hotels available for March 15-17 dates"
-      - "Guest count of 2 adults is reflected in the search results"
-      - "Returned multiple hotel options with relevant details"
-      - "Each hotel includes essential information (name, price, location)"
-      - "Results are presented in a clear, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify hotel search results are displayed for San Francisco"
-        - "Check that dates March 15-17 are correctly selected"
-        - "Confirm guest count shows 2 adults"
-        - "Ensure search results show hotels with availability for specified dates"
-
-metadata:
-  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
deleted file mode 100644
index a6b9735..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# E-commerce web task evaluation (matches DevTools test case)
-id: "web-task-agent-ecommerce-001"
-name: "E-commerce Product Search"
-description: "Test web task agent handling product search on shopping site"
-enabled: true
-
-target:
-  url: "https://www.amazon.com"
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Search Amazon for \"wireless headphones\" and find products under $100"
-  reasoning: "Testing e-commerce search workflow with price filtering"
-  context: "User wants to find wireless headphones with specific price constraint"
-  extraction_schema:
-    type: "object"
-    properties:
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            price:
-              type: "string"
-            rating:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "hybrid"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully navigated to product search"
-      - "Applied appropriate filters correctly"
-      - "Extracted product details accurately"
-      - "Provided meaningful comparison of features"
-      - "Stayed within specified price range"
-  snapshot:
-    structure_only: true
-    exclude_paths:
-      - "timestamp"
-      - "sessionId"
-
-metadata:
-  tags: ["web-task", "multi-step", "ecommerce", "search"]
-  priority: "high"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
deleted file mode 100644
index cc5c7df..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Error Recovery Workflow - Web Task Agent
-id: "web-task-agent-error-001"
-name: "Error Recovery Workflow"
-description: "Test web task agent handling action_agent failures and retry logic"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
-  reasoning: "Customer is asking for this response"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
-      - "Either found some results OR provided clear explanation why no results were found"
-      - "Response handles the edge case gracefully without errors"
-      - "If no results found, suggested alternative actions or explanations"
-      - "Maintained professional tone despite unusual request"
-      - "Final output is coherent and helpful to the user"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Check if search was attempted despite unusual query"
-        - "Verify error handling did not break the page interaction"
-        - "Confirm agent attempted to complete the task or provided clear error info"
-        - "Ensure page is still functional after error recovery attempts"
-
-metadata:
-  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
deleted file mode 100644
index 14eadcb..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Structured Data Extraction - Web Task Agent
-id: "web-task-agent-extract-001"
-name: "Structured Data Extraction"
-description: "Test web task agent extracting structured data from search results"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
-  reasoning: "User is looking to understand the top stories on Hacker News"
-  extraction_schema:
-    type: "object"
-    properties:
-      stories:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments:
-              type: "number"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully returned exactly 5 Hacker News stories in structured text format"
-      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
-      - "Results are presented in readable text format similar to the example provided"
-      - "Response includes all required fields: title, score, comments count, URL"
-      - "Maintained proper orchestration pattern throughout the extraction process"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News homepage is loaded and displaying stories"
-        - "Check that top stories are visible with scores and comment counts"
-        - "Confirm story titles and metadata are clearly displayed"
-        - "Ensure page structure allows for data extraction"
-
-metadata:
-  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
deleted file mode 100644
index 8f7a2b0..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Stock Information Research - Web Task Agent
-id: "web-task-agent-finance-001"
-name: "Stock Information Research"
-description: "Test extracting stock prices and financial information"
-enabled: true
-
-target:
-  url: "https://finance.yahoo.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
-  reasoning: "Users need automated financial data collection for investment decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      stock_info:
-        type: "object"
-        properties:
-          symbol:
-            type: "string"
-          company_name:
-            type: "string"
-          current_price:
-            type: "string"
-          change:
-            type: "string"
-          change_percent:
-            type: "string"
-          market_cap:
-            type: "string"
-          pe_ratio:
-            type: "string"
-          volume:
-            type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Apple (AAPL) stock information"
-      - "Current stock price is clearly stated"
-      - "Market cap information is included"
-      - "Price change and percentage change are provided"
-      - "Additional metrics (PE ratio, volume) included when available"
-      - "Financial data is current and presented in readable text format (not JSON)"
-      - "Stock information is well-organized and easy to understand"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
-        - "Check that current stock price and change are visible"
-        - "Confirm market cap and trading volume are displayed"
-        - "Ensure financial metrics and charts are shown"
-
-metadata:
-  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
deleted file mode 100644
index a17883f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Complex Flight Search - Web Task Agent
-id: "web-task-agent-flight-001"
-name: "Complex Flight Search"
-description: "Test web task agent handling complex flight search with multiple criteria"
-enabled: true
-
-target:
-  url: "https://www.kayak.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
-  reasoning: "Customer is looking for finding the best flight options"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Flight results show March 20 departure date"
-      - "Flight results show March 30 return date"
-      - "Returned multiple flight options with airlines and prices"
-      - "Each flight includes essential details (times, airlines, prices)"
-      - "Results clearly distinguish between outbound and return flights"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify flight search results are displayed"
-        - "Check SEA to NRT route is correctly selected"
-        - "Confirm dates March 20 departure and March 30 return"
-        - "Ensure flight options are showing with prices and airlines"
-
-metadata:
-  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
deleted file mode 100644
index 32ee646..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Restaurant Search and Menu Extraction - Web Task Agent
-id: "web-task-agent-food-001"
-name: "Restaurant Search and Menu Extraction"
-description: "Test searching restaurants and extracting menu information"
-enabled: true
-
-target:
-  url: "https://www.yelp.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
-  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
-  extraction_schema:
-    type: "object"
-    properties:
-      restaurants:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            rating:
-              type: "string"
-            price_range:
-              type: "string"
-            cuisine:
-              type: "string"
-            address:
-              type: "string"
-            phone:
-              type: "string"
-            hours:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Italian restaurants in San Francisco"
-      - "Each restaurant includes name, rating, and price range"
-      - "Location/address information is provided for each restaurant"
-      - "Contact details (phone/hours) included when available"
-      - "All restaurants listed serve Italian cuisine"
-      - "Results are presented in clear, structured text format (not JSON)"
-      - "Restaurants are numbered or organized clearly for easy comparison"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yelp search results for Italian restaurants"
-        - "Check that restaurants show ratings and price ranges"
-        - "Confirm location filter shows San Francisco results"
-        - "Ensure restaurant listings include contact information"
-
-metadata:
-  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
deleted file mode 100644
index 30b0eac..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# ANA Airlines Iframe Content Extraction - Web Task Agent
-id: "web-task-agent-iframe-001"
-name: "ANA Airlines Iframe Content Extraction"
-description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
-enabled: true
-
-target:
-  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
-  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            flight_number:
-              type: "string"
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            departure_date:
-              type: "string"
-            arrival_date:
-              type: "string"
-            duration:
-              type: "string"
-            aircraft:
-              type: "string"
-            price:
-              type: "string"
-            cabin_class:
-              type: "string"
-            stops:
-              type: "string"
-      booking_interface_status:
-        type: "string"
-      iframe_content_found:
-        type: "boolean"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully navigated ANA Airlines booking interface"
-      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
-      - "Extracted flight information from ANA flight search results"
-      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
-      - "Extracted pricing information in appropriate currency"
-      - "Handled any booking interface elements, popups, or navigation flows"
-      - "Results show flights for the correct dates (March 20-30, 2026)"
-      - "Successfully demonstrated iframe content extraction capabilities"
-      - "Booking interface status indicates successful page interaction"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify ANA Airlines flight search page loaded correctly"
-        - "Check that search parameters show SEA to NRT route"
-        - "Confirm flight results are displayed (may be in iframes)"
-        - "Ensure booking interface elements are functional"
-        - "Verify flight information is accessible and extractable"
-
-metadata:
-  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
deleted file mode 100644
index 2c72df3..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Job Search Workflow - Web Task Agent
-id: "web-task-agent-jobs-001"
-name: "Job Search Workflow"
-description: "Test web task agent orchestrating job search on LinkedIn"
-enabled: true
-
-target:
-  url: "https://www.linkedin.com/jobs"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
-  reasoning: "User wants to find job opportunities in tech industry"
-  extraction_schema:
-    type: "object"
-    properties:
-      jobs:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            company:
-              type: "string"
-            location:
-              type: "string"
-            salary:
-              type: "string"
-            description:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
-      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
-      - "If using forms: delegated keyword and location input to action_agent"
-      - "Extracted job listings using extract_data"
-      - "Returned structured job data in readable text format (not JSON)"
-      - "Each job listing includes title, company, location, and other relevant fields"
-      - "Results are numbered or organized clearly for easy reading"
-      - "Demonstrated proper workflow orchestration for job search"
-      - "Never used direct browser interaction tools"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify LinkedIn job search results are displayed"
-        - "Check that search shows Software Engineer jobs in San Francisco"
-        - "Confirm job listings include company names and titles"
-        - "Ensure at least 5 job results are visible"
-
-metadata:
-  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
deleted file mode 100644
index 8dcdc7d..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Online Course Search - Web Task Agent
-id: "web-task-agent-learning-001"
-name: "Online Course Search"
-description: "Test searching and extracting course information from learning platforms"
-enabled: true
-
-target:
-  url: "https://www.coursera.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
-  reasoning: "Users want to compare courses across platforms for learning decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      courses:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            instructor:
-              type: "string"
-            university:
-              type: "string"
-            rating:
-              type: "string"
-            duration:
-              type: "string"
-            price:
-              type: "string"
-            description:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Machine Learning courses on Coursera"
-      - "Returned details for top 5 courses as requested"
-      - "Each course includes title, instructor, university, and rating"
-      - "Duration and pricing information included for each course"
-      - "Course descriptions or key topics are provided"
-      - "Results are presented in structured text format (not JSON)"
-      - "Courses are numbered (1-5) and well-organized for easy comparison"
-      - "Each course entry is clearly formatted and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Coursera search results for Machine Learning"
-        - "Check that courses show titles, instructors, and ratings"
-        - "Confirm course details include duration and pricing"
-        - "Ensure search results are relevant to Machine Learning"
-
-metadata:
-  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
deleted file mode 100644
index fdee2f4..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Site Navigation Workflow - Web Task Agent
-id: "web-task-agent-nav-001"
-name: "Site Navigation Workflow"
-description: "Test web task agent orchestrating navigation between different sections of a site"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
-  reasoning: "User is looking to explore Wikipedia content through structured navigation"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated Wikipedia search via action_agent calls"
-      - "Navigated to artificial intelligence article through action_agent"
-      - "Located machine learning section via action_agent coordination"
-      - "Extracted relevant information about machine learning"
-      - "Demonstrated multi-step navigation workflow"
-      - "Maintained orchestration pattern throughout navigation"
-      - "Provided structured summary of found information"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify navigation reached artificial intelligence Wikipedia page"
-        - "Check that machine learning section or content is visible"
-        - "Confirm successful navigation through multiple page sections"
-        - "Ensure content related to machine learning is displayed"
-
-metadata:
-  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
deleted file mode 100644
index d9e1934..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# News Article Aggregation - Web Task Agent
-id: "web-task-agent-news-001"
-name: "News Article Aggregation"
-description: "Test aggregating news headlines and summaries from news sites"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
-  reasoning: "Users want automated news monitoring for research and awareness"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments_count:
-              type: "number"
-            url:
-              type: "string"
-            top_comment:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted 10 Hacker News stories as requested"
-      - "Each story includes title, score, and comment count"
-      - "URLs are provided for each story"
-      - "Stories appear to be from the current top/front page"
-      - "Results are presented in clear, numbered text format (1-10), not JSON"
-      - "All required fields are present and properly formatted in readable text"
-      - "Each story is clearly separated and easy to read"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News stories are visible with scores"
-        - "Check that story titles and comment counts are shown"
-        - "Confirm top stories section is properly displayed"
-        - "Ensure story metadata is accessible for extraction"
-
-metadata:
-  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
deleted file mode 100644
index f22bc13..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Real Estate Property Search - Web Task Agent
-id: "web-task-agent-realestate-001"
-name: "Real Estate Property Search"
-description: "Test property search workflow on real estate platforms"
-enabled: true
-
-target:
-  url: "https://www.zillow.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
-  reasoning: "User wants to find affordable housing options in a specific location"
-  extraction_schema:
-    type: "object"
-    properties:
-      properties:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            address:
-              type: "string"
-            price:
-              type: "string"
-            bedrooms:
-              type: "number"
-            bathrooms:
-              type: "number"
-            sqft:
-              type: "string"
-            lot_size:
-              type: "string"
-            year_built:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated location search via action_agent"
-      - "Delegated price filter setting to action_agent"
-      - "Coordinated property type selection through action_agent"
-      - "Applied search filters through proper action_agent calls"
-      - "Extracted property listings with extract_data"
-      - "Returned structured property data in readable text format (not JSON)"
-      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
-      - "Properties are clearly numbered or organized for easy comparison"
-      - "Demonstrated complex real estate search workflow orchestration"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Zillow search results for Austin, Texas properties"
-        - "Check that properties shown are under $500k"
-        - "Confirm property listings show price, beds, baths info"
-        - "Ensure search results match the specified criteria"
-
-metadata:
-  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
deleted file mode 100644
index 6fd0f6e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Infinite Scroll Content Loading - Web Task Agent
-id: "web-task-agent-scroll-001"
-name: "Infinite Scroll Content Loading"
-description: "Test web task agent handling infinite scroll pages to load more content"
-enabled: true
-
-target:
-  url: "https://twitter.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
-  reasoning: "Testing infinite scroll functionality for dynamic content loading"
-  extraction_schema:
-    type: "object"
-    properties:
-      tweets:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            author:
-              type: "string"
-            content:
-              type: "string"
-            likes:
-              type: "string"
-            retweets:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used scroll_page tool to scroll down the page"
-      - "Loaded additional content through scrolling actions"
-      - "Extracted at least 20 tweets from the feed"
-      - "Each tweet includes author and content information"
-      - "Demonstrated proper handling of dynamically loaded content"
-      - "Results are presented in clear, numbered text format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify initial Twitter feed is loaded"
-        - "Check that scrolling action loaded additional tweets"
-        - "Confirm at least 20 tweets are visible after scrolling"
-        - "Ensure page scrolled down significantly from initial position"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
deleted file mode 100644
index d5d060a..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Product Review Scrolling - Web Task Agent
-id: "web-task-agent-scroll-002"
-name: "Product Review Scrolling"
-description: "Test scrolling to load more product reviews on e-commerce sites"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/dp/B08N5WRWNW"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
-  reasoning: "Users need to see multiple reviews beyond initial visible ones"
-  extraction_schema:
-    type: "object"
-    properties:
-      reviews:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            rating:
-              type: "string"
-            title:
-              type: "string"
-            author:
-              type: "string"
-            date:
-              type: "string"
-            verified:
-              type: "boolean"
-            content:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool to navigate to reviews section"
-      - "Scrolled within reviews area to load additional reviews"
-      - "Extracted multiple product reviews with ratings"
-      - "Each review includes rating, author, and content"
-      - "Successfully handled lazy-loaded review content"
-      - "Presented reviews in structured, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Amazon product page is loaded"
-        - "Check that page scrolled to reviews section"
-        - "Confirm additional reviews loaded after scrolling"
-        - "Ensure review content is fully visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
deleted file mode 100644
index f435017..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# News Article Progressive Loading - Web Task Agent
-id: "web-task-agent-scroll-003"
-name: "News Article Progressive Loading"
-description: "Test scrolling through news sites that load articles progressively"
-enabled: true
-
-target:
-  url: "https://medium.com/topic/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
-  reasoning: "Testing progressive content loading on news/blog platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            author:
-              type: "string"
-            reading_time:
-              type: "string"
-            preview:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool multiple times to load content"
-      - "Successfully loaded at least 15 articles through scrolling"
-      - "Extracted article titles and author information"
-      - "Handled Medium's progressive loading mechanism"
-      - "Articles are from technology topic as requested"
-      - "Results presented in clear, numbered format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Medium technology page is loaded"
-        - "Check that initial articles are visible"
-        - "Confirm scrolling loaded additional articles"
-        - "Ensure at least 15 articles are visible after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
deleted file mode 100644
index 5970947..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Search Results Infinite Scroll - Web Task Agent
-id: "web-task-agent-scroll-004"
-name: "Search Results Infinite Scroll"
-description: "Test handling search results that use infinite scroll instead of pagination"
-enabled: true
-
-target:
-  url: "https://www.pinterest.com/search/pins/?q=web%20design"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
-  reasoning: "Testing infinite scroll on visual search platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      pins:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            description:
-              type: "string"
-            saves:
-              type: "string"
-            source:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully performed search for \"web design\" pins"
-      - "Used scroll_page tool to trigger infinite scroll loading"
-      - "Loaded at least 30 pins through scrolling actions"
-      - "Extracted pin titles and metadata"
-      - "Handled Pinterest's masonry layout and lazy loading"
-      - "Results are well-organized and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Pinterest search results for web design"
-        - "Check initial pins are displayed"
-        - "Confirm scrolling loaded many more pins"
-        - "Ensure grid layout shows 30+ pins after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
deleted file mode 100644
index e603ff7..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-# Google Flights Scroll and Show More - Web Task Agent
-id: "web-task-agent-scroll-005"
-name: "Google Flights Scroll and Show More"
-description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
-  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "string"
-            price:
-              type: "string"
-            aircraft:
-              type: "string"
-      total_flights_found:
-        type: "number"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted initial flight results from Google Flights"
-      - "Used scroll_page tool to scroll down the flight results list"
-      - "Located and clicked \"Show more flights\" button using action_agent"
-      - "Loaded additional flight options beyond the initial set"
-      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Each flight includes airline, times, duration, stops, and price"
-      - "Flights are for the correct dates (March 20-30, 2026)"
-      - "Results are presented in clear, numbered format"
-      - "Successfully combined scrolling and clicking actions to load more content"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Google Flights page shows SEA to NRT flights"
-        - "Check that initial flight results are displayed"
-        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
-        - "Ensure additional flights loaded after clicking the button"
-        - "Verify at least 20 flight options are now visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
deleted file mode 100644
index 50dc920..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Basic web task search evaluation (matches DevTools test case)
-id: "web-task-agent-search-001"
-name: "Site-Specific Search Task"
-description: "Test web task agent orchestrating a search workflow on a specific site"
-enabled: true
-
-target:
-  url: "chrome://new-tab-page"
-
-tool: "web_task_agent"
-timeout: 60000
-
-input:
-  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
-  reasoning: "Testing basic site-specific search workflow orchestration"
-  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully returned exactly 3 search results in structured text format"
-      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
-      - "Each result includes a URL in the format \"URL: [link]\""
-      - "Results are presented in a clear, readable text format (not JSON)"
-      - "Response includes a brief summary or conclusion statement"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search was completed and results page is showing"
-        - "Check that search results are related to \"Chrome DevTools automation\""
-        - "Confirm at least 3 search results are visible on the page"
-        - "Ensure the search workflow was completed successfully"
-
-metadata:
-  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
-  priority: "normal"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
deleted file mode 100644
index f1f969e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Social Media Content Extraction - Web Task Agent
-id: "web-task-agent-social-001"
-name: "Social Media Content Extraction"
-description: "Test extracting trending topics and posts from social media"
-enabled: true
-
-target:
-  url: "https://twitter.com/explore"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 trending topics from Twitter/X explore page"
-  reasoning: "User wants to stay updated on current trends"
-  extraction_schema:
-    type: "object"
-    properties:
-      trends:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            topic:
-              type: "string"
-            posts_count:
-              type: "string"
-            category:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully accessed Twitter/X explore page and found trending topics"
-      - "Returned exactly 5 trending topics as requested"
-      - "Each topic includes the trend name/hashtag"
-      - "Post counts or metrics are included when available"
-      - "Topics are current/recent trends (not outdated)"
-      - "Results are presented in clear, numbered text format (not JSON)"
-      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Twitter/X explore page is loaded"
-        - "Check that trending topics section is visible"
-        - "Confirm trending topics show names and post counts"
-        - "Ensure page shows current trending content"
-
-metadata:
-  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/examples/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml b/eval-server/nodejs/examples/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
deleted file mode 100644
index f5b865f..0000000
--- a/eval-server/nodejs/examples/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-client:
-  id: 1233ae25-9f9e-4f77-924d-865f7d615cef
-  name: DevTools Client 1233ae25
-  secret_key: hello
-  description: Auto-generated DevTools evaluation client
-settings:
-  max_concurrent_evaluations: 3
-  default_timeout: 45000
-  retry_policy:
-    max_retries: 2
-    backoff_multiplier: 2
-    initial_delay: 1000
diff --git a/eval-server/nodejs/examples/library-usage.js b/eval-server/nodejs/examples/library-usage.js
deleted file mode 100644
index cfb3ffd..0000000
--- a/eval-server/nodejs/examples/library-usage.js
+++ /dev/null
@@ -1,250 +0,0 @@
-#!/usr/bin/env node
-
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Simple example demonstrating the programmatic API usage
-
-import { EvalServer } from '../src/lib/EvalServer.js';
-import { CONFIG } from '../src/config.js';
-
-console.log('🔧 Creating server...');
-const server = new EvalServer({
-  authKey: 'hello',
-  host: '127.0.0.1',
-  port: 8080
-});
-
-console.log('🔧 Setting up event handlers...');
-
-server.on('started', (info) => {
-  console.log('✅ Server started event fired:', info);
-});
-
-server.on('error', (error) => {
-  console.log('❌ Server error:', error);
-});
-
-server.onConnect(async client => {
-  console.log('🎉 CLIENT CONNECTED!');
-  console.log('   - Client ID:', client.id);
-  console.log('   - Client tabId:', client.tabId);
-  console.log('   - Client info:', client.getInfo());
-
-  // Check available LLM providers
-  console.log('\n🔑 Available LLM Providers:');
-  const availableProviders = [];
-  if (CONFIG.providers.openai.apiKey) {
-    availableProviders.push('openai');
-    console.log('   ✅ OpenAI configured');
-  }
-  if (CONFIG.providers.groq.apiKey) {
-    availableProviders.push('groq');
-    console.log('   ✅ Groq configured');
-  }
-  if (CONFIG.providers.openrouter.apiKey) {
-    availableProviders.push('openrouter');
-    console.log('   ✅ OpenRouter configured');
-  }
-  if (CONFIG.providers.litellm.apiKey && CONFIG.providers.litellm.endpoint) {
-    availableProviders.push('litellm');
-    console.log('   ✅ LiteLLM configured');
-  }
-
-  if (availableProviders.length === 0) {
-    console.log('   ❌ No providers configured. Add API keys to .env file.');
-    console.log('   ℹ️  Example: OPENAI_API_KEY=sk-your-key-here');
-  }
-
-  try {
-    // Demonstrate basic evaluation first
-    console.log('\n🔄 Starting basic evaluation...');
-    let response = await client.evaluate({
-      id: "basic_eval",
-      name: "Capital of France",
-      description: "Basic test evaluation",
-      tool: "chat",
-      input: {
-        message: "What is the capital of France?"
-      }
-    });
-
-    console.log('✅ Basic evaluation completed!');
-    console.log('📊 Response:', JSON.stringify(response, null, 2));
-
-    // Demonstrate explicit model selection if OpenAI is available
-    if (CONFIG.providers.openai.apiKey) {
-      await demonstrateModelSelection(client);
-    }
-
-    // Demonstrate LLM configuration if providers are available
-    if (availableProviders.length > 0) {
-      await demonstrateLLMConfiguration(client, availableProviders);
-    }
-
-  } catch (error) {
-    console.log('❌ Evaluation failed:', error.message);
-  }
-});
-
-server.onDisconnect(clientInfo => {
-  console.log('👋 CLIENT DISCONNECTED:', clientInfo);
-});
-
-// Function to demonstrate explicit model selection within OpenAI
-async function demonstrateModelSelection(client) {
-  console.log('\n🤖 Demonstrating Model Selection (OpenAI)...');
-
-  const modelTests = [
-    {
-      model: 'gpt-4',
-      task: 'Complex reasoning',
-      message: 'Solve this step by step: If a train travels 60 mph for 2.5 hours, how far does it go?'
-    },
-    {
-      model: 'gpt-4-mini',
-      task: 'Simple question',
-      message: 'What is 2 + 2?'
-    },
-    {
-      model: 'gpt-3.5-turbo',
-      task: 'Creative writing',
-      message: 'Write a one-sentence story about a cat.'
-    }
-  ];
-
-  for (const test of modelTests) {
-    console.log(`\n🔧 Testing ${test.model} for ${test.task}...`);
-
-    try {
-      const response = await client.evaluate({
-        id: `model_test_${test.model.replace(/[^a-z0-9]/g, '_')}`,
-        name: `${test.model} ${test.task}`,
-        tool: "chat",
-        input: {
-          message: test.message
-        },
-        model: {
-          main_model: {
-            provider: "openai",
-            model: test.model,
-            api_key: CONFIG.providers.openai.apiKey
-          }
-        }
-      });
-
-      console.log(`   ✅ ${test.model} completed successfully`);
-      console.log(`   📊 Response: ${JSON.stringify(response.output).substring(0, 100)}...`);
-
-      // Wait between tests
-      await new Promise(resolve => setTimeout(resolve, 1500));
-
-    } catch (error) {
-      console.log(`   ❌ ${test.model} failed: ${error.message}`);
-    }
-  }
-
-  console.log('\n✨ Model selection demonstration completed!');
-}
-
-// Function to demonstrate LLM configuration
-async function demonstrateLLMConfiguration(client, availableProviders) {
-  console.log('\n🧪 Demonstrating LLM Configuration...');
-
-  for (const provider of availableProviders.slice(0, 2)) { // Test up to 2 providers
-    console.log(`\n🔧 Configuring ${provider.toUpperCase()} provider...`);
-
-    try {
-      // Configure different models based on provider
-      let models;
-      switch (provider) {
-        case 'openai':
-          models = {
-            main: 'gpt-4',
-            mini: 'gpt-4-mini',
-            nano: 'gpt-3.5-turbo'
-          };
-          break;
-        case 'groq':
-          models = {
-            main: 'llama-3.1-8b-instant',
-            mini: 'llama-3.1-8b-instant',
-            nano: 'llama-3.1-8b-instant'
-          };
-          break;
-        case 'openrouter':
-          models = {
-            main: 'anthropic/claude-3-sonnet',
-            mini: 'anthropic/claude-3-haiku',
-            nano: 'anthropic/claude-3-haiku'
-          };
-          break;
-        case 'litellm':
-          models = {
-            main: 'claude-3-sonnet-20240229',
-            mini: 'claude-3-haiku-20240307',
-            nano: 'claude-3-haiku-20240307'
-          };
-          break;
-      }
-
-      console.log(`   📦 Models: main=${models.main}, mini=${models.mini}, nano=${models.nano}`);
-
-      // Run evaluation with specific provider configuration
-      const response = await client.evaluate({
-        id: `${provider}_config_eval`,
-        name: `${provider.toUpperCase()} Configuration Test`,
-        description: `Test evaluation using ${provider} provider`,
-        tool: "chat",
-        input: {
-          message: `Hello! This is a test using the ${provider} provider. Please respond with a brief confirmation.`
-        },
-        model: {
-          main_model: {
-            provider: provider,
-            model: models.main,
-            api_key: CONFIG.providers[provider].apiKey,
-            endpoint: CONFIG.providers[provider].endpoint
-          },
-          mini_model: {
-            provider: provider,
-            model: models.mini,
-            api_key: CONFIG.providers[provider].apiKey,
-            endpoint: CONFIG.providers[provider].endpoint
-          },
-          nano_model: {
-            provider: provider,
-            model: models.nano,
-            api_key: CONFIG.providers[provider].apiKey,
-            endpoint: CONFIG.providers[provider].endpoint
-          }
-        }
-      });
-
-      console.log(`   ✅ ${provider.toUpperCase()} evaluation completed successfully`);
-      console.log(`   📊 Response preview: ${JSON.stringify(response.output).substring(0, 100)}...`);
-
-      // Wait between provider tests
-      await new Promise(resolve => setTimeout(resolve, 2000));
-
-    } catch (error) {
-      console.log(`   ❌ ${provider.toUpperCase()} configuration test failed:`, error.message);
-    }
-  }
-
-  console.log('\n✨ LLM configuration demonstration completed!');
-}
-
-console.log('🔧 Starting server...');
-await server.start();
-console.log('✅ Server started successfully on ws://127.0.0.1:8080');
-console.log('⏳ Waiting for DevTools client to connect...');
-console.log('   WebSocket URL: ws://127.0.0.1:8080');
-console.log('   Auth Key: hello');
-
-// Add periodic status check
-setInterval(() => {
-  const status = server.getStatus();
-  console.log(`📊 Status: ${status.connectedClients} clients, ${status.readyClients} ready`);
-}, 10000);
\ No newline at end of file
diff --git a/eval-server/nodejs/examples/logs/.gitignore b/eval-server/nodejs/examples/logs/.gitignore
deleted file mode 100644
index 9309608..0000000
--- a/eval-server/nodejs/examples/logs/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-combined.log
-error.log
-evaluations.jsonl
\ No newline at end of file
diff --git a/eval-server/nodejs/examples/multiple-evals.js b/eval-server/nodejs/examples/multiple-evals.js
deleted file mode 100755
index b65522f..0000000
--- a/eval-server/nodejs/examples/multiple-evals.js
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/usr/bin/env node
-
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Example demonstrating multiple evaluations using a stack-based approach
-// Each connecting client receives a different evaluation from the stack
-
-import { EvalServer } from '../src/lib/EvalServer.js';
-import { EvaluationStack } from '../src/lib/EvaluationStack.js';
-import { CONFIG } from '../src/config.js';
-
-console.log('🔧 Creating evaluation stack...');
-const evalStack = new EvaluationStack();
-
-// Create multiple diverse evaluations for the stack with different LLM configurations
-const evaluations = [
-  {
-    id: "math_eval",
-    name: "Basic Math Problem",
-    description: "Simple arithmetic evaluation",
-    tool: "chat",
-    input: {
-      message: "What is 15 * 7 + 23? Please show your calculation steps."
-    },
-    // Use OpenAI if available, otherwise default
-    model: CONFIG.providers.openai.apiKey ? {
-      main_model: {
-        provider: 'openai',
-        model: 'gpt-4',
-        api_key: CONFIG.providers.openai.apiKey
-      }
-    } : {}
-  },
-  {
-    id: "geography_eval",
-    name: "Capital of France",
-    description: "Geography knowledge test",
-    tool: "chat",
-    input: {
-      message: "What is the capital of France?"
-    },
-    // Use Groq if available, otherwise default
-    model: CONFIG.providers.groq.apiKey ? {
-      main_model: {
-        provider: 'groq',
-        model: 'llama-3.1-8b-instant',
-        api_key: CONFIG.providers.groq.apiKey
-      }
-    } : {}
-  },
-  {
-    id: "creative_eval",
-    name: "Creative Writing",
-    description: "Short creative writing task",
-    tool: "chat",
-    input: {
-      message: "Write a two-sentence story about a robot discovering friendship."
-    },
-    // Use OpenRouter if available, otherwise default
-    model: CONFIG.providers.openrouter.apiKey ? {
-      main_model: {
-        provider: 'openrouter',
-        model: 'anthropic/claude-3-sonnet',
-        api_key: CONFIG.providers.openrouter.apiKey
-      }
-    } : {}
-  },
-  {
-    id: "tech_eval",
-    name: "Technology Knowledge",
-    description: "Basic technology concepts",
-    tool: "chat",
-    input: {
-      message: "Explain what HTTP stands for and what it's used for in simple terms."
-    },
-    // Use LiteLLM if available, otherwise default
-    model: (CONFIG.providers.litellm.apiKey && CONFIG.providers.litellm.endpoint) ? {
-      main_model: {
-        provider: 'litellm',
-        model: 'claude-3-haiku-20240307',
-        api_key: CONFIG.providers.litellm.apiKey,
-        endpoint: CONFIG.providers.litellm.endpoint
-      }
-    } : {}
-  }
-];
-
-// Push evaluations to stack (they will be popped in reverse order)
-console.log('📚 Adding evaluations to stack...');
-evaluations.forEach((evaluation, index) => {
-  evalStack.push(evaluation);
-  const providerInfo = evaluation.model?.main_model?.provider ? ` [${evaluation.model.main_model.provider}]` : ' [default]';
-  console.log(`   ${index + 1}. ${evaluation.name} (${evaluation.id})${providerInfo}`);
-});
-
-console.log(`✅ Stack initialized with ${evalStack.size()} evaluations`);
-
-console.log('🔧 Creating server...');
-const server = new EvalServer({
-  authKey: 'hello',
-  host: '127.0.0.1',
-  port: 8080
-});
-
-console.log('🔧 Setting up event handlers...');
-
-server.on('started', (info) => {
-  console.log('✅ Server started event fired:', info);
-});
-
-server.on('error', (error) => {
-  console.log('❌ Server error:', error);
-});
-
-server.onConnect(async client => {
-  console.log('🎉 CLIENT CONNECTED!');
-  console.log('   - Client ID:', client.id);
-  console.log('   - Client tabId:', client.tabId);
-  console.log('   - Client info:', client.getInfo());
-
-  // Check if we have evaluations left in the stack
-  if (evalStack.isEmpty()) {
-    console.log('⚠️  No more evaluations in stack for this client');
-    console.log('   Consider refilling the stack or handling this scenario');
-    return;
-  }
-
-  // Pop the next evaluation from the stack
-  const evaluation = evalStack.pop();
-  const providerInfo = evaluation.model?.main_model?.provider ? ` using ${evaluation.model.main_model.provider}` : ' using default provider';
-  console.log(`📋 Assigning evaluation: "${evaluation.name}" (${evaluation.id})${providerInfo}`);
-  console.log(`📊 Remaining evaluations in stack: ${evalStack.size()}`);
-
-  try {
-    console.log('🔄 Starting evaluation...');
-    if (evaluation.model?.main_model?.provider) {
-      console.log(`🔧 Using LLM provider: ${evaluation.model.main_model.provider} with model: ${evaluation.model.main_model.model}`);
-    }
-
-    let response = await client.evaluate(evaluation);
-
-    console.log('✅ Evaluation completed!');
-    console.log(`📊 Response for "${evaluation.name}":`, JSON.stringify(response, null, 2));
-  } catch (error) {
-    console.log(`❌ Evaluation "${evaluation.name}" failed:`, error.message);
-  }
-});
-
-server.onDisconnect(clientInfo => {
-  console.log('👋 CLIENT DISCONNECTED:', clientInfo);
-});
-
-console.log('🔧 Starting server...');
-await server.start();
-console.log('✅ Server started successfully on ws://127.0.0.1:8080');
-console.log('⏳ Waiting for DevTools clients to connect...');
-console.log('   WebSocket URL: ws://127.0.0.1:8080');
-console.log('   Auth Key: hello');
-console.log(`📚 Stack contains ${evalStack.size()} evaluations ready to be distributed`);
-
-// Add periodic status check
-setInterval(() => {
-  const status = server.getStatus();
-  console.log(`📊 Status: ${status.connectedClients} clients, ${status.readyClients} ready, ${evalStack.size()} evals remaining`);
-}, 10000);
\ No newline at end of file
diff --git a/eval-server/nodejs/examples/with-http-wrapper.js b/eval-server/nodejs/examples/with-http-wrapper.js
deleted file mode 100644
index 688f532..0000000
--- a/eval-server/nodejs/examples/with-http-wrapper.js
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env node
-
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Example demonstrating how to use EvalServer with optional HTTP API wrapper
-
-import { EvalServer } from '../src/lib/EvalServer.js';
-import { HTTPWrapper } from '../src/lib/HTTPWrapper.js';
-
-console.log('🔧 Creating EvalServer...');
-const evalServer = new EvalServer({
-  // No authKey - authentication disabled for automated mode
-  host: '127.0.0.1',
-  port: 8082
-});
-
-console.log('🔧 Creating HTTP wrapper...');
-const httpWrapper = new HTTPWrapper(evalServer, {
-  port: 8080,
-  host: '0.0.0.0'
-});
-
-
-console.log('🔧 Starting EvalServer...');
-await evalServer.start();
-console.log('✅ EvalServer started on ws://127.0.0.1:8082');
-
-console.log('🔧 Starting HTTP wrapper...');
-await httpWrapper.start();
-console.log('✅ HTTP API started on http://0.0.0.0:8080');
-
-console.log('⏳ Waiting for DevTools client to connect...');
-console.log('   WebSocket URL: ws://127.0.0.1:8082');
-console.log('   HTTP API URL: http://0.0.0.0:8080');
-console.log('   Auth: Disabled (automated mode)');
-
-// Add periodic status check
-setInterval(() => {
-  const evalServerStatus = evalServer.getStatus();
-  const httpWrapperStatus = httpWrapper.getStatus();
-  console.log(`📊 EvalServer: ${evalServerStatus.connectedClients} clients, ${evalServerStatus.readyClients} ready`);
-  console.log(`📊 HTTP API: ${httpWrapperStatus.isRunning ? 'running' : 'stopped'} on ${httpWrapperStatus.url}`);
-}, 15000);
\ No newline at end of file
diff --git a/eval-server/nodejs/logs/.gitignore b/eval-server/nodejs/logs/.gitignore
deleted file mode 100644
index 326f777..0000000
--- a/eval-server/nodejs/logs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.log
-*.jsonl
\ No newline at end of file
diff --git a/eval-server/nodejs/package.json b/eval-server/nodejs/package.json
index 6f92a73..c6315fa 100644
--- a/eval-server/nodejs/package.json
+++ b/eval-server/nodejs/package.json
@@ -1,38 +1,25 @@
 {
   "name": "bo-eval-server",
   "version": "1.0.0",
-  "description": "WebSocket server for evaluating LLM agents with LLM-as-a-judge",
+  "description": "HTTP API wrapper for Browser Operator - WebSocket server with CDP integration",
   "main": "src/lib/EvalServer.js",
   "type": "module",
   "exports": {
     ".": "./src/lib/EvalServer.js",
     "./EvalServer": "./src/lib/EvalServer.js",
-    "./EvaluationLoader": "./src/lib/EvaluationLoader.js",
-    "./HTTPWrapper": "./src/lib/HTTPWrapper.js",
-    "./judges/Judge": "./src/lib/judges/Judge.js",
-    "./judges/LLMJudge": "./src/lib/judges/LLMJudge.js",
-    "./CLI": "./src/cli/CLI.js"
-  },
-  "bin": {
-    "eval-server": "./src/cli/index.js"
+    "./HTTPWrapper": "./src/lib/HTTPWrapper.js"
   },
   "scripts": {
-    "start": "node examples/with-http-wrapper.js",
-    "dev": "node --watch examples/with-http-wrapper.js",
-    "cli": "node src/cli/index.js",
-    "lib:example": "node examples/library-usage.js",
-    "lib:example:http": "node examples/with-http-wrapper.js"
+    "start": "node src/lib/EvalServer.js"
   },
-  "keywords": ["websocket", "llm", "evaluation", "rpc", "library", "programmatic"],
+  "keywords": ["websocket", "browser-automation", "cdp", "http-api", "rpc"],
   "author": "",
   "license": "MIT",
   "dependencies": {
     "ws": "^8.16.0",
     "uuid": "^9.0.1",
     "winston": "^3.11.0",
-    "dotenv": "^16.3.1",
-    "openai": "^4.24.1",
-    "js-yaml": "^4.1.0"
+    "dotenv": "^16.3.1"
   },
   "devDependencies": {
     "@types/ws": "^8.5.10"
@@ -40,4 +27,4 @@
   "engines": {
     "node": ">=18.0.0"
   }
-}
\ No newline at end of file
+}
diff --git a/eval-server/nodejs/schemas/client.schema.json b/eval-server/nodejs/schemas/client.schema.json
deleted file mode 100644
index 8dfdd3b..0000000
--- a/eval-server/nodejs/schemas/client.schema.json
+++ /dev/null
@@ -1,299 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "type": "object",
-  "title": "Client Configuration Schema",
-  "description": "Schema for validating client YAML configuration files",
-  "required": ["client", "settings", "evaluations"],
-  "properties": {
-    "client": {
-      "type": "object",
-      "required": ["id", "name"],
-      "properties": {
-        "id": {
-          "type": "string",
-          "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
-          "description": "UUID v4 format client identifier"
-        },
-        "name": {
-          "type": "string",
-          "minLength": 1,
-          "maxLength": 100,
-          "description": "Human-readable client name"
-        },
-        "secret_key": {
-          "type": "string",
-          "description": "Optional authentication secret key"
-        },
-        "description": {
-          "type": "string",
-          "description": "Optional client description"
-        }
-      }
-    },
-    "settings": {
-      "type": "object",
-      "properties": {
-        "max_concurrent_evaluations": {
-          "type": "integer",
-          "minimum": 1,
-          "maximum": 10,
-          "default": 3
-        },
-        "default_timeout": {
-          "type": "integer",
-          "minimum": 5000,
-          "maximum": 300000,
-          "default": 30000,
-          "description": "Default timeout in milliseconds"
-        },
-        "retry_policy": {
-          "type": "object",
-          "properties": {
-            "max_retries": {
-              "type": "integer",
-              "minimum": 0,
-              "maximum": 5,
-              "default": 2
-            },
-            "backoff_multiplier": {
-              "type": "number",
-              "minimum": 1,
-              "maximum": 5,
-              "default": 2
-            },
-            "initial_delay": {
-              "type": "integer",
-              "minimum": 100,
-              "maximum": 10000,
-              "default": 1000,
-              "description": "Initial delay in milliseconds"
-            }
-          }
-        }
-      }
-    },
-    "evaluations": {
-      "type": "array",
-      "items": {
-        "$ref": "#/definitions/evaluation"
-      }
-    }
-  },
-  "definitions": {
-    "evaluation": {
-      "type": "object",
-      "required": ["id", "name", "tool", "input"],
-      "properties": {
-        "id": {
-          "type": "string",
-          "pattern": "^[a-zA-Z0-9-_]+$",
-          "minLength": 1,
-          "maxLength": 100,
-          "description": "Unique evaluation identifier"
-        },
-        "name": {
-          "type": "string",
-          "minLength": 1,
-          "maxLength": 200,
-          "description": "Human-readable evaluation name"
-        },
-        "description": {
-          "type": "string",
-          "description": "Optional evaluation description"
-        },
-        "enabled": {
-          "type": "boolean",
-          "default": true,
-          "description": "Whether this evaluation is enabled"
-        },
-        "target": {
-          "type": "object",
-          "properties": {
-            "url": {
-              "type": "string",
-              "format": "uri",
-              "description": "Target URL for the evaluation"
-            },
-            "wait_for": {
-              "type": "string",
-              "enum": ["load", "domcontentloaded", "networkidle"],
-              "default": "networkidle"
-            },
-            "wait_timeout": {
-              "type": "integer",
-              "minimum": 1000,
-              "maximum": 30000,
-              "default": 5000
-            }
-          }
-        },
-        "tool": {
-          "type": "string",
-          "enum": [
-            "extract_data",
-            "extract_schema_streamlined", 
-            "research_agent",
-            "action_agent",
-            "web_task_agent"
-          ],
-          "description": "Tool to execute for this evaluation"
-        },
-        "timeout": {
-          "type": "integer",
-          "minimum": 5000,
-          "maximum": 300000,
-          "description": "Evaluation timeout in milliseconds"
-        },
-        "input": {
-          "type": "object",
-          "description": "Tool-specific input parameters"
-        },
-        "validation": {
-          "type": "object",
-          "required": ["type"],
-          "properties": {
-            "type": {
-              "type": "string",
-              "enum": ["llm-judge", "snapshot", "hybrid"]
-            },
-            "llm_judge": {
-              "$ref": "#/definitions/llm_judge_config"
-            },
-            "snapshot": {
-              "$ref": "#/definitions/snapshot_config"
-            },
-            "hybrid": {
-              "type": "object",
-              "properties": {
-                "weight_llm": {
-                  "type": "number",
-                  "minimum": 0,
-                  "maximum": 1
-                },
-                "weight_snapshot": {
-                  "type": "number",
-                  "minimum": 0,
-                  "maximum": 1
-                }
-              }
-            }
-          }
-        },
-        "metadata": {
-          "type": "object",
-          "properties": {
-            "tags": {
-              "type": "array",
-              "items": {
-                "type": "string"
-              }
-            },
-            "priority": {
-              "type": "string",
-              "enum": ["low", "normal", "high"],
-              "default": "normal"
-            },
-            "owner": {
-              "type": "string",
-              "description": "Responsible team or person"
-            },
-            "created": {
-              "type": "string",
-              "format": "date"
-            },
-            "modified": {
-              "type": "string",
-              "format": "date"
-            }
-          }
-        }
-      }
-    },
-    "llm_judge_config": {
-      "type": "object",
-      "required": ["criteria"],
-      "properties": {
-        "model": {
-          "type": "string",
-          "default": "gpt-4o-mini",
-          "description": "LLM model to use for evaluation"
-        },
-        "temperature": {
-          "type": "number",
-          "minimum": 0,
-          "maximum": 2,
-          "default": 0.3
-        },
-        "criteria": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          },
-          "minItems": 1,
-          "description": "List of evaluation criteria"
-        },
-        "visual_verification": {
-          "type": "object",
-          "properties": {
-            "enabled": {
-              "type": "boolean",
-              "default": false
-            },
-            "capture_before": {
-              "type": "boolean",
-              "default": true
-            },
-            "capture_after": {
-              "type": "boolean",
-              "default": true
-            },
-            "prompts": {
-              "type": "array",
-              "items": {
-                "type": "string"
-              }
-            }
-          }
-        }
-      }
-    },
-    "snapshot_config": {
-      "type": "object",
-      "properties": {
-        "structure_only": {
-          "type": "boolean",
-          "default": false,
-          "description": "Compare only structure, not values"
-        },
-        "exclude_paths": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          },
-          "description": "JSONPath expressions for fields to exclude"
-        },
-        "sanitizers": {
-          "type": "array",
-          "items": {
-            "type": "object",
-            "required": ["path"],
-            "properties": {
-              "path": {
-                "type": "string",
-                "description": "JSONPath to the field"
-              },
-              "pattern": {
-                "type": "string",
-                "description": "Regex pattern to match"
-              },
-              "replacement": {
-                "type": "string",
-                "description": "Replacement string"
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/cli/CLI.js b/eval-server/nodejs/src/cli/CLI.js
deleted file mode 100644
index 240e66b..0000000
--- a/eval-server/nodejs/src/cli/CLI.js
+++ /dev/null
@@ -1,518 +0,0 @@
-#!/usr/bin/env node
-
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-import readline from 'readline';
-import { EvalServer } from '../lib/EvalServer.js';
-
-/**
- * EvaluationCLI - Command line interface for the evaluation server
- * 
- * Refactored to use the new EvalServer library instead of directly
- * instantiating the old EvaluationServer class.
- */
-export class EvaluationCLI {
-  constructor(options = {}) {
-    this.server = new EvalServer(options);
-    this.rl = readline.createInterface({
-      input: process.stdin,
-      output: process.stdout
-    });
-    
-    // Keep track of connected clients for CLI operations
-    this.connectedClients = new Map();
-    
-    // Set up event handlers
-    this.setupEventHandlers();
-  }
-
-  /**
-   * Set up event handlers for the server
-   */
-  setupEventHandlers() {
-    this.server.onConnect(client => {
-      this.connectedClients.set(client.id, client);
-      console.log(`✅ Client connected: ${client.id}`);
-    });
-
-    this.server.onDisconnect(clientInfo => {
-      this.connectedClients.delete(clientInfo.clientId);
-      console.log(`❌ Client disconnected: ${clientInfo.clientId}`);
-    });
-
-    this.server.on('error', error => {
-      console.error(`🚨 Server error: ${error.message}`);
-    });
-  }
-
-  async start() {
-    console.log('🚀 Starting Evaluation Server CLI');
-    console.log('====================================');
-    
-    // Start the server
-    try {
-      await this.server.start();
-    } catch (error) {
-      console.error(`❌ Failed to start server: ${error.message}`);
-      process.exit(1);
-    }
-    
-    // Wait a moment for server to start
-    await new Promise(resolve => setTimeout(resolve, 1000));
-    
-    this.showHelp();
-    this.startInteractiveMode();
-  }
-
-  showHelp() {
-    console.log('\\nAvailable commands:');
-    console.log('  status                           - Show server status');
-    console.log('  clients                          - List all clients and their evaluations');
-    console.log('  clients-connected                - List connected clients');
-    console.log('  list-tabs [client-id]            - List active tabs (all clients or specific client)');
-    console.log('  run <client-id> <evaluation-id>  - Run specific evaluation for a client');
-    console.log('  run-all <client-id>              - Run all evaluations for a client');
-    console.log('  run-tab <client-id> <tab-id> <evaluation-id> - Run evaluation on specific tab');
-    console.log('  eval <evaluation-id>             - Run specific evaluation on all connected clients');
-    console.log('  eval all                         - Run all pending evaluations on all clients');
-    console.log('  load-evals [directory]           - Load evaluations from directory');
-    console.log('  list-evals [category]            - List available evaluations');
-    console.log('  help                             - Show this help');
-    console.log('  quit                             - Exit the CLI');
-    console.log('');
-  }
-
-  startInteractiveMode() {
-    this.rl.question('eval-server> ', (input) => {
-      this.handleCommand(input.trim());
-    });
-  }
-
-  async handleCommand(input) {
-    const [command, ...args] = input.split(' ');
-    
-    try {
-      switch (command) {
-        case 'status':
-          this.showStatus();
-          break;
-        case 'clients':
-          this.listClients();
-          break;
-        case 'run':
-          if (args.length < 2) {
-            console.log('Usage: run <client-id> <evaluation-id>');
-          } else {
-            await this.runSpecificEvaluation(args[0], args[1]);
-          }
-          break;
-        case 'run-all':
-          if (args.length < 1) {
-            console.log('Usage: run-all <client-id>');
-          } else {
-            await this.runAllEvaluations(args[0]);
-          }
-          break;
-        case 'eval':
-          if (args.length === 0) {
-            console.log('Usage: eval <evaluation-id>  OR  eval all');
-          } else {
-            await this.runEvaluation(args.join(' '));
-          }
-          break;
-        case 'clients-connected':
-          this.listConnectedClients();
-          break;
-        case 'list-tabs':
-          this.listTabs(args[0]);
-          break;
-        case 'run-tab':
-          if (args.length < 3) {
-            console.log('Usage: run-tab <client-id> <tab-id> <evaluation-id>');
-          } else {
-            await this.runTabEvaluation(args[0], args[1], args[2]);
-          }
-          break;
-        case 'load-evals':
-          await this.loadEvaluations(args[0]);
-          break;
-        case 'list-evals':
-          this.listEvaluations(args[0]);
-          break;
-        case 'help':
-          this.showHelp();
-          break;
-        case 'quit':
-        case 'exit':
-          this.quit();
-          return;
-        case '':
-          break;
-        default:
-          console.log(`Unknown command: ${command}. Type 'help' for available commands.`);
-      }
-    } catch (error) {
-      console.error('Error:', error.message);
-    }
-    
-    this.startInteractiveMode();
-  }
-
-  showStatus() {
-    const status = this.server.getStatus();
-    console.log('\\n📊 Server Status:');
-    console.log(`  Running: ${status.isRunning ? 'Yes' : 'No'}`);
-    console.log(`  Host: ${status.host}:${status.port}`);
-    console.log(`  Connected clients: ${status.connectedClients}`);
-    console.log(`  Unique base clients: ${status.uniqueBaseClients}`);
-    console.log(`  Total tabs: ${status.totalTabs}`);
-    console.log(`  Ready clients: ${status.readyClients}`);
-    console.log('');
-  }
-
-  listConnectedClients() {
-    console.log('\\n👥 Connected Clients:');
-    
-    if (this.connectedClients.size === 0) {
-      console.log('  No clients connected');
-    } else {
-      for (const [clientId, client] of this.connectedClients) {
-        const info = client.getInfo();
-        console.log(`  Client ID: ${info.id}`);
-        console.log(`    Base Client: ${info.baseClientId}`);
-        console.log(`    Tab ID: ${info.tabId || 'default'}`);
-        console.log(`    Connected: ${info.connectedAt}`);
-        console.log(`    Address: ${info.remoteAddress}`);
-        console.log('');
-      }
-    }
-  }
-
-  listClients() {
-    const clients = this.server.clientManager.getAllClients();
-    console.log('\\n👥 Registered Clients:');
-    
-    if (clients.length === 0) {
-      console.log('  No clients registered');
-      return;
-    }
-    
-    clients.forEach(client => {
-      console.log(`\\n  📋 ${client.name} (${client.id})`);
-      console.log(`     Description: ${client.description || 'N/A'}`);
-      console.log(`     Secret Key: ${client.secretKey ? '***' : 'None'}`);
-      
-      const evaluations = this.server.clientManager.getClientEvaluations(client.id);
-      console.log(`     Evaluations: ${evaluations.length}`);
-      
-      // Group evaluations by category
-      const evaluationsByCategory = {};
-      evaluations.forEach(evaluation => {
-        const category = evaluation.category || 'uncategorized';
-        if (!evaluationsByCategory[category]) {
-          evaluationsByCategory[category] = [];
-        }
-        evaluationsByCategory[category].push(evaluation);
-      });
-      
-      // Display evaluations grouped by category
-      Object.keys(evaluationsByCategory).sort().forEach(category => {
-        const categoryEvals = evaluationsByCategory[category];
-        console.log(`\\n       📁 ${category} (${categoryEvals.length})`);
-        categoryEvals.forEach(evaluation => {
-          const status = evaluation.status || 'pending';
-          const statusIcon = status === 'completed' ? '✅' : status === 'running' ? '🔄' : status === 'failed' ? '❌' : '⏳';
-          console.log(`         ${statusIcon} ${evaluation.id}: ${evaluation.name}`);
-        });
-      });
-    });
-    console.log('');
-  }
-
-  async loadEvaluations(directory) {
-    try {
-      const evalsDir = directory || './evals';
-      console.log(`\\n📂 Loading evaluations from ${evalsDir}...`);
-      
-      const result = await this.server.loadEvaluations(evalsDir);
-      console.log(`✅ Loaded ${result.totalEvaluations} evaluations from ${result.categories} categories`);
-      
-    } catch (error) {
-      console.log(`❌ Failed to load evaluations: ${error.message}`);
-    }
-  }
-
-  listEvaluations(category) {
-    const evaluations = category 
-      ? this.server.evaluationLoader.getEvaluationsByCategory(category)
-      : this.server.evaluationLoader.getAllEvaluations();
-    
-    console.log(`\\n📋 ${category ? `Evaluations in category '${category}'` : 'All Evaluations'}:`);
-    
-    if (evaluations.length === 0) {
-      console.log('  No evaluations found');
-      return;
-    }
-    
-    // Group by category if showing all
-    if (!category) {
-      const byCategory = {};
-      evaluations.forEach(evaluation => {
-        const cat = evaluation.category || 'uncategorized';
-        if (!byCategory[cat]) byCategory[cat] = [];
-        byCategory[cat].push(evaluation);
-      });
-      
-      Object.keys(byCategory).sort().forEach(cat => {
-        console.log(`\\n  📁 ${cat}:`);
-        byCategory[cat].forEach(evaluation => {
-          const enabledIcon = evaluation.enabled !== false ? '✅' : '❌';
-          console.log(`    ${enabledIcon} ${evaluation.id}: ${evaluation.name} (${evaluation.tool})`);
-        });
-      });
-    } else {
-      evaluations.forEach(evaluation => {
-        const enabledIcon = evaluation.enabled !== false ? '✅' : '❌';
-        console.log(`  ${enabledIcon} ${evaluation.id}: ${evaluation.name} (${evaluation.tool})`);
-        if (evaluation.description) {
-          console.log(`     ${evaluation.description}`);
-        }
-      });
-    }
-    console.log('');
-  }
-
-  async runSpecificEvaluation(clientId, evaluationId) {
-    console.log(`\\n🎯 Running evaluation '${evaluationId}' for client '${clientId}'...`);
-    
-    try {
-      const client = this.connectedClients.get(clientId);
-      if (!client) {
-        console.log(`❌ Client '${clientId}' is not connected`);
-        return;
-      }
-      
-      // Get the evaluation
-      const evaluation = this.server.evaluationLoader.getEvaluationById(evaluationId);
-      if (!evaluation) {
-        console.log(`❌ Evaluation '${evaluationId}' not found`);
-        return;
-      }
-      
-      // Execute the evaluation
-      const result = await client.evaluate(evaluation);
-      
-      console.log(`✅ Evaluation '${evaluationId}' completed successfully`);
-      console.log(`Result: ${JSON.stringify(result, null, 2)}`);
-      
-    } catch (error) {
-      console.log(`❌ Evaluation failed: ${error.message}`);
-    }
-  }
-
-  async runAllEvaluations(clientId) {
-    console.log(`\\n🚀 Running all evaluations for client '${clientId}'...`);
-    
-    try {
-      const client = this.connectedClients.get(clientId);
-      if (!client) {
-        console.log(`❌ Client '${clientId}' is not connected`);
-        return;
-      }
-      
-      // Get all evaluations
-      const evaluations = this.server.evaluationLoader.getAllEvaluations();
-      
-      if (evaluations.length === 0) {
-        console.log(`❌ No evaluations found`);
-        return;
-      }
-      
-      console.log(`Found ${evaluations.length} evaluations to run...`);
-      
-      let completed = 0;
-      let failed = 0;
-      
-      for (const evaluation of evaluations) {
-        if (evaluation.enabled === false) {
-          console.log(`⏭️  Skipping disabled: ${evaluation.name}`);
-          continue;
-        }
-        
-        console.log(`\\n🔄 Running: ${evaluation.name} (${evaluation.id})`);
-        
-        try {
-          await client.evaluate(evaluation);
-          console.log(`  ✅ Completed: ${evaluation.name}`);
-          completed++;
-        } catch (error) {
-          console.log(`  ❌ Failed: ${evaluation.name} - ${error.message}`);
-          failed++;
-        }
-        
-        // Add a small delay between evaluations
-        await new Promise(resolve => setTimeout(resolve, 2000));
-      }
-      
-      console.log(`\\n📊 Results: ${completed} completed, ${failed} failed`);
-      
-    } catch (error) {
-      console.log(`❌ Batch evaluation failed: ${error.message}`);
-    }
-  }
-
-  async runEvaluation(task) {
-    console.log(`\\n🔍 Running evaluation: "${task}"`);
-    console.log('=====================================');
-    
-    try {
-      if (this.connectedClients.size === 0) {
-        console.log('❌ No clients connected');
-        return;
-      }
-      
-      const clients = Array.from(this.connectedClients.values());
-      console.log(`Running on ${clients.length} connected clients...`);
-      
-      const results = [];
-      
-      for (const client of clients) {
-        try {
-          let evaluation;
-          
-          if (task === 'all') {
-            // Run all evaluations for this client
-            const allEvals = this.server.evaluationLoader.getAllEvaluations()
-              .filter(e => e.enabled !== false);
-            
-            for (const evaluation of allEvals) {
-              const result = await client.evaluate(evaluation);
-              results.push({
-                clientId: client.id,
-                evaluationId: evaluation.id,
-                success: true,
-                result
-              });
-            }
-          } else {
-            // Run specific evaluation
-            evaluation = this.server.evaluationLoader.getEvaluationById(task);
-            if (!evaluation) {
-              results.push({
-                clientId: client.id,
-                evaluationId: task,
-                success: false,
-                error: `Evaluation '${task}' not found`
-              });
-              continue;
-            }
-            
-            const result = await client.evaluate(evaluation);
-            results.push({
-              clientId: client.id,
-              evaluationId: evaluation.id,
-              success: true,
-              result
-            });
-          }
-        } catch (error) {
-          results.push({
-            clientId: client.id,
-            success: false,
-            error: error.message
-          });
-        }
-      }
-      
-      // Display results
-      console.log('\\n📋 Evaluation Results:');
-      results.forEach((result, index) => {
-        console.log(`\\n  Client ${index + 1} (${result.clientId}):`);
-        
-        if (result.success) {
-          console.log(`    ✅ Success`);
-          if (result.evaluationId) {
-            console.log(`    Evaluation ID: ${result.evaluationId}`);
-          }
-        } else {
-          console.log(`    ❌ Error: ${result.error}`);
-        }
-      });
-      
-      console.log('\\n✅ Evaluation completed');
-      
-    } catch (error) {
-      console.log(`\\n❌ Evaluation failed: ${error.message}`);
-    }
-  }
-
-  listTabs(clientId = null) {
-    console.log('\\n📱 Active Tabs:');
-    
-    if (clientId) {
-      // List tabs for specific client
-      const client = this.connectedClients.get(clientId);
-      if (!client) {
-        console.log(`  Client '${clientId}' not found`);
-        return;
-      }
-      
-      const info = client.getInfo();
-      console.log(`\\n  Client: ${info.baseClientId}`);
-      console.log(`    📄 Tab ID: ${info.tabId || 'default'}`);
-      console.log(`       Connected: ${info.connectedAt}`);
-      console.log(`       Address: ${info.remoteAddress || 'unknown'}`);
-    } else {
-      // List tabs for all clients
-      if (this.connectedClients.size === 0) {
-        console.log('  No active tabs');
-        return;
-      }
-      
-      for (const [clientId, client] of this.connectedClients) {
-        const info = client.getInfo();
-        console.log(`\\n  📋 Client: ${info.baseClientId}`);
-        console.log(`       📄 Tab ID: ${info.tabId || 'default'}`);
-        console.log(`          Composite ID: ${info.id}`);
-        console.log(`          Connected: ${info.connectedAt}`);
-        console.log(`          Address: ${info.remoteAddress || 'unknown'}`);
-      }
-    }
-    console.log('');
-  }
-
-  async runTabEvaluation(clientId, tabId, evaluationId) {
-    const compositeClientId = `${clientId}:${tabId}`;
-    console.log(`\\n🎯 Running evaluation '${evaluationId}' on tab '${tabId}' of client '${clientId}'...`);
-    
-    try {
-      const client = this.connectedClients.get(compositeClientId);
-      if (!client) {
-        console.log(`❌ Tab '${tabId}' of client '${clientId}' is not connected`);
-        return;
-      }
-      
-      const evaluation = this.server.evaluationLoader.getEvaluationById(evaluationId);
-      if (!evaluation) {
-        console.log(`❌ Evaluation '${evaluationId}' not found`);
-        return;
-      }
-      
-      const result = await client.evaluate(evaluation);
-      console.log(`✅ Evaluation '${evaluationId}' completed successfully on tab '${tabId}'`);
-      console.log(`Result: ${JSON.stringify(result, null, 2)}`);
-      
-    } catch (error) {
-      console.log(`❌ Tab evaluation failed: ${error.message}`);
-    }
-  }
-
-  quit() {
-    console.log('\\n👋 Shutting down...');
-    this.server.stop();
-    this.rl.close();
-    process.exit(0);
-  }
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/cli/index.js b/eval-server/nodejs/src/cli/index.js
deleted file mode 100644
index f9d5c41..0000000
--- a/eval-server/nodejs/src/cli/index.js
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env node
-
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-import { EvaluationCLI } from './CLI.js';
-
-// Start CLI if this file is run directly
-if (import.meta.url === `file://${process.argv[1]}`) {
-  const cli = new EvaluationCLI();
-  
-  process.on('SIGINT', () => {
-    cli.quit();
-  });
-  
-  cli.start().catch(error => {
-    console.error('Failed to start CLI:', error.message);
-    process.exit(1);
-  });
-}
-
-export { EvaluationCLI };
\ No newline at end of file
diff --git a/eval-server/nodejs/src/evaluator.js b/eval-server/nodejs/src/evaluator.js
deleted file mode 100644
index 95ac14a..0000000
--- a/eval-server/nodejs/src/evaluator.js
+++ /dev/null
@@ -1,117 +0,0 @@
-import OpenAI from 'openai';
-import { CONFIG } from './config.js';
-import logger from './logger.js';
-
-export class LLMEvaluator {
-  constructor() {
-    if (!CONFIG.llm.apiKey) {
-      throw new Error('OpenAI API key is required');
-    }
-    
-    this.openai = new OpenAI({
-      apiKey: CONFIG.llm.apiKey
-    });
-  }
-
-  async evaluate(task, agentResponse) {
-    try {
-      const prompt = this.buildEvaluationPrompt(task, agentResponse);
-      
-      const completion = await this.openai.chat.completions.create({
-        model: CONFIG.llm.model,
-        messages: [
-          {
-            role: 'system',
-            content: 'You are an expert evaluator of AI agent responses. Provide objective, detailed evaluations.'
-          },
-          {
-            role: 'user',
-            content: prompt
-          }
-        ],
-        temperature: CONFIG.llm.temperature,
-        max_tokens: 1000
-      });
-
-      const evaluation = completion.choices[0].message.content;
-      const usage = completion.usage;
-
-      logger.info('LLM evaluation completed', {
-        tokens_used: usage.total_tokens,
-        model: CONFIG.llm.model
-      });
-
-      return this.parseEvaluation(evaluation);
-    } catch (error) {
-      logger.error('LLM evaluation failed', { error: error.message });
-      throw error;
-    }
-  }
-
-  buildEvaluationPrompt(task, agentResponse) {
-    return `Please evaluate the following AI agent response to a given task.
-
-TASK:
-${task}
-
-AGENT RESPONSE:
-${agentResponse}
-
-Please evaluate the response on the following criteria and provide a JSON response:
-
-1. **Correctness**: Is the response factually accurate and correct?
-2. **Completeness**: Does the response fully address the task?
-3. **Clarity**: Is the response clear and well-structured?
-4. **Relevance**: Is the response relevant to the task?
-5. **Helpfulness**: How helpful is the response to the user?
-
-Provide your evaluation in the following JSON format:
-{
-  "overall_score": <score from 1-10>,
-  "criteria_scores": {
-    "correctness": <score from 1-10>,
-    "completeness": <score from 1-10>,
-    "clarity": <score from 1-10>,
-    "relevance": <score from 1-10>,
-    "helpfulness": <score from 1-10>
-  },
-  "reasoning": "<detailed explanation of your evaluation>",
-  "strengths": ["<list of strengths>"],
-  "weaknesses": ["<list of weaknesses>"],
-  "suggestions": ["<list of improvement suggestions>"]
-}`;
-  }
-
-  parseEvaluation(evaluationText) {
-    try {
-      // Try to extract JSON from the response
-      const jsonMatch = evaluationText.match(/\{[\s\S]*\}/);
-      if (jsonMatch) {
-        return JSON.parse(jsonMatch[0]);
-      }
-      
-      // If no JSON found, return a structured response with the raw text
-      return {
-        overall_score: null,
-        criteria_scores: {},
-        reasoning: evaluationText,
-        strengths: [],
-        weaknesses: [],
-        suggestions: [],
-        raw_evaluation: evaluationText
-      };
-    } catch (error) {
-      logger.warn('Failed to parse evaluation JSON', { error: error.message });
-      return {
-        overall_score: null,
-        criteria_scores: {},
-        reasoning: evaluationText,
-        strengths: [],
-        weaknesses: [],
-        suggestions: [],
-        raw_evaluation: evaluationText,
-        parse_error: error.message
-      };
-    }
-  }
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/EvaluationLoader.js b/eval-server/nodejs/src/lib/EvaluationLoader.js
deleted file mode 100644
index 8f85459..0000000
--- a/eval-server/nodejs/src/lib/EvaluationLoader.js
+++ /dev/null
@@ -1,448 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-import fs from 'fs';
-import path from 'path';
-import yaml from 'js-yaml';
-import logger from '../logger.js';
-
-/**
- * EvaluationLoader - Handles loading and managing evaluations from YAML files
- * 
- * Example usage:
- * ```js
- * const loader = new EvaluationLoader('./evals');
- * await loader.loadFromDirectory('./evals');
- * 
- * const evaluations = loader.getAllEvaluations();
- * const filtered = loader.getEvaluationsByCategory('action-agent');
- * const specific = loader.getEvaluationById('a11y-001');
- * ```
- */
-export class EvaluationLoader {
-  constructor(evalsDir = './evals') {
-    this.evalsDir = path.resolve(evalsDir);
-    this.evaluations = new Map(); // evaluationId -> evaluation
-    this.categories = new Map(); // category -> evaluations[]
-    this.configDefaults = null;
-    
-    // Ensure directory exists
-    if (!fs.existsSync(this.evalsDir)) {
-      fs.mkdirSync(this.evalsDir, { recursive: true });
-    }
-    
-    this.loadConfigDefaults();
-  }
-
-  /**
-   * Load default model configuration from config.yaml
-   */
-  loadConfigDefaults() {
-    try {
-      const configPath = path.resolve(this.evalsDir, 'config.yaml');
-      if (fs.existsSync(configPath)) {
-        const configContent = fs.readFileSync(configPath, 'utf8');
-        this.configDefaults = yaml.load(configContent);
-        logger.info('EvaluationLoader: Loaded config.yaml defaults', this.configDefaults);
-      } else {
-        // Don't warn about missing config.yaml - it's optional
-        this.configDefaults = null;
-      }
-    } catch (error) {
-      logger.error('EvaluationLoader: Failed to load config.yaml:', error);
-      this.configDefaults = null;
-    }
-  }
-
-  /**
-   * Apply model precedence logic
-   * API calls OR test YAML models override config.yaml fallback
-   */
-  applyModelPrecedence(evaluation, apiModelOverride = null) {
-    if (apiModelOverride) {
-      return {
-        ...(this.configDefaults?.model || {}),
-        ...apiModelOverride
-      };
-    }
-    
-    const testModel = evaluation.model;
-    if (testModel && Object.keys(testModel).length > 0) {
-      return {
-        ...(this.configDefaults?.model || {}),
-        ...testModel
-      };
-    }
-    
-    return this.configDefaults?.model || {};
-  }
-
-  /**
-   * Load all evaluations from the specified directory
-   */
-  async loadFromDirectory(evalsDir = this.evalsDir) {
-    try {
-      this.evalsDir = path.resolve(evalsDir);
-      
-      // Clear existing evaluations
-      this.evaluations.clear();
-      this.categories.clear();
-      
-      // Reload config defaults
-      this.loadConfigDefaults();
-      
-      // Find all category directories
-      const categories = fs.readdirSync(this.evalsDir)
-        .filter(dir => {
-          const fullPath = path.join(this.evalsDir, dir);
-          return fs.statSync(fullPath).isDirectory();
-        });
-      
-      let totalEvaluations = 0;
-      
-      for (const category of categories) {
-        const categoryDir = path.join(this.evalsDir, category);
-        const evalFiles = fs.readdirSync(categoryDir)
-          .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
-        
-        const categoryEvaluations = [];
-        
-        for (const file of evalFiles) {
-          try {
-            const evalPath = path.join(categoryDir, file);
-            const evaluation = await this.loadEvaluationFile(evalPath, category);
-            
-            if (evaluation && evaluation.enabled !== false) {
-              this.evaluations.set(evaluation.id, evaluation);
-              categoryEvaluations.push(evaluation);
-              totalEvaluations++;
-            }
-          } catch (error) {
-            logger.error(`EvaluationLoader: Failed to load evaluation ${file}:`, error);
-          }
-        }
-        
-        if (categoryEvaluations.length > 0) {
-          this.categories.set(category, categoryEvaluations);
-        }
-      }
-      
-      logger.info(`EvaluationLoader: Loaded ${totalEvaluations} evaluations from ${categories.length} categories`);
-      return { totalEvaluations, categories: categories.length };
-      
-    } catch (error) {
-      logger.error('EvaluationLoader: Failed to load evaluations:', error);
-      throw error;
-    }
-  }
-
-  /**
-   * Load a specific evaluation file
-   */
-  async loadEvaluationFile(filePath, category) {
-    try {
-      const yamlContent = fs.readFileSync(filePath, 'utf8');
-      const evaluation = yaml.load(yamlContent);
-      
-      if (!evaluation || !evaluation.id) {
-        throw new Error('Evaluation must have an id field');
-      }
-      
-      // Apply model precedence
-      const resolvedModel = this.applyModelPrecedence(evaluation);
-      
-      // Enhance evaluation with metadata
-      const enhancedEvaluation = {
-        ...evaluation,
-        model: resolvedModel,
-        category,
-        filePath,
-        status: 'pending',
-        loadedAt: new Date().toISOString()
-      };
-      
-      // Validate required fields
-      this.validateEvaluation(enhancedEvaluation);
-      
-      return enhancedEvaluation;
-      
-    } catch (error) {
-      logger.error(`EvaluationLoader: Failed to load evaluation file ${filePath}:`, error);
-      throw error;
-    }
-  }
-
-  /**
-   * Validate evaluation structure
-   */
-  validateEvaluation(evaluation) {
-    const required = ['id', 'name', 'tool'];
-    
-    for (const field of required) {
-      if (!evaluation[field]) {
-        throw new Error(`Evaluation missing required field: ${field}`);
-      }
-    }
-    
-    // Validate tool is supported
-    const supportedTools = [
-      'action_agent',
-      'research_agent', 
-      'schema_extractor',
-      'streamlined_schema_extractor',
-      'screenshot_verification',
-      'web_task_agent',
-      'chat'
-    ];
-    
-    if (!supportedTools.includes(evaluation.tool)) {
-      logger.warn(`EvaluationLoader: Unknown tool type: ${evaluation.tool}`);
-    }
-    
-    return true;
-  }
-
-  /**
-   * Get all loaded evaluations
-   */
-  getAllEvaluations() {
-    return Array.from(this.evaluations.values());
-  }
-
-  /**
-   * Get evaluations by category
-   */
-  getEvaluationsByCategory(category) {
-    return this.categories.get(category) || [];
-  }
-
-  /**
-   * Get all available categories
-   */
-  getCategories() {
-    return Array.from(this.categories.keys());
-  }
-
-  /**
-   * Get evaluation by ID
-   */
-  getEvaluationById(evaluationId) {
-    return this.evaluations.get(evaluationId);
-  }
-
-  /**
-   * Filter evaluations by criteria
-   */
-  filterEvaluations(criteria = {}) {
-    let evaluations = this.getAllEvaluations();
-    
-    // Filter by category
-    if (criteria.category) {
-      evaluations = evaluations.filter(e => e.category === criteria.category);
-    }
-    
-    // Filter by tool
-    if (criteria.tool) {
-      evaluations = evaluations.filter(e => e.tool === criteria.tool);
-    }
-    
-    // Filter by tags
-    if (criteria.tags && criteria.tags.length > 0) {
-      evaluations = evaluations.filter(e => {
-        const evalTags = e.metadata?.tags || [];
-        return criteria.tags.some(tag => evalTags.includes(tag));
-      });
-    }
-    
-    // Filter by enabled status
-    if (criteria.enabled !== undefined) {
-      evaluations = evaluations.filter(e => e.enabled === criteria.enabled);
-    }
-    
-    // Filter by priority
-    if (criteria.priority) {
-      evaluations = evaluations.filter(e => e.metadata?.priority === criteria.priority);
-    }
-    
-    return evaluations;
-  }
-
-  /**
-   * Get evaluation statistics
-   */
-  getStatistics() {
-    const evaluations = this.getAllEvaluations();
-    const stats = {
-      total: evaluations.length,
-      byCategory: {},
-      byTool: {},
-      byStatus: {},
-      enabled: 0,
-      disabled: 0
-    };
-    
-    for (const evaluation of evaluations) {
-      // Count by category
-      const category = evaluation.category;
-      stats.byCategory[category] = (stats.byCategory[category] || 0) + 1;
-      
-      // Count by tool
-      const tool = evaluation.tool;
-      stats.byTool[tool] = (stats.byTool[tool] || 0) + 1;
-      
-      // Count by status
-      const status = evaluation.status || 'pending';
-      stats.byStatus[status] = (stats.byStatus[status] || 0) + 1;
-      
-      // Count enabled/disabled
-      if (evaluation.enabled !== false) {
-        stats.enabled++;
-      } else {
-        stats.disabled++;
-      }
-    }
-    
-    return stats;
-  }
-
-  /**
-   * Reload evaluations from disk
-   */
-  async reload() {
-    return this.loadFromDirectory(this.evalsDir);
-  }
-
-  /**
-   * Create a new evaluation programmatically
-   */
-  createEvaluation(evaluationData) {
-    const evaluation = {
-      id: evaluationData.id || `eval-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
-      name: evaluationData.name || 'Untitled Evaluation',
-      description: evaluationData.description || '',
-      enabled: evaluationData.enabled !== false,
-      tool: evaluationData.tool || 'chat',
-      timeout: evaluationData.timeout || 45000,
-      input: evaluationData.input || {},
-      model: this.applyModelPrecedence(evaluationData, evaluationData.model),
-      validation: evaluationData.validation || { type: 'none' },
-      metadata: {
-        tags: ['programmatic'],
-        priority: 'medium',
-        ...evaluationData.metadata
-      },
-      category: evaluationData.category || 'programmatic',
-      status: 'pending',
-      loadedAt: new Date().toISOString(),
-      ...evaluationData
-    };
-    
-    // Validate the evaluation
-    this.validateEvaluation(evaluation);
-    
-    // Store the evaluation
-    this.evaluations.set(evaluation.id, evaluation);
-    
-    // Add to category
-    const category = evaluation.category;
-    if (!this.categories.has(category)) {
-      this.categories.set(category, []);
-    }
-    this.categories.get(category).push(evaluation);
-    
-    logger.info(`EvaluationLoader: Created evaluation ${evaluation.id} in category ${category}`);
-    return evaluation;
-  }
-
-  /**
-   * Remove an evaluation
-   */
-  removeEvaluation(evaluationId) {
-    const evaluation = this.evaluations.get(evaluationId);
-    if (!evaluation) {
-      return false;
-    }
-    
-    // Remove from main map
-    this.evaluations.delete(evaluationId);
-    
-    // Remove from category
-    const category = evaluation.category;
-    if (this.categories.has(category)) {
-      const categoryEvals = this.categories.get(category);
-      const index = categoryEvals.findIndex(e => e.id === evaluationId);
-      if (index !== -1) {
-        categoryEvals.splice(index, 1);
-        
-        // Remove category if empty
-        if (categoryEvals.length === 0) {
-          this.categories.delete(category);
-        }
-      }
-    }
-    
-    logger.info(`EvaluationLoader: Removed evaluation ${evaluationId}`);
-    return true;
-  }
-
-  /**
-   * Update an existing evaluation
-   */
-  updateEvaluation(evaluationId, updates) {
-    const evaluation = this.evaluations.get(evaluationId);
-    if (!evaluation) {
-      throw new Error(`Evaluation ${evaluationId} not found`);
-    }
-    
-    // Apply updates
-    const updatedEvaluation = {
-      ...evaluation,
-      ...updates,
-      id: evaluationId, // Ensure ID doesn't change
-      updatedAt: new Date().toISOString()
-    };
-    
-    // Validate updated evaluation
-    this.validateEvaluation(updatedEvaluation);
-    
-    // Update in storage
-    this.evaluations.set(evaluationId, updatedEvaluation);
-    
-    // Update in category if category changed
-    if (updates.category && updates.category !== evaluation.category) {
-      // Remove from old category
-      const oldCategory = evaluation.category;
-      if (this.categories.has(oldCategory)) {
-        const oldCategoryEvals = this.categories.get(oldCategory);
-        const index = oldCategoryEvals.findIndex(e => e.id === evaluationId);
-        if (index !== -1) {
-          oldCategoryEvals.splice(index, 1);
-          if (oldCategoryEvals.length === 0) {
-            this.categories.delete(oldCategory);
-          }
-        }
-      }
-      
-      // Add to new category
-      const newCategory = updates.category;
-      if (!this.categories.has(newCategory)) {
-        this.categories.set(newCategory, []);
-      }
-      this.categories.get(newCategory).push(updatedEvaluation);
-    } else {
-      // Update existing entry in category
-      const category = evaluation.category;
-      if (this.categories.has(category)) {
-        const categoryEvals = this.categories.get(category);
-        const index = categoryEvals.findIndex(e => e.id === evaluationId);
-        if (index !== -1) {
-          categoryEvals[index] = updatedEvaluation;
-        }
-      }
-    }
-    
-    logger.info(`EvaluationLoader: Updated evaluation ${evaluationId}`);
-    return updatedEvaluation;
-  }
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/EvaluationStack.js b/eval-server/nodejs/src/lib/EvaluationStack.js
deleted file mode 100644
index 04d7b36..0000000
--- a/eval-server/nodejs/src/lib/EvaluationStack.js
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-/**
- * EvaluationStack - A simple stack-like structure for managing evaluations
- * 
- * Provides LIFO (Last In, First Out) access to evaluation objects.
- * Useful for distributing different evaluations across multiple client connections.
- */
-export class EvaluationStack {
-  constructor() {
-    this.evaluations = [];
-  }
-
-  /**
-   * Add an evaluation to the top of the stack
-   * @param {Object} evaluation - The evaluation object to add
-   */
-  push(evaluation) {
-    if (!evaluation || typeof evaluation !== 'object') {
-      throw new Error('Evaluation must be a valid object');
-    }
-    
-    // Validate required fields
-    const requiredFields = ['id', 'name', 'tool', 'input'];
-    for (const field of requiredFields) {
-      if (!evaluation[field]) {
-        throw new Error(`Evaluation missing required field: ${field}`);
-      }
-    }
-    
-    this.evaluations.push(evaluation);
-  }
-
-  /**
-   * Remove and return the evaluation from the top of the stack
-   * @returns {Object|null} The evaluation object, or null if stack is empty
-   */
-  pop() {
-    return this.evaluations.pop() || null;
-  }
-
-  /**
-   * Check if the stack is empty
-   * @returns {boolean} True if stack has no evaluations
-   */
-  isEmpty() {
-    return this.evaluations.length === 0;
-  }
-
-  /**
-   * Get the number of evaluations in the stack
-   * @returns {number} The stack size
-   */
-  size() {
-    return this.evaluations.length;
-  }
-
-  /**
-   * Peek at the top evaluation without removing it
-   * @returns {Object|null} The top evaluation object, or null if stack is empty
-   */
-  peek() {
-    if (this.isEmpty()) {
-      return null;
-    }
-    return this.evaluations[this.evaluations.length - 1];
-  }
-
-  /**
-   * Clear all evaluations from the stack
-   */
-  clear() {
-    this.evaluations = [];
-  }
-
-  /**
-   * Get a copy of all evaluations in the stack (top to bottom)
-   * @returns {Array} Array of evaluation objects
-   */
-  toArray() {
-    return [...this.evaluations].reverse();
-  }
-}
\ No newline at end of file
diff --git a/eval-server/nodejs/templates/default-client.yaml b/eval-server/nodejs/templates/default-client.yaml
deleted file mode 100644
index 6ada130..0000000
--- a/eval-server/nodejs/templates/default-client.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Default client configuration template
-# This file is used as a template when creating new clients
-
-client:
-  id: "{CLIENT_ID}"
-  name: "{CLIENT_NAME}"
-  secret_key: "{SECRET_KEY}"  # Optional
-  description: "Auto-generated client configuration"
-
-settings:
-  max_concurrent_evaluations: 3
-  default_timeout: 30000
-  retry_policy:
-    max_retries: 2
-    backoff_multiplier: 2
-    initial_delay: 1000
-
-evaluations:
-  # Example evaluation - disabled by default
-  - id: "example-schema-extraction"
-    name: "Example Schema Extraction"
-    description: "A sample evaluation for schema extraction"
-    enabled: false
-    
-    target:
-      url: "https://example.com"
-      wait_for: "networkidle"
-      wait_timeout: 5000
-    
-    tool: "extract_data"
-    timeout: 30000
-    
-    input:
-      schema:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-            description: "Page title"
-          content:
-            type: "string"
-            description: "Main content"
-    
-    
-    validation:
-      type: "llm-judge"
-      llm_judge:
-        model: "gpt-4o-mini"
-        temperature: 0.3
-        criteria:
-          - "Title should be extracted correctly"
-          - "Content should be meaningful and not empty"
-    
-    metadata:
-      tags: ["example", "schema-extraction"]
-      priority: "normal"
\ No newline at end of file
diff --git a/eval-server/python/README.md b/eval-server/python/README.md
deleted file mode 100644
index f167b48..0000000
--- a/eval-server/python/README.md
+++ /dev/null
@@ -1,368 +0,0 @@
-# bo-eval-server (Python)
-
-A minimal Python library for creating WebSocket-based evaluation servers for LLM agents.
-
-## Features
-
-- 🔌 **WebSocket Server**: Real-time agent connections with asyncio
-- 🤖 **Bidirectional RPC**: JSON-RPC 2.0 for calling methods on connected agents
-- 📚 **Programmatic API**: Create and manage evaluations in Python code
-- 📊 **Evaluation Stack**: LIFO stack for managing evaluation queues
-- ⚡ **Concurrent Support**: Full async/await support for multiple agents
-- 🔍 **Enhanced Logging**: Structured logging with loguru
-- ✨ **Minimal Dependencies**: Only websockets and loguru required
-
-## Quick Start
-
-### Basic WebSocket Server
-
-```python
-import asyncio
-from bo_eval_server import EvalServer
-
-async def main():
-    server = EvalServer(
-        auth_key='hello',
-        host='127.0.0.1',
-        port=8080
-    )
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        response = await client.evaluate({
-            "id": "test_eval",
-            "name": "Capital of France",
-            "tool": "chat",
-            "input": {"message": "What is the capital of France?"}
-        })
-        
-        print(f'Response: {response}')
-    
-    await server.start()
-    print('Server running on ws://127.0.0.1:8080')
-    
-    # Keep server running
-    await server.wait_closed()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### Using Evaluation Stack
-
-```python
-import asyncio
-from bo_eval_server import EvalServer, EvaluationStack
-
-async def main():
-    server = EvalServer(auth_key='secret', port=8080)
-    stack = EvaluationStack()
-    
-    # Add evaluations to stack
-    stack.push({
-        "id": "eval_001",
-        "name": "Math Question",
-        "tool": "chat",
-        "input": {"message": "What is 2 + 2?"}
-    })
-    
-    stack.push({
-        "id": "eval_002", 
-        "name": "Science Question",
-        "tool": "chat",
-        "input": {"message": "What is the speed of light?"}
-    })
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        # Process evaluations from stack
-        while not stack.is_empty():
-            evaluation = stack.pop()
-            try:
-                result = await client.evaluate(evaluation)
-                print(f'✅ {evaluation["name"]}: {result["status"]}')
-            except Exception as e:
-                print(f'❌ {evaluation["name"]}: {e}')
-    
-    await server.start()
-    await server.wait_closed()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Installation
-
-### Using uv (Recommended)
-
-```bash
-# Install uv package manager (if not already installed)
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Install dependencies and create virtual environment
-uv sync
-
-# Run examples using the convenient runner
-python run.py basic      # Basic server example
-python run.py stack      # Evaluation stack example  
-python run.py prog       # Programmatic evaluations example
-python run.py all        # Show all available examples
-
-# Or run examples directly with uv
-uv run python examples/basic_server.py
-uv run python examples/with_stack.py
-uv run python examples/programmatic_evals.py
-```
-
-### Using pip (Alternative)
-
-```bash
-# Install the package
-pip install -e .
-
-# Or install with development dependencies
-pip install -e ".[dev]"
-
-# Or install from requirements.txt
-pip install -r requirements.txt
-```
-
-## Library Usage
-
-### EvalServer API
-
-```python
-from bo_eval_server import EvalServer
-
-# Create server instance
-server = EvalServer(
-    auth_key='your-secret-key',  # Required: client authentication
-    host='127.0.0.1',           # Optional: default 'localhost'
-    port=8080,                  # Optional: default 8080
-)
-
-# Register event handlers
-@server.on_connect
-async def handle_connect(client):
-    # Called when client connects and is ready
-    pass
-
-@server.on_disconnect  
-async def handle_disconnect(client_info):
-    # Called when client disconnects
-    pass
-
-# Server lifecycle
-await server.start()        # Start the server
-await server.stop()         # Stop the server
-await server.wait_closed()  # Wait for server to close
-
-# Server status
-status = server.get_status()
-print(f"Server running: {status['running']}")
-```
-
-### Client Proxy API
-
-```python
-@server.on_connect
-async def handle_client(client):
-    # Client information
-    print(f'Client ID: {client.id}')
-    print(f'Tab ID: {client.tab_id}')
-    print(f'Base Client ID: {client.base_client_id}')
-    
-    # Execute evaluations
-    result = await client.evaluate({
-        "id": "eval_001",
-        "name": "Test Evaluation",
-        "description": "Optional description",
-        "tool": "chat",
-        "input": {"message": "Your question here"},
-        "timeout": 30.0,  # Optional timeout in seconds
-        "metadata": {"tags": ["api", "test"]}
-    })
-    
-    # Send custom messages
-    await client.send_message({
-        "type": "custom", 
-        "data": "Hello client!"
-    })
-```
-
-### EvaluationStack API
-
-```python
-from bo_eval_server import EvaluationStack
-
-stack = EvaluationStack()
-
-# Add evaluations (LIFO - Last In, First Out)
-stack.push({
-    "id": "eval_001",
-    "name": "Test",
-    "tool": "chat", 
-    "input": {"message": "Hello"}
-})
-
-# Remove and get evaluation
-evaluation = stack.pop()  # Returns dict or None if empty
-
-# Stack operations
-size = stack.size()           # Get number of evaluations
-is_empty = stack.is_empty()   # Check if empty
-top = stack.peek()            # View top without removing
-stack.clear()                 # Remove all evaluations
-all_evals = stack.to_array()  # Get copy as list
-```
-
-## Agent Protocol
-
-Your agent needs to implement the WebSocket protocol:
-
-### 1. Connect to WebSocket
-```python
-import websockets
-import json
-
-ws = await websockets.connect('ws://localhost:8080')
-```
-
-### 2. Receive Authentication Challenge
-The server sends an authentication challenge with the secret key:
-```python
-challenge = json.loads(await ws.recv())
-# Expected: {"type": "auth_challenge", "secretKey": "hello", "connectionId": "uuid"}
-```
-
-### 3. Send Registration Response
-Client validates the secret key and responds:
-```python
-await ws.send(json.dumps({
-    "type": "register",
-    "clientId": "your-client-id",
-    "acceptAuth": True,  # True if secret key is acceptable
-    "connectionId": challenge["connectionId"],
-    "capabilities": ["chat", "action"]
-}))
-```
-
-### 4. Receive Registration Confirmation
-```python
-confirmation = json.loads(await ws.recv())
-# Expected: {"type": "registered", "clientId": "your-client-id", "serverTime": 123456}
-```
-
-### 5. Send Ready Signal
-```python
-await ws.send(json.dumps({"type": "ready"}))
-```
-
-### 6. Handle RPC Calls
-```python
-async for message in ws:
-    data = json.loads(message)
-    
-    if data.get("jsonrpc") == "2.0" and data.get("method") == "evaluate":
-        # Handle evaluation request
-        result = await handle_evaluation(data["params"])
-        
-        # Send response
-        await ws.send(json.dumps({
-            "jsonrpc": "2.0",
-            "id": data["id"],
-            "result": result
-        }))
-```
-
-## Architecture
-
-```
-src/bo_eval_server/
-├── __init__.py           # Package exports
-├── eval_server.py        # Main EvalServer class
-├── evaluation_stack.py   # EvaluationStack implementation
-├── client_manager.py     # Client connection management
-├── rpc_client.py         # JSON-RPC client implementation
-├── config.py             # Configuration management
-└── logger.py             # Enhanced logging setup
-```
-
-## Design Principles
-
-- **Async-First**: Built on asyncio for high concurrency
-- **Minimal Dependencies**: Only essential packages required
-- **Type Hints**: Full typing support for better development experience
-- **Event-Driven**: React to client connections with decorators
-- **Programmatic**: Full control through Python code
-- **Clean API**: Simple, Pythonic interface
-
-## Examples
-
-See the `examples/` directory for complete working examples:
-
-- `basic_server.py` - Simple WebSocket server setup
-- `with_stack.py` - Using evaluation stack for queuing
-- `programmatic_evals.py` - Creating evaluations in code
-
-## Evaluation Scripts
-
-The `evals/` directory contains ready-to-use evaluation scripts for various benchmarks:
-
-- `browsecomp_eval_server.py` - Browsecomp benchmark server (1,266 web browsing questions)
-  - Run with: `./evals/run_browsecomp_eval_server.sh`
-  - See `evals/README.md` for detailed usage
-
-## Development
-
-### Using uv
-
-```bash
-# Install with development dependencies
-uv sync --dev
-
-# Run tests
-uv run pytest
-
-# Format code
-uv run black src/ examples/
-
-# Type checking
-uv run mypy src/
-
-# Run all development commands
-uv run pytest && uv run black src/ examples/ && uv run mypy src/
-```
-
-### Using pip
-
-```bash
-# Install in development mode
-pip install -e ".[dev]"
-
-# Run tests
-pytest
-
-# Format code
-black src/ examples/
-
-# Type checking
-mypy src/
-```
-
-## Environment Variables
-
-```bash
-# Optional configuration
-BO_EVAL_SERVER_HOST=localhost
-BO_EVAL_SERVER_PORT=8080
-BO_EVAL_SERVER_LOG_LEVEL=INFO
-```
-
----
-
-This Python implementation provides the core WebSocket evaluation server functionality with a clean, async API for programmatic evaluation management.
\ No newline at end of file
diff --git a/eval-server/python/UV_COMMANDS.md b/eval-server/python/UV_COMMANDS.md
deleted file mode 100644
index ea79fcb..0000000
--- a/eval-server/python/UV_COMMANDS.md
+++ /dev/null
@@ -1,188 +0,0 @@
-# UV Commands Reference
-
-Quick reference for using uv with bo-eval-server Python implementation.
-
-## Installation & Setup
-
-```bash
-# Install uv (if not already installed)
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Install project dependencies
-uv sync
-
-# Install with development dependencies
-uv sync --dev
-```
-
-## Running Examples
-
-### Using the convenience runner (Recommended)
-
-```bash
-# Basic WebSocket server
-python run.py basic
-
-# Evaluation stack example
-python run.py stack
-
-# Programmatic evaluations with analytics
-python run.py prog
-
-# Show all available examples
-python run.py all
-```
-
-### Direct uv execution
-
-```bash
-# Run examples directly
-uv run python examples/basic_server.py
-uv run python examples/with_stack.py  
-uv run python examples/programmatic_evals.py
-
-# Run with custom arguments or environment variables
-uv run --env BO_EVAL_SERVER_PORT=8081 python examples/basic_server.py
-```
-
-## Development Commands
-
-```bash
-# Run tests
-uv run pytest
-
-# Run tests with coverage
-uv run pytest --cov=src/bo_eval_server
-
-# Format code
-uv run black .
-uv run black src/ examples/
-
-# Type checking
-uv run mypy src/
-
-# Run all checks
-uv run pytest && uv run black . && uv run mypy src/
-```
-
-## Package Management
-
-```bash
-# Add new dependencies
-uv add requests
-uv add --dev pytest-cov
-
-# Remove dependencies  
-uv remove requests
-
-# Update dependencies
-uv sync --upgrade
-
-# Show installed packages
-uv tree
-
-# Show project info
-uv show
-```
-
-## Virtual Environment
-
-```bash
-# Activate virtual environment
-source .venv/bin/activate  # Unix/macOS
-# or
-.venv\Scripts\activate     # Windows
-
-# Check Python version in venv
-uv run python --version
-
-# Run any command in the virtual environment
-uv run <command>
-```
-
-## Project Scripts
-
-The project includes entry point scripts defined in `pyproject.toml`:
-
-```bash
-# After installation, these commands become available:
-bo-eval-basic         # Run basic server example
-bo-eval-stack         # Run evaluation stack example  
-bo-eval-programmatic  # Run programmatic evaluations example
-```
-
-## Useful UV Options
-
-```bash
-# Run with specific Python version
-uv run --python 3.11 python examples/basic_server.py
-
-# Run with environment variables
-uv run --env DEBUG=1 python examples/basic_server.py
-
-# Run in isolated environment (no local packages)
-uv run --isolated python examples/basic_server.py
-
-# Show verbose output
-uv sync --verbose
-
-# Force reinstall
-uv sync --reinstall
-```
-
-## Integration with IDEs
-
-For VS Code and other IDEs, point to the uv-created virtual environment:
-
-```bash
-# Show virtual environment path
-echo $PWD/.venv/bin/python
-
-# Or use uv to find it
-uv run which python
-```
-
-Then configure your IDE to use this Python interpreter for the project.
-
-## Common Workflows
-
-### Quick Start Development
-
-```bash
-git clone <repo>
-cd eval-server/python
-uv sync --dev
-python run.py basic
-```
-
-### Running Tests in CI
-
-```bash
-uv sync --dev --frozen
-uv run pytest --cov=src/bo_eval_server --cov-report=xml
-```
-
-### Building and Publishing
-
-```bash
-uv build
-uv publish  # If publishing to PyPI
-```
-
-## Troubleshooting
-
-```bash
-# Clear uv cache
-uv cache clean
-
-# Reinstall everything
-rm -rf .venv uv.lock
-uv sync
-
-# Check uv version
-uv --version
-
-# Get help
-uv --help
-uv run --help
-```
\ No newline at end of file
diff --git a/eval-server/python/evals/README.md b/eval-server/python/evals/README.md
deleted file mode 100644
index 6d3b082..0000000
--- a/eval-server/python/evals/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Python Evaluation Scripts
-
-This directory contains evaluation scripts for running various benchmark datasets using the Python eval-server.
-
-## Available Scripts
-
-### Browsecomp Evaluation Server
-
-**Script**: `browsecomp_eval_server.py`  
-**Wrapper**: `run_browsecomp_eval_server.sh`
-
-The browsecomp eval server loads questions from the [Browsecomp benchmark](https://github.com/openai/simple-evals) and distributes them to connected BrowserOperator clients via WebSocket connections.
-
-#### Features
-
-- Loads and decrypts 1,266 browsecomp questions automatically
-- Distributes exactly one question per client connection
-- Stack-based LIFO distribution
-- **Automatic scoring**: Compares responses against true answers
-- **Structured response parsing**: Handles BrowserOperator's message format
-- **Comprehensive logging**: Structured logs saved to timestamped files
-- Real-time progress tracking with accuracy metrics  
-- Confidence score extraction and analysis
-- Results saved to JSON file for later analysis
-- Configurable timeout (default: 60 minutes)
-- Configurable server settings
-
-#### Usage
-
-```bash
-# Use the wrapper script for proper dependencies
-./run_browsecomp_eval_server.sh --help
-
-# List available questions
-./run_browsecomp_eval_server.sh --list --list-limit 10
-
-# Start server with first 5 questions
-./run_browsecomp_eval_server.sh --limit 5
-
-# Start server with specific questions
-./run_browsecomp_eval_server.sh --questions 1 5 10 25
-
-# Start server with a range of questions (questions 10-15)
-./run_browsecomp_eval_server.sh --start 10 --end 15
-
-# Start server from question 100 to the end
-./run_browsecomp_eval_server.sh --start 100
-
-# Start server with questions 1-50
-./run_browsecomp_eval_server.sh --end 50
-
-# Start server with all 1,266 questions
-./run_browsecomp_eval_server.sh
-
-# Custom configuration
-./run_browsecomp_eval_server.sh --limit 20 --port 8081 --auth-key my-key
-
-# Save results to JSON file
-./run_browsecomp_eval_server.sh --limit 10 --save-results
-```
-
-#### How It Works
-
-1. **Load Questions**: The server loads browsecomp questions from the dataset
-2. **Stack Distribution**: Questions are placed in a LIFO stack
-3. **Client Connection**: When a BrowserOperator connects, it receives one question
-4. **Processing**: The client processes the question and returns results
-5. **Automatic Scoring**: Server compares responses against true answers
-6. **Tracking**: Server tracks completion, accuracy, and confidence statistics
-7. **Results**: Optionally saves detailed results to JSON file
-
-#### Example Workflow
-
-```bash
-# Terminal 1: Start the eval server
-cd /path/to/eval-server/python/evals
-./run_browsecomp_eval_server.sh --limit 10 --save-results
-
-# Terminal 2+: Connect BrowserOperator clients
-# Each client will automatically receive and process one question
-```
-
-#### Scoring Output
-
-When evaluations complete, you'll see automatic scoring results:
-
-```
-✅ Evaluation completed!
-📊 Response structure: 12 messages, 3 tool calls, gpt-4 model, 45230ms
-
-🎯 Scoring Results:
-   - True Answer: 1988-96
-   - Extracted Answer: 1988-96
-   - Correct: ✅ YES
-   - Confidence: 85%
-
-📊 Current Statistics:
-   ✅ Completed: 5/10
-   ❌ Failed: 0/10
-   📚 Remaining: 5/10
-
-🎯 Scoring Statistics:
-   📊 Accuracy: 80.0% (4/5 correct)
-   💡 Average Confidence: 78.5%
-```
-
-#### Results JSON Format
-
-When using `--save-results`, evaluations are saved to `browsecomp_eval_results_[timestamp].json`:
-
-```json
-{
-  "timestamp": "20240115_143022",
-  "total_questions": 10,
-  "completed": 10,
-  "failed": 0,
-  "accuracy": 80.0,
-  "average_confidence": 78.5,
-  "evaluations": [
-    {
-      "client_id": "abc123...",
-      "question_id": 1,
-      "result": "Explanation: ... Exact Answer: 1988-96 Confidence Score: 85%",
-      "scoring": {
-        "is_correct": true,
-        "true_answer": "1988-96",
-        "extracted_answer": "1988-96",
-        "confidence": 85
-      }
-    }
-  ]
-}
-```
-
-#### Logging
-
-The server creates comprehensive logs in the `./logs/` directory:
-
-- **Console Output**: Real-time progress with emojis and summaries
-- **Structured Logs**: Timestamped log file `browsecomp_eval_server_YYYYMMDD_HHMMSS.log`
-
-**Structured Log Events**:
-```
-EVENT: {"timestamp": "2024-01-15T14:30:22", "event_type": "client_connected", "client_id": "abc123", "stack_remaining": 10}
-EVENT: {"timestamp": "2024-01-15T14:30:25", "event_type": "evaluation_assigned", "evaluation_id": "browsecomp_q1", "question_id": 1}
-EVENT: {"timestamp": "2024-01-15T14:32:10", "event_type": "evaluation_completed", "is_correct": true, "confidence": 85, "model_used": "gpt-4"}
-EVENT: {"timestamp": "2024-01-15T14:35:00", "event_type": "session_completed", "accuracy": 80.0, "total_questions": 10}
-```
-
-**Log Files Location**: 
-- `./logs/browsecomp_eval_server_YYYYMMDD_HHMMSS.log` - Main server log
-- `./logs/` - Directory also used by eval-server's internal logging
-
-## Dependencies
-
-The evaluation scripts require additional dependencies beyond the base eval-server:
-- `pandas` - For dataset loading and manipulation
-- `requests` - For downloading datasets
-
-These are automatically installed when you run `uv sync` in the eval-server/python directory.
-
-## Adding New Evaluation Scripts
-
-To add a new evaluation script:
-
-1. Create your script in this directory
-2. Import the eval-server modules:
-   ```python
-   import sys
-   from pathlib import Path
-   sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-   from bo_eval_server import EvalServer, EvaluationStack
-   ```
-
-3. Create a wrapper script for easy execution:
-   ```bash
-   #!/bin/bash
-   SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-   cd "$SCRIPT_DIR/.."
-   uv run python evals/your_script.py "$@"
-   ```
-
-4. Make the wrapper executable: `chmod +x your_wrapper.sh`
-
-## Dataset Files
-
-- `browsecomp_dataset.py` - Dataset loader for browsecomp questions with automatic decryption support
-- `browsecomp_scorer.py` - Scoring logic that extracts answers and compares against ground truth
-
-## Notes
-
-- Always use the wrapper scripts (`.sh` files) to ensure proper dependencies are loaded
-- The eval server runs on WebSocket protocol (ws://localhost:8080 by default)
-- Each connected client receives exactly one evaluation from the stack
-- Progress and statistics are shown in real-time during execution
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_dataset.py b/eval-server/python/evals/browsecomp_dataset.py
deleted file mode 100644
index 387f713..0000000
--- a/eval-server/python/evals/browsecomp_dataset.py
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Dataset Loader
-Handles downloading and decrypting the Browsecomp benchmark dataset
-"""
-
-import base64
-import hashlib
-import pandas as pd
-from typing import List, Dict, Optional
-
-
-DATASET_URL = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv"
-
-
-def derive_key(password: str, length: int) -> bytes:
-    """Derive a fixed-length key from the password using SHA256."""
-    hasher = hashlib.sha256()
-    hasher.update(password.encode())
-    key = hasher.digest()
-    return key * (length // len(key)) + key[: length % len(key)]
-
-
-def decrypt(ciphertext_b64: str, password: str) -> str:
-    """Decrypt base64-encoded ciphertext with XOR."""
-    encrypted = base64.b64decode(ciphertext_b64)
-    key = derive_key(password, len(encrypted))
-    decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
-    return decrypted.decode()
-
-
-class BrowsecompDataset:
-    def __init__(self, password: Optional[str] = None):
-        """
-        Initialize the Browsecomp dataset loader.
-        
-        Args:
-            password: Decryption password for the dataset. If None, will try
-                     to read from BROWSECOMP_PASSWORD environment variable.
-        """
-        self.password = password
-        if not self.password:
-            import os
-            self.password = os.environ.get('BROWSECOMP_PASSWORD', '')
-        
-        self.dataset = None
-        
-    def load_dataset(self) -> pd.DataFrame:
-        """Load and decrypt the Browsecomp dataset."""
-        print("Loading Browsecomp dataset...")
-        
-        try:
-            # Download the dataset
-            df = pd.read_csv(DATASET_URL)
-            print(f"Downloaded {len(df)} encrypted questions")
-            
-            # Add index column for reference
-            df = df.reset_index(drop=True)
-            df['question_id'] = df.index + 1
-            
-            # Check for canary field in dataset (automatic password)
-            canary_password = None
-            if 'canary' in df.columns and len(df) > 0:
-                canary_password = df.iloc[0]['canary']
-                print(f"Found canary password in dataset")
-            
-            # Use provided password or canary from dataset
-            decryption_password = self.password or canary_password
-            
-            if decryption_password:
-                print("Decrypting questions...")
-                decrypted_rows = []
-                
-                for idx, row in df.iterrows():
-                    try:
-                        # Use the canary from the row or the provided password
-                        row_canary = row.get('canary', decryption_password)
-                        
-                        # Decrypt the problem and answer columns
-                        row_dict = row.to_dict()
-                        
-                        if 'problem' in row and pd.notna(row['problem']):
-                            row_dict['problem_decrypted'] = decrypt(row['problem'], row_canary)
-                            row_dict['problem_encrypted'] = row['problem']
-                        else:
-                            row_dict['problem_decrypted'] = "[No problem field]"
-                        
-                        if 'answer' in row and pd.notna(row['answer']):
-                            row_dict['answer_decrypted'] = decrypt(row['answer'], row_canary)
-                            row_dict['answer_encrypted'] = row['answer']
-                        else:
-                            row_dict['answer_decrypted'] = ""
-                            
-                        decrypted_rows.append(row_dict)
-                            
-                    except Exception as e:
-                        print(f"Error decrypting row {idx}: {e}")
-                        row_dict = row.to_dict()
-                        row_dict['problem_decrypted'] = f"[Decryption failed: {str(e)}]"
-                        row_dict['answer_decrypted'] = ""
-                        decrypted_rows.append(row_dict)
-                
-                df = pd.DataFrame(decrypted_rows)
-                print(f"Successfully decrypted {len(df)} questions")
-            else:
-                print("Warning: No password provided and no canary found, questions remain encrypted")
-                df['problem_decrypted'] = df.get('problem', '')
-                df['answer_decrypted'] = df.get('answer', '')
-            
-            # Normalize column names for consistency
-            df = self._normalize_columns(df)
-            
-            # Add difficulty level (all Browsecomp questions are considered level 1)
-            df['task'] = 1
-            
-            self.dataset = df
-            return df
-            
-        except Exception as e:
-            print(f"Error loading dataset: {e}")
-            raise
-    
-    def _normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Normalize column names to match expected format."""
-        # Map Browsecomp columns to standard format
-        column_mapping = {
-            'problem_decrypted': 'question',
-            'problem': 'question_encrypted',
-            'answer_decrypted': 'true_answer',
-            'answer': 'true_answer_encrypted',
-            'question_id': 'question_id'
-        }
-        
-        # Apply renaming
-        for old_col, new_col in column_mapping.items():
-            if old_col in df.columns:
-                df = df.rename(columns={old_col: new_col})
-        
-        # Ensure required columns exist
-        if 'question' not in df.columns:
-            if 'problem_decrypted' in df.columns:
-                df['question'] = df['problem_decrypted']
-            else:
-                raise ValueError("No question column found in dataset")
-        
-        if 'true_answer' not in df.columns:
-            if 'answer_decrypted' in df.columns:
-                df['true_answer'] = df['answer_decrypted']
-            elif 'answer' in df.columns:
-                df['true_answer'] = df['answer']
-            else:
-                print("Warning: No answer column found, setting empty answers")
-                df['true_answer'] = ''
-        
-        return df
-    
-    def get_questions(self, 
-                     indices: Optional[List[int]] = None,
-                     limit: Optional[int] = None) -> pd.DataFrame:
-        """
-        Get specific questions from the dataset.
-        
-        Args:
-            indices: List of question numbers (1-based) to retrieve
-            limit: Maximum number of questions to return
-            
-        Returns:
-            DataFrame with selected questions
-        """
-        if self.dataset is None:
-            self.load_dataset()
-        
-        df = self.dataset.copy()
-        
-        # Filter by specific indices if provided
-        if indices:
-            # Convert to 0-based indexing
-            zero_based_indices = [i - 1 for i in indices if i > 0]
-            valid_indices = [i for i in zero_based_indices if i < len(df)]
-            
-            if not valid_indices:
-                print(f"No valid question indices found. Available range: 1-{len(df)}")
-                return pd.DataFrame()
-            
-            df = df.iloc[valid_indices]
-        
-        # Apply limit if specified
-        if limit and not indices:
-            df = df.head(limit)
-        
-        return df
-    
-    def list_questions(self, limit: int = 20) -> None:
-        """Display available questions."""
-        if self.dataset is None:
-            self.load_dataset()
-        
-        print(f"\nAvailable Browsecomp questions (showing first {limit}):")
-        print("=" * 80)
-        
-        for idx in range(min(limit, len(self.dataset))):
-            row = self.dataset.iloc[idx]
-            question = row.get('question', row.get('problem_decrypted', '[Encrypted]'))
-            
-            # Truncate long questions
-            if isinstance(question, str):
-                question_preview = question[:60] + "..." if len(question) > 60 else question
-            else:
-                question_preview = "[No question text]"
-                
-            print(f"#{idx + 1:3d} {question_preview}")
-        
-        if len(self.dataset) > limit:
-            print(f"\n... and {len(self.dataset) - limit} more questions")
-        
-        print(f"\nTotal: {len(self.dataset)} questions")
-        
-        # Check if questions are actually decrypted
-        if len(self.dataset) > 0:
-            first_question = self.dataset.iloc[0].get('question', '')
-            if not first_question or first_question.startswith('['):
-                print("⚠️  Questions are encrypted. Set BROWSECOMP_PASSWORD to decrypt.")
-            else:
-                print("✓ Questions are decrypted and ready to use")
-
-
-def test_dataset_loading():
-    """Test the dataset loading functionality."""
-    dataset = BrowsecompDataset()
-    
-    try:
-        df = dataset.load_dataset()
-        print(f"\n✓ Loaded {len(df)} questions")
-        print(f"Columns: {list(df.columns)}")
-        
-        # Show first question
-        if len(df) > 0:
-            first = df.iloc[0]
-            print(f"\nFirst question (truncated):")
-            question_text = str(first.get('question', ''))
-            print(f"  Question: {question_text[:100]}...")
-            print(f"  Answer: {first.get('true_answer', 'N/A')}")
-            
-    except Exception as e:
-        print(f"✗ Error: {e}")
-        return False
-    
-    return True
-
-
-if __name__ == "__main__":
-    test_dataset_loading()
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_eval_server.py b/eval-server/python/evals/browsecomp_eval_server.py
deleted file mode 100755
index 753e7cf..0000000
--- a/eval-server/python/evals/browsecomp_eval_server.py
+++ /dev/null
@@ -1,836 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Evaluation Server
-
-Command-line controlled eval processing server that loads browsecomp questions
-into a stack and distributes them one per client connection.
-"""
-
-import argparse
-import asyncio
-import json
-import logging
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-
-# Add eval-server src to path
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-# Add current directory (evals) to path for browsecomp_dataset import
-sys.path.insert(0, str(Path(__file__).parent))
-
-from bo_eval_server import EvalServer, EvaluationStack
-from browsecomp_dataset import BrowsecompDataset
-from browsecomp_scorer import question_scorer, extract_answer, extract_confidence
-
-
-def log_evaluation_event(logger: logging.Logger, event_type: str, data: Dict[str, Any]) -> None:
-    """
-    Log a structured evaluation event.
-    
-    Args:
-        logger: Logger instance
-        event_type: Type of event (client_connect, evaluation_start, evaluation_complete, etc.)
-        data: Event data to log
-    """
-    log_entry = {
-        "timestamp": datetime.now().isoformat(),
-        "event_type": event_type,
-        **data
-    }
-    logger.info(f"EVENT: {json.dumps(log_entry)}")
-
-
-def setup_logging(log_dir: str = "./logs") -> logging.Logger:
-    """
-    Set up logging to both console and file.
-    
-    Args:
-        log_dir: Directory to save log files
-        
-    Returns:
-        Configured logger
-    """
-    # Ensure logs directory exists
-    Path(log_dir).mkdir(exist_ok=True)
-    
-    # Create timestamp for log file
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_file = Path(log_dir) / f"browsecomp_eval_server_{timestamp}.log"
-    
-    # Create logger
-    logger = logging.getLogger('browsecomp_eval_server')
-    logger.setLevel(logging.INFO)
-    
-    # Clear any existing handlers
-    logger.handlers.clear()
-    
-    # Create formatter
-    formatter = logging.Formatter(
-        '%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
-        datefmt='%Y-%m-%d %H:%M:%S'
-    )
-    
-    # Console handler (for immediate feedback)
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_handler.setLevel(logging.INFO)
-    console_handler.setFormatter(formatter)
-    logger.addHandler(console_handler)
-    
-    # File handler (for persistent logging)
-    file_handler = logging.FileHandler(log_file)
-    file_handler.setLevel(logging.INFO)
-    file_handler.setFormatter(formatter)
-    logger.addHandler(file_handler)
-    
-    logger.info(f"Logging initialized - saving to {log_file}")
-    return logger
-
-
-def extract_response_text(result: Any) -> str:
-    """
-    Extract the actual response text from BrowserOperator's structured response format.
-    
-    Args:
-        result: The response from BrowserOperator (could be string, dict, or structured format)
-        
-    Returns:
-        The text content that should be scored
-    """
-    # Handle partial results with errors first
-    if isinstance(result, dict) and result.get('partial') and result.get('error'):
-        # This is our error structure, fallback to string representation
-        return str(result)
-    
-    # Handle structured BrowserOperator response
-    if isinstance(result, dict):
-        # Look for messages array (main response structure)
-        if 'messages' in result and isinstance(result['messages'], list):
-            response_parts = []
-            
-            for message in result['messages']:
-                if isinstance(message, dict):
-                    # Model responses with answers
-                    if message.get('entity') == 'model' and message.get('answer'):
-                        response_parts.append(message['answer'])
-                    # Tool results
-                    elif message.get('entity') == 'tool_result' and message.get('resultText'):
-                        response_parts.append(message['resultText'])
-                    # User messages
-                    elif message.get('entity') == 'user' and message.get('text'):
-                        response_parts.append(message['text'])
-            
-            if response_parts:
-                return '\n'.join(response_parts)
-        
-        # Fallback: look for common response fields
-        for field in ['answer', 'response', 'result', 'text', 'content']:
-            if field in result and result[field]:
-                return str(result[field])
-    
-    # Fallback to string representation
-    return str(result)
-
-
-def convert_question_to_evaluation(question_row: Dict[str, Any], question_id: int) -> Dict[str, Any]:
-    """
-    Convert a browsecomp question to the evaluation format expected by eval-server.
-    
-    Args:
-        question_row: Row from the browsecomp dataset DataFrame
-        question_id: Question ID number (1-based)
-        
-    Returns:
-        Evaluation object compatible with eval-server
-    """
-    question_text = question_row.get('question', question_row.get('problem_decrypted', ''))
-    true_answer = question_row.get('true_answer', question_row.get('answer_decrypted', ''))
-    
-    return {
-        "id": f"browsecomp_q{question_id}",
-        "name": f"Browsecomp Question {question_id}",
-        "description": f"Web browsing evaluation question from browsecomp dataset",
-        "tool": "chat",
-        "input": {
-            "message": f"{question_text}\n\nPlease provide your response in the following format:\n\nExplanation: [Step-by-step reasoning and information gathering]\n\nExact Answer: [The precise answer to the question]\n\nConfidence Score: [Confidence as a percentage, e.g., 85%]"
-        },
-        # Store original data for later reference/scoring
-        "metadata": {
-            "question_id": question_id,
-            "true_answer": true_answer,
-            "original_question": question_text,
-            "dataset": "browsecomp"
-        }
-    }
-
-
-def load_browsecomp_evaluations(
-    limit: Optional[int] = None,
-    questions: Optional[List[int]] = None,
-    start: Optional[int] = None,
-    end: Optional[int] = None,
-    password: Optional[str] = None
-) -> List[Dict[str, Any]]:
-    """
-    Load browsecomp questions and convert them to evaluation format.
-    
-    Args:
-        limit: Maximum number of questions to load
-        questions: Specific question numbers to load (1-based)
-        start: Start question number for range selection (1-based, inclusive)
-        end: End question number for range selection (1-based, inclusive)
-        password: Decryption password (optional, auto-detected from dataset)
-        
-    Returns:
-        List of evaluation objects
-    """
-    print("📚 Loading Browsecomp dataset...")
-    
-    # Load dataset
-    dataset = BrowsecompDataset(password=password)
-    
-    try:
-        df = dataset.load_dataset()
-        print(f"✅ Loaded {len(df)} questions from dataset")
-    except Exception as e:
-        print(f"❌ Failed to load dataset: {e}")
-        return []
-    
-    # Get specific questions, range, or apply limit
-    if questions:
-        print(f"📋 Filtering to specific questions: {questions}")
-        df_filtered = dataset.get_questions(indices=questions)
-    elif start is not None or end is not None:
-        # Handle range selection
-        if start is not None and end is not None:
-            if start > end:
-                print(f"❌ Invalid range: start ({start}) cannot be greater than end ({end})")
-                return []
-            if start < 1:
-                print(f"❌ Invalid start: question numbers are 1-based, got {start}")
-                return []
-            if end > len(df):
-                print(f"⚠️  End question {end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                end = len(df)
-            
-            print(f"📋 Loading questions {start} to {end} (range of {end - start + 1} questions)")
-            # Convert to 0-based indexing for pandas
-            range_questions = list(range(start, end + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-        elif start is not None:
-            # Only start specified, go to end of dataset
-            if start < 1:
-                print(f"❌ Invalid start: question numbers are 1-based, got {start}")
-                return []
-            if start > len(df):
-                print(f"❌ Start question {start} exceeds dataset size ({len(df)})")
-                return []
-            
-            print(f"📋 Loading questions from {start} to end ({len(df) - start + 1} questions)")
-            range_questions = list(range(start, len(df) + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-        else:
-            # Only end specified, start from beginning
-            if end < 1:
-                print(f"❌ Invalid end: question numbers are 1-based, got {end}")
-                return []
-            if end > len(df):
-                print(f"⚠️  End question {end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                end = len(df)
-            
-            print(f"📋 Loading questions 1 to {end} ({end} questions)")
-            range_questions = list(range(1, end + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-    elif limit:
-        print(f"📋 Limiting to first {limit} questions")
-        df_filtered = dataset.get_questions(limit=limit)
-    else:
-        print(f"📋 Loading all {len(df)} questions")
-        df_filtered = df
-    
-    if df_filtered.empty:
-        print("❌ No questions found with the specified criteria")
-        return []
-    
-    print(f"🔄 Converting {len(df_filtered)} questions to evaluation format...")
-    
-    # Convert to evaluation format
-    evaluations = []
-    for idx, row in df_filtered.iterrows():
-        question_id = row.get('question_id', idx + 1)
-        evaluation = convert_question_to_evaluation(row.to_dict(), question_id)
-        evaluations.append(evaluation)
-        
-        # Show preview of first few questions
-        if len(evaluations) <= 3:
-            question_preview = evaluation['input']['message'][:80] + "..."
-            print(f"   • Q{question_id}: {question_preview}")
-    
-    if len(evaluations) > 3:
-        print(f"   ... and {len(evaluations) - 3} more questions")
-    
-    print(f"✅ Created {len(evaluations)} evaluation objects")
-    return evaluations
-
-
-def main():
-    """Main function for the browsecomp evaluation server."""
-    return asyncio.run(async_main())
-
-async def async_main():
-    """Async main function for the browsecomp evaluation server."""
-    parser = argparse.ArgumentParser(description="Browsecomp Evaluation Server")
-    parser.add_argument(
-        "--limit", 
-        type=int, 
-        help="Maximum number of questions to load (default: all 1,266 questions)"
-    )
-    parser.add_argument(
-        "--questions", 
-        type=int, 
-        nargs="+", 
-        help="Specific question numbers to load (1-based, e.g. --questions 1 5 10)"
-    )
-    parser.add_argument(
-        "--start", 
-        type=int, 
-        help="Start question number for range selection (1-based, inclusive)"
-    )
-    parser.add_argument(
-        "--end", 
-        type=int, 
-        help="End question number for range selection (1-based, inclusive)"
-    )
-    parser.add_argument(
-        "--port", 
-        type=int, 
-        default=8080, 
-        help="Server port (default: 8080)"
-    )
-    parser.add_argument(
-        "--host", 
-        type=str, 
-        default="127.0.0.1", 
-        help="Server host (default: 127.0.0.1)"
-    )
-    parser.add_argument(
-        "--auth-key", 
-        type=str, 
-        default="browsecomp-eval", 
-        help="Authentication key (default: browsecomp-eval)"
-    )
-    parser.add_argument(
-        "--password", 
-        type=str, 
-        help="Dataset decryption password (optional, auto-detected from dataset)"
-    )
-    parser.add_argument(
-        "--list", 
-        action="store_true", 
-        help="List available questions without starting server"
-    )
-    parser.add_argument(
-        "--list-limit", 
-        type=int, 
-        default=20, 
-        help="Number of questions to show when listing (default: 20)"
-    )
-    parser.add_argument(
-        "--save-results", 
-        action="store_true", 
-        help="Save evaluation results to JSON file on completion"
-    )
-    parser.add_argument(
-        "--timeout", 
-        type=float, 
-        default=3600.0, 
-        help="Timeout for each evaluation in seconds (default: 3600s/60min)"
-    )
-    
-    args = parser.parse_args()
-    
-    # Setup logging
-    logger = setup_logging("./logs")
-    
-    # Handle list mode
-    if args.list:
-        logger.info("📋 Listing available browsecomp questions...")
-        dataset = BrowsecompDataset(password=args.password)
-        
-        # Apply filtering for list mode if range or specific questions are specified
-        if args.questions or args.start is not None or args.end is not None:
-            # Load the full dataset first
-            df = dataset.load_dataset()
-            
-            # Apply the same filtering logic as the main function
-            if args.questions:
-                print(f"📋 Showing specific questions: {args.questions}")
-                df_filtered = dataset.get_questions(indices=args.questions)
-            elif args.start is not None or args.end is not None:
-                # Handle range selection (same logic as in load_browsecomp_evaluations)
-                if args.start is not None and args.end is not None:
-                    if args.start > args.end:
-                        print(f"❌ Invalid range: start ({args.start}) cannot be greater than end ({args.end})")
-                        return 1
-                    if args.start < 1:
-                        print(f"❌ Invalid start: question numbers are 1-based, got {args.start}")
-                        return 1
-                    if args.end > len(df):
-                        print(f"⚠️  End question {args.end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                        args.end = len(df)
-                    
-                    print(f"📋 Showing questions {args.start} to {args.end}")
-                    range_questions = list(range(args.start, args.end + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-                elif args.start is not None:
-                    if args.start < 1:
-                        print(f"❌ Invalid start: question numbers are 1-based, got {args.start}")
-                        return 1
-                    if args.start > len(df):
-                        print(f"❌ Start question {args.start} exceeds dataset size ({len(df)})")
-                        return 1
-                    
-                    print(f"📋 Showing questions from {args.start} to end")
-                    range_questions = list(range(args.start, len(df) + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-                else:  # args.end is not None
-                    if args.end < 1:
-                        print(f"❌ Invalid end: question numbers are 1-based, got {args.end}")
-                        return 1
-                    if args.end > len(df):
-                        print(f"⚠️  End question {args.end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                        args.end = len(df)
-                    
-                    print(f"📋 Showing questions 1 to {args.end}")
-                    range_questions = list(range(1, args.end + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-            
-            # Display filtered results
-            if not df_filtered.empty:
-                print("=" * 80)
-                for idx, row in df_filtered.iterrows():
-                    question_id = row.get('question_id', idx + 1)
-                    question = row.get('question', row.get('problem_decrypted', '[Encrypted]'))
-                    
-                    if isinstance(question, str):
-                        question_preview = question[:60] + "..." if len(question) > 60 else question
-                    else:
-                        question_preview = str(question)[:60] + "..."
-                    
-                    print(f"#{question_id:3d} {question_preview}")
-                
-                print(f"\nShowing {len(df_filtered)} question(s)")
-            else:
-                print("❌ No questions found with the specified criteria")
-        else:
-            # Standard list mode
-            dataset.list_questions(limit=args.list_limit)
-        
-        return
-    
-    logger.info("🚀 Starting Browsecomp Evaluation Server")
-    logger.info("=" * 60)
-    
-    # Validate arguments
-    if args.questions and (args.start is not None or args.end is not None):
-        print("❌ Cannot use --questions together with --start/--end. Choose one approach.")
-        return 1
-    
-    if args.limit and (args.start is not None or args.end is not None):
-        print("❌ Cannot use --limit together with --start/--end. Choose one approach.")
-        return 1
-    
-    # Load evaluations
-    evaluations = load_browsecomp_evaluations(
-        limit=args.limit,
-        questions=args.questions,
-        start=args.start,
-        end=args.end,
-        password=args.password
-    )
-    
-    if not evaluations:
-        print("❌ No evaluations loaded. Exiting.")
-        return 1
-    
-    # Create evaluation stack and populate it
-    stack = EvaluationStack()
-    
-    print(f"\n📚 Loading {len(evaluations)} evaluations into stack...")
-    for evaluation in evaluations:
-        stack.push(evaluation)
-    
-    print(f"✅ Stack loaded with {stack.size()} evaluations")
-    print(f"🔝 Top evaluation: {stack.peek()['name'] if stack.peek() else 'None'}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key=args.auth_key,
-        host=args.host,
-        port=args.port,
-        log_level='INFO',
-        log_dir='./logs',
-        rpc_timeout=args.timeout,
-    )
-    
-    # Track processed evaluations
-    completed_evaluations = []
-    failed_evaluations = []
-    client_evaluation_map = {}  # client_id -> evaluation_id mapping
-    
-    print(f"\n🌐 Server Configuration:")
-    print(f"   Host: {args.host}")
-    print(f"   Port: {args.port}")
-    print(f"   Auth Key: {args.auth_key}")
-    print(f"   Timeout: {args.timeout}s ({args.timeout/60:.1f} minutes)")
-    print(f"   Total Evaluations: {stack.size()}")
-    
-    @server.on_connect
-    async def handle_client(client):
-        logger.info(f'🎉 CLIENT CONNECTED!')
-        logger.info(f'   - Client ID: {client.id}')
-        logger.info(f'   - Client tabId: {client.tab_id}')
-        logger.info(f'   - Client info: {client.get_info()}')
-        
-        # Log structured client connection event
-        log_evaluation_event(logger, "client_connected", {
-            "client_id": client.id,
-            "tab_id": client.tab_id,
-            "client_info": client.get_info(),
-            "stack_remaining": stack.size()
-        })
-        
-        # Check if we have evaluations left in the stack
-        if stack.is_empty():
-            print('⚠️  No more evaluations in stack for this client')
-            print('   All browsecomp questions have been distributed')
-            await client.send_message({
-                "type": "no_evaluations",
-                "message": "All browsecomp questions have been distributed"
-            })
-            return
-        
-        # Pop the next evaluation from the stack (ONE evaluation per client!)
-        evaluation = stack.pop()
-        evaluation_id = evaluation['id']
-        question_id = evaluation['metadata']['question_id']
-        
-        print(f'📋 Assigning evaluation: "{evaluation["name"]}" (Question #{question_id})')
-        print(f'📊 Remaining evaluations in stack: {stack.size()}')
-        
-        # Track which evaluation was sent to which client
-        client_evaluation_map[client.id] = evaluation_id
-        
-        # Log evaluation assignment
-        log_evaluation_event(logger, "evaluation_assigned", {
-            "client_id": client.id,
-            "evaluation_id": evaluation_id,
-            "question_id": question_id,
-            "evaluation_name": evaluation["name"],
-            "stack_remaining": stack.size(),
-            "true_answer": evaluation['metadata']['true_answer']
-        })
-        
-        try:
-            print(f'🔄 Starting evaluation... (timeout: {args.timeout}s)')
-            result = await client.evaluate(evaluation, timeout=args.timeout)
-            
-            print('✅ Evaluation completed!')
-            
-            # Extract the true answer from evaluation metadata
-            true_answer = evaluation['metadata']['true_answer']
-            
-            # Check if this is a partial result with errors
-            is_partial_result = (isinstance(result, dict) and 
-                               result.get('partial') and 
-                               result.get('error'))
-            
-            # Extract the actual response text from the structured format
-            response_text = extract_response_text(result)
-            
-            # Show structured response details if available
-            if isinstance(result, dict) and 'messages' in result:
-                message_count = len(result.get('messages', []))
-                model_used = result.get('modelUsed', 'unknown')
-                execution_time = result.get('executionTime', 0)
-                tool_calls = len(result.get('toolCalls', []))
-                print(f'📊 Response structure: {message_count} messages, {tool_calls} tool calls, {model_used} model, {execution_time}ms')
-            else:
-                print(f'📊 Response for "{evaluation["name"]}": {response_text[:100]}...')
-            
-            # Score the response
-            is_correct = question_scorer(response_text, true_answer)
-            extracted_answer = extract_answer(response_text)
-            confidence = extract_confidence(response_text)
-            
-            # Print scoring results
-            print(f'🎯 Scoring Results:')
-            print(f'   - True Answer: {true_answer}')
-            print(f'   - Extracted Answer: {extracted_answer}')
-            print(f'   - Correct: {"✅ YES" if is_correct else "❌ NO"}')
-            print(f'   - Confidence: {confidence}%')
-            
-            if is_partial_result:
-                print(f'⚠️  Note: Result obtained after retries with errors:')
-                print(f'   - Error: {result.get("error", "Unknown error")}')
-                print(f'   - Attempts: {result.get("attempts", "Unknown")}')
-                print(f'   - The BrowserOperator had issues but provided a response')
-            
-            # Log evaluation completion
-            log_evaluation_event(logger, "evaluation_completed", {
-                "client_id": client.id,
-                "evaluation_id": evaluation_id,
-                "question_id": question_id,
-                "evaluation_name": evaluation["name"],
-                "is_correct": is_correct,
-                "extracted_answer": extracted_answer,
-                "true_answer": true_answer,
-                "confidence": confidence,
-                "is_partial_result": is_partial_result,
-                "model_used": result.get('modelUsed') if isinstance(result, dict) else None,
-                "execution_time_ms": result.get('executionTime') if isinstance(result, dict) else None,
-                "tool_calls_count": len(result.get('toolCalls', [])) if isinstance(result, dict) else None
-            })
-            
-            completed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'result': result,
-                'question_id': question_id,
-                'scoring': {
-                    'is_correct': is_correct,
-                    'true_answer': true_answer,
-                    'extracted_answer': extracted_answer,
-                    'confidence': confidence
-                },
-                'partial_result': is_partial_result,
-                'execution_info': {
-                    'had_errors': is_partial_result,
-                    'error_message': result.get('error') if is_partial_result else None,
-                    'retry_attempts': result.get('attempts') if is_partial_result else 1,
-                    'model_used': result.get('modelUsed') if isinstance(result, dict) else None,
-                    'execution_time_ms': result.get('executionTime') if isinstance(result, dict) else None,
-                    'tool_calls_count': len(result.get('toolCalls', [])) if isinstance(result, dict) else None,
-                    'messages_count': len(result.get('messages', [])) if isinstance(result, dict) else None
-                }
-            })
-            
-        except Exception as e:
-            error_msg = str(e)
-            print(f'❌ Evaluation "{evaluation["name"]}" failed: {error_msg}')
-            
-            # Check if this is a tool execution error that might still be running
-            if "Tool execution failed" in error_msg or "-32000" in error_msg:
-                print(f'⚠️  Note: BrowserOperator may still be processing this question')
-                print(f'   The client reported an error but might continue execution')
-                print(f'   Consider increasing timeout with --timeout parameter')
-            
-            # Log evaluation failure
-            log_evaluation_event(logger, "evaluation_failed", {
-                "client_id": client.id,
-                "evaluation_id": evaluation_id,
-                "question_id": question_id,
-                "evaluation_name": evaluation["name"],
-                "error_message": error_msg,
-                "is_tool_execution_error": "Tool execution failed" in error_msg or "-32000" in error_msg,
-                "true_answer": evaluation['metadata']['true_answer']
-            })
-            
-            failed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'error': error_msg,
-                'question_id': question_id,
-            })
-        
-        # Send completion message
-        try:
-            await client.send_message({
-                "type": "evaluation_complete",
-                "evaluation_id": evaluation_id,
-                "evaluation_name": evaluation["name"],
-                "question_id": question_id,
-                "status": "completed" if evaluation_id not in [e['evaluation']['id'] for e in failed_evaluations] else "failed"
-            })
-        except Exception as e:
-            print(f'   ⚠️  Failed to send completion message: {e}')
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        client_id = client_info["id"]
-        print(f'\n🔌 Client disconnected: {client_id}')
-        
-        # Show what evaluation this client was working on
-        evaluation_id = None
-        if client_id in client_evaluation_map:
-            evaluation_id = client_evaluation_map[client_id]
-            print(f'   Was working on: {evaluation_id}')
-        
-        # Log client disconnect
-        log_evaluation_event(logger, "client_disconnected", {
-            "client_id": client_id,
-            "evaluation_id": evaluation_id,
-            "completed_count": len(completed_evaluations),
-            "failed_count": len(failed_evaluations),
-            "stack_remaining": stack.size()
-        })
-        
-        # Show final statistics
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations)
-        remaining = stack.size()
-        total_original = len(evaluations)
-        
-        print(f'\n📊 Current Statistics:')
-        print(f'   ✅ Completed: {total_completed}/{total_original}')
-        print(f'   ❌ Failed: {total_failed}/{total_original}')
-        print(f'   📚 Remaining: {remaining}/{total_original}')
-        print(f'   🔄 In Progress: {total_original - total_completed - total_failed - remaining}')
-        
-        # Calculate scoring statistics
-        if completed_evaluations:
-            correct_count = sum(1 for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False))
-            partial_count = sum(1 for item in completed_evaluations if item.get('partial_result', False))
-            accuracy = correct_count / total_completed * 100 if total_completed > 0 else 0
-            avg_confidence = sum(item.get('scoring', {}).get('confidence', 0) for item in completed_evaluations) / total_completed if total_completed > 0 else 0
-            
-            print(f'\n🎯 Scoring Statistics:')
-            print(f'   📊 Accuracy: {accuracy:.1f}% ({correct_count}/{total_completed} correct)')
-            print(f'   💡 Average Confidence: {avg_confidence:.1f}%')
-            if partial_count > 0:
-                print(f'   ⚠️  Partial Results: {partial_count}/{total_completed} had execution errors but recovered')
-        
-        if completed_evaluations:
-            print(f'\n🎯 Recently Completed Evaluations:')
-            for item in completed_evaluations[-3:]:  # Show last 3
-                eval_name = item['evaluation']['name']
-                question_id = item['question_id']
-                client_id_short = item['client_id'][:8]  # Short client ID
-                is_correct = item.get('scoring', {}).get('is_correct', False)
-                confidence = item.get('scoring', {}).get('confidence', 0)
-                is_partial = item.get('partial_result', False)
-                status_emoji = '✅' if is_correct else '❌'
-                partial_indicator = '⚠️' if is_partial else ''
-                print(f'   • Q{question_id}: {eval_name} {status_emoji}{partial_indicator} (confidence: {confidence}%, client: {client_id_short})')
-        
-        if failed_evaluations:
-            print(f'\n💥 Failed Evaluations:')
-            for item in failed_evaluations:
-                eval_name = item['evaluation']['name']
-                question_id = item['question_id']
-                error = item['error']
-                print(f'   • Q{question_id}: {eval_name} - {error}')
-    
-    # Start server
-    try:
-        print(f'\n🚀 Starting server on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your BrowserOperator to start processing browsecomp questions')
-        print('   Press Ctrl+C to stop the server')
-        print('=' * 60)
-        
-        await server.start()
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-        # Show final summary
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations)
-        total_processed = total_completed + total_failed
-        
-        if total_processed > 0:
-            print(f'\n📈 Final Summary:')
-            print(f'   Total processed: {total_processed}/{len(evaluations)}')
-            print(f'   Success rate: {total_completed/total_processed*100:.1f}%')
-            print(f'   Completed: {total_completed}')
-            print(f'   Failed: {total_failed}')
-            
-            # Final scoring statistics
-            if completed_evaluations:
-                correct_count = sum(1 for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False))
-                accuracy = correct_count / total_completed * 100 if total_completed > 0 else 0
-                avg_confidence = sum(item.get('scoring', {}).get('confidence', 0) for item in completed_evaluations) / total_completed if total_completed > 0 else 0
-                
-                print(f'\n🏆 Final Scoring Results:')
-                print(f'   📊 Overall Accuracy: {accuracy:.1f}% ({correct_count}/{total_completed} correct)')
-                print(f'   💡 Average Confidence: {avg_confidence:.1f}%')
-                
-                # Show confidence correlation
-                correct_items = [item for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False)]
-                incorrect_items = [item for item in completed_evaluations if not item.get('scoring', {}).get('is_correct', False)]
-                
-                if correct_items:
-                    avg_conf_correct = sum(item.get('scoring', {}).get('confidence', 0) for item in correct_items) / len(correct_items)
-                    print(f'   ✅ Avg confidence when correct: {avg_conf_correct:.1f}%')
-                
-                if incorrect_items:
-                    avg_conf_incorrect = sum(item.get('scoring', {}).get('confidence', 0) for item in incorrect_items) / len(incorrect_items)
-                    print(f'   ❌ Avg confidence when incorrect: {avg_conf_incorrect:.1f}%')
-                
-                # Save results to JSON file
-                if completed_evaluations and (args.save_results or total_completed == len(evaluations)):
-                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                    results_file = f"browsecomp_eval_results_{timestamp}.json"
-                    
-                    results_data = {
-                        "timestamp": timestamp,
-                        "total_questions": len(evaluations),
-                        "completed": total_completed,
-                        "failed": total_failed,
-                        "accuracy": accuracy,
-                        "average_confidence": avg_confidence,
-                        "evaluations": completed_evaluations
-                    }
-                    
-                    with open(results_file, 'w') as f:
-                        json.dump(results_data, f, indent=2)
-                    
-                    print(f'\n💾 Results saved to: {results_file}')
-                
-                # Log final session summary
-                log_evaluation_event(logger, "session_completed", {
-                    "total_questions": len(evaluations),
-                    "completed": total_completed,
-                    "failed": total_failed,
-                    "accuracy": accuracy,
-                    "average_confidence": avg_confidence,
-                    "partial_results": partial_count,
-                    "results_file": results_file if 'results_file' in locals() else None
-                })
-        
-    except Exception as e:
-        logger.error(f'💥 Server error: {e}')
-        log_evaluation_event(logger, "server_error", {
-            "error_message": str(e),
-            "completed_count": len(completed_evaluations),
-            "failed_count": len(failed_evaluations)
-        })
-        
-        if server.is_running():
-            await server.stop()
-        return 1
-    
-    logger.info("✅ Server session ended successfully")
-    return 0
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        exit_code = main()
-        sys.exit(exit_code)
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_scorer.py b/eval-server/python/evals/browsecomp_scorer.py
deleted file mode 100644
index 17561e6..0000000
--- a/eval-server/python/evals/browsecomp_scorer.py
+++ /dev/null
@@ -1,328 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Benchmark Scorer
-Implements the scoring logic for Browsecomp benchmark questions
-"""
-
-import re
-from typing import Union, Any, Optional, Dict
-
-
-# Grader template based on the original Browsecomp evaluation
-GRADER_TEMPLATE = """
-Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
-
-[question]: {question}
-
-[response]: {response}
-
-Your judgement must be in the format and criteria specified below:
-
-extracted_final_answer: The final exact answer extracted from the [response]. Put the answer as 'None' if there is no exact, final answer to extract.
-
-[correct_answer]: {correct_answer}
-
-reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer.
-
-correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise.
-
-confidence: The extracted confidence score between 0% and 100% from [response]. Put 100 if no confidence score available.
-""".strip()
-
-
-def extract_answer(response: str) -> str:
-    """Extract the exact answer from a response."""
-    # Look for "Exact Answer:" pattern
-    patterns = [
-        r'[Ee]xact [Aa]nswer:\s*([^\n]+)',
-        r'[Ff]inal [Aa]nswer:\s*([^\n]+)',
-        r'[Aa]nswer:\s*([^\n]+)',
-    ]
-    
-    for pattern in patterns:
-        match = re.search(pattern, response)
-        if match:
-            return match.group(1).strip()
-    
-    # If no pattern found, try to extract from the end of response
-    lines = response.strip().split('\n')
-    if lines:
-        # Check last few lines for answer-like content
-        for line in reversed(lines[-3:]):
-            line = line.strip()
-            if line and not line.startswith('[') and not line.startswith('Confidence'):
-                return line
-    
-    return ""
-
-
-def extract_confidence(response: str) -> float:
-    """Extract confidence score from response."""
-    patterns = [
-        r'[Cc]onfidence\s*[Ss]core:\s*(\d+)%',
-        r'[Cc]onfidence:\s*(\d+)%',
-        r'(\d+)%\s*confident',
-        r'I am (\d+)% confident',
-        r'(\d+)%\s*confidence',
-    ]
-    
-    for pattern in patterns:
-        match = re.search(pattern, response)
-        if match:
-            return float(match.group(1))
-    
-    return 100.0  # Default to 100% if not specified
-
-
-def normalize_answer(answer: str) -> str:
-    """Normalize answer for comparison."""
-    if not isinstance(answer, str):
-        answer = str(answer)
-    
-    # Convert to lowercase
-    answer = answer.lower().strip()
-    
-    # Remove common punctuation at the end
-    answer = answer.rstrip('.,!?;:')
-    
-    # Normalize whitespace
-    answer = ' '.join(answer.split())
-    
-    return answer
-
-
-def extract_number(text: str) -> Union[float, None]:
-    """Extract a number from text."""
-    # Remove common separators and convert to standard format
-    text = text.replace(',', '')
-    
-    # Try to find numbers with various patterns
-    patterns = [
-        r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?',  # Scientific notation
-        r'[-+]?\d+\.?\d*',  # Regular numbers
-        r'[-+]?\d+',  # Integers
-    ]
-    
-    for pattern in patterns:
-        matches = re.findall(pattern, text)
-        if matches:
-            try:
-                # Return the first valid number
-                return float(matches[0])
-            except ValueError:
-                continue
-    
-    return None
-
-
-def compare_numeric_answers(pred: str, true: str, tolerance: float = 0.01) -> bool:
-    """Compare numeric answers with tolerance."""
-    pred_num = extract_number(pred)
-    true_num = extract_number(true)
-    
-    if pred_num is None or true_num is None:
-        return False
-    
-    # Check relative tolerance for non-zero values
-    if true_num != 0:
-        relative_error = abs(pred_num - true_num) / abs(true_num)
-        return relative_error <= tolerance
-    else:
-        # For zero values, use absolute tolerance
-        return abs(pred_num - true_num) <= tolerance
-
-
-def question_scorer(prediction: str, true_answer: str) -> bool:
-    """
-    Score a prediction against the true answer.
-    Returns True if the prediction is considered correct.
-    
-    This is a simplified scorer for quick evaluation.
-    For production use, consider using grade_with_llm for more accurate grading.
-    """
-    if not prediction or not true_answer:
-        return False
-    
-    # Extract the answer part from the prediction
-    extracted_answer = extract_answer(prediction)
-    if not extracted_answer:
-        extracted_answer = prediction
-    
-    # Normalize both answers
-    pred_norm = normalize_answer(extracted_answer)
-    true_norm = normalize_answer(true_answer)
-    
-    # Exact match after normalization
-    if pred_norm == true_norm:
-        return True
-    
-    # Check if the true answer is contained in the prediction
-    if true_norm in pred_norm:
-        return True
-    
-    # Check numeric answers
-    if any(char.isdigit() for char in true_answer):
-        if compare_numeric_answers(extracted_answer, true_answer):
-            return True
-    
-    # Check for common variations
-    # Handle yes/no answers
-    if true_norm in ['yes', 'no']:
-        if true_norm == 'yes' and pred_norm in ['yes', 'true', 'correct', 'affirmative']:
-            return True
-        if true_norm == 'no' and pred_norm in ['no', 'false', 'incorrect', 'negative']:
-            return True
-    
-    return False
-
-
-def grade_with_llm(question: str, correct_answer: str, response: str,
-                   grader_function: Optional[callable] = None) -> Dict[str, Any]:
-    """
-    Grade a response using an LLM grader.
-    
-    Args:
-        question: The original question
-        correct_answer: The correct answer
-        response: The model's response
-        grader_function: Optional function to call the grader LLM
-        
-    Returns:
-        Dictionary with grading results
-    """
-    if not grader_function:
-        # If no grader function provided, use simple scoring
-        is_correct = question_scorer(response, correct_answer)
-        confidence = extract_confidence(response)
-        
-        return {
-            'is_correct': is_correct,
-            'confidence': confidence,
-            'reasoning': 'Graded using rule-based scorer',
-            'extracted_answer': extract_answer(response)
-        }
-    
-    # Format the grading prompt
-    grader_prompt = GRADER_TEMPLATE.format(
-        question=question,
-        correct_answer=correct_answer,
-        response=response,
-    )
-    
-    # Call the grader
-    grading_response = grader_function(grader_prompt)
-    
-    # Parse the grading response
-    is_correct = False
-    confidence = 100.0
-    reasoning = ""
-    extracted_answer = ""
-    
-    # Look for patterns in grading response
-    correct_match = re.search(r"correct:\s*(yes|no)", grading_response.lower())
-    if correct_match:
-        is_correct = correct_match.group(1) == "yes"
-    
-    confidence_match = re.search(r"confidence:\s*(\d+)", grading_response)
-    if confidence_match:
-        confidence = float(confidence_match.group(1))
-    
-    reasoning_match = re.search(r"reasoning:\s*([^\n]+)", grading_response, re.IGNORECASE)
-    if reasoning_match:
-        reasoning = reasoning_match.group(1).strip()
-    
-    answer_match = re.search(r"extracted_final_answer:\s*([^\n]+)", grading_response, re.IGNORECASE)
-    if answer_match:
-        extracted_answer = answer_match.group(1).strip()
-    
-    return {
-        'is_correct': is_correct,
-        'confidence': confidence,
-        'reasoning': reasoning,
-        'extracted_answer': extracted_answer,
-        'grader_response': grading_response
-    }
-
-
-def evaluate_predictions(predictions: list, true_answers: list) -> dict:
-    """
-    Evaluate a list of predictions against true answers.
-    Returns statistics about the evaluation.
-    """
-    if len(predictions) != len(true_answers):
-        raise ValueError("Predictions and true answers must have the same length")
-    
-    results = {
-        'total': len(predictions),
-        'correct': 0,
-        'incorrect': 0,
-        'details': [],
-        'average_confidence': 0.0
-    }
-    
-    total_confidence = 0.0
-    
-    for pred, true in zip(predictions, true_answers):
-        is_correct = question_scorer(pred, true)
-        confidence = extract_confidence(pred)
-        
-        results['details'].append({
-            'prediction': pred,
-            'true_answer': true,
-            'correct': is_correct,
-            'confidence': confidence,
-            'extracted_answer': extract_answer(pred)
-        })
-        
-        if is_correct:
-            results['correct'] += 1
-        else:
-            results['incorrect'] += 1
-        
-        total_confidence += confidence
-    
-    results['accuracy'] = results['correct'] / results['total'] if results['total'] > 0 else 0
-    results['average_confidence'] = total_confidence / results['total'] if results['total'] > 0 else 0
-    
-    return results
-
-
-# Example usage and tests
-if __name__ == "__main__":
-    # Test cases
-    test_cases = [
-        (
-            "Explanation: I found that...\nExact Answer: Paris\nConfidence Score: 95%",
-            "Paris",
-            True
-        ),
-        (
-            "The answer is 42",
-            "42",
-            True
-        ),
-        (
-            "Exact Answer: Yes\nConfidence: 80%",
-            "yes",
-            True
-        ),
-        (
-            "After browsing, I found the answer is 3.14159",
-            "3.14",
-            True
-        ),
-        (
-            "The result is 99",
-            "100",
-            False
-        ),
-    ]
-    
-    print("Testing Browsecomp scorer:")
-    for pred, true, expected in test_cases:
-        result = question_scorer(pred, true)
-        extracted = extract_answer(pred)
-        confidence = extract_confidence(pred)
-        status = "✓" if result == expected else "✗"
-        print(f"{status} Pred: '{pred[:50]}...' | True: '{true}' | Correct: {result}")
-        print(f"   Extracted: '{extracted}' | Confidence: {confidence}%")
\ No newline at end of file
diff --git a/eval-server/python/evals/run_browsecomp_eval_server.sh b/eval-server/python/evals/run_browsecomp_eval_server.sh
deleted file mode 100755
index e393dad..0000000
--- a/eval-server/python/evals/run_browsecomp_eval_server.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-# Wrapper script to run browsecomp eval server with proper dependencies
-
-# Get the directory of this script
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-EVAL_SERVER_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
-
-# Change to eval-server python directory
-cd "$EVAL_SERVER_DIR"
-
-# Run with uv, passing all arguments
-uv run python evals/browsecomp_eval_server.py "$@"
\ No newline at end of file
diff --git a/eval-server/python/examples/__init__.py b/eval-server/python/examples/__init__.py
deleted file mode 100644
index 4bb7da7..0000000
--- a/eval-server/python/examples/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Examples package for bo-eval-server.
-
-This package contains working examples demonstrating different uses of the evaluation server:
-- basic_server: Simple WebSocket server setup
-- with_stack: Using evaluation stack for queuing evaluations
-- programmatic_evals: Advanced programmatic evaluation creation
-"""
-
-__version__ = "1.0.0"
\ No newline at end of file
diff --git a/eval-server/python/examples/basic_server.py b/eval-server/python/examples/basic_server.py
deleted file mode 100644
index 3a1f9b0..0000000
--- a/eval-server/python/examples/basic_server.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Basic EvalServer example - Simple WebSocket server setup.
-
-This example shows the minimal setup for a WebSocket evaluation server.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer
-
-
-async def main():
-    """Main example function for basic server setup."""
-    # Create server with basic configuration
-    server = EvalServer(
-        auth_key='hello',
-        host='127.0.0.1',
-        port=8080,
-        log_level='DEBUG',
-        log_dir='./logs',  # Optional: create logs directory
-    )
-    
-    # Set up client connection handler
-    @server.on_connect
-    async def handle_client(client):
-        print(f'🔗 Client connected: {client.id}')
-        print(f'   Tab ID: {client.tab_id}')
-        print(f'   Capabilities: {client.capabilities}')
-        
-        # Send EXACTLY the same evaluation as NodeJS library-usage.js
-        try:
-            print('🔄 Starting evaluation...')
-            response = await client.evaluate({
-                "id": "test_eval",
-                "name": "Capital of France",
-                "description": "Simple test evaluation", 
-                "tool": "chat",
-                "input": {
-                    "message": "What is the capital of France?"
-                }
-            })
-            
-            print('✅ Evaluation completed!')
-            print(f'📊 Response: {response}')
-            
-        except Exception as e:
-            print(f'❌ Evaluation failed: {e}')
-        
-        # Send a custom message
-        try:
-            await client.send_message({
-                "type": "info",
-                "message": "Evaluation completed successfully!"
-            })
-        except Exception as e:
-            print(f'⚠️  Failed to send message: {e}')
-    
-    # Set up client disconnection handler
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'🔌 Client disconnected: {client_info["id"]}')
-        print(f'   Connection duration: {client_info.get("duration", "unknown")}s')
-    
-    # Start the server
-    try:
-        await server.start()
-        print(f'🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Check if logs directory exists, create if needed
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/examples/logs/.gitignore b/eval-server/python/examples/logs/.gitignore
deleted file mode 100644
index 326f777..0000000
--- a/eval-server/python/examples/logs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.log
-*.jsonl
\ No newline at end of file
diff --git a/eval-server/python/examples/programmatic_evals.py b/eval-server/python/examples/programmatic_evals.py
deleted file mode 100644
index 47e579d..0000000
--- a/eval-server/python/examples/programmatic_evals.py
+++ /dev/null
@@ -1,428 +0,0 @@
-#!/usr/bin/env python3
-"""
-Programmatic evaluation creation example.
-
-This example demonstrates creating and customizing evaluations programmatically
-in Python code, including dynamic evaluation generation and conditional logic.
-"""
-
-import asyncio
-import random
-import sys
-import time
-from pathlib import Path
-from typing import Dict, Any, List
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer, EvaluationStack
-
-
-class EvaluationGenerator:
-    """Helper class for generating evaluations programmatically."""
-    
-    def __init__(self):
-        self.counter = 0
-    
-    def create_evaluation(
-        self,
-        name: str,
-        tool: str,
-        input_data: Dict[str, Any],
-        description: str = "",
-        metadata: Dict[str, Any] = None,
-        timeout: float = 30.0,
-    ) -> Dict[str, Any]:
-        """Create a standardized evaluation object."""
-        self.counter += 1
-        
-        return {
-            "id": f"generated_{self.counter:03d}_{int(time.time())}",
-            "name": name,
-            "description": description or f"Programmatically generated evaluation: {name}",
-            "tool": tool,
-            "input": input_data,
-            "timeout": timeout,
-            "metadata": {
-                "generated": True,
-                "timestamp": time.time(),
-                "generator": "programmatic_evals.py",
-                **(metadata or {})
-            }
-        }
-    
-    def create_chat_evaluation(
-        self,
-        message: str,
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create a chat-based evaluation."""
-        return self.create_evaluation(
-            name=name or f"Chat: {message[:30]}...",
-            tool="chat",
-            input_data={"message": message},
-            **kwargs
-        )
-    
-    def create_action_evaluation(
-        self,
-        objective: str,
-        url: str = None,
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create an action-based evaluation."""
-        input_data = {"objective": objective}
-        if url:
-            input_data["url"] = url
-            
-        return self.create_evaluation(
-            name=name or f"Action: {objective[:30]}...",
-            tool="action",
-            input_data=input_data,
-            **kwargs
-        )
-    
-    def create_research_evaluation(
-        self,
-        query: str,
-        depth: str = "basic",
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create a research-based evaluation."""
-        return self.create_evaluation(
-            name=name or f"Research: {query[:30]}...",
-            tool="research",
-            input_data={
-                "query": query,
-                "depth": depth,
-            },
-            **kwargs
-        )
-
-
-def create_dynamic_evaluations(generator: EvaluationGenerator) -> List[Dict[str, Any]]:
-    """Create evaluations based on dynamic logic."""
-    evaluations = []
-    
-    # Math evaluations with increasing difficulty
-    for i in range(3):
-        if i == 0:
-            a, b = random.randint(1, 10), random.randint(1, 10)
-            op = "+"
-            difficulty = "easy"
-        elif i == 1:
-            a, b = random.randint(10, 50), random.randint(10, 50)
-            op = "*"
-            difficulty = "medium"
-        else:
-            a, b = random.randint(100, 1000), random.randint(2, 20)
-            op = "/"
-            difficulty = "hard"
-        
-        evaluation = generator.create_chat_evaluation(
-            message=f"Calculate: {a} {op} {b}",
-            name=f"Math {difficulty.title()} #{i+1}",
-            metadata={
-                "category": "mathematics",
-                "difficulty": difficulty,
-                "numbers": [a, b],
-                "operation": op
-            }
-        )
-        evaluations.append(evaluation)
-    
-    # Conditional evaluations based on current time
-    current_hour = time.localtime().tm_hour
-    if 6 <= current_hour < 12:
-        time_context = "morning"
-        questions = [
-            "What's a good breakfast recipe?",
-            "How can I boost my energy in the morning?",
-        ]
-    elif 12 <= current_hour < 18:
-        time_context = "afternoon"
-        questions = [
-            "What's a healthy lunch option?",
-            "How can I stay productive in the afternoon?",
-        ]
-    else:
-        time_context = "evening"
-        questions = [
-            "What's a good dinner recipe?",
-            "How can I relax in the evening?",
-        ]
-    
-    for i, question in enumerate(questions):
-        evaluation = generator.create_chat_evaluation(
-            message=question,
-            name=f"{time_context.title()} Question #{i+1}",
-            metadata={
-                "category": "lifestyle",
-                "time_context": time_context,
-                "hour": current_hour
-            }
-        )
-        evaluations.append(evaluation)
-    
-    # Generate research evaluations for trending topics
-    trending_topics = [
-        "artificial intelligence trends 2024",
-        "sustainable energy solutions",
-        "space exploration recent developments",
-    ]
-    
-    for topic in trending_topics:
-        evaluation = generator.create_research_evaluation(
-            query=topic,
-            depth="detailed",
-            name=f"Research: {topic.title()}",
-            metadata={
-                "category": "research",
-                "topic": topic,
-                "priority": "high"
-            },
-            timeout=60.0  # Longer timeout for research
-        )
-        evaluations.append(evaluation)
-    
-    return evaluations
-
-
-async def main():
-    """Main example function for programmatic evaluation creation."""
-    print("🏭 Programmatic Evaluation Generation Example")
-    print("=" * 50)
-    
-    # Create evaluation generator
-    generator = EvaluationGenerator()
-    
-    # Create evaluation stack
-    stack = EvaluationStack()
-    
-    # Generate static evaluations
-    print("\n📝 Creating static evaluations...")
-    static_evals = [
-        generator.create_chat_evaluation(
-            message="Explain quantum computing in simple terms",
-            name="Quantum Computing Explanation",
-            metadata={"category": "science", "complexity": "advanced"}
-        ),
-        generator.create_action_evaluation(
-            objective="Find and click the search button",
-            url="https://www.google.com",
-            name="Google Search Action",
-            metadata={"category": "web_automation", "site": "google"}
-        ),
-        generator.create_chat_evaluation(
-            message="Write a haiku about programming",
-            name="Programming Haiku",
-            metadata={"category": "creative", "format": "poetry"}
-        ),
-    ]
-    
-    for eval_obj in static_evals:
-        stack.push(eval_obj)
-        print(f"   ➕ {eval_obj['name']}")
-    
-    # Generate dynamic evaluations
-    print("\n🎲 Creating dynamic evaluations...")
-    dynamic_evals = create_dynamic_evaluations(generator)
-    
-    for eval_obj in dynamic_evals:
-        stack.push(eval_obj)
-        print(f"   ➕ {eval_obj['name']} (category: {eval_obj['metadata']['category']})")
-    
-    print(f"\n📊 Total evaluations created: {stack.size()}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key='programmatic-demo',
-        host='127.0.0.1',
-        port=8080,
-        log_level='INFO',
-        log_dir='./logs',
-        max_concurrent_evaluations=5,  # Allow more concurrent evaluations
-    )
-    
-    # Track evaluation results with detailed analysis
-    results = {
-        'completed': [],
-        'failed': [],
-        'by_category': {},
-        'by_difficulty': {},
-        'timing': [],
-    }
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'\n🔗 Client connected: {client.id}')
-        print(f'   Processing {stack.size()} evaluations...')
-        
-        start_time = time.time()
-        processed = 0
-        
-        while not stack.is_empty():
-            evaluation = stack.pop()
-            if not evaluation:
-                break
-            
-            processed += 1
-            eval_start = time.time()
-            
-            print(f'\n📋 [{processed}] {evaluation["name"]}')
-            print(f'   Category: {evaluation["metadata"].get("category", "unknown")}')
-            print(f'   Tool: {evaluation["tool"]}')
-            
-            try:
-                # Use concurrency-limited evaluation
-                result = await server.evaluate_with_concurrency_limit(
-                    client, 
-                    evaluation,
-                    timeout=evaluation.get("timeout", 30.0)
-                )
-                
-                eval_duration = time.time() - eval_start
-                
-                # Record successful result
-                result_record = {
-                    'evaluation': evaluation,
-                    'result': result,
-                    'duration': eval_duration,
-                    'client_id': client.id,
-                    'timestamp': time.time(),
-                }
-                results['completed'].append(result_record)
-                
-                # Update category stats
-                category = evaluation["metadata"].get("category", "unknown")
-                if category not in results['by_category']:
-                    results['by_category'][category] = {'completed': 0, 'failed': 0}
-                results['by_category'][category]['completed'] += 1
-                
-                # Update difficulty stats
-                difficulty = evaluation["metadata"].get("difficulty", "unknown")
-                if difficulty not in results['by_difficulty']:
-                    results['by_difficulty'][difficulty] = {'completed': 0, 'failed': 0}
-                results['by_difficulty'][difficulty]['completed'] += 1
-                
-                # Record timing
-                results['timing'].append(eval_duration)
-                
-                print(f'   ✅ Completed in {eval_duration:.2f}s')
-                
-                # Show preview of response
-                if "output" in result and "response" in result["output"]:
-                    response = result["output"]["response"]
-                    preview = response[:150] + "..." if len(response) > 150 else response
-                    print(f'   💬 "{preview}"')
-                
-            except Exception as e:
-                eval_duration = time.time() - eval_start
-                
-                # Record failed result
-                failure_record = {
-                    'evaluation': evaluation,
-                    'error': str(e),
-                    'duration': eval_duration,
-                    'client_id': client.id,
-                    'timestamp': time.time(),
-                }
-                results['failed'].append(failure_record)
-                
-                # Update stats
-                category = evaluation["metadata"].get("category", "unknown")
-                if category not in results['by_category']:
-                    results['by_category'][category] = {'completed': 0, 'failed': 0}
-                results['by_category'][category]['failed'] += 1
-                
-                difficulty = evaluation["metadata"].get("difficulty", "unknown")
-                if difficulty not in results['by_difficulty']:
-                    results['by_difficulty'][difficulty] = {'completed': 0, 'failed': 0}
-                results['by_difficulty'][difficulty]['failed'] += 1
-                
-                print(f'   ❌ Failed after {eval_duration:.2f}s: {e}')
-        
-        total_duration = time.time() - start_time
-        print(f'\n🏁 Batch completed in {total_duration:.2f}s')
-        print(f'   Processed: {processed}')
-        print(f'   Success rate: {len(results["completed"])/processed*100:.1f}%')
-        
-        # Send detailed completion message
-        await client.send_message({
-            "type": "batch_analysis",
-            "total_processed": processed,
-            "completed": len(results['completed']),
-            "failed": len(results['failed']),
-            "duration": total_duration,
-            "average_eval_time": sum(results['timing']) / len(results['timing']) if results['timing'] else 0,
-            "categories": list(results['by_category'].keys()),
-        })
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'\n🔌 Client disconnected: {client_info["id"]}')
-        
-        # Show detailed analysis
-        total = len(results['completed']) + len(results['failed'])
-        if total > 0:
-            print(f'\n📈 Final Analysis:')
-            print(f'   Total evaluations: {total}')
-            print(f'   Successful: {len(results["completed"])} ({len(results["completed"])/total*100:.1f}%)')
-            print(f'   Failed: {len(results["failed"])} ({len(results["failed"])/total*100:.1f}%)')
-            
-            if results['timing']:
-                avg_time = sum(results['timing']) / len(results['timing'])
-                min_time = min(results['timing'])
-                max_time = max(results['timing'])
-                print(f'   Average time: {avg_time:.2f}s (min: {min_time:.2f}s, max: {max_time:.2f}s)')
-            
-            print(f'\n📊 By Category:')
-            for category, stats in results['by_category'].items():
-                total_cat = stats['completed'] + stats['failed']
-                success_rate = stats['completed'] / total_cat * 100 if total_cat > 0 else 0
-                print(f'   {category}: {total_cat} total, {success_rate:.1f}% success')
-            
-            if any(results['by_difficulty'].values()):
-                print(f'\n🎯 By Difficulty:')
-                for difficulty, stats in results['by_difficulty'].items():
-                    if difficulty != "unknown":
-                        total_diff = stats['completed'] + stats['failed']
-                        success_rate = stats['completed'] / total_diff * 100 if total_diff > 0 else 0
-                        print(f'   {difficulty}: {total_diff} total, {success_rate:.1f}% success')
-    
-    # Start server
-    try:
-        await server.start()
-        print(f'\n🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your agent client to start processing evaluations')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/examples/with_stack.py b/eval-server/python/examples/with_stack.py
deleted file mode 100644
index f4b5d20..0000000
--- a/eval-server/python/examples/with_stack.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-"""
-EvalServer with EvaluationStack example.
-
-This example demonstrates using an EvaluationStack to queue evaluations
-and distribute them across multiple client connections.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer, EvaluationStack
-
-
-def create_sample_evaluations():
-    """Create evaluations matching NodeJS multiple-evals.js exactly."""
-    evaluations = [
-        {
-            "id": "math_eval",
-            "name": "Basic Math Problem",
-            "description": "Simple arithmetic evaluation",
-            "tool": "chat",
-            "input": {
-                "message": "What is 15 * 7 + 23? Please show your calculation steps."
-            }
-        },
-        {
-            "id": "geography_eval", 
-            "name": "Capital of France",
-            "description": "Geography knowledge test",
-            "tool": "chat",
-            "input": {
-                "message": "What is the capital of France?"
-            }
-        },
-        {
-            "id": "creative_eval",
-            "name": "Creative Writing",
-            "description": "Short creative writing task",
-            "tool": "chat", 
-            "input": {
-                "message": "Write a two-sentence story about a robot discovering friendship."
-            }
-        },
-        {
-            "id": "tech_eval",
-            "name": "Technology Knowledge",  
-            "description": "Basic technology concepts",
-            "tool": "chat",
-            "input": {
-                "message": "Explain what HTTP stands for and what it's used for in simple terms."
-            }
-        }
-    ]
-    return evaluations
-
-
-async def main():
-    """Main example function for evaluation stack usage."""
-    # Create evaluation stack and populate it
-    stack = EvaluationStack()
-    sample_evaluations = create_sample_evaluations()
-    
-    print(f"📚 Created {len(sample_evaluations)} sample evaluations")
-    
-    # Add evaluations to stack (LIFO order)
-    for evaluation in sample_evaluations:
-        stack.push(evaluation)
-        print(f"   ➕ Added: {evaluation['name']}")
-    
-    print(f"📊 Stack size: {stack.size()}")
-    print(f"🔝 Top evaluation: {stack.peek()['name'] if stack.peek() else 'None'}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key='stack-demo',
-        host='127.0.0.1',
-        port=8080,
-        log_level='INFO',
-        log_dir='./logs',
-    )
-    
-    # Track processed evaluations
-    completed_evaluations = []
-    failed_evaluations = []
-    
-    @server.on_connect
-    async def handle_client(client):
-        print('🎉 CLIENT CONNECTED!')
-        print(f'   - Client ID: {client.id}')
-        print(f'   - Client tabId: {client.tab_id}')
-        print(f'   - Client info: {client.get_info()}')
-        
-        # Check if we have evaluations left in the stack
-        if stack.is_empty():
-            print('⚠️  No more evaluations in stack for this client')
-            print('   Consider refilling the stack or handling this scenario')
-            return
-        
-        # Pop the next evaluation from the stack (ONE evaluation per client!)
-        evaluation = stack.pop()
-        print(f'📋 Assigning evaluation: "{evaluation["name"]}" ({evaluation["id"]})')
-        print(f'📊 Remaining evaluations in stack: {stack.size()}')
-        
-        try:
-            print('🔄 Starting evaluation...')
-            result = await client.evaluate(evaluation)
-            
-            print('✅ Evaluation completed!')
-            print(f'📊 Response for "{evaluation["name"]}": {result}')
-            
-            completed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'result': result,
-            })
-            
-        except Exception as e:
-            print(f'❌ Evaluation "{evaluation["name"]}" failed: {e}')
-            
-            failed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'error': str(e),
-            })
-        
-        # Send completion message
-        try:
-            await client.send_message({
-                "type": "evaluation_complete",
-                "evaluation_id": evaluation["id"],
-                "evaluation_name": evaluation["name"],
-                "status": "completed" if evaluation["id"] not in [e['evaluation']['id'] for e in failed_evaluations] else "failed"
-            })
-        except Exception as e:
-            print(f'   ⚠️  Failed to send completion message: {e}')
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'\n🔌 Client disconnected: {client_info["id"]}')
-        
-        # Show final statistics
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations) 
-        remaining = stack.size()
-        
-        print(f'\n📊 Final Statistics:')
-        print(f'   ✅ Completed: {total_completed}')
-        print(f'   ❌ Failed: {total_failed}')
-        print(f'   📚 Remaining: {remaining}')
-        
-        if completed_evaluations:
-            print(f'\n🎯 Completed Evaluations:')
-            for item in completed_evaluations:
-                eval_name = item['evaluation']['name']
-                client_id = item['client_id'][:8]  # Short client ID
-                print(f'   • {eval_name} (client: {client_id})')
-        
-        if failed_evaluations:
-            print(f'\n💥 Failed Evaluations:')
-            for item in failed_evaluations:
-                eval_name = item['evaluation']['name']
-                error = item['error']
-                print(f'   • {eval_name}: {error}')
-    
-    # Start server
-    try:
-        await server.start()
-        print(f'\n🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your agent client to start processing evaluations')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/logs/.gitignore b/eval-server/python/logs/.gitignore
deleted file mode 100644
index 326f777..0000000
--- a/eval-server/python/logs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.log
-*.jsonl
\ No newline at end of file
diff --git a/eval-server/python/pyproject.toml b/eval-server/python/pyproject.toml
deleted file mode 100644
index 83d30ee..0000000
--- a/eval-server/python/pyproject.toml
+++ /dev/null
@@ -1,84 +0,0 @@
-[build-system]
-requires = ["setuptools>=61.0", "wheel"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "bo-eval-server"
-version = "1.0.0"
-description = "WebSocket server for evaluating LLM agents - Python implementation"
-readme = "README.md"
-license = {text = "MIT"}
-authors = [
-    {name = "Browser Operator Team"}
-]
-classifiers = [
-    "Development Status :: 4 - Beta",
-    "Intended Audience :: Developers",
-    "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Topic :: Software Development :: Libraries :: Python Modules",
-    "Topic :: System :: Networking",
-]
-keywords = ["websocket", "llm", "evaluation", "rpc", "library", "programmatic"]
-requires-python = ">=3.8"
-dependencies = [
-    "websockets>=11.0.0",
-    "loguru>=0.7.0",
-    "pandas>=2.0.0",
-    "requests>=2.31.0",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest>=7.0.0",
-    "pytest-asyncio>=0.21.0",
-    "black>=23.0.0",
-    "mypy>=1.0.0",
-]
-
-[project.urls]
-Homepage = "https://github.com/chromium/devtools-frontend"
-Repository = "https://github.com/chromium/devtools-frontend"
-Issues = "https://github.com/chromium/devtools-frontend/issues"
-
-[project.scripts]
-bo-eval-basic = "scripts:run_basic_server"
-bo-eval-stack = "scripts:run_with_stack"
-bo-eval-programmatic = "scripts:run_programmatic_evals"
-
-[tool.setuptools.packages.find]
-where = ["src"]
-
-[tool.setuptools.package-data]
-"*" = ["*.md", "*.txt", "*.yaml", "*.json"]
-
-[tool.black]
-line-length = 88
-target-version = ['py38']
-
-[tool.mypy]
-python_version = "3.8"
-warn_return_any = true
-warn_unused_configs = true
-disallow_untyped_defs = true
-
-[tool.pytest.ini_options]
-asyncio_mode = "auto"
-testpaths = ["tests"]
-python_files = ["test_*.py"]
-python_classes = ["Test*"]
-python_functions = ["test_*"]
-
-[dependency-groups]
-dev = [
-    "black>=24.8.0",
-    "mypy>=1.14.1",
-    "pytest>=8.3.5",
-    "pytest-asyncio>=0.24.0",
-]
diff --git a/eval-server/python/quick_test.py b/eval-server/python/quick_test.py
deleted file mode 100644
index 5bf5b9a..0000000
--- a/eval-server/python/quick_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""Quick test to see what's happening with the server."""
-
-import asyncio
-import json
-import websockets
-
-async def test_server():
-    print("🔗 Testing server connection...")
-    try:
-        async with websockets.connect('ws://127.0.0.1:8080') as ws:
-            print("✅ Connected to server")
-            
-            # Wait for welcome message
-            print("⏳ Waiting for welcome message...")
-            welcome = await asyncio.wait_for(ws.recv(), timeout=5.0)
-            print(f"📥 Welcome: {welcome}")
-            
-            # Send registration
-            registration = {
-                "type": "register",
-                "clientId": "test-client-123",
-                "secretKey": "hello",
-                "capabilities": ["chat"]
-            }
-            print(f"📤 Sending registration: {json.dumps(registration)}")
-            await ws.send(json.dumps(registration))
-            
-            # Wait for ack
-            print("⏳ Waiting for registration ack...")
-            ack = await asyncio.wait_for(ws.recv(), timeout=5.0)
-            print(f"📥 Registration ack: {ack}")
-            
-    except Exception as e:
-        print(f"❌ Error: {e}")
-
-if __name__ == "__main__":
-    asyncio.run(test_server())
\ No newline at end of file
diff --git a/eval-server/python/requirements.txt b/eval-server/python/requirements.txt
deleted file mode 100644
index e9fc8ca..0000000
--- a/eval-server/python/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Core dependencies
-websockets>=11.0.0
-loguru>=0.7.0
-
-# Development dependencies (optional)
-# Install with: pip install -e ".[dev]"
-# pytest>=7.0.0
-# pytest-asyncio>=0.21.0
-# black>=23.0.0
-# mypy>=1.0.0
\ No newline at end of file
diff --git a/eval-server/python/run.py b/eval-server/python/run.py
deleted file mode 100644
index 407cd68..0000000
--- a/eval-server/python/run.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple script runner for bo-eval-server examples.
-
-Usage:
-    python run.py basic      # Run basic server example
-    python run.py stack      # Run evaluation stack example  
-    python run.py prog       # Run programmatic evaluations example
-    python run.py all        # Show all available examples
-"""
-
-import subprocess
-import sys
-from pathlib import Path
-
-
-def run_with_uv(script_path: str, description: str):
-    """Run a Python script using uv."""
-    print(f"🚀 {description}")
-    print(f"   Running: uv run python {script_path}")
-    print("-" * 50)
-    
-    try:
-        # Ensure logs directory exists
-        logs_dir = Path("logs")
-        logs_dir.mkdir(exist_ok=True)
-        
-        # Run the script with uv
-        result = subprocess.run([
-            "uv", "run", "python", script_path
-        ], cwd=Path(__file__).parent)
-        
-        return result.returncode
-        
-    except KeyboardInterrupt:
-        print("\n🛑 Interrupted by user")
-        return 130
-    except FileNotFoundError:
-        print("❌ Error: 'uv' command not found. Please install uv first:")
-        print("   curl -LsSf https://astral.sh/uv/install.sh | sh")
-        return 1
-    except Exception as e:
-        print(f"💥 Error running script: {e}")
-        return 1
-
-
-def show_examples():
-    """Show all available examples."""
-    print("📚 Available Examples:")
-    print()
-    print("🔧 basic      - Basic WebSocket server setup")
-    print("             Simple server that connects to one client and runs a single evaluation")
-    print()
-    print("📚 stack      - Evaluation stack usage")
-    print("             Demonstrates LIFO queue for managing multiple evaluations")
-    print()
-    print("🏭 prog       - Programmatic evaluation creation")
-    print("             Advanced example with dynamic evaluation generation and analytics")
-    print()
-    print("Usage:")
-    print("  python run.py basic")
-    print("  python run.py stack")
-    print("  python run.py prog")
-    print()
-    print("Or with uv directly:")
-    print("  uv run python examples/basic_server.py")
-    print("  uv run python examples/with_stack.py")
-    print("  uv run python examples/programmatic_evals.py")
-
-
-def main():
-    """Main entry point."""
-    if len(sys.argv) != 2:
-        print("Usage: python run.py [basic|stack|prog|all]")
-        print("       python run.py all    # Show all examples")
-        sys.exit(1)
-    
-    command = sys.argv[1].lower()
-    
-    examples = {
-        "basic": ("examples/basic_server.py", "Basic WebSocket Server Example"),
-        "stack": ("examples/with_stack.py", "Evaluation Stack Example"),
-        "prog": ("examples/programmatic_evals.py", "Programmatic Evaluations Example"),
-        "programmatic": ("examples/programmatic_evals.py", "Programmatic Evaluations Example"),
-    }
-    
-    if command == "all":
-        show_examples()
-        return 0
-    elif command in examples:
-        script_path, description = examples[command]
-        return run_with_uv(script_path, description)
-    else:
-        print(f"❌ Unknown command: {command}")
-        print("Available commands: basic, stack, prog, all")
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
diff --git a/eval-server/python/scripts.py b/eval-server/python/scripts.py
deleted file mode 100644
index b57377d..0000000
--- a/eval-server/python/scripts.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script runner for bo-eval-server examples using uv.
-
-This module provides entry points for running examples with uv.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add the examples directory to path
-examples_dir = Path(__file__).parent / "examples"
-sys.path.insert(0, str(examples_dir))
-
-
-def run_basic_server():
-    """Run the basic server example."""
-    from examples.basic_server import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-def run_with_stack():
-    """Run the evaluation stack example."""
-    from examples.with_stack import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-def run_programmatic_evals():
-    """Run the programmatic evaluations example."""
-    from examples.programmatic_evals import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python scripts.py [basic|stack|programmatic]")
-        sys.exit(1)
-    
-    script = sys.argv[1]
-    if script == "basic":
-        run_basic_server()
-    elif script == "stack":
-        run_with_stack()
-    elif script == "programmatic":
-        run_programmatic_evals()
-    else:
-        print(f"Unknown script: {script}")
-        print("Available scripts: basic, stack, programmatic")
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/__init__.py b/eval-server/python/src/bo_eval_server/__init__.py
deleted file mode 100644
index 3a8b6aa..0000000
--- a/eval-server/python/src/bo_eval_server/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-bo-eval-server: A minimal Python library for WebSocket-based LLM agent evaluation servers.
-
-This package provides core functionality for:
-- WebSocket server for agent connections
-- JSON-RPC 2.0 bidirectional communication  
-- Evaluation stack for managing evaluation queues
-- Enhanced logging and client management
-"""
-
-from .eval_server import EvalServer
-from .evaluation_stack import EvaluationStack
-from .client_manager import ClientManager, ClientProxy
-from .rpc_client import RpcClient
-from .config import Config
-from .logger import setup_logger
-
-__version__ = "1.0.0"
-__author__ = "Browser Operator Team"
-
-__all__ = [
-    "EvalServer",
-    "EvaluationStack", 
-    "ClientManager",
-    "ClientProxy",
-    "RpcClient",
-    "Config",
-    "setup_logger",
-]
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/client_manager.py b/eval-server/python/src/bo_eval_server/client_manager.py
deleted file mode 100644
index 280f676..0000000
--- a/eval-server/python/src/bo_eval_server/client_manager.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""
-Client management for WebSocket connections.
-
-Handles client registration, authentication, and provides a proxy interface
-for interacting with connected agents.
-"""
-
-import asyncio
-import json
-import time
-import uuid
-from typing import Dict, Any, Optional, List, Callable, Awaitable
-
-import websockets
-from loguru import logger
-
-from .rpc_client import RpcClient, RpcError, RpcTimeoutError
-from .logger import log_connection, log_evaluation
-
-
-class ClientProxy:
-    """Proxy object for interacting with a connected agent."""
-    
-    def __init__(
-        self,
-        client_id: str,
-        websocket: websockets.WebSocketServerProtocol,
-        rpc_client: RpcClient,
-        tab_id: Optional[str] = None,
-        base_client_id: Optional[str] = None,
-        capabilities: Optional[List[str]] = None,
-    ):
-        """
-        Initialize client proxy.
-        
-        Args:
-            client_id: Unique client identifier
-            websocket: WebSocket connection
-            rpc_client: RPC client for method calls
-            tab_id: Browser tab ID (if applicable)
-            base_client_id: Base client ID for grouping
-            capabilities: List of agent capabilities
-        """
-        self.id = client_id
-        self.tab_id = tab_id
-        self.base_client_id = base_client_id or client_id
-        self.capabilities = capabilities or []
-        self._websocket = websocket
-        self._rpc_client = rpc_client
-        self._connected_at = time.time()
-    
-    async def evaluate(
-        self,
-        evaluation: Dict[str, Any],
-        timeout: Optional[float] = None,
-    ) -> Dict[str, Any]:
-        """
-        Execute an evaluation on the connected agent.
-        
-        Args:
-            evaluation: Evaluation object with required fields
-            timeout: Optional timeout override
-            
-        Returns:
-            Evaluation result from the agent
-            
-        Raises:
-            ValueError: If evaluation is invalid
-            RpcError: If the RPC call fails
-            RpcTimeoutError: If the call times out
-        """
-        # Validate evaluation object
-        required_fields = ['id', 'name', 'tool', 'input']
-        for field in required_fields:
-            if field not in evaluation:
-                raise ValueError(f"Evaluation missing required field: {field}")
-        
-        evaluation_id = evaluation['id']
-        start_time = time.time()
-        
-        try:
-            # Log evaluation start
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="started",
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            
-            # Make RPC call to agent
-            result = await self._rpc_client.call(
-                method="evaluate",
-                params=evaluation,
-                timeout=timeout,
-                client_id=self.id,
-            )
-            
-            duration = time.time() - start_time
-            
-            # Log evaluation completion
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="completed",
-                duration=duration,
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            
-            return result
-            
-        except RpcTimeoutError:
-            duration = time.time() - start_time
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="timeout",
-                duration=duration,
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            raise
-            
-        except Exception as e:
-            duration = time.time() - start_time
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="failed",
-                duration=duration,
-                error=str(e),
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            raise
-    
-    async def send_message(self, message: Dict[str, Any]) -> None:
-        """
-        Send a custom message to the connected agent.
-        
-        Args:
-            message: Message object to send
-        """
-        try:
-            await self._websocket.send(json.dumps(message))
-        except Exception as e:
-            logger.error(f"Failed to send message to client {self.id}: {e}")
-            raise
-    
-    def get_info(self) -> Dict[str, Any]:
-        """
-        Get client information.
-        
-        Returns:
-            Dictionary with client details
-        """
-        return {
-            'id': self.id,
-            'tab_id': self.tab_id,
-            'base_client_id': self.base_client_id,
-            'capabilities': self.capabilities,
-            'connected_at': self._connected_at,
-            'connected': self._rpc_client.is_connected(),
-        }
-    
-    def is_connected(self) -> bool:
-        """Check if the client is still connected."""
-        return self._rpc_client.is_connected()
-    
-    def __repr__(self) -> str:
-        """String representation of the client proxy."""
-        return f"ClientProxy(id={self.id}, connected={self.is_connected()})"
-
-
-class ClientManager:
-    """Manages WebSocket client connections and authentication."""
-    
-    def __init__(self, auth_key: str, rpc_timeout: float = 1500.0):
-        """
-        Initialize client manager.
-        
-        Args:
-            auth_key: Required authentication key for clients
-            rpc_timeout: Default RPC timeout in seconds
-        """
-        self.auth_key = auth_key
-        self.rpc_timeout = rpc_timeout
-        self._clients: Dict[str, ClientProxy] = {}
-        self._pending_connections: Dict[str, Dict[str, Any]] = {}
-        
-        # Event handlers
-        self._on_connect_handler: Optional[Callable[[ClientProxy], Awaitable[None]]] = None
-        self._on_disconnect_handler: Optional[Callable[[Dict[str, Any]], Awaitable[None]]] = None
-    
-    def on_connect(self, handler: Callable[[ClientProxy], Awaitable[None]]) -> None:
-        """Set the handler for client connections."""
-        self._on_connect_handler = handler
-    
-    def on_disconnect(self, handler: Callable[[Dict[str, Any]], Awaitable[None]]) -> None:
-        """Set the handler for client disconnections."""
-        self._on_disconnect_handler = handler
-    
-    async def handle_connection(self, websocket: websockets.WebSocketServerProtocol) -> None:
-        """
-        Handle a new WebSocket connection - matches NodeJS EvalServer flow.
-        
-        Args:
-            websocket: WebSocket connection
-        """
-        connection_id = str(uuid.uuid4())
-        client_proxy: Optional[ClientProxy] = None
-        
-        try:
-            # Send welcome message immediately (like NodeJS)
-            welcome_message = {
-                'type': 'welcome',
-                'serverId': 'python-eval-server-001',
-                'version': '1.0.0',
-                'timestamp': time.time()
-            }
-            logger.debug(f"Sending welcome message to connection {connection_id}")
-            await websocket.send(json.dumps(welcome_message))
-            
-            # Wait for registration message
-            client_proxy = await self._authenticate_client(websocket, connection_id)
-            
-            if client_proxy:
-                # Start RPC client
-                await client_proxy._rpc_client.start()
-                
-                # Add to active clients
-                self._clients[client_proxy.id] = client_proxy
-                
-                # Call connection handler
-                if self._on_connect_handler:
-                    await self._on_connect_handler(client_proxy)
-                
-                # Keep connection alive until closed
-                await client_proxy._rpc_client._message_handler_task
-                
-        except websockets.exceptions.ConnectionClosed:
-            logger.debug(f"WebSocket connection closed: {connection_id}")
-        except Exception as e:
-            logger.error(f"Error handling connection {connection_id}: {e}")
-        finally:
-            # Clean up on disconnect
-            if client_proxy:
-                await self._handle_disconnect(client_proxy)
-    
-    async def _authenticate_client(
-        self,
-        websocket: websockets.WebSocketServerProtocol,
-        connection_id: str,
-    ) -> Optional[ClientProxy]:
-        """Authenticate and register a client connection - matches NodeJS implementation."""
-        try:
-            logger.debug(f"Waiting for registration message from connection {connection_id}")
-            # Wait for registration message with timeout
-            message = await asyncio.wait_for(websocket.recv(), timeout=30.0)
-            logger.debug(f"Received message from {connection_id}: {message}")
-            data = json.loads(message)
-            
-            if data.get('type') != 'register':
-                logger.warning(f"Invalid first message from {connection_id}: expected 'register', got '{data.get('type')}'")
-                await websocket.send(json.dumps({
-                    'type': 'registration_ack',
-                    'status': 'rejected',
-                    'message': 'First message must be registration'
-                }))
-                return None
-            
-            # Auto-accept clients like NodeJS does (NodeJS auto-creates client configs)
-            # For simplicity, we'll accept any client with the correct secret key or no secret key
-            if 'secretKey' in data:
-                if data.get('secretKey') != self.auth_key:
-                    logger.warning(f"Invalid auth key from {connection_id}: expected '{self.auth_key}', got '{data.get('secretKey')}'")
-                    await websocket.send(json.dumps({
-                        'type': 'registration_ack',
-                        'clientId': data.get('clientId', str(uuid.uuid4())),
-                        'status': 'rejected',
-                        'message': 'Invalid authentication key'
-                    }))
-                    return None
-                else:
-                    logger.debug(f"Valid secret key provided by {connection_id}")
-            else:
-                logger.debug(f"No secret key provided by {connection_id}, accepting anyway")
-            
-            client_id = data.get('clientId', str(uuid.uuid4()))
-            tab_id = data.get('tabId')
-            base_client_id = data.get('baseClientId')
-            capabilities = data.get('capabilities', [])
-            
-            logger.info(f"Registering client {client_id} from connection {connection_id}")
-            logger.debug(f"Client capabilities: {capabilities}")
-            
-            # Send registration acknowledgment
-            registration_response = {
-                'type': 'registration_ack',
-                'clientId': client_id,
-                'status': 'accepted',
-                'message': 'Client registered successfully'
-            }
-            logger.debug(f"Sending registration ack to {client_id}: {registration_response}")
-            await websocket.send(json.dumps(registration_response))
-            
-            # Wait for ready signal
-            logger.debug(f"Waiting for ready signal from client {client_id}")
-            ready_message = await asyncio.wait_for(websocket.recv(), timeout=30.0)
-            logger.debug(f"Received ready message from {client_id}: {ready_message}")
-            ready_data = json.loads(ready_message)
-            
-            if ready_data.get('type') != 'ready':
-                logger.warning(f"Invalid ready message from {client_id}: expected 'ready', got '{ready_data.get('type')}'")
-                await websocket.send(json.dumps({
-                    'type': 'error',
-                    'message': 'Expected ready signal after registration'
-                }))
-                return None
-            
-            logger.info(f"Client {client_id} is ready for evaluations")
-            
-            # Create RPC client and proxy
-            rpc_client = RpcClient(websocket, self.rpc_timeout)
-            client_proxy = ClientProxy(
-                client_id=client_id,
-                websocket=websocket,
-                rpc_client=rpc_client,
-                tab_id=tab_id,
-                base_client_id=base_client_id,
-                capabilities=capabilities,
-            )
-            
-            # Log successful connection
-            log_connection(
-                event="connect",
-                client_id=client_id,
-                tab_id=tab_id,
-                base_client_id=base_client_id,
-                capabilities=capabilities,
-            )
-            
-            return client_proxy
-            
-        except asyncio.TimeoutError:
-            logger.warning(f"Client registration timeout: {connection_id}")
-            return None
-        except json.JSONDecodeError:
-            logger.warning(f"Invalid JSON in registration: {connection_id}")
-            return None
-        except Exception as e:
-            logger.error(f"Error during client authentication: {e}")
-            return None
-    
-    async def _handle_disconnect(self, client_proxy: ClientProxy) -> None:
-        """Handle client disconnection cleanup."""
-        client_id = client_proxy.id
-        
-        # Remove from active clients
-        self._clients.pop(client_id, None)
-        
-        # Stop RPC client
-        await client_proxy._rpc_client.stop()
-        
-        # Get client info for disconnect handler
-        client_info = client_proxy.get_info()
-        
-        # Log disconnection
-        log_connection(
-            event="disconnect",
-            client_id=client_id,
-            tab_id=client_proxy.tab_id,
-            base_client_id=client_proxy.base_client_id,
-        )
-        
-        # Call disconnect handler
-        if self._on_disconnect_handler:
-            try:
-                await self._on_disconnect_handler(client_info)
-            except Exception as e:
-                logger.error(f"Error in disconnect handler: {e}")
-    
-    def get_clients(self) -> List[ClientProxy]:
-        """Get list of connected clients."""
-        return list(self._clients.values())
-    
-    def get_client(self, client_id: str) -> Optional[ClientProxy]:
-        """Get a specific client by ID."""
-        return self._clients.get(client_id)
-    
-    def get_status(self) -> Dict[str, Any]:
-        """Get client manager status."""
-        return {
-            'connected_clients': len(self._clients),
-            'client_ids': list(self._clients.keys()),
-        }
-    
-    def __repr__(self) -> str:
-        """String representation of the client manager."""
-        return f"ClientManager(clients={len(self._clients)})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/config.py b/eval-server/python/src/bo_eval_server/config.py
deleted file mode 100644
index 46e72b9..0000000
--- a/eval-server/python/src/bo_eval_server/config.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Configuration management for bo-eval-server.
-
-Handles server configuration with environment variable support.
-"""
-
-import os
-from typing import Optional
-
-
-class Config:
-    """Configuration class for EvalServer with environment variable support."""
-    
-    def __init__(
-        self,
-        host: Optional[str] = None,
-        port: Optional[int] = None,
-        auth_key: Optional[str] = None,
-        log_level: Optional[str] = None,
-        rpc_timeout: Optional[float] = None,
-        max_concurrent_evaluations: Optional[int] = None,
-    ):
-        """
-        Initialize configuration with optional overrides.
-        
-        Args:
-            host: Server host (default: localhost)
-            port: Server port (default: 8080)
-            auth_key: Authentication key for clients
-            log_level: Logging level (default: INFO)
-            rpc_timeout: RPC call timeout in seconds (default: 1500.0)
-            max_concurrent_evaluations: Max concurrent evaluations (default: 10)
-        """
-        self.host = host or os.getenv('BO_EVAL_SERVER_HOST', 'localhost')
-        self.port = int(port or os.getenv('BO_EVAL_SERVER_PORT', '8080'))
-        self.auth_key = auth_key or os.getenv('BO_EVAL_SERVER_AUTH_KEY')
-        self.log_level = log_level or os.getenv('BO_EVAL_SERVER_LOG_LEVEL', 'INFO')
-        self.rpc_timeout = float(
-            rpc_timeout or os.getenv('BO_EVAL_SERVER_RPC_TIMEOUT', '1500.0')
-        )
-        self.max_concurrent_evaluations = int(
-            max_concurrent_evaluations or 
-            os.getenv('BO_EVAL_SERVER_MAX_CONCURRENT', '10')
-        )
-    
-    def validate(self) -> None:
-        """Validate configuration parameters."""
-        if not self.auth_key:
-            raise ValueError("auth_key is required for server authentication")
-        
-        if not isinstance(self.port, int) or self.port <= 0 or self.port > 65535:
-            raise ValueError(f"Invalid port: {self.port}")
-        
-        if self.rpc_timeout <= 0:
-            raise ValueError(f"Invalid RPC timeout: {self.rpc_timeout}")
-        
-        if self.max_concurrent_evaluations <= 0:
-            raise ValueError(
-                f"Invalid max_concurrent_evaluations: {self.max_concurrent_evaluations}"
-            )
-    
-    def to_dict(self) -> dict:
-        """Convert configuration to dictionary."""
-        return {
-            'host': self.host,
-            'port': self.port,
-            'auth_key': '***' if self.auth_key else None,  # Hide sensitive data
-            'log_level': self.log_level,
-            'rpc_timeout': self.rpc_timeout,
-            'max_concurrent_evaluations': self.max_concurrent_evaluations,
-        }
-    
-    def __repr__(self) -> str:
-        """String representation of configuration."""
-        return f"Config({self.to_dict()})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/eval_server.py b/eval-server/python/src/bo_eval_server/eval_server.py
deleted file mode 100644
index 9f6ccb7..0000000
--- a/eval-server/python/src/bo_eval_server/eval_server.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""
-EvalServer - Main WebSocket server for LLM agent evaluations.
-
-A library-first evaluation server that accepts connections from AI agents,
-sends them evaluation tasks via RPC calls, and collects their responses.
-"""
-
-import asyncio
-from typing import Dict, Any, Optional, Callable, Awaitable, List
-
-import websockets
-from loguru import logger
-
-from .config import Config
-from .client_manager import ClientManager, ClientProxy
-from .logger import setup_logger, log_server_event
-
-
-class EvalServer:
-    """
-    Main evaluation server class for managing WebSocket connections and evaluations.
-    
-    Example usage:
-    ```python
-    server = EvalServer(
-        auth_key='your-secret-key',
-        host='127.0.0.1', 
-        port=8080
-    )
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        result = await client.evaluate({
-            "id": "test_eval",
-            "name": "Test Evaluation",
-            "tool": "chat",
-            "input": {"message": "Hello world"}
-        })
-        
-        print(f'Response: {result}')
-    
-    await server.start()
-    await server.wait_closed()
-    ```
-    """
-    
-    def __init__(
-        self,
-        auth_key: str,
-        host: str = 'localhost',
-        port: int = 8080,
-        rpc_timeout: float = 1500.0,
-        log_level: str = 'INFO',
-        log_dir: Optional[str] = None,
-        max_concurrent_evaluations: int = 10,
-    ):
-        """
-        Initialize the evaluation server.
-        
-        Args:
-            auth_key: Required authentication key for client connections
-            host: Server host address
-            port: Server port number  
-            rpc_timeout: Default timeout for RPC calls in seconds
-            log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
-            log_dir: Directory for log files (optional)
-            max_concurrent_evaluations: Maximum concurrent evaluations
-        """
-        # Create and validate configuration
-        self.config = Config(
-            host=host,
-            port=port,
-            auth_key=auth_key,
-            log_level=log_level,
-            rpc_timeout=rpc_timeout,
-            max_concurrent_evaluations=max_concurrent_evaluations,
-        )
-        self.config.validate()
-        
-        # Setup logging
-        setup_logger(
-            log_level=self.config.log_level,
-            log_dir=log_dir,
-        )
-        
-        # Initialize client manager
-        self.client_manager = ClientManager(
-            auth_key=self.config.auth_key,
-            rpc_timeout=self.config.rpc_timeout,
-        )
-        
-        # Server state
-        self._server: Optional[websockets.WebSocketServer] = None
-        self._running = False
-        self._start_time: Optional[float] = None
-        
-        # Evaluation concurrency control
-        self._evaluation_semaphore = asyncio.Semaphore(
-            self.config.max_concurrent_evaluations
-        )
-    
-    def on_connect(self, handler: Callable[[ClientProxy], Awaitable[None]]) -> Callable:
-        """
-        Decorator to set the client connection handler.
-        
-        Args:
-            handler: Async function to call when a client connects
-            
-        Returns:
-            The handler function (for decorator use)
-        """
-        self.client_manager.on_connect(handler)
-        return handler
-    
-    def on_disconnect(self, handler: Callable[[Dict[str, Any]], Awaitable[None]]) -> Callable:
-        """
-        Decorator to set the client disconnection handler.
-        
-        Args:
-            handler: Async function to call when a client disconnects
-            
-        Returns:
-            The handler function (for decorator use)
-        """
-        self.client_manager.on_disconnect(handler)
-        return handler
-    
-    async def start(self) -> None:
-        """
-        Start the WebSocket server.
-        
-        Raises:
-            RuntimeError: If server is already running
-            OSError: If unable to bind to the specified host/port
-        """
-        if self._running:
-            raise RuntimeError("Server is already running")
-        
-        try:
-            logger.info(f"Starting EvalServer on {self.config.host}:{self.config.port}")
-            
-            # Start WebSocket server
-            self._server = await websockets.serve(
-                self.client_manager.handle_connection,
-                self.config.host,
-                self.config.port,
-                ping_interval=20,
-                ping_timeout=20,
-                close_timeout=10,
-            )
-            
-            self._running = True
-            self._start_time = asyncio.get_event_loop().time()
-            
-            log_server_event(
-                event="start",
-                host=self.config.host,
-                port=self.config.port,
-                config=self.config.to_dict(),
-            )
-            
-            logger.info(f"EvalServer started successfully on ws://{self.config.host}:{self.config.port}")
-            
-        except Exception as e:
-            logger.error(f"Failed to start server: {e}")
-            log_server_event(event="start_failed", error=str(e))
-            raise
-    
-    async def stop(self) -> None:
-        """
-        Stop the WebSocket server.
-        
-        Raises:
-            RuntimeError: If server is not running
-        """
-        if not self._running:
-            raise RuntimeError("Server is not running")
-        
-        try:
-            logger.info("Stopping EvalServer...")
-            
-            if self._server:
-                self._server.close()
-                await self._server.wait_closed()
-            
-            self._running = False
-            self._start_time = None
-            
-            log_server_event(event="stop")
-            logger.info("EvalServer stopped successfully")
-            
-        except Exception as e:
-            logger.error(f"Error stopping server: {e}")
-            log_server_event(event="stop_failed", error=str(e))
-            raise
-    
-    async def wait_closed(self) -> None:
-        """
-        Wait for the server to be closed.
-        
-        This method blocks until the server is stopped, useful for keeping
-        the server running in the main program.
-        """
-        if not self._running or not self._server:
-            return
-            
-        try:
-            await self._server.wait_closed()
-        except Exception as e:
-            logger.error(f"Error waiting for server closure: {e}")
-    
-    def get_status(self) -> Dict[str, Any]:
-        """
-        Get server status information.
-        
-        Returns:
-            Dictionary with server status details
-        """
-        uptime = None
-        if self._running and self._start_time:
-            uptime = asyncio.get_event_loop().time() - self._start_time
-        
-        return {
-            'running': self._running,
-            'host': self.config.host,
-            'port': self.config.port,
-            'uptime': uptime,
-            'config': self.config.to_dict(),
-            'clients': self.client_manager.get_status(),
-        }
-    
-    def get_clients(self) -> List[ClientProxy]:
-        """
-        Get list of connected clients.
-        
-        Returns:
-            List of ClientProxy objects
-        """
-        return self.client_manager.get_clients()
-    
-    def get_client(self, client_id: str) -> Optional[ClientProxy]:
-        """
-        Get a specific client by ID.
-        
-        Args:
-            client_id: Client identifier
-            
-        Returns:
-            ClientProxy object or None if not found
-        """
-        return self.client_manager.get_client(client_id)
-    
-    async def evaluate_with_concurrency_limit(
-        self,
-        client: ClientProxy,
-        evaluation: Dict[str, Any],
-        timeout: Optional[float] = None,
-    ) -> Dict[str, Any]:
-        """
-        Execute an evaluation with concurrency limiting.
-        
-        Args:
-            client: Client to execute evaluation on
-            evaluation: Evaluation object
-            timeout: Optional timeout override
-            
-        Returns:
-            Evaluation result
-        """
-        async with self._evaluation_semaphore:
-            return await client.evaluate(evaluation, timeout)
-    
-    def is_running(self) -> bool:
-        """Check if the server is currently running."""
-        return self._running
-    
-    def __repr__(self) -> str:
-        """String representation of the server."""
-        status = "running" if self._running else "stopped"
-        return f"EvalServer(status={status}, host={self.config.host}, port={self.config.port})"
-    
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self.start()
-        return self
-    
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        if self._running:
-            await self.stop()
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/evaluation_stack.py b/eval-server/python/src/bo_eval_server/evaluation_stack.py
deleted file mode 100644
index 1ad5078..0000000
--- a/eval-server/python/src/bo_eval_server/evaluation_stack.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-EvaluationStack - A simple stack-like structure for managing evaluations.
-
-Provides LIFO (Last In, First Out) access to evaluation objects.
-Useful for distributing different evaluations across multiple client connections.
-"""
-
-from typing import Dict, Any, List, Optional
-
-
-class EvaluationStack:
-    """A LIFO stack for managing evaluation objects."""
-    
-    def __init__(self) -> None:
-        """Initialize an empty evaluation stack."""
-        self._evaluations: List[Dict[str, Any]] = []
-    
-    def push(self, evaluation: Dict[str, Any]) -> None:
-        """
-        Add an evaluation to the top of the stack.
-        
-        Args:
-            evaluation: The evaluation object to add
-            
-        Raises:
-            ValueError: If evaluation is invalid or missing required fields
-        """
-        if not evaluation or not isinstance(evaluation, dict):
-            raise ValueError('Evaluation must be a valid dictionary')
-        
-        # Validate required fields
-        required_fields = ['id', 'name', 'tool', 'input']
-        for field in required_fields:
-            if field not in evaluation or not evaluation[field]:
-                raise ValueError(f'Evaluation missing required field: {field}')
-        
-        self._evaluations.append(evaluation)
-    
-    def pop(self) -> Optional[Dict[str, Any]]:
-        """
-        Remove and return the evaluation from the top of the stack.
-        
-        Returns:
-            The evaluation object, or None if stack is empty
-        """
-        if self._evaluations:
-            return self._evaluations.pop()
-        return None
-    
-    def is_empty(self) -> bool:
-        """
-        Check if the stack is empty.
-        
-        Returns:
-            True if stack has no evaluations
-        """
-        return len(self._evaluations) == 0
-    
-    def size(self) -> int:
-        """
-        Get the number of evaluations in the stack.
-        
-        Returns:
-            The stack size
-        """
-        return len(self._evaluations)
-    
-    def peek(self) -> Optional[Dict[str, Any]]:
-        """
-        Peek at the top evaluation without removing it.
-        
-        Returns:
-            The top evaluation object, or None if stack is empty
-        """
-        if self.is_empty():
-            return None
-        return self._evaluations[-1]
-    
-    def clear(self) -> None:
-        """Clear all evaluations from the stack."""
-        self._evaluations.clear()
-    
-    def to_array(self) -> List[Dict[str, Any]]:
-        """
-        Get a copy of all evaluations in the stack (top to bottom).
-        
-        Returns:
-            List of evaluation objects from top to bottom
-        """
-        return list(reversed(self._evaluations))
-    
-    def __len__(self) -> int:
-        """Return the number of evaluations in the stack."""
-        return len(self._evaluations)
-    
-    def __bool__(self) -> bool:
-        """Return True if stack has evaluations."""
-        return not self.is_empty()
-    
-    def __repr__(self) -> str:
-        """String representation of the stack."""
-        return f"EvaluationStack(size={self.size()})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/logger.py b/eval-server/python/src/bo_eval_server/logger.py
deleted file mode 100644
index 8f6e3c5..0000000
--- a/eval-server/python/src/bo_eval_server/logger.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Enhanced logging setup for bo-eval-server using loguru.
-
-Provides structured logging with JSON formatting and multiple log levels.
-"""
-
-import sys
-from pathlib import Path
-from typing import Optional, Dict, Any
-
-from loguru import logger
-
-
-def setup_logger(
-    log_level: str = "INFO",
-    log_dir: Optional[str] = None,
-    enable_json: bool = True,
-) -> None:
-    """
-    Setup enhanced logging with loguru.
-    
-    Args:
-        log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-        log_dir: Directory for log files (default: ./logs)
-        enable_json: Whether to use JSON formatting for structured logs
-    """
-    # Remove default handler
-    logger.remove()
-    
-    # Console handler with colored output
-    logger.add(
-        sys.stdout,
-        level=log_level,
-        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
-               "<level>{level: <8}</level> | "
-               "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
-               "<level>{message}</level>",
-        colorize=True,
-    )
-    
-    # File handlers if log_dir is specified
-    if log_dir:
-        log_path = Path(log_dir)
-        log_path.mkdir(exist_ok=True)
-        
-        # Combined log file
-        logger.add(
-            log_path / "combined.log",
-            level="DEBUG",
-            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
-            rotation="10 MB",
-            retention="7 days",
-        )
-        
-        # Error log file
-        logger.add(
-            log_path / "error.log", 
-            level="ERROR",
-            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
-            rotation="10 MB",
-            retention="30 days",
-        )
-        
-        # Structured JSON log for evaluations
-        if enable_json:
-            logger.add(
-                log_path / "evaluations.jsonl",
-                level="INFO",
-                format="{message}",
-                filter=lambda record: record["extra"].get("event_type") == "evaluation",
-                rotation="10 MB",
-                retention="30 days",
-            )
-
-
-def log_connection(event: str, client_id: str, **kwargs) -> None:
-    """
-    Log connection events with structured data.
-    
-    Args:
-        event: Connection event type (connect, disconnect, ready)
-        client_id: Client identifier
-        **kwargs: Additional event data
-    """
-    logger.bind(event_type="connection").info(
-        f"Connection {event}: {client_id}",
-        extra={
-            "event_type": "connection",
-            "connection_event": event,
-            "client_id": client_id,
-            **kwargs,
-        }
-    )
-
-
-def log_evaluation(
-    evaluation_id: str,
-    client_id: str,
-    status: str,
-    duration: Optional[float] = None,
-    **kwargs
-) -> None:
-    """
-    Log evaluation events with structured data.
-    
-    Args:
-        evaluation_id: Unique evaluation identifier
-        client_id: Client that handled the evaluation
-        status: Evaluation status (started, completed, failed, timeout)
-        duration: Evaluation duration in seconds
-        **kwargs: Additional evaluation data
-    """
-    message = f"Evaluation {status}: {evaluation_id} (client: {client_id})"
-    if duration is not None:
-        message += f" ({duration:.2f}s)"
-    
-    log_data = {
-        "event_type": "evaluation",
-        "evaluation_id": evaluation_id,
-        "client_id": client_id,
-        "status": status,
-        "duration": duration,
-        **kwargs,
-    }
-    
-    logger.bind(event_type="evaluation").info(message, extra=log_data)
-
-
-def log_rpc_call(
-    method: str,
-    client_id: str,
-    call_id: str,
-    status: str,
-    duration: Optional[float] = None,
-    **kwargs
-) -> None:
-    """
-    Log RPC call events with structured data.
-    
-    Args:
-        method: RPC method name
-        client_id: Target client identifier
-        call_id: RPC call identifier
-        status: Call status (sent, completed, failed, timeout)
-        duration: Call duration in seconds
-        **kwargs: Additional call data
-    """
-    message = f"RPC {status}: {method} -> {client_id} (id: {call_id})"
-    if duration is not None:
-        message += f" ({duration:.2f}s)"
-    
-    log_data = {
-        "event_type": "rpc",
-        "method": method,
-        "client_id": client_id,
-        "call_id": call_id,
-        "status": status,
-        "duration": duration,
-        **kwargs,
-    }
-    
-    logger.bind(event_type="rpc").info(message, extra=log_data)
-
-
-def log_server_event(event: str, **kwargs) -> None:
-    """
-    Log server lifecycle events.
-    
-    Args:
-        event: Server event type (start, stop, error)
-        **kwargs: Additional event data
-    """
-    logger.bind(event_type="server").info(
-        f"Server {event}",
-        extra={
-            "event_type": "server",
-            "server_event": event,
-            **kwargs,
-        }
-    )
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/rpc_client.py b/eval-server/python/src/bo_eval_server/rpc_client.py
deleted file mode 100644
index 8fc024b..0000000
--- a/eval-server/python/src/bo_eval_server/rpc_client.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-JSON-RPC 2.0 client implementation for calling methods on connected agents.
-
-Handles request/response correlation, timeouts, and error conditions.
-"""
-
-import asyncio
-import json
-import time
-import uuid
-from typing import Dict, Any, Optional, Callable, Awaitable
-
-import websockets
-from loguru import logger
-
-from .logger import log_rpc_call
-
-
-class RpcError(Exception):
-    """Exception raised for RPC-related errors."""
-    pass
-
-
-class RpcTimeoutError(RpcError):
-    """Exception raised when RPC call times out."""
-    pass
-
-
-class RpcClient:
-    """JSON-RPC 2.0 client for bidirectional communication with agents."""
-    
-    def __init__(self, websocket: websockets.WebSocketServerProtocol, timeout: float = 1500.0):
-        """
-        Initialize RPC client for a WebSocket connection.
-        
-        Args:
-            websocket: WebSocket connection to the agent
-            timeout: Default timeout for RPC calls in seconds
-        """
-        self.websocket = websocket
-        self.timeout = timeout
-        self._pending_calls: Dict[str, asyncio.Future] = {}
-        self._message_handler_task: Optional[asyncio.Task] = None
-        self._closed = False
-    
-    async def start(self) -> None:
-        """Start the RPC client message handler."""
-        if self._message_handler_task is None:
-            self._message_handler_task = asyncio.create_task(self._handle_messages())
-    
-    async def stop(self) -> None:
-        """Stop the RPC client and cancel pending calls."""
-        self._closed = True
-        
-        # Cancel message handler
-        if self._message_handler_task:
-            self._message_handler_task.cancel()
-            try:
-                await self._message_handler_task
-            except asyncio.CancelledError:
-                pass
-        
-        # Cancel all pending calls
-        for future in self._pending_calls.values():
-            if not future.done():
-                future.cancel()
-        self._pending_calls.clear()
-    
-    async def call(
-        self,
-        method: str,
-        params: Optional[Dict[str, Any]] = None,
-        timeout: Optional[float] = None,
-        client_id: Optional[str] = None,
-    ) -> Any:
-        """
-        Make an RPC call to the connected agent.
-        
-        Args:
-            method: RPC method name to call
-            params: Parameters to pass to the method
-            timeout: Timeout for this call (uses default if None)
-            client_id: Client ID for logging purposes
-            
-        Returns:
-            The result returned by the agent
-            
-        Raises:
-            RpcError: If the call fails or returns an error
-            RpcTimeoutError: If the call times out
-            ConnectionError: If the WebSocket connection is closed
-        """
-        if self._closed:
-            raise ConnectionError("RPC client is closed")
-        
-        call_id = str(uuid.uuid4())
-        call_timeout = timeout or self.timeout
-        
-        # Create JSON-RPC 2.0 request
-        request = {
-            "jsonrpc": "2.0",
-            "method": method,
-            "params": params or {},
-            "id": call_id,
-        }
-        
-        # Create future for response
-        future: asyncio.Future = asyncio.Future()
-        self._pending_calls[call_id] = future
-        
-        start_time = time.time()
-        
-        try:
-            # Log RPC call start
-            log_rpc_call(
-                method=method,
-                client_id=client_id or "unknown",
-                call_id=call_id,
-                status="sent",
-                params=params,
-            )
-            
-            # Send request
-            await self.websocket.send(json.dumps(request))
-            
-            # Wait for response with timeout
-            try:
-                result = await asyncio.wait_for(future, timeout=call_timeout)
-                duration = time.time() - start_time
-                
-                # Log successful completion
-                log_rpc_call(
-                    method=method,
-                    client_id=client_id or "unknown",
-                    call_id=call_id,
-                    status="completed",
-                    duration=duration,
-                )
-                
-                return result
-                
-            except asyncio.TimeoutError:
-                duration = time.time() - start_time
-                
-                # Log timeout
-                log_rpc_call(
-                    method=method,
-                    client_id=client_id or "unknown",
-                    call_id=call_id,
-                    status="timeout",
-                    duration=duration,
-                )
-                
-                raise RpcTimeoutError(f"RPC call '{method}' timed out after {call_timeout}s")
-                
-        except Exception as e:
-            duration = time.time() - start_time
-            
-            # Log failure
-            log_rpc_call(
-                method=method,
-                client_id=client_id or "unknown",
-                call_id=call_id,
-                status="failed",
-                duration=duration,
-                error=str(e),
-            )
-            
-            raise
-            
-        finally:
-            # Clean up pending call
-            self._pending_calls.pop(call_id, None)
-    
-    async def _handle_messages(self) -> None:
-        """Handle incoming WebSocket messages and route RPC responses."""
-        try:
-            async for message in self.websocket:
-                if self._closed:
-                    break
-                
-                try:
-                    await self._process_message(message)
-                except Exception as e:
-                    logger.error(f"Error processing RPC message: {e}")
-                    
-        except websockets.exceptions.ConnectionClosed:
-            logger.debug("WebSocket connection closed in RPC message handler")
-        except Exception as e:
-            logger.error(f"Error in RPC message handler: {e}")
-        finally:
-            await self.stop()
-    
-    async def _process_message(self, message: str) -> None:
-        """Process a single WebSocket message."""
-        try:
-            data = json.loads(message)
-        except json.JSONDecodeError as e:
-            logger.warning(f"Invalid JSON in RPC message: {e}")
-            return
-        
-        # Handle JSON-RPC 2.0 responses
-        if isinstance(data, dict) and "jsonrpc" in data and "id" in data:
-            call_id = data["id"]
-            future = self._pending_calls.get(call_id)
-            
-            if future and not future.done():
-                if "result" in data:
-                    # Successful response
-                    future.set_result(data["result"])
-                elif "error" in data:
-                    # Error response
-                    error = data["error"]
-                    error_msg = f"RPC error {error.get('code', 'unknown')}: {error.get('message', 'Unknown error')}"
-                    future.set_exception(RpcError(error_msg))
-                else:
-                    # Invalid response format
-                    future.set_exception(RpcError("Invalid RPC response format"))
-            else:
-                logger.warning(f"Received response for unknown or completed call: {call_id}")
-    
-    def is_connected(self) -> bool:
-        """Check if the RPC client is still active."""
-        return not self._closed
-    
-    def __repr__(self) -> str:
-        """String representation of the RPC client."""
-        status = "connected" if self.is_connected() else "closed"
-        return f"RpcClient(status={status}, pending_calls={len(self._pending_calls)})"
\ No newline at end of file
diff --git a/eval-server/python/test_client.py b/eval-server/python/test_client.py
deleted file mode 100644
index 37f2520..0000000
--- a/eval-server/python/test_client.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test client for debugging connection issues with bo-eval-server.
-
-This client helps test the WebSocket connection and protocol implementation.
-"""
-
-import asyncio
-import json
-import sys
-import uuid
-from pathlib import Path
-
-# Add src to path for development
-sys.path.insert(0, str(Path(__file__).parent / "src"))
-
-try:
-    import websockets
-except ImportError:
-    print("❌ websockets not installed. Run: uv add websockets")
-    sys.exit(1)
-
-
-class TestClient:
-    """Simple test client for debugging server connections."""
-    
-    def __init__(self, server_url: str = "ws://127.0.0.1:8080", auth_key: str = "hello"):
-        self.server_url = server_url
-        self.auth_key = auth_key
-        self.client_id = str(uuid.uuid4())
-        self.websocket = None
-    
-    async def connect_and_test(self):
-        """Connect to server and test the NodeJS-compatible protocol."""
-        print(f"🔗 Connecting to {self.server_url}")
-        print(f"   Client ID: {self.client_id}")
-        print(f"   Auth Key: {self.auth_key}")
-        
-        try:
-            # Connect to WebSocket
-            self.websocket = await websockets.connect(
-                self.server_url,
-                ping_interval=20,
-                ping_timeout=20,
-                close_timeout=10,
-            )
-            print("✅ WebSocket connection established")
-            
-            # Send registration message (NodeJS style)
-            registration = {
-                "type": "register",
-                "clientId": self.client_id,
-                "secretKey": self.auth_key,
-                "capabilities": ["chat", "action", "research"]
-            }
-            
-            print("📤 Sending registration message:")
-            print(f"   {json.dumps(registration, indent=2)}")
-            
-            await self.websocket.send(json.dumps(registration))
-            
-            # Wait for registration acknowledgment
-            print("⏳ Waiting for registration acknowledgment...")
-            response = await asyncio.wait_for(self.websocket.recv(), timeout=10.0)
-            response_data = json.loads(response)
-            
-            print("📥 Received registration acknowledgment:")
-            print(f"   {json.dumps(response_data, indent=2)}")
-            
-            if response_data.get("type") == "registration_ack" and response_data.get("status") == "accepted":
-                print("✅ Registration successful!")
-                
-                # Send ready signal
-                ready_message = {"type": "ready"}
-                print("📤 Sending ready signal:")
-                print(f"   {json.dumps(ready_message, indent=2)}")
-                
-                await self.websocket.send(json.dumps(ready_message))
-                print("✅ Ready signal sent")
-                
-                # Listen for RPC calls
-                print("👂 Listening for RPC calls...")
-                await self.listen_for_calls()
-                
-            elif response_data.get("type") == "error":
-                print(f"❌ Registration failed: {response_data.get('message')}")
-                return False
-            else:
-                print(f"❓ Unexpected response: {response_data}")
-                return False
-                
-        except asyncio.TimeoutError:
-            print("⏰ Timeout waiting for server response")
-            return False
-        except websockets.exceptions.ConnectionClosed as e:
-            print(f"🔌 Connection closed: {e}")
-            return False
-        except Exception as e:
-            print(f"💥 Error during connection: {e}")
-            return False
-        finally:
-            if self.websocket:
-                await self.websocket.close()
-        
-        return True
-    
-    async def listen_for_calls(self):
-        """Listen for RPC calls from the server."""
-        try:
-            async for message in self.websocket:
-                print(f"\n📥 Received message: {message}")
-                
-                try:
-                    data = json.loads(message)
-                    
-                    if data.get("jsonrpc") == "2.0" and data.get("method") == "evaluate":
-                        print("🎯 Received RPC evaluation request")
-                        print(f"   ID: {data.get('id')}")
-                        print(f"   Params: {json.dumps(data.get('params', {}), indent=2)}")
-                        
-                        # Send mock response
-                        response = {
-                            "jsonrpc": "2.0",
-                            "id": data["id"],
-                            "result": {
-                                "status": "completed",
-                                "output": {
-                                    "response": f"Mock response for evaluation {data['params'].get('name', 'unknown')}"
-                                },
-                                "metadata": {
-                                    "client_id": self.client_id,
-                                    "test_client": True
-                                }
-                            }
-                        }
-                        
-                        print("📤 Sending mock response:")
-                        print(f"   {json.dumps(response, indent=2)}")
-                        
-                        await self.websocket.send(json.dumps(response))
-                        print("✅ Mock response sent")
-                    else:
-                        print(f"❓ Unknown message type: {data}")
-                        
-                except json.JSONDecodeError as e:
-                    print(f"❌ Invalid JSON received: {e}")
-                    
-        except websockets.exceptions.ConnectionClosed:
-            print("🔌 Connection closed by server")
-        except Exception as e:
-            print(f"💥 Error listening for calls: {e}")
-
-
-async def main():
-    """Main test function."""
-    print("🧪 Test Client for bo-eval-server")
-    print("=" * 40)
-    
-    if len(sys.argv) > 1:
-        server_url = sys.argv[1]
-    else:
-        server_url = "ws://127.0.0.1:8080"
-    
-    if len(sys.argv) > 2:
-        auth_key = sys.argv[2]
-    else:
-        auth_key = "hello"  # Default from examples
-    
-    client = TestClient(server_url, auth_key)
-    
-    try:
-        success = await client.connect_and_test()
-        if success:
-            print("\n✅ Test completed successfully!")
-        else:
-            print("\n❌ Test failed!")
-            sys.exit(1)
-    except KeyboardInterrupt:
-        print("\n🛑 Test interrupted by user")
-    except Exception as e:
-        print(f"\n💥 Test failed with error: {e}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    print("Usage: python test_client.py [ws://server:port] [auth_key]")
-    print("Example: python test_client.py ws://127.0.0.1:8080 hello")
-    print()
-    
-    asyncio.run(main())
\ No newline at end of file
diff --git a/eval-server/python/uv.lock b/eval-server/python/uv.lock
deleted file mode 100644
index 2da9568..0000000
--- a/eval-server/python/uv.lock
+++ /dev/null
@@ -1,1306 +0,0 @@
-version = 1
-revision = 2
-requires-python = ">=3.8"
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-    "python_full_version < '3.9'",
-]
-
-[[package]]
-name = "backports-asyncio-runner"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" },
-]
-
-[[package]]
-name = "black"
-version = "24.8.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy-extensions", marker = "python_full_version < '3.9'" },
-    { name = "packaging", marker = "python_full_version < '3.9'" },
-    { name = "pathspec", marker = "python_full_version < '3.9'" },
-    { name = "platformdirs", version = "4.3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/04/b0/46fb0d4e00372f4a86a6f8efa3cb193c9f64863615e39010b1477e010578/black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f", size = 644810, upload-time = "2024-08-02T17:43:18.405Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/6e/74e29edf1fba3887ed7066930a87f698ffdcd52c5dbc263eabb06061672d/black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6", size = 1632092, upload-time = "2024-08-02T17:47:26.911Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/49/575cb6c3faee690b05c9d11ee2e8dba8fbd6d6c134496e644c1feb1b47da/black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb", size = 1457529, upload-time = "2024-08-02T17:47:29.109Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/b4/d34099e95c437b53d01c4aa37cf93944b233066eb034ccf7897fa4e5f286/black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42", size = 1757443, upload-time = "2024-08-02T17:46:20.306Z" },
-    { url = "https://files.pythonhosted.org/packages/87/a0/6d2e4175ef364b8c4b64f8441ba041ed65c63ea1db2720d61494ac711c15/black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a", size = 1418012, upload-time = "2024-08-02T17:47:20.33Z" },
-    { url = "https://files.pythonhosted.org/packages/08/a6/0a3aa89de9c283556146dc6dbda20cd63a9c94160a6fbdebaf0918e4a3e1/black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1", size = 1615080, upload-time = "2024-08-02T17:48:05.467Z" },
-    { url = "https://files.pythonhosted.org/packages/db/94/b803d810e14588bb297e565821a947c108390a079e21dbdcb9ab6956cd7a/black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af", size = 1438143, upload-time = "2024-08-02T17:47:30.247Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/b5/f485e1bbe31f768e2e5210f52ea3f432256201289fd1a3c0afda693776b0/black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4", size = 1738774, upload-time = "2024-08-02T17:46:17.837Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/69/a000fc3736f89d1bdc7f4a879f8aaf516fb03613bb51a0154070383d95d9/black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af", size = 1427503, upload-time = "2024-08-02T17:46:22.654Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/a8/05fb14195cfef32b7c8d4585a44b7499c2a4b205e1662c427b941ed87054/black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368", size = 1646132, upload-time = "2024-08-02T17:49:52.843Z" },
-    { url = "https://files.pythonhosted.org/packages/41/77/8d9ce42673e5cb9988f6df73c1c5c1d4e9e788053cccd7f5fb14ef100982/black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed", size = 1448665, upload-time = "2024-08-02T17:47:54.479Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/94/eff1ddad2ce1d3cc26c162b3693043c6b6b575f538f602f26fe846dfdc75/black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018", size = 1762458, upload-time = "2024-08-02T17:46:19.384Z" },
-    { url = "https://files.pythonhosted.org/packages/28/ea/18b8d86a9ca19a6942e4e16759b2fa5fc02bbc0eb33c1b866fcd387640ab/black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2", size = 1436109, upload-time = "2024-08-02T17:46:52.97Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/d4/ae03761ddecc1a37d7e743b89cccbcf3317479ff4b88cfd8818079f890d0/black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd", size = 1617322, upload-time = "2024-08-02T17:51:20.203Z" },
-    { url = "https://files.pythonhosted.org/packages/14/4b/4dfe67eed7f9b1ddca2ec8e4418ea74f0d1dc84d36ea874d618ffa1af7d4/black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2", size = 1442108, upload-time = "2024-08-02T17:50:40.824Z" },
-    { url = "https://files.pythonhosted.org/packages/97/14/95b3f91f857034686cae0e73006b8391d76a8142d339b42970eaaf0416ea/black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e", size = 1745786, upload-time = "2024-08-02T17:46:02.939Z" },
-    { url = "https://files.pythonhosted.org/packages/95/54/68b8883c8aa258a6dde958cd5bdfada8382bec47c5162f4a01e66d839af1/black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920", size = 1426754, upload-time = "2024-08-02T17:46:38.603Z" },
-    { url = "https://files.pythonhosted.org/packages/13/b2/b3f24fdbb46f0e7ef6238e131f13572ee8279b70f237f221dd168a9dba1a/black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c", size = 1631706, upload-time = "2024-08-02T17:49:57.606Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/35/31010981e4a05202a84a3116423970fd1a59d2eda4ac0b3570fbb7029ddc/black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e", size = 1457429, upload-time = "2024-08-02T17:49:12.764Z" },
-    { url = "https://files.pythonhosted.org/packages/27/25/3f706b4f044dd569a20a4835c3b733dedea38d83d2ee0beb8178a6d44945/black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47", size = 1756488, upload-time = "2024-08-02T17:46:08.067Z" },
-    { url = "https://files.pythonhosted.org/packages/63/72/79375cd8277cbf1c5670914e6bd4c1b15dea2c8f8e906dc21c448d0535f0/black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb", size = 1417721, upload-time = "2024-08-02T17:46:42.637Z" },
-    { url = "https://files.pythonhosted.org/packages/27/1e/83fa8a787180e1632c3d831f7e58994d7aaf23a0961320d21e84f922f919/black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed", size = 206504, upload-time = "2024-08-02T17:43:15.747Z" },
-]
-
-[[package]]
-name = "black"
-version = "25.1.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "mypy-extensions", marker = "python_full_version >= '3.9'" },
-    { name = "packaging", marker = "python_full_version >= '3.9'" },
-    { name = "pathspec", marker = "python_full_version >= '3.9'" },
-    { name = "platformdirs", version = "4.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/94/49/26a7b0f3f35da4b5a65f081943b7bcd22d7002f5f0fb8098ec1ff21cb6ef/black-25.1.0.tar.gz", hash = "sha256:33496d5cd1222ad73391352b4ae8da15253c5de89b93a80b3e2c8d9a19ec2666", size = 649449, upload-time = "2025-01-29T04:15:40.373Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4d/3b/4ba3f93ac8d90410423fdd31d7541ada9bcee1df32fb90d26de41ed40e1d/black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32", size = 1629419, upload-time = "2025-01-29T05:37:06.642Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/02/0bde0485146a8a5e694daed47561785e8b77a0466ccc1f3e485d5ef2925e/black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da", size = 1461080, upload-time = "2025-01-29T05:37:09.321Z" },
-    { url = "https://files.pythonhosted.org/packages/52/0e/abdf75183c830eaca7589144ff96d49bce73d7ec6ad12ef62185cc0f79a2/black-25.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:055e59b198df7ac0b7efca5ad7ff2516bca343276c466be72eb04a3bcc1f82d7", size = 1766886, upload-time = "2025-01-29T04:18:24.432Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/a6/97d8bb65b1d8a41f8a6736222ba0a334db7b7b77b8023ab4568288f23973/black-25.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:db8ea9917d6f8fc62abd90d944920d95e73c83a5ee3383493e35d271aca872e9", size = 1419404, upload-time = "2025-01-29T04:19:04.296Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/4f/87f596aca05c3ce5b94b8663dbfe242a12843caaa82dd3f85f1ffdc3f177/black-25.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a39337598244de4bae26475f77dda852ea00a93bd4c728e09eacd827ec929df0", size = 1614372, upload-time = "2025-01-29T05:37:11.71Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/d0/2c34c36190b741c59c901e56ab7f6e54dad8df05a6272a9747ecef7c6036/black-25.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96c1c7cd856bba8e20094e36e0f948718dc688dba4a9d78c3adde52b9e6c2299", size = 1442865, upload-time = "2025-01-29T05:37:14.309Z" },
-    { url = "https://files.pythonhosted.org/packages/21/d4/7518c72262468430ead45cf22bd86c883a6448b9eb43672765d69a8f1248/black-25.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce2e264d59c91e52d8000d507eb20a9aca4a778731a08cfff7e5ac4a4bb7096", size = 1749699, upload-time = "2025-01-29T04:18:17.688Z" },
-    { url = "https://files.pythonhosted.org/packages/58/db/4f5beb989b547f79096e035c4981ceb36ac2b552d0ac5f2620e941501c99/black-25.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:172b1dbff09f86ce6f4eb8edf9dede08b1fce58ba194c87d7a4f1a5aa2f5b3c2", size = 1428028, upload-time = "2025-01-29T04:18:51.711Z" },
-    { url = "https://files.pythonhosted.org/packages/83/71/3fe4741df7adf015ad8dfa082dd36c94ca86bb21f25608eb247b4afb15b2/black-25.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4b60580e829091e6f9238c848ea6750efed72140b91b048770b64e74fe04908b", size = 1650988, upload-time = "2025-01-29T05:37:16.707Z" },
-    { url = "https://files.pythonhosted.org/packages/13/f3/89aac8a83d73937ccd39bbe8fc6ac8860c11cfa0af5b1c96d081facac844/black-25.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e2978f6df243b155ef5fa7e558a43037c3079093ed5d10fd84c43900f2d8ecc", size = 1453985, upload-time = "2025-01-29T05:37:18.273Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/22/b99efca33f1f3a1d2552c714b1e1b5ae92efac6c43e790ad539a163d1754/black-25.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b48735872ec535027d979e8dcb20bf4f70b5ac75a8ea99f127c106a7d7aba9f", size = 1783816, upload-time = "2025-01-29T04:18:33.823Z" },
-    { url = "https://files.pythonhosted.org/packages/18/7e/a27c3ad3822b6f2e0e00d63d58ff6299a99a5b3aee69fa77cd4b0076b261/black-25.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea0213189960bda9cf99be5b8c8ce66bb054af5e9e861249cd23471bd7b0b3ba", size = 1440860, upload-time = "2025-01-29T04:19:12.944Z" },
-    { url = "https://files.pythonhosted.org/packages/98/87/0edf98916640efa5d0696e1abb0a8357b52e69e82322628f25bf14d263d1/black-25.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f0b18a02996a836cc9c9c78e5babec10930862827b1b724ddfe98ccf2f2fe4f", size = 1650673, upload-time = "2025-01-29T05:37:20.574Z" },
-    { url = "https://files.pythonhosted.org/packages/52/e5/f7bf17207cf87fa6e9b676576749c6b6ed0d70f179a3d812c997870291c3/black-25.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:afebb7098bfbc70037a053b91ae8437c3857482d3a690fefc03e9ff7aa9a5fd3", size = 1453190, upload-time = "2025-01-29T05:37:22.106Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/ee/adda3d46d4a9120772fae6de454c8495603c37c4c3b9c60f25b1ab6401fe/black-25.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:030b9759066a4ee5e5aca28c3c77f9c64789cdd4de8ac1df642c40b708be6171", size = 1782926, upload-time = "2025-01-29T04:18:58.564Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/64/94eb5f45dcb997d2082f097a3944cfc7fe87e071907f677e80788a2d7b7a/black-25.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:a22f402b410566e2d1c950708c77ebf5ebd5d0d88a6a2e87c86d9fb48afa0d18", size = 1442613, upload-time = "2025-01-29T04:19:27.63Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/b6/ae7507470a4830dbbfe875c701e84a4a5fb9183d1497834871a715716a92/black-25.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1ee0a0c330f7b5130ce0caed9936a904793576ef4d2b98c40835d6a65afa6a0", size = 1628593, upload-time = "2025-01-29T05:37:23.672Z" },
-    { url = "https://files.pythonhosted.org/packages/24/c1/ae36fa59a59f9363017ed397750a0cd79a470490860bc7713967d89cdd31/black-25.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3df5f1bf91d36002b0a75389ca8663510cf0531cca8aa5c1ef695b46d98655f", size = 1460000, upload-time = "2025-01-29T05:37:25.829Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/b6/98f832e7a6c49aa3a464760c67c7856363aa644f2f3c74cf7d624168607e/black-25.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9e6827d563a2c820772b32ce8a42828dc6790f095f441beef18f96aa6f8294e", size = 1765963, upload-time = "2025-01-29T04:18:38.116Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/e9/2cb0a017eb7024f70e0d2e9bdb8c5a5b078c5740c7f8816065d06f04c557/black-25.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:bacabb307dca5ebaf9c118d2d2f6903da0d62c9faa82bd21a33eecc319559355", size = 1419419, upload-time = "2025-01-29T04:18:30.191Z" },
-    { url = "https://files.pythonhosted.org/packages/09/71/54e999902aed72baf26bca0d50781b01838251a462612966e9fc4891eadd/black-25.1.0-py3-none-any.whl", hash = "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717", size = 207646, upload-time = "2025-01-29T04:15:38.082Z" },
-]
-
-[[package]]
-name = "bo-eval-server"
-version = "1.0.0"
-source = { editable = "." }
-dependencies = [
-    { name = "loguru" },
-    { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pandas", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "requests" },
-    { name = "websockets", version = "13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "websockets", version = "15.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.optional-dependencies]
-dev = [
-    { name = "black", version = "24.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "black", version = "25.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "mypy", version = "1.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest-asyncio", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest-asyncio", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.dev-dependencies]
-dev = [
-    { name = "black", version = "24.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "black", version = "25.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "mypy", version = "1.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest-asyncio", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest-asyncio", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
-    { name = "loguru", specifier = ">=0.7.0" },
-    { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.0.0" },
-    { name = "pandas", specifier = ">=2.0.0" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
-    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.21.0" },
-    { name = "requests", specifier = ">=2.31.0" },
-    { name = "websockets", specifier = ">=11.0.0" },
-]
-provides-extras = ["dev"]
-
-[package.metadata.requires-dev]
-dev = [
-    { name = "black", specifier = ">=24.8.0" },
-    { name = "mypy", specifier = ">=1.14.1" },
-    { name = "pytest", specifier = ">=8.3.5" },
-    { name = "pytest-asyncio", specifier = ">=0.24.0" },
-]
-
-[[package]]
-name = "certifi"
-version = "2025.8.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" },
-]
-
-[[package]]
-name = "charset-normalizer"
-version = "3.4.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload-time = "2025-05-02T08:34:42.01Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/28/9901804da60055b406e1a1c5ba7aac1276fb77f1dde635aabfc7fd84b8ab/charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941", size = 201818, upload-time = "2025-05-02T08:31:46.725Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/9b/892a8c8af9110935e5adcbb06d9c6fe741b6bb02608c6513983048ba1a18/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd", size = 144649, upload-time = "2025-05-02T08:31:48.889Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/a5/4179abd063ff6414223575e008593861d62abfc22455b5d1a44995b7c101/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6", size = 155045, upload-time = "2025-05-02T08:31:50.757Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/95/bc08c7dfeddd26b4be8c8287b9bb055716f31077c8b0ea1cd09553794665/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d", size = 147356, upload-time = "2025-05-02T08:31:52.634Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/2d/7a5b635aa65284bf3eab7653e8b4151ab420ecbae918d3e359d1947b4d61/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86", size = 149471, upload-time = "2025-05-02T08:31:56.207Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/38/51fc6ac74251fd331a8cfdb7ec57beba8c23fd5493f1050f71c87ef77ed0/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c", size = 151317, upload-time = "2025-05-02T08:31:57.613Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/17/edee1e32215ee6e9e46c3e482645b46575a44a2d72c7dfd49e49f60ce6bf/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0", size = 146368, upload-time = "2025-05-02T08:31:59.468Z" },
-    { url = "https://files.pythonhosted.org/packages/26/2c/ea3e66f2b5f21fd00b2825c94cafb8c326ea6240cd80a91eb09e4a285830/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef", size = 154491, upload-time = "2025-05-02T08:32:01.219Z" },
-    { url = "https://files.pythonhosted.org/packages/52/47/7be7fa972422ad062e909fd62460d45c3ef4c141805b7078dbab15904ff7/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6", size = 157695, upload-time = "2025-05-02T08:32:03.045Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/42/9f02c194da282b2b340f28e5fb60762de1151387a36842a92b533685c61e/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366", size = 154849, upload-time = "2025-05-02T08:32:04.651Z" },
-    { url = "https://files.pythonhosted.org/packages/67/44/89cacd6628f31fb0b63201a618049be4be2a7435a31b55b5eb1c3674547a/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db", size = 150091, upload-time = "2025-05-02T08:32:06.719Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/79/4b8da9f712bc079c0f16b6d67b099b0b8d808c2292c937f267d816ec5ecc/charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a", size = 98445, upload-time = "2025-05-02T08:32:08.66Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/d7/96970afb4fb66497a40761cdf7bd4f6fca0fc7bafde3a84f836c1f57a926/charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509", size = 105782, upload-time = "2025-05-02T08:32:10.46Z" },
-    { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload-time = "2025-05-02T08:32:11.945Z" },
-    { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload-time = "2025-05-02T08:32:13.946Z" },
-    { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload-time = "2025-05-02T08:32:15.873Z" },
-    { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload-time = "2025-05-02T08:32:17.283Z" },
-    { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload-time = "2025-05-02T08:32:18.807Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload-time = "2025-05-02T08:32:20.333Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload-time = "2025-05-02T08:32:21.86Z" },
-    { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload-time = "2025-05-02T08:32:23.434Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload-time = "2025-05-02T08:32:24.993Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload-time = "2025-05-02T08:32:26.435Z" },
-    { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload-time = "2025-05-02T08:32:28.376Z" },
-    { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload-time = "2025-05-02T08:32:30.281Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload-time = "2025-05-02T08:32:32.191Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload-time = "2025-05-02T08:32:33.712Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload-time = "2025-05-02T08:32:35.768Z" },
-    { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload-time = "2025-05-02T08:32:37.284Z" },
-    { url = "https://files.pythonhosted.org/packages/86/2d/fb55fdf41964ec782febbf33cb64be480a6b8f16ded2dbe8db27a405c09f/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214", size = 146626, upload-time = "2025-05-02T08:32:38.803Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/73/6ede2ec59bce19b3edf4209d70004253ec5f4e319f9a2e3f2f15601ed5f7/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a", size = 148567, upload-time = "2025-05-02T08:32:40.251Z" },
-    { url = "https://files.pythonhosted.org/packages/09/14/957d03c6dc343c04904530b6bef4e5efae5ec7d7990a7cbb868e4595ee30/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd", size = 150957, upload-time = "2025-05-02T08:32:41.705Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/c8/8174d0e5c10ccebdcb1b53cc959591c4c722a3ad92461a273e86b9f5a302/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981", size = 145408, upload-time = "2025-05-02T08:32:43.709Z" },
-    { url = "https://files.pythonhosted.org/packages/58/aa/8904b84bc8084ac19dc52feb4f5952c6df03ffb460a887b42615ee1382e8/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c", size = 153399, upload-time = "2025-05-02T08:32:46.197Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/26/89ee1f0e264d201cb65cf054aca6038c03b1a0c6b4ae998070392a3ce605/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b", size = 156815, upload-time = "2025-05-02T08:32:48.105Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/07/68e95b4b345bad3dbbd3a8681737b4338ff2c9df29856a6d6d23ac4c73cb/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d", size = 154537, upload-time = "2025-05-02T08:32:49.719Z" },
-    { url = "https://files.pythonhosted.org/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565, upload-time = "2025-05-02T08:32:51.404Z" },
-    { url = "https://files.pythonhosted.org/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357, upload-time = "2025-05-02T08:32:53.079Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776, upload-time = "2025-05-02T08:32:54.573Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload-time = "2025-05-02T08:32:56.363Z" },
-    { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload-time = "2025-05-02T08:32:58.551Z" },
-    { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload-time = "2025-05-02T08:33:00.342Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload-time = "2025-05-02T08:33:02.081Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload-time = "2025-05-02T08:33:04.063Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload-time = "2025-05-02T08:33:06.418Z" },
-    { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload-time = "2025-05-02T08:33:08.183Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload-time = "2025-05-02T08:33:09.986Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload-time = "2025-05-02T08:33:11.814Z" },
-    { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload-time = "2025-05-02T08:33:13.707Z" },
-    { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload-time = "2025-05-02T08:33:15.458Z" },
-    { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload-time = "2025-05-02T08:33:17.06Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload-time = "2025-05-02T08:33:18.753Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/fd/f700cfd4ad876def96d2c769d8a32d808b12d1010b6003dc6639157f99ee/charset_normalizer-3.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76af085e67e56c8816c3ccf256ebd136def2ed9654525348cfa744b6802b69eb", size = 198257, upload-time = "2025-05-02T08:33:45.511Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/95/6eec4cbbbd119e6a402e3bfd16246785cc52ce64cf21af2ecdf7b3a08e91/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e45ba65510e2647721e35323d6ef54c7974959f6081b58d4ef5d87c60c84919a", size = 143453, upload-time = "2025-05-02T08:33:47.463Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/b3/d4f913660383b3d93dbe6f687a312ea9f7e89879ae883c4e8942048174d4/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:046595208aae0120559a67693ecc65dd75d46f7bf687f159127046628178dc45", size = 153130, upload-time = "2025-05-02T08:33:50.568Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/69/7540141529eabc55bf19cc05cd9b61c2078bebfcdbd3e799af99b777fc28/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75d10d37a47afee94919c4fab4c22b9bc2a8bf7d4f46f87363bcf0573f3ff4f5", size = 145688, upload-time = "2025-05-02T08:33:52.828Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/bb/d76d3d6e340fb0967c43c564101e28a78c9a363ea62f736a68af59ee3683/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6333b3aa5a12c26b2a4d4e7335a28f1475e0e5e17d69d55141ee3cab736f66d1", size = 147418, upload-time = "2025-05-02T08:33:54.718Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/ef/b7c1f39c0dc3808160c8b72e0209c2479393966313bfebc833533cfff9cc/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8323a9b031aa0393768b87f04b4164a40037fb2a3c11ac06a03ffecd3618027", size = 150066, upload-time = "2025-05-02T08:33:56.597Z" },
-    { url = "https://files.pythonhosted.org/packages/20/26/4e47cc23d2a4a5eb6ed7d6f0f8cda87d753e2f8abc936d5cf5ad2aae8518/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:24498ba8ed6c2e0b56d4acbf83f2d989720a93b41d712ebd4f4979660db4417b", size = 144499, upload-time = "2025-05-02T08:33:58.637Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/9c/efdf59dd46593cecad0548d36a702683a0bdc056793398a9cd1e1546ad21/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:844da2b5728b5ce0e32d863af26f32b5ce61bc4273a9c720a9f3aa9df73b1455", size = 152954, upload-time = "2025-05-02T08:34:00.552Z" },
-    { url = "https://files.pythonhosted.org/packages/59/b3/4e8b73f7299d9aaabd7cd26db4a765f741b8e57df97b034bb8de15609002/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:65c981bdbd3f57670af8b59777cbfae75364b483fa8a9f420f08094531d54a01", size = 155876, upload-time = "2025-05-02T08:34:02.527Z" },
-    { url = "https://files.pythonhosted.org/packages/53/cb/6fa0ccf941a069adce3edb8a1e430bc80e4929f4d43b5140fdf8628bdf7d/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:3c21d4fca343c805a52c0c78edc01e3477f6dd1ad7c47653241cf2a206d4fc58", size = 153186, upload-time = "2025-05-02T08:34:04.481Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/c6/80b93fabc626b75b1665ffe405e28c3cef0aae9237c5c05f15955af4edd8/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dc7039885fa1baf9be153a0626e337aa7ec8bf96b0128605fb0d77788ddc1681", size = 148007, upload-time = "2025-05-02T08:34:06.888Z" },
-    { url = "https://files.pythonhosted.org/packages/41/eb/c7367ac326a2628e4f05b5c737c86fe4a8eb3ecc597a4243fc65720b3eeb/charset_normalizer-3.4.2-cp38-cp38-win32.whl", hash = "sha256:8272b73e1c5603666618805fe821edba66892e2870058c94c53147602eab29c7", size = 97923, upload-time = "2025-05-02T08:34:08.792Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/02/1c82646582ccf2c757fa6af69b1a3ea88744b8d2b4ab93b7686b2533e023/charset_normalizer-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:70f7172939fdf8790425ba31915bfbe8335030f05b9913d7ae00a87d4395620a", size = 105020, upload-time = "2025-05-02T08:34:10.6Z" },
-    { url = "https://files.pythonhosted.org/packages/28/f8/dfb01ff6cc9af38552c69c9027501ff5a5117c4cc18dcd27cb5259fa1888/charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4", size = 201671, upload-time = "2025-05-02T08:34:12.696Z" },
-    { url = "https://files.pythonhosted.org/packages/32/fb/74e26ee556a9dbfe3bd264289b67be1e6d616329403036f6507bb9f3f29c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7", size = 144744, upload-time = "2025-05-02T08:34:14.665Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/06/8499ee5aa7addc6f6d72e068691826ff093329fe59891e83b092ae4c851c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836", size = 154993, upload-time = "2025-05-02T08:34:17.134Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/a2/5e4c187680728219254ef107a6949c60ee0e9a916a5dadb148c7ae82459c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597", size = 147382, upload-time = "2025-05-02T08:34:19.081Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/fe/56aca740dda674f0cc1ba1418c4d84534be51f639b5f98f538b332dc9a95/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7", size = 149536, upload-time = "2025-05-02T08:34:21.073Z" },
-    { url = "https://files.pythonhosted.org/packages/53/13/db2e7779f892386b589173dd689c1b1e304621c5792046edd8a978cbf9e0/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f", size = 151349, upload-time = "2025-05-02T08:34:23.193Z" },
-    { url = "https://files.pythonhosted.org/packages/69/35/e52ab9a276186f729bce7a0638585d2982f50402046e4b0faa5d2c3ef2da/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba", size = 146365, upload-time = "2025-05-02T08:34:25.187Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/d8/af7333f732fc2e7635867d56cb7c349c28c7094910c72267586947561b4b/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12", size = 154499, upload-time = "2025-05-02T08:34:27.359Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/3d/a5b2e48acef264d71e036ff30bcc49e51bde80219bb628ba3e00cf59baac/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518", size = 157735, upload-time = "2025-05-02T08:34:29.798Z" },
-    { url = "https://files.pythonhosted.org/packages/85/d8/23e2c112532a29f3eef374375a8684a4f3b8e784f62b01da931186f43494/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5", size = 154786, upload-time = "2025-05-02T08:34:31.858Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/57/93e0169f08ecc20fe82d12254a200dfaceddc1c12a4077bf454ecc597e33/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3", size = 150203, upload-time = "2025-05-02T08:34:33.88Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/9d/9bf2b005138e7e060d7ebdec7503d0ef3240141587651f4b445bdf7286c2/charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471", size = 98436, upload-time = "2025-05-02T08:34:35.907Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/24/5849d46cf4311bbf21b424c443b09b459f5b436b1558c04e45dbb7cc478b/charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e", size = 105772, upload-time = "2025-05-02T08:34:37.935Z" },
-    { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" },
-]
-
-[[package]]
-name = "click"
-version = "8.1.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.9.*'",
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload-time = "2024-12-21T18:38:44.339Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" },
-]
-
-[[package]]
-name = "click"
-version = "8.2.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
-]
-
-[[package]]
-name = "exceptiongroup"
-version = "1.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
-]
-
-[[package]]
-name = "idna"
-version = "3.10"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
-]
-
-[[package]]
-name = "iniconfig"
-version = "2.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" },
-]
-
-[[package]]
-name = "loguru"
-version = "0.7.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
-]
-
-[[package]]
-name = "mypy"
-version = "1.14.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "mypy-extensions", marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/eb/2c92d8ea1e684440f54fa49ac5d9a5f19967b7b472a281f419e69a8d228e/mypy-1.14.1.tar.gz", hash = "sha256:7ec88144fe9b510e8475ec2f5f251992690fcf89ccb4500b214b4226abcd32d6", size = 3216051, upload-time = "2024-12-30T16:39:07.335Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/7a/87ae2adb31d68402da6da1e5f30c07ea6063e9f09b5e7cfc9dfa44075e74/mypy-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:52686e37cf13d559f668aa398dd7ddf1f92c5d613e4f8cb262be2fb4fedb0fcb", size = 11211002, upload-time = "2024-12-30T16:37:22.435Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/23/eada4c38608b444618a132be0d199b280049ded278b24cbb9d3fc59658e4/mypy-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1fb545ca340537d4b45d3eecdb3def05e913299ca72c290326be19b3804b39c0", size = 10358400, upload-time = "2024-12-30T16:37:53.526Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c9/d6785c6f66241c62fd2992b05057f404237deaad1566545e9f144ced07f5/mypy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90716d8b2d1f4cd503309788e51366f07c56635a3309b0f6a32547eaaa36a64d", size = 12095172, upload-time = "2024-12-30T16:37:50.332Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/62/daa7e787770c83c52ce2aaf1a111eae5893de9e004743f51bfcad9e487ec/mypy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae753f5c9fef278bcf12e1a564351764f2a6da579d4a81347e1d5a15819997b", size = 12828732, upload-time = "2024-12-30T16:37:29.96Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/a2/5fb18318a3637f29f16f4e41340b795da14f4751ef4f51c99ff39ab62e52/mypy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0fe0f5feaafcb04505bcf439e991c6d8f1bf8b15f12b05feeed96e9e7bf1427", size = 13012197, upload-time = "2024-12-30T16:38:05.037Z" },
-    { url = "https://files.pythonhosted.org/packages/28/99/e153ce39105d164b5f02c06c35c7ba958aaff50a2babba7d080988b03fe7/mypy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:7d54bd85b925e501c555a3227f3ec0cfc54ee8b6930bd6141ec872d1c572f81f", size = 9780836, upload-time = "2024-12-30T16:37:19.726Z" },
-    { url = "https://files.pythonhosted.org/packages/da/11/a9422850fd506edbcdc7f6090682ecceaf1f87b9dd847f9df79942da8506/mypy-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f995e511de847791c3b11ed90084a7a0aafdc074ab88c5a9711622fe4751138c", size = 11120432, upload-time = "2024-12-30T16:37:11.533Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/9e/47e450fd39078d9c02d620545b2cb37993a8a8bdf7db3652ace2f80521ca/mypy-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d64169ec3b8461311f8ce2fd2eb5d33e2d0f2c7b49116259c51d0d96edee48d1", size = 10279515, upload-time = "2024-12-30T16:37:40.724Z" },
-    { url = "https://files.pythonhosted.org/packages/01/b5/6c8d33bd0f851a7692a8bfe4ee75eb82b6983a3cf39e5e32a5d2a723f0c1/mypy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba24549de7b89b6381b91fbc068d798192b1b5201987070319889e93038967a8", size = 12025791, upload-time = "2024-12-30T16:36:58.73Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/4c/e10e2c46ea37cab5c471d0ddaaa9a434dc1d28650078ac1b56c2d7b9b2e4/mypy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:183cf0a45457d28ff9d758730cd0210419ac27d4d3f285beda038c9083363b1f", size = 12749203, upload-time = "2024-12-30T16:37:03.741Z" },
-    { url = "https://files.pythonhosted.org/packages/88/55/beacb0c69beab2153a0f57671ec07861d27d735a0faff135a494cd4f5020/mypy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f2a0ecc86378f45347f586e4163d1769dd81c5a223d577fe351f26b179e148b1", size = 12885900, upload-time = "2024-12-30T16:37:57.948Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/75/8c93ff7f315c4d086a2dfcde02f713004357d70a163eddb6c56a6a5eff40/mypy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:ad3301ebebec9e8ee7135d8e3109ca76c23752bac1e717bc84cd3836b4bf3eae", size = 9777869, upload-time = "2024-12-30T16:37:33.428Z" },
-    { url = "https://files.pythonhosted.org/packages/43/1b/b38c079609bb4627905b74fc6a49849835acf68547ac33d8ceb707de5f52/mypy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:30ff5ef8519bbc2e18b3b54521ec319513a26f1bba19a7582e7b1f58a6e69f14", size = 11266668, upload-time = "2024-12-30T16:38:02.211Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/75/2ed0d2964c1ffc9971c729f7a544e9cd34b2cdabbe2d11afd148d7838aa2/mypy-1.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cb9f255c18052343c70234907e2e532bc7e55a62565d64536dbc7706a20b78b9", size = 10254060, upload-time = "2024-12-30T16:37:46.131Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/5f/7b8051552d4da3c51bbe8fcafffd76a6823779101a2b198d80886cd8f08e/mypy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b4e3413e0bddea671012b063e27591b953d653209e7a4fa5e48759cda77ca11", size = 11933167, upload-time = "2024-12-30T16:37:43.534Z" },
-    { url = "https://files.pythonhosted.org/packages/04/90/f53971d3ac39d8b68bbaab9a4c6c58c8caa4d5fd3d587d16f5927eeeabe1/mypy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:553c293b1fbdebb6c3c4030589dab9fafb6dfa768995a453d8a5d3b23784af2e", size = 12864341, upload-time = "2024-12-30T16:37:36.249Z" },
-    { url = "https://files.pythonhosted.org/packages/03/d2/8bc0aeaaf2e88c977db41583559319f1821c069e943ada2701e86d0430b7/mypy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fad79bfe3b65fe6a1efaed97b445c3d37f7be9fdc348bdb2d7cac75579607c89", size = 12972991, upload-time = "2024-12-30T16:37:06.743Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/17/07815114b903b49b0f2cf7499f1c130e5aa459411596668267535fe9243c/mypy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:8fa2220e54d2946e94ab6dbb3ba0a992795bd68b16dc852db33028df2b00191b", size = 9879016, upload-time = "2024-12-30T16:37:15.02Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/15/bb6a686901f59222275ab228453de741185f9d54fecbaacec041679496c6/mypy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:92c3ed5afb06c3a8e188cb5da4984cab9ec9a77ba956ee419c68a388b4595255", size = 11252097, upload-time = "2024-12-30T16:37:25.144Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/b3/8b0f74dfd072c802b7fa368829defdf3ee1566ba74c32a2cb2403f68024c/mypy-1.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:dbec574648b3e25f43d23577309b16534431db4ddc09fda50841f1e34e64ed34", size = 10239728, upload-time = "2024-12-30T16:38:08.634Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/9b/4fd95ab20c52bb5b8c03cc49169be5905d931de17edfe4d9d2986800b52e/mypy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8c6d94b16d62eb3e947281aa7347d78236688e21081f11de976376cf010eb31a", size = 11924965, upload-time = "2024-12-30T16:38:12.132Z" },
-    { url = "https://files.pythonhosted.org/packages/56/9d/4a236b9c57f5d8f08ed346914b3f091a62dd7e19336b2b2a0d85485f82ff/mypy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d4b19b03fdf54f3c5b2fa474c56b4c13c9dbfb9a2db4370ede7ec11a2c5927d9", size = 12867660, upload-time = "2024-12-30T16:38:17.342Z" },
-    { url = "https://files.pythonhosted.org/packages/40/88/a61a5497e2f68d9027de2bb139c7bb9abaeb1be1584649fa9d807f80a338/mypy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0c911fde686394753fff899c409fd4e16e9b294c24bfd5e1ea4675deae1ac6fd", size = 12969198, upload-time = "2024-12-30T16:38:32.839Z" },
-    { url = "https://files.pythonhosted.org/packages/54/da/3d6fc5d92d324701b0c23fb413c853892bfe0e1dbe06c9138037d459756b/mypy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8b21525cb51671219f5307be85f7e646a153e5acc656e5cebf64bfa076c50107", size = 9885276, upload-time = "2024-12-30T16:38:20.828Z" },
-    { url = "https://files.pythonhosted.org/packages/39/02/1817328c1372be57c16148ce7d2bfcfa4a796bedaed897381b1aad9b267c/mypy-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7084fb8f1128c76cd9cf68fe5971b37072598e7c31b2f9f95586b65c741a9d31", size = 11143050, upload-time = "2024-12-30T16:38:29.743Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/07/99db9a95ece5e58eee1dd87ca456a7e7b5ced6798fd78182c59c35a7587b/mypy-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8f845a00b4f420f693f870eaee5f3e2692fa84cc8514496114649cfa8fd5e2c6", size = 10321087, upload-time = "2024-12-30T16:38:14.739Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/eb/85ea6086227b84bce79b3baf7f465b4732e0785830726ce4a51528173b71/mypy-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44bf464499f0e3a2d14d58b54674dee25c031703b2ffc35064bd0df2e0fac319", size = 12066766, upload-time = "2024-12-30T16:38:47.038Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/bb/f01bebf76811475d66359c259eabe40766d2f8ac8b8250d4e224bb6df379/mypy-1.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c99f27732c0b7dc847adb21c9d47ce57eb48fa33a17bc6d7d5c5e9f9e7ae5bac", size = 12787111, upload-time = "2024-12-30T16:39:02.444Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/c9/84837ff891edcb6dcc3c27d85ea52aab0c4a34740ff5f0ccc0eb87c56139/mypy-1.14.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:bce23c7377b43602baa0bd22ea3265c49b9ff0b76eb315d6c34721af4cdf1d9b", size = 12974331, upload-time = "2024-12-30T16:38:23.849Z" },
-    { url = "https://files.pythonhosted.org/packages/84/5f/901e18464e6a13f8949b4909535be3fa7f823291b8ab4e4b36cfe57d6769/mypy-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:8edc07eeade7ebc771ff9cf6b211b9a7d93687ff892150cb5692e4f4272b0837", size = 9763210, upload-time = "2024-12-30T16:38:36.299Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/1f/186d133ae2514633f8558e78cd658070ba686c0e9275c5a5c24a1e1f0d67/mypy-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3888a1816d69f7ab92092f785a462944b3ca16d7c470d564165fe703b0970c35", size = 11200493, upload-time = "2024-12-30T16:38:26.935Z" },
-    { url = "https://files.pythonhosted.org/packages/af/fc/4842485d034e38a4646cccd1369f6b1ccd7bc86989c52770d75d719a9941/mypy-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:46c756a444117c43ee984bd055db99e498bc613a70bbbc120272bd13ca579fbc", size = 10357702, upload-time = "2024-12-30T16:38:50.623Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/e6/457b83f2d701e23869cfec013a48a12638f75b9d37612a9ddf99072c1051/mypy-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:27fc248022907e72abfd8e22ab1f10e903915ff69961174784a3900a8cba9ad9", size = 12091104, upload-time = "2024-12-30T16:38:53.735Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/bf/76a569158db678fee59f4fd30b8e7a0d75bcbaeef49edd882a0d63af6d66/mypy-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:499d6a72fb7e5de92218db961f1a66d5f11783f9ae549d214617edab5d4dbdbb", size = 12830167, upload-time = "2024-12-30T16:38:56.437Z" },
-    { url = "https://files.pythonhosted.org/packages/43/bc/0bc6b694b3103de9fed61867f1c8bd33336b913d16831431e7cb48ef1c92/mypy-1.14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:57961db9795eb566dc1d1b4e9139ebc4c6b0cb6e7254ecde69d1552bf7613f60", size = 13013834, upload-time = "2024-12-30T16:38:59.204Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/79/5f5ec47849b6df1e6943d5fd8e6632fbfc04b4fd4acfa5a5a9535d11b4e2/mypy-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:07ba89fdcc9451f2ebb02853deb6aaaa3d2239a236669a63ab3801bbf923ef5c", size = 9781231, upload-time = "2024-12-30T16:39:05.124Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/b5/32dd67b69a16d088e533962e5044e51004176a9952419de0370cdaead0f8/mypy-1.14.1-py3-none-any.whl", hash = "sha256:b66a60cc4073aeb8ae00057f9c1f64d49e90f918fbcef9a977eb121da8b8f1d1", size = 2752905, upload-time = "2024-12-30T16:38:42.021Z" },
-]
-
-[[package]]
-name = "mypy"
-version = "1.17.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "mypy-extensions", marker = "python_full_version >= '3.9'" },
-    { name = "pathspec", marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/8e/22/ea637422dedf0bf36f3ef238eab4e455e2a0dcc3082b5cc067615347ab8e/mypy-1.17.1.tar.gz", hash = "sha256:25e01ec741ab5bb3eec8ba9cdb0f769230368a22c959c4937360efb89b7e9f01", size = 3352570, upload-time = "2025-07-31T07:54:19.204Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/a9/3d7aa83955617cdf02f94e50aab5c830d205cfa4320cf124ff64acce3a8e/mypy-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3fbe6d5555bf608c47203baa3e72dbc6ec9965b3d7c318aa9a4ca76f465bd972", size = 11003299, upload-time = "2025-07-31T07:54:06.425Z" },
-    { url = "https://files.pythonhosted.org/packages/83/e8/72e62ff837dd5caaac2b4a5c07ce769c8e808a00a65e5d8f94ea9c6f20ab/mypy-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80ef5c058b7bce08c83cac668158cb7edea692e458d21098c7d3bce35a5d43e7", size = 10125451, upload-time = "2025-07-31T07:53:52.974Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/10/f3f3543f6448db11881776f26a0ed079865926b0c841818ee22de2c6bbab/mypy-1.17.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4a580f8a70c69e4a75587bd925d298434057fe2a428faaf927ffe6e4b9a98df", size = 11916211, upload-time = "2025-07-31T07:53:18.879Z" },
-    { url = "https://files.pythonhosted.org/packages/06/bf/63e83ed551282d67bb3f7fea2cd5561b08d2bb6eb287c096539feb5ddbc5/mypy-1.17.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd86bb649299f09d987a2eebb4d52d10603224500792e1bee18303bbcc1ce390", size = 12652687, upload-time = "2025-07-31T07:53:30.544Z" },
-    { url = "https://files.pythonhosted.org/packages/69/66/68f2eeef11facf597143e85b694a161868b3b006a5fbad50e09ea117ef24/mypy-1.17.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a76906f26bd8d51ea9504966a9c25419f2e668f012e0bdf3da4ea1526c534d94", size = 12896322, upload-time = "2025-07-31T07:53:50.74Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/87/8e3e9c2c8bd0d7e071a89c71be28ad088aaecbadf0454f46a540bda7bca6/mypy-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:e79311f2d904ccb59787477b7bd5d26f3347789c06fcd7656fa500875290264b", size = 9507962, upload-time = "2025-07-31T07:53:08.431Z" },
-    { url = "https://files.pythonhosted.org/packages/46/cf/eadc80c4e0a70db1c08921dcc220357ba8ab2faecb4392e3cebeb10edbfa/mypy-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad37544be07c5d7fba814eb370e006df58fed8ad1ef33ed1649cb1889ba6ff58", size = 10921009, upload-time = "2025-07-31T07:53:23.037Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/c1/c869d8c067829ad30d9bdae051046561552516cfb3a14f7f0347b7d973ee/mypy-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:064e2ff508e5464b4bd807a7c1625bc5047c5022b85c70f030680e18f37273a5", size = 10047482, upload-time = "2025-07-31T07:53:26.151Z" },
-    { url = "https://files.pythonhosted.org/packages/98/b9/803672bab3fe03cee2e14786ca056efda4bb511ea02dadcedde6176d06d0/mypy-1.17.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70401bbabd2fa1aa7c43bb358f54037baf0586f41e83b0ae67dd0534fc64edfd", size = 11832883, upload-time = "2025-07-31T07:53:47.948Z" },
-    { url = "https://files.pythonhosted.org/packages/88/fb/fcdac695beca66800918c18697b48833a9a6701de288452b6715a98cfee1/mypy-1.17.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e92bdc656b7757c438660f775f872a669b8ff374edc4d18277d86b63edba6b8b", size = 12566215, upload-time = "2025-07-31T07:54:04.031Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/37/a932da3d3dace99ee8eb2043b6ab03b6768c36eb29a02f98f46c18c0da0e/mypy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c1fdf4abb29ed1cb091cf432979e162c208a5ac676ce35010373ff29247bcad5", size = 12751956, upload-time = "2025-07-31T07:53:36.263Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/cf/6438a429e0f2f5cab8bc83e53dbebfa666476f40ee322e13cac5e64b79e7/mypy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:ff2933428516ab63f961644bc49bc4cbe42bbffb2cd3b71cc7277c07d16b1a8b", size = 9507307, upload-time = "2025-07-31T07:53:59.734Z" },
-    { url = "https://files.pythonhosted.org/packages/17/a2/7034d0d61af8098ec47902108553122baa0f438df8a713be860f7407c9e6/mypy-1.17.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:69e83ea6553a3ba79c08c6e15dbd9bfa912ec1e493bf75489ef93beb65209aeb", size = 11086295, upload-time = "2025-07-31T07:53:28.124Z" },
-    { url = "https://files.pythonhosted.org/packages/14/1f/19e7e44b594d4b12f6ba8064dbe136505cec813549ca3e5191e40b1d3cc2/mypy-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b16708a66d38abb1e6b5702f5c2c87e133289da36f6a1d15f6a5221085c6403", size = 10112355, upload-time = "2025-07-31T07:53:21.121Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/69/baa33927e29e6b4c55d798a9d44db5d394072eef2bdc18c3e2048c9ed1e9/mypy-1.17.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:89e972c0035e9e05823907ad5398c5a73b9f47a002b22359b177d40bdaee7056", size = 11875285, upload-time = "2025-07-31T07:53:55.293Z" },
-    { url = "https://files.pythonhosted.org/packages/90/13/f3a89c76b0a41e19490b01e7069713a30949d9a6c147289ee1521bcea245/mypy-1.17.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03b6d0ed2b188e35ee6d5c36b5580cffd6da23319991c49ab5556c023ccf1341", size = 12737895, upload-time = "2025-07-31T07:53:43.623Z" },
-    { url = "https://files.pythonhosted.org/packages/23/a1/c4ee79ac484241301564072e6476c5a5be2590bc2e7bfd28220033d2ef8f/mypy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c837b896b37cd103570d776bda106eabb8737aa6dd4f248451aecf53030cdbeb", size = 12931025, upload-time = "2025-07-31T07:54:17.125Z" },
-    { url = "https://files.pythonhosted.org/packages/89/b8/7409477be7919a0608900e6320b155c72caab4fef46427c5cc75f85edadd/mypy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:665afab0963a4b39dff7c1fa563cc8b11ecff7910206db4b2e64dd1ba25aed19", size = 9584664, upload-time = "2025-07-31T07:54:12.842Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/82/aec2fc9b9b149f372850291827537a508d6c4d3664b1750a324b91f71355/mypy-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93378d3203a5c0800c6b6d850ad2f19f7a3cdf1a3701d3416dbf128805c6a6a7", size = 11075338, upload-time = "2025-07-31T07:53:38.873Z" },
-    { url = "https://files.pythonhosted.org/packages/07/ac/ee93fbde9d2242657128af8c86f5d917cd2887584cf948a8e3663d0cd737/mypy-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:15d54056f7fe7a826d897789f53dd6377ec2ea8ba6f776dc83c2902b899fee81", size = 10113066, upload-time = "2025-07-31T07:54:14.707Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/68/946a1e0be93f17f7caa56c45844ec691ca153ee8b62f21eddda336a2d203/mypy-1.17.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:209a58fed9987eccc20f2ca94afe7257a8f46eb5df1fb69958650973230f91e6", size = 11875473, upload-time = "2025-07-31T07:53:14.504Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/0f/478b4dce1cb4f43cf0f0d00fba3030b21ca04a01b74d1cd272a528cf446f/mypy-1.17.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:099b9a5da47de9e2cb5165e581f158e854d9e19d2e96b6698c0d64de911dd849", size = 12744296, upload-time = "2025-07-31T07:53:03.896Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/70/afa5850176379d1b303f992a828de95fc14487429a7139a4e0bdd17a8279/mypy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ffadfbe6994d724c5a1bb6123a7d27dd68fc9c059561cd33b664a79578e14", size = 12914657, upload-time = "2025-07-31T07:54:08.576Z" },
-    { url = "https://files.pythonhosted.org/packages/53/f9/4a83e1c856a3d9c8f6edaa4749a4864ee98486e9b9dbfbc93842891029c2/mypy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:9a2b7d9180aed171f033c9f2fc6c204c1245cf60b0cb61cf2e7acc24eea78e0a", size = 9593320, upload-time = "2025-07-31T07:53:01.341Z" },
-    { url = "https://files.pythonhosted.org/packages/38/56/79c2fac86da57c7d8c48622a05873eaab40b905096c33597462713f5af90/mypy-1.17.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:15a83369400454c41ed3a118e0cc58bd8123921a602f385cb6d6ea5df050c733", size = 11040037, upload-time = "2025-07-31T07:54:10.942Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/c3/adabe6ff53638e3cad19e3547268482408323b1e68bf082c9119000cd049/mypy-1.17.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55b918670f692fc9fba55c3298d8a3beae295c5cded0a55dccdc5bbead814acd", size = 10131550, upload-time = "2025-07-31T07:53:41.307Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/c5/2e234c22c3bdeb23a7817af57a58865a39753bde52c74e2c661ee0cfc640/mypy-1.17.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:62761474061feef6f720149d7ba876122007ddc64adff5ba6f374fda35a018a0", size = 11872963, upload-time = "2025-07-31T07:53:16.878Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/26/c13c130f35ca8caa5f2ceab68a247775648fdcd6c9a18f158825f2bc2410/mypy-1.17.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c49562d3d908fd49ed0938e5423daed8d407774a479b595b143a3d7f87cdae6a", size = 12710189, upload-time = "2025-07-31T07:54:01.962Z" },
-    { url = "https://files.pythonhosted.org/packages/82/df/c7d79d09f6de8383fe800521d066d877e54d30b4fb94281c262be2df84ef/mypy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:397fba5d7616a5bc60b45c7ed204717eaddc38f826e3645402c426057ead9a91", size = 12900322, upload-time = "2025-07-31T07:53:10.551Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/98/3d5a48978b4f708c55ae832619addc66d677f6dc59f3ebad71bae8285ca6/mypy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:9d6b20b97d373f41617bd0708fd46aa656059af57f2ef72aa8c7d6a2b73b74ed", size = 9751879, upload-time = "2025-07-31T07:52:56.683Z" },
-    { url = "https://files.pythonhosted.org/packages/29/cb/673e3d34e5d8de60b3a61f44f80150a738bff568cd6b7efb55742a605e98/mypy-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5d1092694f166a7e56c805caaf794e0585cabdbf1df36911c414e4e9abb62ae9", size = 10992466, upload-time = "2025-07-31T07:53:57.574Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/d0/fe1895836eea3a33ab801561987a10569df92f2d3d4715abf2cfeaa29cb2/mypy-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:79d44f9bfb004941ebb0abe8eff6504223a9c1ac51ef967d1263c6572bbebc99", size = 10117638, upload-time = "2025-07-31T07:53:34.256Z" },
-    { url = "https://files.pythonhosted.org/packages/97/f3/514aa5532303aafb95b9ca400a31054a2bd9489de166558c2baaeea9c522/mypy-1.17.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b01586eed696ec905e61bd2568f48740f7ac4a45b3a468e6423a03d3788a51a8", size = 11915673, upload-time = "2025-07-31T07:52:59.361Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/c3/c0805f0edec96fe8e2c048b03769a6291523d509be8ee7f56ae922fa3882/mypy-1.17.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43808d9476c36b927fbcd0b0255ce75efe1b68a080154a38ae68a7e62de8f0f8", size = 12649022, upload-time = "2025-07-31T07:53:45.92Z" },
-    { url = "https://files.pythonhosted.org/packages/45/3e/d646b5a298ada21a8512fa7e5531f664535a495efa672601702398cea2b4/mypy-1.17.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:feb8cc32d319edd5859da2cc084493b3e2ce5e49a946377663cc90f6c15fb259", size = 12895536, upload-time = "2025-07-31T07:53:06.17Z" },
-    { url = "https://files.pythonhosted.org/packages/14/55/e13d0dcd276975927d1f4e9e2ec4fd409e199f01bdc671717e673cc63a22/mypy-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d7598cf74c3e16539d4e2f0b8d8c318e00041553d83d4861f87c7a72e95ac24d", size = 9512564, upload-time = "2025-07-31T07:53:12.346Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/f3/8fcd2af0f5b806f6cf463efaffd3c9548a28f84220493ecd38d127b6b66d/mypy-1.17.1-py3-none-any.whl", hash = "sha256:a9f52c0351c21fe24c21d8c0eb1f62967b262d6729393397b6f443c3b773c3b9", size = 2283411, upload-time = "2025-07-31T07:53:24.664Z" },
-]
-
-[[package]]
-name = "mypy-extensions"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "1.24.4"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229, upload-time = "2023-06-26T13:39:33.218Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140, upload-time = "2023-06-26T13:22:33.184Z" },
-    { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297, upload-time = "2023-06-26T13:22:59.541Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611, upload-time = "2023-06-26T13:23:22.167Z" },
-    { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357, upload-time = "2023-06-26T13:23:51.446Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222, upload-time = "2023-06-26T13:24:13.849Z" },
-    { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514, upload-time = "2023-06-26T13:24:38.129Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508, upload-time = "2023-06-26T13:25:08.882Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033, upload-time = "2023-06-26T13:25:33.417Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951, upload-time = "2023-06-26T13:25:55.725Z" },
-    { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923, upload-time = "2023-06-26T13:26:25.658Z" },
-    { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446, upload-time = "2023-06-26T13:26:49.302Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466, upload-time = "2023-06-26T13:27:16.029Z" },
-    { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722, upload-time = "2023-06-26T13:27:49.573Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102, upload-time = "2023-06-26T13:28:12.288Z" },
-    { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616, upload-time = "2023-06-26T13:28:35.659Z" },
-    { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263, upload-time = "2023-06-26T13:29:09.272Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660, upload-time = "2023-06-26T13:29:33.434Z" },
-    { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112, upload-time = "2023-06-26T13:29:58.385Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549, upload-time = "2023-06-26T13:30:36.976Z" },
-    { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950, upload-time = "2023-06-26T13:31:01.787Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228, upload-time = "2023-06-26T13:31:26.696Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170, upload-time = "2023-06-26T13:31:56.615Z" },
-    { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918, upload-time = "2023-06-26T13:32:16.8Z" },
-    { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441, upload-time = "2023-06-26T13:32:40.521Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590, upload-time = "2023-06-26T13:33:10.36Z" },
-    { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744, upload-time = "2023-06-26T13:33:36.703Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290, upload-time = "2023-06-26T13:34:05.409Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.0.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" },
-    { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" },
-    { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" },
-    { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" },
-    { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" },
-    { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" },
-    { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" },
-    { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" },
-    { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" },
-    { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" },
-    { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" },
-    { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" },
-    { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" },
-    { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.2.6"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" },
-    { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" },
-    { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" },
-    { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" },
-    { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" },
-    { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" },
-    { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" },
-    { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" },
-    { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" },
-    { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" },
-    { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" },
-    { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" },
-    { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" },
-    { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828, upload-time = "2025-05-17T21:37:56.699Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006, upload-time = "2025-05-17T21:38:18.291Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765, upload-time = "2025-05-17T21:38:27.319Z" },
-    { url = "https://files.pythonhosted.org/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736, upload-time = "2025-05-17T21:38:38.141Z" },
-    { url = "https://files.pythonhosted.org/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719, upload-time = "2025-05-17T21:38:58.433Z" },
-    { url = "https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072, upload-time = "2025-05-17T21:39:22.638Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213, upload-time = "2025-05-17T21:39:45.865Z" },
-    { url = "https://files.pythonhosted.org/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632, upload-time = "2025-05-17T21:40:13.331Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467, upload-time = "2025-05-17T21:40:44Z" },
-    { url = "https://files.pythonhosted.org/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144, upload-time = "2025-05-17T21:41:05.695Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217, upload-time = "2025-05-17T21:41:15.903Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014, upload-time = "2025-05-17T21:41:27.321Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935, upload-time = "2025-05-17T21:41:49.738Z" },
-    { url = "https://files.pythonhosted.org/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122, upload-time = "2025-05-17T21:42:14.046Z" },
-    { url = "https://files.pythonhosted.org/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143, upload-time = "2025-05-17T21:42:37.464Z" },
-    { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" },
-    { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" },
-    { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" },
-    { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" },
-    { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" },
-    { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.3.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/37/7d/3fec4199c5ffb892bed55cff901e4f39a58c81df9c44c280499e92cad264/numpy-2.3.2.tar.gz", hash = "sha256:e0486a11ec30cdecb53f184d496d1c6a20786c81e55e41640270130056f8ee48", size = 20489306, upload-time = "2025-07-24T21:32:07.553Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/26/1320083986108998bd487e2931eed2aeedf914b6e8905431487543ec911d/numpy-2.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:852ae5bed3478b92f093e30f785c98e0cb62fa0a939ed057c31716e18a7a22b9", size = 21259016, upload-time = "2025-07-24T20:24:35.214Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/2b/792b341463fa93fc7e55abbdbe87dac316c5b8cb5e94fb7a59fb6fa0cda5/numpy-2.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a0e27186e781a69959d0230dd9909b5e26024f8da10683bd6344baea1885168", size = 14451158, upload-time = "2025-07-24T20:24:58.397Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/13/e792d7209261afb0c9f4759ffef6135b35c77c6349a151f488f531d13595/numpy-2.3.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f0a1a8476ad77a228e41619af2fa9505cf69df928e9aaa165746584ea17fed2b", size = 5379817, upload-time = "2025-07-24T20:25:07.746Z" },
-    { url = "https://files.pythonhosted.org/packages/49/ce/055274fcba4107c022b2113a213c7287346563f48d62e8d2a5176ad93217/numpy-2.3.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:cbc95b3813920145032412f7e33d12080f11dc776262df1712e1638207dde9e8", size = 6913606, upload-time = "2025-07-24T20:25:18.84Z" },
-    { url = "https://files.pythonhosted.org/packages/17/f2/e4d72e6bc5ff01e2ab613dc198d560714971900c03674b41947e38606502/numpy-2.3.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f75018be4980a7324edc5930fe39aa391d5734531b1926968605416ff58c332d", size = 14589652, upload-time = "2025-07-24T20:25:40.356Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/b0/fbeee3000a51ebf7222016e2939b5c5ecf8000a19555d04a18f1e02521b8/numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20b8200721840f5621b7bd03f8dcd78de33ec522fc40dc2641aa09537df010c3", size = 16938816, upload-time = "2025-07-24T20:26:05.721Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/ec/2f6c45c3484cc159621ea8fc000ac5a86f1575f090cac78ac27193ce82cd/numpy-2.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f91e5c028504660d606340a084db4b216567ded1056ea2b4be4f9d10b67197f", size = 16370512, upload-time = "2025-07-24T20:26:30.545Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/01/dd67cf511850bd7aefd6347aaae0956ed415abea741ae107834aae7d6d4e/numpy-2.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fb1752a3bb9a3ad2d6b090b88a9a0ae1cd6f004ef95f75825e2f382c183b2097", size = 18884947, upload-time = "2025-07-24T20:26:58.24Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/17/2cf60fd3e6a61d006778735edf67a222787a8c1a7842aed43ef96d777446/numpy-2.3.2-cp311-cp311-win32.whl", hash = "sha256:4ae6863868aaee2f57503c7a5052b3a2807cf7a3914475e637a0ecd366ced220", size = 6599494, upload-time = "2025-07-24T20:27:09.786Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/03/0eade211c504bda872a594f045f98ddcc6caef2b7c63610946845e304d3f/numpy-2.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:240259d6564f1c65424bcd10f435145a7644a65a6811cfc3201c4a429ba79170", size = 13087889, upload-time = "2025-07-24T20:27:29.558Z" },
-    { url = "https://files.pythonhosted.org/packages/13/32/2c7979d39dafb2a25087e12310fc7f3b9d3c7d960df4f4bc97955ae0ce1d/numpy-2.3.2-cp311-cp311-win_arm64.whl", hash = "sha256:4209f874d45f921bde2cff1ffcd8a3695f545ad2ffbef6d3d3c6768162efab89", size = 10459560, upload-time = "2025-07-24T20:27:46.803Z" },
-    { url = "https://files.pythonhosted.org/packages/00/6d/745dd1c1c5c284d17725e5c802ca4d45cfc6803519d777f087b71c9f4069/numpy-2.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bc3186bea41fae9d8e90c2b4fb5f0a1f5a690682da79b92574d63f56b529080b", size = 20956420, upload-time = "2025-07-24T20:28:18.002Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/96/e7b533ea5740641dd62b07a790af5d9d8fec36000b8e2d0472bd7574105f/numpy-2.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f4f0215edb189048a3c03bd5b19345bdfa7b45a7a6f72ae5945d2a28272727f", size = 14184660, upload-time = "2025-07-24T20:28:39.522Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/53/102c6122db45a62aa20d1b18c9986f67e6b97e0d6fbc1ae13e3e4c84430c/numpy-2.3.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b1224a734cd509f70816455c3cffe13a4f599b1bf7130f913ba0e2c0b2006c0", size = 5113382, upload-time = "2025-07-24T20:28:48.544Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/21/376257efcbf63e624250717e82b4fae93d60178f09eb03ed766dbb48ec9c/numpy-2.3.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3dcf02866b977a38ba3ec10215220609ab9667378a9e2150615673f3ffd6c73b", size = 6647258, upload-time = "2025-07-24T20:28:59.104Z" },
-    { url = "https://files.pythonhosted.org/packages/91/ba/f4ebf257f08affa464fe6036e13f2bf9d4642a40228781dc1235da81be9f/numpy-2.3.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:572d5512df5470f50ada8d1972c5f1082d9a0b7aa5944db8084077570cf98370", size = 14281409, upload-time = "2025-07-24T20:40:30.298Z" },
-    { url = "https://files.pythonhosted.org/packages/59/ef/f96536f1df42c668cbacb727a8c6da7afc9c05ece6d558927fb1722693e1/numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8145dd6d10df13c559d1e4314df29695613575183fa2e2d11fac4c208c8a1f73", size = 16641317, upload-time = "2025-07-24T20:40:56.625Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/a7/af813a7b4f9a42f498dde8a4c6fcbff8100eed00182cc91dbaf095645f38/numpy-2.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:103ea7063fa624af04a791c39f97070bf93b96d7af7eb23530cd087dc8dbe9dc", size = 16056262, upload-time = "2025-07-24T20:41:20.797Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/5d/41c4ef8404caaa7f05ed1cfb06afe16a25895260eacbd29b4d84dff2920b/numpy-2.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc927d7f289d14f5e037be917539620603294454130b6de200091e23d27dc9be", size = 18579342, upload-time = "2025-07-24T20:41:50.753Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/4f/9950e44c5a11636f4a3af6e825ec23003475cc9a466edb7a759ed3ea63bd/numpy-2.3.2-cp312-cp312-win32.whl", hash = "sha256:d95f59afe7f808c103be692175008bab926b59309ade3e6d25009e9a171f7036", size = 6320610, upload-time = "2025-07-24T20:42:01.551Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/2f/244643a5ce54a94f0a9a2ab578189c061e4a87c002e037b0829dd77293b6/numpy-2.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:9e196ade2400c0c737d93465327d1ae7c06c7cb8a1756121ebf54b06ca183c7f", size = 12786292, upload-time = "2025-07-24T20:42:20.738Z" },
-    { url = "https://files.pythonhosted.org/packages/54/cd/7b5f49d5d78db7badab22d8323c1b6ae458fbf86c4fdfa194ab3cd4eb39b/numpy-2.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:ee807923782faaf60d0d7331f5e86da7d5e3079e28b291973c545476c2b00d07", size = 10194071, upload-time = "2025-07-24T20:42:36.657Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/c0/c6bb172c916b00700ed3bf71cb56175fd1f7dbecebf8353545d0b5519f6c/numpy-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c8d9727f5316a256425892b043736d63e89ed15bbfe6556c5ff4d9d4448ff3b3", size = 20949074, upload-time = "2025-07-24T20:43:07.813Z" },
-    { url = "https://files.pythonhosted.org/packages/20/4e/c116466d22acaf4573e58421c956c6076dc526e24a6be0903219775d862e/numpy-2.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:efc81393f25f14d11c9d161e46e6ee348637c0a1e8a54bf9dedc472a3fae993b", size = 14177311, upload-time = "2025-07-24T20:43:29.335Z" },
-    { url = "https://files.pythonhosted.org/packages/78/45/d4698c182895af189c463fc91d70805d455a227261d950e4e0f1310c2550/numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dd937f088a2df683cbb79dda9a772b62a3e5a8a7e76690612c2737f38c6ef1b6", size = 5106022, upload-time = "2025-07-24T20:43:37.999Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/76/3e6880fef4420179309dba72a8c11f6166c431cf6dee54c577af8906f914/numpy-2.3.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:11e58218c0c46c80509186e460d79fbdc9ca1eb8d8aee39d8f2dc768eb781089", size = 6640135, upload-time = "2025-07-24T20:43:49.28Z" },
-    { url = "https://files.pythonhosted.org/packages/34/fa/87ff7f25b3c4ce9085a62554460b7db686fef1e0207e8977795c7b7d7ba1/numpy-2.3.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ad4ebcb683a1f99f4f392cc522ee20a18b2bb12a2c1c42c3d48d5a1adc9d3d2", size = 14278147, upload-time = "2025-07-24T20:44:10.328Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f", size = 16635989, upload-time = "2025-07-24T20:44:34.88Z" },
-    { url = "https://files.pythonhosted.org/packages/24/5a/84ae8dca9c9a4c592fe11340b36a86ffa9fd3e40513198daf8a97839345c/numpy-2.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:66459dccc65d8ec98cc7df61307b64bf9e08101f9598755d42d8ae65d9a7a6ee", size = 16053052, upload-time = "2025-07-24T20:44:58.872Z" },
-    { url = "https://files.pythonhosted.org/packages/57/7c/e5725d99a9133b9813fcf148d3f858df98511686e853169dbaf63aec6097/numpy-2.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7af9ed2aa9ec5950daf05bb11abc4076a108bd3c7db9aa7251d5f107079b6a6", size = 18577955, upload-time = "2025-07-24T20:45:26.714Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/11/7c546fcf42145f29b71e4d6f429e96d8d68e5a7ba1830b2e68d7418f0bbd/numpy-2.3.2-cp313-cp313-win32.whl", hash = "sha256:906a30249315f9c8e17b085cc5f87d3f369b35fedd0051d4a84686967bdbbd0b", size = 6311843, upload-time = "2025-07-24T20:49:24.444Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/6f/a428fd1cb7ed39b4280d057720fed5121b0d7754fd2a9768640160f5517b/numpy-2.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:c63d95dc9d67b676e9108fe0d2182987ccb0f11933c1e8959f42fa0da8d4fa56", size = 12782876, upload-time = "2025-07-24T20:49:43.227Z" },
-    { url = "https://files.pythonhosted.org/packages/65/85/4ea455c9040a12595fb6c43f2c217257c7b52dd0ba332c6a6c1d28b289fe/numpy-2.3.2-cp313-cp313-win_arm64.whl", hash = "sha256:b05a89f2fb84d21235f93de47129dd4f11c16f64c87c33f5e284e6a3a54e43f2", size = 10192786, upload-time = "2025-07-24T20:49:59.443Z" },
-    { url = "https://files.pythonhosted.org/packages/80/23/8278f40282d10c3f258ec3ff1b103d4994bcad78b0cba9208317f6bb73da/numpy-2.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4e6ecfeddfa83b02318f4d84acf15fbdbf9ded18e46989a15a8b6995dfbf85ab", size = 21047395, upload-time = "2025-07-24T20:45:58.821Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/2d/624f2ce4a5df52628b4ccd16a4f9437b37c35f4f8a50d00e962aae6efd7a/numpy-2.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:508b0eada3eded10a3b55725b40806a4b855961040180028f52580c4729916a2", size = 14300374, upload-time = "2025-07-24T20:46:20.207Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/62/ff1e512cdbb829b80a6bd08318a58698867bca0ca2499d101b4af063ee97/numpy-2.3.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:754d6755d9a7588bdc6ac47dc4ee97867271b17cee39cb87aef079574366db0a", size = 5228864, upload-time = "2025-07-24T20:46:30.58Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/8e/74bc18078fff03192d4032cfa99d5a5ca937807136d6f5790ce07ca53515/numpy-2.3.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a9f66e7d2b2d7712410d3bc5684149040ef5f19856f20277cd17ea83e5006286", size = 6737533, upload-time = "2025-07-24T20:46:46.111Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ea/0731efe2c9073ccca5698ef6a8c3667c4cf4eea53fcdcd0b50140aba03bc/numpy-2.3.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de6ea4e5a65d5a90c7d286ddff2b87f3f4ad61faa3db8dabe936b34c2275b6f8", size = 14352007, upload-time = "2025-07-24T20:47:07.1Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/90/36be0865f16dfed20f4bc7f75235b963d5939707d4b591f086777412ff7b/numpy-2.3.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3ef07ec8cbc8fc9e369c8dcd52019510c12da4de81367d8b20bc692aa07573a", size = 16701914, upload-time = "2025-07-24T20:47:32.459Z" },
-    { url = "https://files.pythonhosted.org/packages/94/30/06cd055e24cb6c38e5989a9e747042b4e723535758e6153f11afea88c01b/numpy-2.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:27c9f90e7481275c7800dc9c24b7cc40ace3fdb970ae4d21eaff983a32f70c91", size = 16132708, upload-time = "2025-07-24T20:47:58.129Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/14/ecede608ea73e58267fd7cb78f42341b3b37ba576e778a1a06baffbe585c/numpy-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:07b62978075b67eee4065b166d000d457c82a1efe726cce608b9db9dd66a73a5", size = 18651678, upload-time = "2025-07-24T20:48:25.402Z" },
-    { url = "https://files.pythonhosted.org/packages/40/f3/2fe6066b8d07c3685509bc24d56386534c008b462a488b7f503ba82b8923/numpy-2.3.2-cp313-cp313t-win32.whl", hash = "sha256:c771cfac34a4f2c0de8e8c97312d07d64fd8f8ed45bc9f5726a7e947270152b5", size = 6441832, upload-time = "2025-07-24T20:48:37.181Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/ba/0937d66d05204d8f28630c9c60bc3eda68824abde4cf756c4d6aad03b0c6/numpy-2.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:72dbebb2dcc8305c431b2836bcc66af967df91be793d63a24e3d9b741374c450", size = 12927049, upload-time = "2025-07-24T20:48:56.24Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/ed/13542dd59c104d5e654dfa2ac282c199ba64846a74c2c4bcdbc3a0f75df1/numpy-2.3.2-cp313-cp313t-win_arm64.whl", hash = "sha256:72c6df2267e926a6d5286b0a6d556ebe49eae261062059317837fda12ddf0c1a", size = 10262935, upload-time = "2025-07-24T20:49:13.136Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/7c/7659048aaf498f7611b783e000c7268fcc4dcf0ce21cd10aad7b2e8f9591/numpy-2.3.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:448a66d052d0cf14ce9865d159bfc403282c9bc7bb2a31b03cc18b651eca8b1a", size = 20950906, upload-time = "2025-07-24T20:50:30.346Z" },
-    { url = "https://files.pythonhosted.org/packages/80/db/984bea9d4ddf7112a04cfdfb22b1050af5757864cfffe8e09e44b7f11a10/numpy-2.3.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:546aaf78e81b4081b2eba1d105c3b34064783027a06b3ab20b6eba21fb64132b", size = 14185607, upload-time = "2025-07-24T20:50:51.923Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/76/b3d6f414f4eca568f469ac112a3b510938d892bc5a6c190cb883af080b77/numpy-2.3.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:87c930d52f45df092f7578889711a0768094debf73cfcde105e2d66954358125", size = 5114110, upload-time = "2025-07-24T20:51:01.041Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/d2/6f5e6826abd6bca52392ed88fe44a4b52aacb60567ac3bc86c67834c3a56/numpy-2.3.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:8dc082ea901a62edb8f59713c6a7e28a85daddcb67454c839de57656478f5b19", size = 6642050, upload-time = "2025-07-24T20:51:11.64Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/43/f12b2ade99199e39c73ad182f103f9d9791f48d885c600c8e05927865baf/numpy-2.3.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af58de8745f7fa9ca1c0c7c943616c6fe28e75d0c81f5c295810e3c83b5be92f", size = 14296292, upload-time = "2025-07-24T20:51:33.488Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/f9/77c07d94bf110a916b17210fac38680ed8734c236bfed9982fd8524a7b47/numpy-2.3.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed5527c4cf10f16c6d0b6bee1f89958bccb0ad2522c8cadc2efd318bcd545f5", size = 16638913, upload-time = "2025-07-24T20:51:58.517Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/d1/9d9f2c8ea399cc05cfff8a7437453bd4e7d894373a93cdc46361bbb49a7d/numpy-2.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:095737ed986e00393ec18ec0b21b47c22889ae4b0cd2d5e88342e08b01141f58", size = 16071180, upload-time = "2025-07-24T20:52:22.827Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/41/82e2c68aff2a0c9bf315e47d61951099fed65d8cb2c8d9dc388cb87e947e/numpy-2.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5e40e80299607f597e1a8a247ff8d71d79c5b52baa11cc1cce30aa92d2da6e0", size = 18576809, upload-time = "2025-07-24T20:52:51.015Z" },
-    { url = "https://files.pythonhosted.org/packages/14/14/4b4fd3efb0837ed252d0f583c5c35a75121038a8c4e065f2c259be06d2d8/numpy-2.3.2-cp314-cp314-win32.whl", hash = "sha256:7d6e390423cc1f76e1b8108c9b6889d20a7a1f59d9a60cac4a050fa734d6c1e2", size = 6366410, upload-time = "2025-07-24T20:56:44.949Z" },
-    { url = "https://files.pythonhosted.org/packages/11/9e/b4c24a6b8467b61aced5c8dc7dcfce23621baa2e17f661edb2444a418040/numpy-2.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:b9d0878b21e3918d76d2209c924ebb272340da1fb51abc00f986c258cd5e957b", size = 12918821, upload-time = "2025-07-24T20:57:06.479Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/0f/0dc44007c70b1007c1cef86b06986a3812dd7106d8f946c09cfa75782556/numpy-2.3.2-cp314-cp314-win_arm64.whl", hash = "sha256:2738534837c6a1d0c39340a190177d7d66fdf432894f469728da901f8f6dc910", size = 10477303, upload-time = "2025-07-24T20:57:22.879Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/3e/075752b79140b78ddfc9c0a1634d234cfdbc6f9bbbfa6b7504e445ad7d19/numpy-2.3.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:4d002ecf7c9b53240be3bb69d80f86ddbd34078bae04d87be81c1f58466f264e", size = 21047524, upload-time = "2025-07-24T20:53:22.086Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/6d/60e8247564a72426570d0e0ea1151b95ce5bd2f1597bb878a18d32aec855/numpy-2.3.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:293b2192c6bcce487dbc6326de5853787f870aeb6c43f8f9c6496db5b1781e45", size = 14300519, upload-time = "2025-07-24T20:53:44.053Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/73/d8326c442cd428d47a067070c3ac6cc3b651a6e53613a1668342a12d4479/numpy-2.3.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:0a4f2021a6da53a0d580d6ef5db29947025ae8b35b3250141805ea9a32bbe86b", size = 5228972, upload-time = "2025-07-24T20:53:53.81Z" },
-    { url = "https://files.pythonhosted.org/packages/34/2e/e71b2d6dad075271e7079db776196829019b90ce3ece5c69639e4f6fdc44/numpy-2.3.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9c144440db4bf3bb6372d2c3e49834cc0ff7bb4c24975ab33e01199e645416f2", size = 6737439, upload-time = "2025-07-24T20:54:04.742Z" },
-    { url = "https://files.pythonhosted.org/packages/15/b0/d004bcd56c2c5e0500ffc65385eb6d569ffd3363cb5e593ae742749b2daa/numpy-2.3.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f92d6c2a8535dc4fe4419562294ff957f83a16ebdec66df0805e473ffaad8bd0", size = 14352479, upload-time = "2025-07-24T20:54:25.819Z" },
-    { url = "https://files.pythonhosted.org/packages/11/e3/285142fcff8721e0c99b51686426165059874c150ea9ab898e12a492e291/numpy-2.3.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cefc2219baa48e468e3db7e706305fcd0c095534a192a08f31e98d83a7d45fb0", size = 16702805, upload-time = "2025-07-24T20:54:50.814Z" },
-    { url = "https://files.pythonhosted.org/packages/33/c3/33b56b0e47e604af2c7cd065edca892d180f5899599b76830652875249a3/numpy-2.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:76c3e9501ceb50b2ff3824c3589d5d1ab4ac857b0ee3f8f49629d0de55ecf7c2", size = 16133830, upload-time = "2025-07-24T20:55:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/ae/7b1476a1f4d6a48bc669b8deb09939c56dd2a439db1ab03017844374fb67/numpy-2.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:122bf5ed9a0221b3419672493878ba4967121514b1d7d4656a7580cd11dddcbf", size = 18652665, upload-time = "2025-07-24T20:55:46.665Z" },
-    { url = "https://files.pythonhosted.org/packages/14/ba/5b5c9978c4bb161034148ade2de9db44ec316fab89ce8c400db0e0c81f86/numpy-2.3.2-cp314-cp314t-win32.whl", hash = "sha256:6f1ae3dcb840edccc45af496f312528c15b1f79ac318169d094e85e4bb35fdf1", size = 6514777, upload-time = "2025-07-24T20:55:57.66Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/46/3dbaf0ae7c17cdc46b9f662c56da2054887b8d9e737c1476f335c83d33db/numpy-2.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:087ffc25890d89a43536f75c5fe8770922008758e8eeeef61733957041ed2f9b", size = 13111856, upload-time = "2025-07-24T20:56:17.318Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/ea/50ebc91d28b275b23b7128ef25c3d08152bc4068f42742867e07a870a42a/numpy-2.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:14a91ebac98813a49bc6aa1a0dfc09513dcec1d97eaf31ca21a87221a1cdcb15", size = 21130338, upload-time = "2025-07-24T20:57:54.37Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/57/cdd5eac00dd5f137277355c318a955c0d8fb8aa486020c22afd305f8b88f/numpy-2.3.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:71669b5daae692189540cffc4c439468d35a3f84f0c88b078ecd94337f6cb0ec", size = 14375776, upload-time = "2025-07-24T20:58:16.303Z" },
-    { url = "https://files.pythonhosted.org/packages/83/85/27280c7f34fcd305c2209c0cdca4d70775e4859a9eaa92f850087f8dea50/numpy-2.3.2-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:69779198d9caee6e547adb933941ed7520f896fd9656834c300bdf4dd8642712", size = 5304882, upload-time = "2025-07-24T20:58:26.199Z" },
-    { url = "https://files.pythonhosted.org/packages/48/b4/6500b24d278e15dd796f43824e69939d00981d37d9779e32499e823aa0aa/numpy-2.3.2-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2c3271cc4097beb5a60f010bcc1cc204b300bb3eafb4399376418a83a1c6373c", size = 6818405, upload-time = "2025-07-24T20:58:37.341Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/c9/142c1e03f199d202da8e980c2496213509291b6024fd2735ad28ae7065c7/numpy-2.3.2-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8446acd11fe3dc1830568c941d44449fd5cb83068e5c70bd5a470d323d448296", size = 14419651, upload-time = "2025-07-24T20:58:59.048Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/95/8023e87cbea31a750a6c00ff9427d65ebc5fef104a136bfa69f76266d614/numpy-2.3.2-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa098a5ab53fa407fded5870865c6275a5cd4101cfdef8d6fafc48286a96e981", size = 16760166, upload-time = "2025-07-24T21:28:56.38Z" },
-    { url = "https://files.pythonhosted.org/packages/78/e3/6690b3f85a05506733c7e90b577e4762517404ea78bab2ca3a5cb1aeb78d/numpy-2.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6936aff90dda378c09bea075af0d9c675fe3a977a9d2402f95a87f440f59f619", size = 12977811, upload-time = "2025-07-24T21:29:18.234Z" },
-]
-
-[[package]]
-name = "packaging"
-version = "25.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
-]
-
-[[package]]
-name = "pandas"
-version = "2.0.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "python-dateutil", marker = "python_full_version < '3.9'" },
-    { name = "pytz", marker = "python_full_version < '3.9'" },
-    { name = "tzdata", marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455, upload-time = "2023-06-28T23:19:33.371Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908, upload-time = "2023-06-28T23:15:57.001Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486, upload-time = "2023-06-28T23:16:06.863Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897, upload-time = "2023-06-28T23:16:14.208Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421, upload-time = "2023-06-28T23:16:23.26Z" },
-    { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792, upload-time = "2023-06-28T23:16:30.876Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333, upload-time = "2023-06-28T23:16:39.209Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672, upload-time = "2023-06-28T23:16:47.601Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229, upload-time = "2023-06-28T23:16:56.397Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591, upload-time = "2023-06-28T23:17:04.234Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370, upload-time = "2023-06-28T23:17:11.783Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935, upload-time = "2023-06-28T23:17:21.376Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692, upload-time = "2023-06-28T23:17:28.824Z" },
-    { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303, upload-time = "2023-06-28T23:17:36.329Z" },
-    { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932, upload-time = "2023-06-28T23:17:49.875Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018, upload-time = "2023-06-28T23:18:05.845Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723, upload-time = "2023-06-28T23:18:17.631Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403, upload-time = "2023-06-28T23:18:24.328Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638, upload-time = "2023-06-28T23:18:30.947Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160, upload-time = "2023-06-28T23:18:40.332Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752, upload-time = "2023-06-28T23:18:50.016Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852, upload-time = "2023-06-28T23:19:00.594Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496, upload-time = "2023-06-28T23:19:11.78Z" },
-    { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766, upload-time = "2023-06-28T23:19:18.182Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902, upload-time = "2023-06-28T23:19:25.151Z" },
-]
-
-[[package]]
-name = "pandas"
-version = "2.3.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
-    { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "python-dateutil", marker = "python_full_version >= '3.9'" },
-    { name = "pytz", marker = "python_full_version >= '3.9'" },
-    { name = "tzdata", marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/ca/aa97b47287221fa37a49634532e520300088e290b20d690b21ce3e448143/pandas-2.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22c2e866f7209ebc3a8f08d75766566aae02bcc91d196935a1d9e59c7b990ac9", size = 11542731, upload-time = "2025-07-07T19:18:12.619Z" },
-    { url = "https://files.pythonhosted.org/packages/80/bf/7938dddc5f01e18e573dcfb0f1b8c9357d9b5fa6ffdee6e605b92efbdff2/pandas-2.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3583d348546201aff730c8c47e49bc159833f971c2899d6097bce68b9112a4f1", size = 10790031, upload-time = "2025-07-07T19:18:16.611Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/2f/9af748366763b2a494fed477f88051dbf06f56053d5c00eba652697e3f94/pandas-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f951fbb702dacd390561e0ea45cdd8ecfa7fb56935eb3dd78e306c19104b9b0", size = 11724083, upload-time = "2025-07-07T19:18:20.512Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/95/79ab37aa4c25d1e7df953dde407bb9c3e4ae47d154bc0dd1692f3a6dcf8c/pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd05b72ec02ebfb993569b4931b2e16fbb4d6ad6ce80224a3ee838387d83a191", size = 12342360, upload-time = "2025-07-07T19:18:23.194Z" },
-    { url = "https://files.pythonhosted.org/packages/75/a7/d65e5d8665c12c3c6ff5edd9709d5836ec9b6f80071b7f4a718c6106e86e/pandas-2.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1b916a627919a247d865aed068eb65eb91a344b13f5b57ab9f610b7716c92de1", size = 13202098, upload-time = "2025-07-07T19:18:25.558Z" },
-    { url = "https://files.pythonhosted.org/packages/65/f3/4c1dbd754dbaa79dbf8b537800cb2fa1a6e534764fef50ab1f7533226c5c/pandas-2.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fe67dc676818c186d5a3d5425250e40f179c2a89145df477dd82945eaea89e97", size = 13837228, upload-time = "2025-07-07T19:18:28.344Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/d6/d7f5777162aa9b48ec3910bca5a58c9b5927cfd9cfde3aa64322f5ba4b9f/pandas-2.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:2eb789ae0274672acbd3c575b0598d213345660120a257b47b5dafdc618aec83", size = 11336561, upload-time = "2025-07-07T19:18:31.211Z" },
-    { url = "https://files.pythonhosted.org/packages/76/1c/ccf70029e927e473a4476c00e0d5b32e623bff27f0402d0a92b7fc29bb9f/pandas-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b", size = 11566608, upload-time = "2025-07-07T19:18:33.86Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/d3/3c37cb724d76a841f14b8f5fe57e5e3645207cc67370e4f84717e8bb7657/pandas-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f", size = 10823181, upload-time = "2025-07-07T19:18:36.151Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/4c/367c98854a1251940edf54a4df0826dcacfb987f9068abf3e3064081a382/pandas-2.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85", size = 11793570, upload-time = "2025-07-07T19:18:38.385Z" },
-    { url = "https://files.pythonhosted.org/packages/07/5f/63760ff107bcf5146eee41b38b3985f9055e710a72fdd637b791dea3495c/pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d", size = 12378887, upload-time = "2025-07-07T19:18:41.284Z" },
-    { url = "https://files.pythonhosted.org/packages/15/53/f31a9b4dfe73fe4711c3a609bd8e60238022f48eacedc257cd13ae9327a7/pandas-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678", size = 13230957, upload-time = "2025-07-07T19:18:44.187Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/94/6fce6bf85b5056d065e0a7933cba2616dcb48596f7ba3c6341ec4bcc529d/pandas-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299", size = 13883883, upload-time = "2025-07-07T19:18:46.498Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/7b/bdcb1ed8fccb63d04bdb7635161d0ec26596d92c9d7a6cce964e7876b6c1/pandas-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab", size = 11340212, upload-time = "2025-07-07T19:18:49.293Z" },
-    { url = "https://files.pythonhosted.org/packages/46/de/b8445e0f5d217a99fe0eeb2f4988070908979bec3587c0633e5428ab596c/pandas-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:689968e841136f9e542020698ee1c4fbe9caa2ed2213ae2388dc7b81721510d3", size = 11588172, upload-time = "2025-07-07T19:18:52.054Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/e0/801cdb3564e65a5ac041ab99ea6f1d802a6c325bb6e58c79c06a3f1cd010/pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:025e92411c16cbe5bb2a4abc99732a6b132f439b8aab23a59fa593eb00704232", size = 10717365, upload-time = "2025-07-07T19:18:54.785Z" },
-    { url = "https://files.pythonhosted.org/packages/51/a5/c76a8311833c24ae61a376dbf360eb1b1c9247a5d9c1e8b356563b31b80c/pandas-2.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b7ff55f31c4fcb3e316e8f7fa194566b286d6ac430afec0d461163312c5841e", size = 11280411, upload-time = "2025-07-07T19:18:57.045Z" },
-    { url = "https://files.pythonhosted.org/packages/da/01/e383018feba0a1ead6cf5fe8728e5d767fee02f06a3d800e82c489e5daaf/pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dcb79bf373a47d2a40cf7232928eb7540155abbc460925c2c96d2d30b006eb4", size = 11988013, upload-time = "2025-07-07T19:18:59.771Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/14/cec7760d7c9507f11c97d64f29022e12a6cc4fc03ac694535e89f88ad2ec/pandas-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:56a342b231e8862c96bdb6ab97170e203ce511f4d0429589c8ede1ee8ece48b8", size = 12767210, upload-time = "2025-07-07T19:19:02.944Z" },
-    { url = "https://files.pythonhosted.org/packages/50/b9/6e2d2c6728ed29fb3d4d4d302504fb66f1a543e37eb2e43f352a86365cdf/pandas-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ca7ed14832bce68baef331f4d7f294411bed8efd032f8109d690df45e00c4679", size = 13440571, upload-time = "2025-07-07T19:19:06.82Z" },
-    { url = "https://files.pythonhosted.org/packages/80/a5/3a92893e7399a691bad7664d977cb5e7c81cf666c81f89ea76ba2bff483d/pandas-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:ac942bfd0aca577bef61f2bc8da8147c4ef6879965ef883d8e8d5d2dc3e744b8", size = 10987601, upload-time = "2025-07-07T19:19:09.589Z" },
-    { url = "https://files.pythonhosted.org/packages/32/ed/ff0a67a2c5505e1854e6715586ac6693dd860fbf52ef9f81edee200266e7/pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22", size = 11531393, upload-time = "2025-07-07T19:19:12.245Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/db/d8f24a7cc9fb0972adab0cc80b6817e8bef888cfd0024eeb5a21c0bb5c4a/pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a", size = 10668750, upload-time = "2025-07-07T19:19:14.612Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/b0/80f6ec783313f1e2356b28b4fd8d2148c378370045da918c73145e6aab50/pandas-2.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928", size = 11342004, upload-time = "2025-07-07T19:19:16.857Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9", size = 12050869, upload-time = "2025-07-07T19:19:19.265Z" },
-    { url = "https://files.pythonhosted.org/packages/55/79/20d746b0a96c67203a5bee5fb4e00ac49c3e8009a39e1f78de264ecc5729/pandas-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12", size = 12750218, upload-time = "2025-07-07T19:19:21.547Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/0f/145c8b41e48dbf03dd18fdd7f24f8ba95b8254a97a3379048378f33e7838/pandas-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb", size = 13416763, upload-time = "2025-07-07T19:19:23.939Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/c0/54415af59db5cdd86a3d3bf79863e8cc3fa9ed265f0745254061ac09d5f2/pandas-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956", size = 10987482, upload-time = "2025-07-07T19:19:42.699Z" },
-    { url = "https://files.pythonhosted.org/packages/48/64/2fd2e400073a1230e13b8cd604c9bc95d9e3b962e5d44088ead2e8f0cfec/pandas-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a", size = 12029159, upload-time = "2025-07-07T19:19:26.362Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/0a/d84fd79b0293b7ef88c760d7dca69828d867c89b6d9bc52d6a27e4d87316/pandas-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9", size = 11393287, upload-time = "2025-07-07T19:19:29.157Z" },
-    { url = "https://files.pythonhosted.org/packages/50/ae/ff885d2b6e88f3c7520bb74ba319268b42f05d7e583b5dded9837da2723f/pandas-2.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275", size = 11309381, upload-time = "2025-07-07T19:19:31.436Z" },
-    { url = "https://files.pythonhosted.org/packages/85/86/1fa345fc17caf5d7780d2699985c03dbe186c68fee00b526813939062bb0/pandas-2.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab", size = 11883998, upload-time = "2025-07-07T19:19:34.267Z" },
-    { url = "https://files.pythonhosted.org/packages/81/aa/e58541a49b5e6310d89474333e994ee57fea97c8aaa8fc7f00b873059bbf/pandas-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96", size = 12704705, upload-time = "2025-07-07T19:19:36.856Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/21/ecf2df680982616459409b09962a8c2065330c7151dc6538069f3b634acf/pandas-2.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4645f770f98d656f11c69e81aeb21c6fca076a44bed3dcbb9396a4311bc7f6d8", size = 11567275, upload-time = "2025-07-07T19:19:45.152Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/1a/dcb50e44b75419e96b276c9fb023b0f147b3c411be1cd517492aa2a184d4/pandas-2.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:342e59589cc454aaff7484d75b816a433350b3d7964d7847327edda4d532a2e3", size = 10811488, upload-time = "2025-07-07T19:19:47.797Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/55/66cd2b679f6a27398380eac7574bc24746128f74626a3c02b978ea00e5ce/pandas-2.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d12f618d80379fde6af007f65f0c25bd3e40251dbd1636480dfffce2cf1e6da", size = 11763000, upload-time = "2025-07-07T19:19:50.83Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/1c/5b9b263c80fd5e231b77df6f78cd7426d1d4ad3a4e858e85b7b3d93d0e9c/pandas-2.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd71c47a911da120d72ef173aeac0bf5241423f9bfea57320110a978457e069e", size = 12361395, upload-time = "2025-07-07T19:19:53.714Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/74/7e817b31413fbb96366ea327d43d1926a9c48c58074e27e094e2839a0e36/pandas-2.3.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:09e3b1587f0f3b0913e21e8b32c3119174551deb4a4eba4a89bc7377947977e7", size = 13225086, upload-time = "2025-07-07T19:19:56.378Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/0f/bc0a44b47eba2f22ae4235719a573d552ef7ad76ed3ea39ae62d554e040b/pandas-2.3.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2323294c73ed50f612f67e2bf3ae45aea04dce5690778e08a09391897f35ff88", size = 13871698, upload-time = "2025-07-07T19:19:58.854Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/cb/6c32f8fadefa4314b740fbe8f74f6a02423bd1549e7c930826df35ac3c1b/pandas-2.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:b4b0de34dc8499c2db34000ef8baad684cfa4cbd836ecee05f323ebfba348c7d", size = 11357186, upload-time = "2025-07-07T19:20:01.475Z" },
-]
-
-[[package]]
-name = "pathspec"
-version = "0.12.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
-]
-
-[[package]]
-name = "platformdirs"
-version = "4.3.6"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302, upload-time = "2024-09-17T19:06:50.688Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439, upload-time = "2024-09-17T19:06:49.212Z" },
-]
-
-[[package]]
-name = "platformdirs"
-version = "4.3.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
-]
-
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload-time = "2024-04-20T21:34:42.531Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload-time = "2024-04-20T21:34:40.434Z" },
-]
-
-[[package]]
-name = "pluggy"
-version = "1.6.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
-]
-
-[[package]]
-name = "pygments"
-version = "2.19.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
-]
-
-[[package]]
-name = "pytest"
-version = "8.3.5"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" },
-    { name = "exceptiongroup", marker = "python_full_version < '3.9'" },
-    { name = "iniconfig", marker = "python_full_version < '3.9'" },
-    { name = "packaging", marker = "python_full_version < '3.9'" },
-    { name = "pluggy", version = "1.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" },
-]
-
-[[package]]
-name = "pytest"
-version = "8.4.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version >= '3.9' and sys_platform == 'win32'" },
-    { name = "exceptiongroup", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "iniconfig", marker = "python_full_version >= '3.9'" },
-    { name = "packaging", marker = "python_full_version >= '3.9'" },
-    { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pygments", marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
-]
-
-[[package]]
-name = "pytest-asyncio"
-version = "0.24.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/52/6d/c6cf50ce320cf8611df7a1254d86233b3df7cc07f9b5f5cbcb82e08aa534/pytest_asyncio-0.24.0.tar.gz", hash = "sha256:d081d828e576d85f875399194281e92bf8a68d60d72d1a2faf2feddb6c46b276", size = 49855, upload-time = "2024-08-22T08:03:18.145Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/31/6607dab48616902f76885dfcf62c08d929796fc3b2d2318faf9fd54dbed9/pytest_asyncio-0.24.0-py3-none-any.whl", hash = "sha256:a811296ed596b69bf0b6f3dc40f83bcaf341b155a269052d82efa2b25ac7037b", size = 18024, upload-time = "2024-08-22T08:03:15.536Z" },
-]
-
-[[package]]
-name = "pytest-asyncio"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "backports-asyncio-runner", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/4e/51/f8794af39eeb870e87a8c8068642fc07bce0c854d6865d7dd0f2a9d338c2/pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea", size = 46652, upload-time = "2025-07-16T04:29:26.393Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/9d/bf86eddabf8c6c9cb1ea9a869d6873b46f105a5d292d3a6f7071f5b07935/pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf", size = 15157, upload-time = "2025-07-16T04:29:24.929Z" },
-]
-
-[[package]]
-name = "python-dateutil"
-version = "2.9.0.post0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "six" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
-]
-
-[[package]]
-name = "pytz"
-version = "2025.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
-]
-
-[[package]]
-name = "requests"
-version = "2.32.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "charset-normalizer" },
-    { name = "idna" },
-    { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload-time = "2025-06-09T16:43:07.34Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" },
-]
-
-[[package]]
-name = "six"
-version = "1.17.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
-]
-
-[[package]]
-name = "tomli"
-version = "2.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175, upload-time = "2024-11-27T22:38:36.873Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077, upload-time = "2024-11-27T22:37:54.956Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429, upload-time = "2024-11-27T22:37:56.698Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067, upload-time = "2024-11-27T22:37:57.63Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030, upload-time = "2024-11-27T22:37:59.344Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898, upload-time = "2024-11-27T22:38:00.429Z" },
-    { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894, upload-time = "2024-11-27T22:38:02.094Z" },
-    { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319, upload-time = "2024-11-27T22:38:03.206Z" },
-    { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273, upload-time = "2024-11-27T22:38:04.217Z" },
-    { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310, upload-time = "2024-11-27T22:38:05.908Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309, upload-time = "2024-11-27T22:38:06.812Z" },
-    { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762, upload-time = "2024-11-27T22:38:07.731Z" },
-    { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453, upload-time = "2024-11-27T22:38:09.384Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486, upload-time = "2024-11-27T22:38:10.329Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349, upload-time = "2024-11-27T22:38:11.443Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159, upload-time = "2024-11-27T22:38:13.099Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243, upload-time = "2024-11-27T22:38:14.766Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645, upload-time = "2024-11-27T22:38:15.843Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584, upload-time = "2024-11-27T22:38:17.645Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875, upload-time = "2024-11-27T22:38:19.159Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418, upload-time = "2024-11-27T22:38:20.064Z" },
-    { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708, upload-time = "2024-11-27T22:38:21.659Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582, upload-time = "2024-11-27T22:38:22.693Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543, upload-time = "2024-11-27T22:38:24.367Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691, upload-time = "2024-11-27T22:38:26.081Z" },
-    { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170, upload-time = "2024-11-27T22:38:27.921Z" },
-    { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530, upload-time = "2024-11-27T22:38:29.591Z" },
-    { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666, upload-time = "2024-11-27T22:38:30.639Z" },
-    { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954, upload-time = "2024-11-27T22:38:31.702Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724, upload-time = "2024-11-27T22:38:32.837Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383, upload-time = "2024-11-27T22:38:34.455Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.13.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" },
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.14.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" },
-]
-
-[[package]]
-name = "tzdata"
-version = "2025.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
-]
-
-[[package]]
-name = "urllib3"
-version = "2.2.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677, upload-time = "2024-09-12T10:52:18.401Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338, upload-time = "2024-09-12T10:52:16.589Z" },
-]
-
-[[package]]
-name = "urllib3"
-version = "2.5.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
-]
-
-[[package]]
-name = "websockets"
-version = "13.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e2/73/9223dbc7be3dcaf2a7bbf756c351ec8da04b1fa573edaf545b95f6b0c7fd/websockets-13.1.tar.gz", hash = "sha256:a3b3366087c1bc0a2795111edcadddb8b3b59509d5db5d7ea3fdd69f954a8878", size = 158549, upload-time = "2024-09-21T17:34:21.54Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0a/94/d15dbfc6a5eb636dbc754303fba18208f2e88cf97e733e1d64fb9cb5c89e/websockets-13.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f48c749857f8fb598fb890a75f540e3221d0976ed0bf879cf3c7eef34151acee", size = 157815, upload-time = "2024-09-21T17:32:27.107Z" },
-    { url = "https://files.pythonhosted.org/packages/30/02/c04af33f4663945a26f5e8cf561eb140c35452b50af47a83c3fbcfe62ae1/websockets-13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c7e72ce6bda6fb9409cc1e8164dd41d7c91466fb599eb047cfda72fe758a34a7", size = 155466, upload-time = "2024-09-21T17:32:28.428Z" },
-    { url = "https://files.pythonhosted.org/packages/35/e8/719f08d12303ea643655e52d9e9851b2dadbb1991d4926d9ce8862efa2f5/websockets-13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f779498eeec470295a2b1a5d97aa1bc9814ecd25e1eb637bd9d1c73a327387f6", size = 155716, upload-time = "2024-09-21T17:32:29.905Z" },
-    { url = "https://files.pythonhosted.org/packages/91/e1/14963ae0252a8925f7434065d25dcd4701d5e281a0b4b460a3b5963d2594/websockets-13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4676df3fe46956fbb0437d8800cd5f2b6d41143b6e7e842e60554398432cf29b", size = 164806, upload-time = "2024-09-21T17:32:31.384Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/fa/ab28441bae5e682a0f7ddf3d03440c0c352f930da419301f4a717f675ef3/websockets-13.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7affedeb43a70351bb811dadf49493c9cfd1ed94c9c70095fd177e9cc1541fa", size = 163810, upload-time = "2024-09-21T17:32:32.384Z" },
-    { url = "https://files.pythonhosted.org/packages/44/77/dea187bd9d16d4b91566a2832be31f99a40d0f5bfa55eeb638eb2c3bc33d/websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1971e62d2caa443e57588e1d82d15f663b29ff9dfe7446d9964a4b6f12c1e700", size = 164125, upload-time = "2024-09-21T17:32:33.398Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/d9/3af14544e83f1437eb684b399e6ba0fa769438e869bf5d83d74bc197fae8/websockets-13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5f2e75431f8dc4a47f31565a6e1355fb4f2ecaa99d6b89737527ea917066e26c", size = 164532, upload-time = "2024-09-21T17:32:35.109Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/8a/6d332eabe7d59dfefe4b8ba6f46c8c5fabb15b71c8a8bc3d2b65de19a7b6/websockets-13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:58cf7e75dbf7e566088b07e36ea2e3e2bd5676e22216e4cad108d4df4a7402a0", size = 163948, upload-time = "2024-09-21T17:32:36.214Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/91/a0aeadbaf3017467a1ee03f8fb67accdae233fe2d5ad4b038c0a84e357b0/websockets-13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c90d6dec6be2c7d03378a574de87af9b1efea77d0c52a8301dd831ece938452f", size = 163898, upload-time = "2024-09-21T17:32:37.277Z" },
-    { url = "https://files.pythonhosted.org/packages/71/31/a90fb47c63e0ae605be914b0b969d7c6e6ffe2038cd744798e4b3fbce53b/websockets-13.1-cp310-cp310-win32.whl", hash = "sha256:730f42125ccb14602f455155084f978bd9e8e57e89b569b4d7f0f0c17a448ffe", size = 158706, upload-time = "2024-09-21T17:32:38.755Z" },
-    { url = "https://files.pythonhosted.org/packages/93/ca/9540a9ba80da04dc7f36d790c30cae4252589dbd52ccdc92e75b0be22437/websockets-13.1-cp310-cp310-win_amd64.whl", hash = "sha256:5993260f483d05a9737073be197371940c01b257cc45ae3f1d5d7adb371b266a", size = 159141, upload-time = "2024-09-21T17:32:40.495Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/f0/cf0b8a30d86b49e267ac84addbebbc7a48a6e7bb7c19db80f62411452311/websockets-13.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:61fc0dfcda609cda0fc9fe7977694c0c59cf9d749fbb17f4e9483929e3c48a19", size = 157813, upload-time = "2024-09-21T17:32:42.188Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/e7/22285852502e33071a8cf0ac814f8988480ec6db4754e067b8b9d0e92498/websockets-13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ceec59f59d092c5007e815def4ebb80c2de330e9588e101cf8bd94c143ec78a5", size = 155469, upload-time = "2024-09-21T17:32:43.858Z" },
-    { url = "https://files.pythonhosted.org/packages/68/d4/c8c7c1e5b40ee03c5cc235955b0fb1ec90e7e37685a5f69229ad4708dcde/websockets-13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1dca61c6db1166c48b95198c0b7d9c990b30c756fc2923cc66f68d17dc558fd", size = 155717, upload-time = "2024-09-21T17:32:44.914Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/e4/c50999b9b848b1332b07c7fd8886179ac395cb766fda62725d1539e7bc6c/websockets-13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:308e20f22c2c77f3f39caca508e765f8725020b84aa963474e18c59accbf4c02", size = 165379, upload-time = "2024-09-21T17:32:45.933Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/49/4a4ad8c072f18fd79ab127650e47b160571aacfc30b110ee305ba25fffc9/websockets-13.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62d516c325e6540e8a57b94abefc3459d7dab8ce52ac75c96cad5549e187e3a7", size = 164376, upload-time = "2024-09-21T17:32:46.987Z" },
-    { url = "https://files.pythonhosted.org/packages/af/9b/8c06d425a1d5a74fd764dd793edd02be18cf6fc3b1ccd1f29244ba132dc0/websockets-13.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c6e35319b46b99e168eb98472d6c7d8634ee37750d7693656dc766395df096", size = 164753, upload-time = "2024-09-21T17:32:48.046Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/5b/0acb5815095ff800b579ffc38b13ab1b915b317915023748812d24e0c1ac/websockets-13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5f9fee94ebafbc3117c30be1844ed01a3b177bb6e39088bc6b2fa1dc15572084", size = 165051, upload-time = "2024-09-21T17:32:49.271Z" },
-    { url = "https://files.pythonhosted.org/packages/30/93/c3891c20114eacb1af09dedfcc620c65c397f4fd80a7009cd12d9457f7f5/websockets-13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7c1e90228c2f5cdde263253fa5db63e6653f1c00e7ec64108065a0b9713fa1b3", size = 164489, upload-time = "2024-09-21T17:32:50.392Z" },
-    { url = "https://files.pythonhosted.org/packages/28/09/af9e19885539759efa2e2cd29b8b3f9eecef7ecefea40d46612f12138b36/websockets-13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6548f29b0e401eea2b967b2fdc1c7c7b5ebb3eeb470ed23a54cd45ef078a0db9", size = 164438, upload-time = "2024-09-21T17:32:52.223Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/08/6f38b8e625b3d93de731f1d248cc1493327f16cb45b9645b3e791782cff0/websockets-13.1-cp311-cp311-win32.whl", hash = "sha256:c11d4d16e133f6df8916cc5b7e3e96ee4c44c936717d684a94f48f82edb7c92f", size = 158710, upload-time = "2024-09-21T17:32:53.244Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/39/ec8832ecb9bb04a8d318149005ed8cee0ba4e0205835da99e0aa497a091f/websockets-13.1-cp311-cp311-win_amd64.whl", hash = "sha256:d04f13a1d75cb2b8382bdc16ae6fa58c97337253826dfe136195b7f89f661557", size = 159137, upload-time = "2024-09-21T17:32:54.721Z" },
-    { url = "https://files.pythonhosted.org/packages/df/46/c426282f543b3c0296cf964aa5a7bb17e984f58dde23460c3d39b3148fcf/websockets-13.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9d75baf00138f80b48f1eac72ad1535aac0b6461265a0bcad391fc5aba875cfc", size = 157821, upload-time = "2024-09-21T17:32:56.442Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/85/22529867010baac258da7c45848f9415e6cf37fef00a43856627806ffd04/websockets-13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9b6f347deb3dcfbfde1c20baa21c2ac0751afaa73e64e5b693bb2b848efeaa49", size = 155480, upload-time = "2024-09-21T17:32:57.698Z" },
-    { url = "https://files.pythonhosted.org/packages/29/2c/bdb339bfbde0119a6e84af43ebf6275278698a2241c2719afc0d8b0bdbf2/websockets-13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de58647e3f9c42f13f90ac7e5f58900c80a39019848c5547bc691693098ae1bd", size = 155715, upload-time = "2024-09-21T17:32:59.429Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/d0/8612029ea04c5c22bf7af2fd3d63876c4eaeef9b97e86c11972a43aa0e6c/websockets-13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1b54689e38d1279a51d11e3467dd2f3a50f5f2e879012ce8f2d6943f00e83f0", size = 165647, upload-time = "2024-09-21T17:33:00.495Z" },
-    { url = "https://files.pythonhosted.org/packages/56/04/1681ed516fa19ca9083f26d3f3a302257e0911ba75009533ed60fbb7b8d1/websockets-13.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf1781ef73c073e6b0f90af841aaf98501f975d306bbf6221683dd594ccc52b6", size = 164592, upload-time = "2024-09-21T17:33:02.223Z" },
-    { url = "https://files.pythonhosted.org/packages/38/6f/a96417a49c0ed132bb6087e8e39a37db851c70974f5c724a4b2a70066996/websockets-13.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d23b88b9388ed85c6faf0e74d8dec4f4d3baf3ecf20a65a47b836d56260d4b9", size = 165012, upload-time = "2024-09-21T17:33:03.288Z" },
-    { url = "https://files.pythonhosted.org/packages/40/8b/fccf294919a1b37d190e86042e1a907b8f66cff2b61e9befdbce03783e25/websockets-13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3c78383585f47ccb0fcf186dcb8a43f5438bd7d8f47d69e0b56f71bf431a0a68", size = 165311, upload-time = "2024-09-21T17:33:04.728Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/61/f8615cf7ce5fe538476ab6b4defff52beb7262ff8a73d5ef386322d9761d/websockets-13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d6d300f8ec35c24025ceb9b9019ae9040c1ab2f01cddc2bcc0b518af31c75c14", size = 164692, upload-time = "2024-09-21T17:33:05.829Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/f1/a29dd6046d3a722d26f182b783a7997d25298873a14028c4760347974ea3/websockets-13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a9dcaf8b0cc72a392760bb8755922c03e17a5a54e08cca58e8b74f6902b433cf", size = 164686, upload-time = "2024-09-21T17:33:06.823Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/99/ab1cdb282f7e595391226f03f9b498f52109d25a2ba03832e21614967dfa/websockets-13.1-cp312-cp312-win32.whl", hash = "sha256:2f85cf4f2a1ba8f602298a853cec8526c2ca42a9a4b947ec236eaedb8f2dc80c", size = 158712, upload-time = "2024-09-21T17:33:07.877Z" },
-    { url = "https://files.pythonhosted.org/packages/46/93/e19160db48b5581feac8468330aa11b7292880a94a37d7030478596cc14e/websockets-13.1-cp312-cp312-win_amd64.whl", hash = "sha256:38377f8b0cdeee97c552d20cf1865695fcd56aba155ad1b4ca8779a5b6ef4ac3", size = 159145, upload-time = "2024-09-21T17:33:09.202Z" },
-    { url = "https://files.pythonhosted.org/packages/51/20/2b99ca918e1cbd33c53db2cace5f0c0cd8296fc77558e1908799c712e1cd/websockets-13.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a9ab1e71d3d2e54a0aa646ab6d4eebfaa5f416fe78dfe4da2839525dc5d765c6", size = 157828, upload-time = "2024-09-21T17:33:10.987Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/47/0932a71d3d9c0e9483174f60713c84cee58d62839a143f21a2bcdbd2d205/websockets-13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b9d7439d7fab4dce00570bb906875734df13d9faa4b48e261c440a5fec6d9708", size = 155487, upload-time = "2024-09-21T17:33:12.153Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/60/f1711eb59ac7a6c5e98e5637fef5302f45b6f76a2c9d64fd83bbb341377a/websockets-13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:327b74e915cf13c5931334c61e1a41040e365d380f812513a255aa804b183418", size = 155721, upload-time = "2024-09-21T17:33:13.909Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/e6/ba9a8db7f9d9b0e5f829cf626ff32677f39824968317223605a6b419d445/websockets-13.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:325b1ccdbf5e5725fdcb1b0e9ad4d2545056479d0eee392c291c1bf76206435a", size = 165609, upload-time = "2024-09-21T17:33:14.967Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/22/4ec80f1b9c27a0aebd84ccd857252eda8418ab9681eb571b37ca4c5e1305/websockets-13.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:346bee67a65f189e0e33f520f253d5147ab76ae42493804319b5716e46dddf0f", size = 164556, upload-time = "2024-09-21T17:33:17.113Z" },
-    { url = "https://files.pythonhosted.org/packages/27/ac/35f423cb6bb15600438db80755609d27eda36d4c0b3c9d745ea12766c45e/websockets-13.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91a0fa841646320ec0d3accdff5b757b06e2e5c86ba32af2e0815c96c7a603c5", size = 164993, upload-time = "2024-09-21T17:33:18.168Z" },
-    { url = "https://files.pythonhosted.org/packages/31/4e/98db4fd267f8be9e52e86b6ee4e9aa7c42b83452ea0ea0672f176224b977/websockets-13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:18503d2c5f3943e93819238bf20df71982d193f73dcecd26c94514f417f6b135", size = 165360, upload-time = "2024-09-21T17:33:19.233Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/15/3f0de7cda70ffc94b7e7024544072bc5b26e2c1eb36545291abb755d8cdb/websockets-13.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:a9cd1af7e18e5221d2878378fbc287a14cd527fdd5939ed56a18df8a31136bb2", size = 164745, upload-time = "2024-09-21T17:33:20.361Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/6e/66b6b756aebbd680b934c8bdbb6dcb9ce45aad72cde5f8a7208dbb00dd36/websockets-13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:70c5be9f416aa72aab7a2a76c90ae0a4fe2755c1816c153c1a2bcc3333ce4ce6", size = 164732, upload-time = "2024-09-21T17:33:23.103Z" },
-    { url = "https://files.pythonhosted.org/packages/35/c6/12e3aab52c11aeb289e3dbbc05929e7a9d90d7a9173958477d3ef4f8ce2d/websockets-13.1-cp313-cp313-win32.whl", hash = "sha256:624459daabeb310d3815b276c1adef475b3e6804abaf2d9d2c061c319f7f187d", size = 158709, upload-time = "2024-09-21T17:33:24.196Z" },
-    { url = "https://files.pythonhosted.org/packages/41/d8/63d6194aae711d7263df4498200c690a9c39fb437ede10f3e157a6343e0d/websockets-13.1-cp313-cp313-win_amd64.whl", hash = "sha256:c518e84bb59c2baae725accd355c8dc517b4a3ed8db88b4bc93c78dae2974bf2", size = 159144, upload-time = "2024-09-21T17:33:25.96Z" },
-    { url = "https://files.pythonhosted.org/packages/83/69/59872420e5bce60db166d6fba39ee24c719d339fb0ae48cb2ce580129882/websockets-13.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c7934fd0e920e70468e676fe7f1b7261c1efa0d6c037c6722278ca0228ad9d0d", size = 157811, upload-time = "2024-09-21T17:33:27.379Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/f7/0610032e0d3981758fdd6ee7c68cc02ebf668a762c5178d3d91748228849/websockets-13.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:149e622dc48c10ccc3d2760e5f36753db9cacf3ad7bc7bbbfd7d9c819e286f23", size = 155471, upload-time = "2024-09-21T17:33:28.473Z" },
-    { url = "https://files.pythonhosted.org/packages/55/2f/c43173a72ea395263a427a36d25bce2675f41c809424466a13c61a9a2d61/websockets-13.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a569eb1b05d72f9bce2ebd28a1ce2054311b66677fcd46cf36204ad23acead8c", size = 155713, upload-time = "2024-09-21T17:33:29.795Z" },
-    { url = "https://files.pythonhosted.org/packages/92/7e/8fa930c6426a56c47910792717787640329e4a0e37cdfda20cf89da67126/websockets-13.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95df24ca1e1bd93bbca51d94dd049a984609687cb2fb08a7f2c56ac84e9816ea", size = 164995, upload-time = "2024-09-21T17:33:30.802Z" },
-    { url = "https://files.pythonhosted.org/packages/27/29/50ed4c68a3f606565a2db4b13948ae7b6f6c53aa9f8f258d92be6698d276/websockets-13.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8dbb1bf0c0a4ae8b40bdc9be7f644e2f3fb4e8a9aca7145bfa510d4a374eeb7", size = 164057, upload-time = "2024-09-21T17:33:31.862Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/0e/60da63b1c53c47f389f79312b3356cb305600ffad1274d7ec473128d4e6b/websockets-13.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:035233b7531fb92a76beefcbf479504db8c72eb3bff41da55aecce3a0f729e54", size = 164340, upload-time = "2024-09-21T17:33:33.022Z" },
-    { url = "https://files.pythonhosted.org/packages/20/ef/d87c5fc0aa7fafad1d584b6459ddfe062edf0d0dd64800a02e67e5de048b/websockets-13.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:e4450fc83a3df53dec45922b576e91e94f5578d06436871dce3a6be38e40f5db", size = 164222, upload-time = "2024-09-21T17:33:34.423Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/c4/7916e1f6b5252d3dcb9121b67d7fdbb2d9bf5067a6d8c88885ba27a9e69c/websockets-13.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:463e1c6ec853202dd3657f156123d6b4dad0c546ea2e2e38be2b3f7c5b8e7295", size = 163647, upload-time = "2024-09-21T17:33:35.841Z" },
-    { url = "https://files.pythonhosted.org/packages/de/df/2ebebb807f10993c35c10cbd3628a7944b66bd5fb6632a561f8666f3a68e/websockets-13.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6d6855bbe70119872c05107e38fbc7f96b1d8cb047d95c2c50869a46c65a8e96", size = 163590, upload-time = "2024-09-21T17:33:37.61Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/82/d48911f56bb993c11099a1ff1d4041d9d1481d50271100e8ee62bc28f365/websockets-13.1-cp38-cp38-win32.whl", hash = "sha256:204e5107f43095012b00f1451374693267adbb832d29966a01ecc4ce1db26faf", size = 158701, upload-time = "2024-09-21T17:33:38.695Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/b3/945aacb21fc89ad150403cbaa974c9e846f098f16d9f39a3dd6094f9beb1/websockets-13.1-cp38-cp38-win_amd64.whl", hash = "sha256:485307243237328c022bc908b90e4457d0daa8b5cf4b3723fd3c4a8012fce4c6", size = 159146, upload-time = "2024-09-21T17:33:39.855Z" },
-    { url = "https://files.pythonhosted.org/packages/61/26/5f7a7fb03efedb4f90ed61968338bfe7c389863b0ceda239b94ae61c5ae4/websockets-13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b37c184f8b976f0c0a231a5f3d6efe10807d41ccbe4488df8c74174805eea7d", size = 157810, upload-time = "2024-09-21T17:33:40.94Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/d4/9b4814a07dffaa7a79d71b4944d10836f9adbd527a113f6675734ef3abed/websockets-13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:163e7277e1a0bd9fb3c8842a71661ad19c6aa7bb3d6678dc7f89b17fbcc4aeb7", size = 155467, upload-time = "2024-09-21T17:33:42.075Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/1a/2abdc7ce3b56429ae39d6bfb48d8c791f5a26bbcb6f44aabcf71ffc3fda2/websockets-13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4b889dbd1342820cc210ba44307cf75ae5f2f96226c0038094455a96e64fb07a", size = 155714, upload-time = "2024-09-21T17:33:43.128Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/98/189d7cf232753a719b2726ec55e7922522632248d5d830adf078e3f612be/websockets-13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:586a356928692c1fed0eca68b4d1c2cbbd1ca2acf2ac7e7ebd3b9052582deefa", size = 164587, upload-time = "2024-09-21T17:33:44.27Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/2b/fb77cedf3f9f55ef8605238c801eef6b9a5269b01a396875a86896aea3a6/websockets-13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7bd6abf1e070a6b72bfeb71049d6ad286852e285f146682bf30d0296f5fbadfa", size = 163588, upload-time = "2024-09-21T17:33:45.38Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/b7/070481b83d2d5ac0f19233d9f364294e224e6478b0762f07fa7f060e0619/websockets-13.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2aad13a200e5934f5a6767492fb07151e1de1d6079c003ab31e1823733ae79", size = 163894, upload-time = "2024-09-21T17:33:46.651Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/be/d6e1cff7d441cfe5eafaacc5935463e5f14c8b1c0d39cb8afde82709b55a/websockets-13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:df01aea34b6e9e33572c35cd16bae5a47785e7d5c8cb2b54b2acdb9678315a17", size = 164315, upload-time = "2024-09-21T17:33:48.432Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/5e/ffa234473e46ab2d3f9fd9858163d5db3ecea1439e4cb52966d78906424b/websockets-13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e54affdeb21026329fb0744ad187cf812f7d3c2aa702a5edb562b325191fcab6", size = 163714, upload-time = "2024-09-21T17:33:49.548Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/92/cea9eb9d381ca57065a5eb4ec2ce7a291bd96c85ce742915c3c9ffc1069f/websockets-13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ef8aa8bdbac47f4968a5d66462a2a0935d044bf35c0e5a8af152d58516dbeb5", size = 163673, upload-time = "2024-09-21T17:33:51.056Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/f1/279104fff239bfd04c12b1e58afea227d72fd1acf431e3eed3f6ac2c96d2/websockets-13.1-cp39-cp39-win32.whl", hash = "sha256:deeb929efe52bed518f6eb2ddc00cc496366a14c726005726ad62c2dd9017a3c", size = 158702, upload-time = "2024-09-21T17:33:52.584Z" },
-    { url = "https://files.pythonhosted.org/packages/25/0b/b87370ff141375c41f7dd67941728e4b3682ebb45882591516c792a2ebee/websockets-13.1-cp39-cp39-win_amd64.whl", hash = "sha256:7c65ffa900e7cc958cd088b9a9157a8141c991f8c53d11087e6fb7277a03f81d", size = 159146, upload-time = "2024-09-21T17:33:53.781Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/75/6da22cb3ad5b8c606963f9a5f9f88656256fecc29d420b4b2bf9e0c7d56f/websockets-13.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5dd6da9bec02735931fccec99d97c29f47cc61f644264eb995ad6c0c27667238", size = 155499, upload-time = "2024-09-21T17:33:54.917Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ba/22833d58629088fcb2ccccedfae725ac0bbcd713319629e97125b52ac681/websockets-13.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2510c09d8e8df777177ee3d40cd35450dc169a81e747455cc4197e63f7e7bfe5", size = 155737, upload-time = "2024-09-21T17:33:56.052Z" },
-    { url = "https://files.pythonhosted.org/packages/95/54/61684fe22bdb831e9e1843d972adadf359cf04ab8613285282baea6a24bb/websockets-13.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1c3cf67185543730888b20682fb186fc8d0fa6f07ccc3ef4390831ab4b388d9", size = 157095, upload-time = "2024-09-21T17:33:57.21Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/f5/6652fb82440813822022a9301a30afde85e5ff3fb2aebb77f34aabe2b4e8/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcc03c8b72267e97b49149e4863d57c2d77f13fae12066622dc78fe322490fe6", size = 156701, upload-time = "2024-09-21T17:33:59.061Z" },
-    { url = "https://files.pythonhosted.org/packages/67/33/ae82a7b860fa8a08aba68818bdf7ff61f04598aa5ab96df4cd5a3e418ca4/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004280a140f220c812e65f36944a9ca92d766b6cc4560be652a0a3883a79ed8a", size = 156654, upload-time = "2024-09-21T17:34:00.944Z" },
-    { url = "https://files.pythonhosted.org/packages/63/0b/a1b528d36934f833e20f6da1032b995bf093d55cb416b9f2266f229fb237/websockets-13.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2620453c075abeb0daa949a292e19f56de518988e079c36478bacf9546ced23", size = 159192, upload-time = "2024-09-21T17:34:02.656Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/a1/5ae6d0ef2e61e2b77b3b4678949a634756544186620a728799acdf5c3482/websockets-13.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9156c45750b37337f7b0b00e6248991a047be4aa44554c9886fe6bdd605aab3b", size = 155433, upload-time = "2024-09-21T17:34:03.88Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/2f/addd33f85600d210a445f817ff0d79d2b4d0eb6f3c95b9f35531ebf8f57c/websockets-13.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:80c421e07973a89fbdd93e6f2003c17d20b69010458d3a8e37fb47874bd67d51", size = 155733, upload-time = "2024-09-21T17:34:05.173Z" },
-    { url = "https://files.pythonhosted.org/packages/74/0b/f8ec74ac3b14a983289a1b42dc2c518a0e2030b486d0549d4f51ca11e7c9/websockets-13.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82d0ba76371769d6a4e56f7e83bb8e81846d17a6190971e38b5de108bde9b0d7", size = 157093, upload-time = "2024-09-21T17:34:06.398Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/4c/aa5cc2f718ee4d797411202f332c8281f04c42d15f55b02f7713320f7a03/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9875a0143f07d74dc5e1ded1c4581f0d9f7ab86c78994e2ed9e95050073c94d", size = 156701, upload-time = "2024-09-21T17:34:07.582Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/4b/7c5b2d0d0f0f1a54f27c60107cf1f201bee1f88c5508f87408b470d09a9c/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a11e38ad8922c7961447f35c7b17bffa15de4d17c70abd07bfbe12d6faa3e027", size = 156648, upload-time = "2024-09-21T17:34:08.734Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/63/35f3fb073884a9fd1ce5413b2dcdf0d9198b03dac6274197111259cbde06/websockets-13.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4059f790b6ae8768471cddb65d3c4fe4792b0ab48e154c9f0a04cefaabcd5978", size = 159188, upload-time = "2024-09-21T17:34:10.018Z" },
-    { url = "https://files.pythonhosted.org/packages/59/fd/e4bf9a7159dba6a16c59ae9e670e3e8ad9dcb6791bc0599eb86de32d50a9/websockets-13.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25c35bf84bf7c7369d247f0b8cfa157f989862c49104c5cf85cb5436a641d93e", size = 155499, upload-time = "2024-09-21T17:34:11.3Z" },
-    { url = "https://files.pythonhosted.org/packages/74/42/d48ede93cfe0c343f3b552af08efc60778d234989227b16882eed1b8b189/websockets-13.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:83f91d8a9bb404b8c2c41a707ac7f7f75b9442a0a876df295de27251a856ad09", size = 155731, upload-time = "2024-09-21T17:34:13.151Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/f2/2ef6bff1c90a43b80622a17c0852b48c09d3954ab169266ad7b15e17cdcb/websockets-13.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a43cfdcddd07f4ca2b1afb459824dd3c6d53a51410636a2c7fc97b9a8cf4842", size = 157093, upload-time = "2024-09-21T17:34:14.52Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/14/6f20bbaeeb350f155edf599aad949c554216f90e5d4ae7373d1f2e5931fb/websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a2ef1381632a2f0cb4efeff34efa97901c9fbc118e01951ad7cfc10601a9bb", size = 156701, upload-time = "2024-09-21T17:34:15.692Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/86/38279dfefecd035e22b79c38722d4f87c4b6196f1556b7a631d0a3095ca7/websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459bf774c754c35dbb487360b12c5727adab887f1622b8aed5755880a21c4a20", size = 156649, upload-time = "2024-09-21T17:34:17.335Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/c5/12c6859a2eaa8c53f59a647617a27f1835a226cd7106c601067c53251d98/websockets-13.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:95858ca14a9f6fa8413d29e0a585b31b278388aa775b8a81fa24830123874678", size = 159187, upload-time = "2024-09-21T17:34:18.538Z" },
-    { url = "https://files.pythonhosted.org/packages/56/27/96a5cd2626d11c8280656c6c71d8ab50fe006490ef9971ccd154e0c42cd2/websockets-13.1-py3-none-any.whl", hash = "sha256:a9a396a6ad26130cdae92ae10c36af09d9bfe6cafe69670fd3b6da9b07b4044f", size = 152134, upload-time = "2024-09-21T17:34:19.904Z" },
-]
-
-[[package]]
-name = "websockets"
-version = "15.0.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/da/6462a9f510c0c49837bbc9345aca92d767a56c1fb2939e1579df1e1cdcf7/websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b", size = 175423, upload-time = "2025-03-05T20:01:35.363Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/9f/9d11c1a4eb046a9e106483b9ff69bce7ac880443f00e5ce64261b47b07e7/websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205", size = 173080, upload-time = "2025-03-05T20:01:37.304Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/4f/b462242432d93ea45f297b6179c7333dd0402b855a912a04e7fc61c0d71f/websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a", size = 173329, upload-time = "2025-03-05T20:01:39.668Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/0c/6afa1f4644d7ed50284ac59cc70ef8abd44ccf7d45850d989ea7310538d0/websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e", size = 182312, upload-time = "2025-03-05T20:01:41.815Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/d4/ffc8bd1350b229ca7a4db2a3e1c482cf87cea1baccd0ef3e72bc720caeec/websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf", size = 181319, upload-time = "2025-03-05T20:01:43.967Z" },
-    { url = "https://files.pythonhosted.org/packages/97/3a/5323a6bb94917af13bbb34009fac01e55c51dfde354f63692bf2533ffbc2/websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb", size = 181631, upload-time = "2025-03-05T20:01:46.104Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/cc/1aeb0f7cee59ef065724041bb7ed667b6ab1eeffe5141696cccec2687b66/websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d", size = 182016, upload-time = "2025-03-05T20:01:47.603Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f9/c86f8f7af208e4161a7f7e02774e9d0a81c632ae76db2ff22549e1718a51/websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9", size = 181426, upload-time = "2025-03-05T20:01:48.949Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/b9/828b0bc6753db905b91df6ae477c0b14a141090df64fb17f8a9d7e3516cf/websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c", size = 181360, upload-time = "2025-03-05T20:01:50.938Z" },
-    { url = "https://files.pythonhosted.org/packages/89/fb/250f5533ec468ba6327055b7d98b9df056fb1ce623b8b6aaafb30b55d02e/websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256", size = 176388, upload-time = "2025-03-05T20:01:52.213Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/46/aca7082012768bb98e5608f01658ff3ac8437e563eca41cf068bd5849a5e/websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41", size = 176830, upload-time = "2025-03-05T20:01:53.922Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" },
-    { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" },
-    { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" },
-    { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" },
-    { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" },
-    { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" },
-    { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" },
-    { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" },
-    { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" },
-    { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" },
-    { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" },
-    { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" },
-    { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" },
-    { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" },
-    { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" },
-    { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" },
-    { url = "https://files.pythonhosted.org/packages/36/db/3fff0bcbe339a6fa6a3b9e3fbc2bfb321ec2f4cd233692272c5a8d6cf801/websockets-15.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5f4c04ead5aed67c8a1a20491d54cdfba5884507a48dd798ecaf13c74c4489f5", size = 175424, upload-time = "2025-03-05T20:02:56.505Z" },
-    { url = "https://files.pythonhosted.org/packages/46/e6/519054c2f477def4165b0ec060ad664ed174e140b0d1cbb9fafa4a54f6db/websockets-15.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abdc0c6c8c648b4805c5eacd131910d2a7f6455dfd3becab248ef108e89ab16a", size = 173077, upload-time = "2025-03-05T20:02:58.37Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/21/c0712e382df64c93a0d16449ecbf87b647163485ca1cc3f6cbadb36d2b03/websockets-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a625e06551975f4b7ea7102bc43895b90742746797e2e14b70ed61c43a90f09b", size = 173324, upload-time = "2025-03-05T20:02:59.773Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/cb/51ba82e59b3a664df54beed8ad95517c1b4dc1a913730e7a7db778f21291/websockets-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d591f8de75824cbb7acad4e05d2d710484f15f29d4a915092675ad3456f11770", size = 182094, upload-time = "2025-03-05T20:03:01.827Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/0f/bf3788c03fec679bcdaef787518dbe60d12fe5615a544a6d4cf82f045193/websockets-15.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47819cea040f31d670cc8d324bb6435c6f133b8c7a19ec3d61634e62f8d8f9eb", size = 181094, upload-time = "2025-03-05T20:03:03.123Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/da/9fb8c21edbc719b66763a571afbaf206cb6d3736d28255a46fc2fe20f902/websockets-15.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac017dd64572e5c3bd01939121e4d16cf30e5d7e110a119399cf3133b63ad054", size = 181397, upload-time = "2025-03-05T20:03:04.443Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/65/65f379525a2719e91d9d90c38fe8b8bc62bd3c702ac651b7278609b696c4/websockets-15.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4a9fac8e469d04ce6c25bb2610dc535235bd4aa14996b4e6dbebf5e007eba5ee", size = 181794, upload-time = "2025-03-05T20:03:06.708Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/26/31ac2d08f8e9304d81a1a7ed2851c0300f636019a57cbaa91342015c72cc/websockets-15.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363c6f671b761efcb30608d24925a382497c12c506b51661883c3e22337265ed", size = 181194, upload-time = "2025-03-05T20:03:08.844Z" },
-    { url = "https://files.pythonhosted.org/packages/98/72/1090de20d6c91994cd4b357c3f75a4f25ee231b63e03adea89671cc12a3f/websockets-15.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2034693ad3097d5355bfdacfffcbd3ef5694f9718ab7f29c29689a9eae841880", size = 181164, upload-time = "2025-03-05T20:03:10.242Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/37/098f2e1c103ae8ed79b0e77f08d83b0ec0b241cf4b7f2f10edd0126472e1/websockets-15.0.1-cp39-cp39-win32.whl", hash = "sha256:3b1ac0d3e594bf121308112697cf4b32be538fb1444468fb0a6ae4feebc83411", size = 176381, upload-time = "2025-03-05T20:03:12.77Z" },
-    { url = "https://files.pythonhosted.org/packages/75/8b/a32978a3ab42cebb2ebdd5b05df0696a09f4d436ce69def11893afa301f0/websockets-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7643a03db5c95c799b89b31c036d5f27eeb4d259c798e878d6937d71832b1e4", size = 176841, upload-time = "2025-03-05T20:03:14.367Z" },
-    { url = "https://files.pythonhosted.org/packages/02/9e/d40f779fa16f74d3468357197af8d6ad07e7c5a27ea1ca74ceb38986f77a/websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3", size = 173109, upload-time = "2025-03-05T20:03:17.769Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/cd/5b887b8585a593073fd92f7c23ecd3985cd2c3175025a91b0d69b0551372/websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1", size = 173343, upload-time = "2025-03-05T20:03:19.094Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/ae/d34f7556890341e900a95acf4886833646306269f899d58ad62f588bf410/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475", size = 174599, upload-time = "2025-03-05T20:03:21.1Z" },
-    { url = "https://files.pythonhosted.org/packages/71/e6/5fd43993a87db364ec60fc1d608273a1a465c0caba69176dd160e197ce42/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9", size = 174207, upload-time = "2025-03-05T20:03:23.221Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/fb/c492d6daa5ec067c2988ac80c61359ace5c4c674c532985ac5a123436cec/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04", size = 174155, upload-time = "2025-03-05T20:03:25.321Z" },
-    { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/48/4b67623bac4d79beb3a6bb27b803ba75c1bdedc06bd827e465803690a4b2/websockets-15.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7f493881579c90fc262d9cdbaa05a6b54b3811c2f300766748db79f098db9940", size = 173106, upload-time = "2025-03-05T20:03:29.404Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/f0/adb07514a49fe5728192764e04295be78859e4a537ab8fcc518a3dbb3281/websockets-15.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:47b099e1f4fbc95b701b6e85768e1fcdaf1630f3cbe4765fa216596f12310e2e", size = 173339, upload-time = "2025-03-05T20:03:30.755Z" },
-    { url = "https://files.pythonhosted.org/packages/87/28/bd23c6344b18fb43df40d0700f6d3fffcd7cef14a6995b4f976978b52e62/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67f2b6de947f8c757db2db9c71527933ad0019737ec374a8a6be9a956786aaf9", size = 174597, upload-time = "2025-03-05T20:03:32.247Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/79/ca288495863d0f23a60f546f0905ae8f3ed467ad87f8b6aceb65f4c013e4/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d08eb4c2b7d6c41da6ca0600c077e93f5adcfd979cd777d747e9ee624556da4b", size = 174205, upload-time = "2025-03-05T20:03:33.731Z" },
-    { url = "https://files.pythonhosted.org/packages/04/e4/120ff3180b0872b1fe6637f6f995bcb009fb5c87d597c1fc21456f50c848/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b826973a4a2ae47ba357e4e82fa44a463b8f168e1ca775ac64521442b19e87f", size = 174150, upload-time = "2025-03-05T20:03:35.757Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/c3/30e2f9c539b8da8b1d76f64012f3b19253271a63413b2d3adb94b143407f/websockets-15.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:21c1fa28a6a7e3cbdc171c694398b6df4744613ce9b36b1a498e816787e28123", size = 176877, upload-time = "2025-03-05T20:03:37.199Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
-]
-
-[[package]]
-name = "win32-setctime"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
-]
diff --git a/evals/README.md b/evals/README.md
index 8f86cd0..6082a01 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -1,328 +1,859 @@
 # Evaluation Framework
 
-Modular evaluation framework for testing browser automation agents using LLM-as-a-judge.
+Comprehensive evaluation framework for testing browser automation agents using LLM-as-a-judge with visual verification support.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Quick Start](#quick-start)
+- [Directory Structure](#directory-structure)
+- [Running Evaluations](#running-evaluations)
+- [YAML Configuration](#yaml-configuration)
+- [Judge Types](#judge-types)
+- [Visual Verification](#visual-verification)
+- [Results and Reports](#results-and-reports)
+- [Configuration Reference](#configuration-reference)
+- [Troubleshooting](#troubleshooting)
+
+---
 
 ## Overview
 
 This framework provides:
-- **Shared Configuration**: Single `config.yml` for all evaluation runners
-- **Modular Runner Scripts**: Separate scripts for different evaluation categories
-- **LLM Judge**: Uses GPT-4 to assess response quality against criteria
+- **Universal Runner**: Single `run.py` script for all evaluation types
+- **Visual Verification**: VisionJudge with screenshot analysis for UI tests
+- **LLM Judge**: Text-based evaluation using GPT models
+- **Modular Architecture**: Shared configuration and library code
 - **Automatic Reporting**: Timestamped CSV reports with detailed results
+- **Screenshot Capture**: Automatic screenshot capture via Chrome DevTools Protocol
 
-## Directory Structure
+---
 
-```
-evals/
-├── config.yml              # Shared configuration
-├── data/                   # Evaluation definitions (YAML)
-│   ├── action-agent/
-│   ├── research-agent/
-│   ├── schema-extractor/
-│   └── ...
-├── lib/                    # Shared library code
-│   ├── config_loader.py
-│   ├── eval_loader.py
-│   ├── api_client.py
-│   └── judge.py
-├── reports/                # Generated CSV reports
-├── run_action_agent.py     # Action agent runner
-├── pyproject.toml          # Project configuration and dependencies
-└── requirements.txt        # Legacy pip requirements (optional)
-```
-
-## Setup
-
-### 1. Install uv (if not already installed)
+## Quick Start
+
+### 1. Install Dependencies
 
 ```bash
-# macOS/Linux
-curl -LsSf https://astral.sh/uv/install.sh | sh
+cd evals
+
+# Using uv (recommended)
+uv pip install -e .
 
 # Or using pip
-pip install uv
+pip install -r requirements.txt
 ```
 
-### 2. Install Dependencies
+### 2. Configure Environment
 
 ```bash
-cd evals
+# Copy example environment file
+cp .env.example .env
 
-# Install dependencies using uv
-uv pip install -e .
+# Edit .env and add your API keys
+# OPENAI_API_KEY=sk-...
+```
+
+### 3. Configure Models
 
-# Or use uv sync for development
-uv sync
+Edit `config.yml` to set your model preferences:
+
+```yaml
+main_model:
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
 ```
 
-**Alternative (using pip):**
+### 4. Start Evaluation Server
+
 ```bash
-pip install -r requirements.txt
+# From project root
+make compose-dev
 ```
 
-### 3. Configure Environment
+Verify server is running at `http://localhost:8080`
+
+### 5. Run Your First Evaluation
 
-You have two options for setting API keys:
+```bash
+# Simple math test
+python3 run.py --path test-simple/math-001.yaml
 
-#### Option A: Using .env file (Recommended)
+# With verbose output
+python3 run.py --path test-simple/math-001.yaml --verbose
+
+# UI test with visual verification
+python3 run.py --path action-agent/accordion-001.yaml --verbose
+```
+
+---
+
+## Directory Structure
+
+```
+evals/
+├── run.py                    # Universal evaluation runner
+├── config.yml                # Global configuration
+├── .env                      # API keys and secrets
+├── .env.example              # Example environment file
+├── requirements.txt          # Python dependencies
+├── pyproject.toml            # Project configuration
+│
+├── data/                     # Evaluation definitions (YAML)
+│   ├── test-simple/          # Simple sanity tests
+│   ├── action-agent/         # UI interaction tests
+│   ├── web-task-agent/       # Multi-step web tasks
+│   ├── research-agent/       # Research and information gathering
+│   ├── schema-extractor/     # Data extraction tests
+│   ├── screenshot-verification/  # Visual verification tests
+│   └── end-to-end/           # Complex multi-step scenarios
+│
+├── lib/                      # Framework library
+│   ├── __init__.py           # Library exports
+│   ├── config_loader.py      # Configuration management
+│   ├── eval_loader.py        # YAML evaluation loader
+│   ├── api_client.py         # API client for eval-server
+│   └── judge.py              # LLMJudge and VisionJudge
+│
+├── screenshots/              # Captured screenshots (auto-generated)
+└── reports/                  # CSV evaluation reports (auto-generated)
+```
+
+---
+
+## Running Evaluations
+
+### Command-Line Interface
+
+The `run.py` script provides three execution modes:
+
+#### 1. Run Specific Evaluation
 
 ```bash
-# Copy the example file
-cp .env.example .env
+python3 run.py --path <path-to-yaml>
+```
 
-# Edit .env and add your API keys
-# The file will be automatically loaded when running evaluations
+Examples:
+```bash
+# Relative to data/ directory
+python3 run.py --path test-simple/math-001.yaml
+python3 run.py --path action-agent/accordion-001.yaml
+
+# Absolute path also supported
+python3 run.py --path /absolute/path/to/eval.yaml
+```
+
+#### 2. Run All Evals in a Category
+
+```bash
+python3 run.py --category <category-name>
 ```
 
-Example `.env` file:
+Examples:
 ```bash
-OPENAI_API_KEY=sk-your-actual-key-here
-GROQ_API_KEY=gsk-your-actual-key-here  # Optional
+python3 run.py --category action-agent
+python3 run.py --category test-simple
+python3 run.py --category web-task-agent
 ```
 
-#### Option B: Using shell environment variables
+#### 3. Run All Evaluations
 
 ```bash
-export OPENAI_API_KEY="sk-..."      # Required for LLM judge
-export GROQ_API_KEY="gsk-..."       # Optional, if using Groq models
+python3 run.py --all
 ```
 
-### 4. Configure Models
+### Verbose Mode
 
-Edit `config.yml` to set your model preferences:
+Add `--verbose` flag for detailed execution information:
 
-```yaml
-main_model:
-  provider: "openai"
-  model_name: "gpt-4"
-  api_key: "${OPENAI_API_KEY}"
+```bash
+python3 run.py --path action-agent/accordion-001.yaml --verbose
+```
 
-judge_model:
-  provider: "openai"
-  model_name: "gpt-4"
-  api_key: "${OPENAI_API_KEY}"
+Verbose output includes:
+- Input prompt sent to the agent
+- Response received from the agent
+- Whether VisionJudge is being used
+- Judge reasoning and criteria evaluation
+- Screenshot paths (if captured)
+
+Example verbose output:
+```
+  Input: Click to expand the "Section 2" accordion panel
+  Response: Done — I expanded "Section 2" for you.
+  Using Vision Judge with screenshot
+  Judge Reasoning: The AFTER screenshot shows Section 2 highlighted in blue...
+  Screenshot: /path/to/screenshots/accordion-001_20251020_170412.png
+[1/1] Running: Expand Accordion Section
+  ID: accordion-001
+  Status: PASS
+  Score: 0.80
+  Time: 167222ms
 ```
 
-The config supports environment variable substitution using `${VAR_NAME}` syntax.
+---
 
-### 5. Start Evaluation Server
+## YAML Configuration
 
-Ensure the evaluation server is running:
+### Basic Structure
 
-```bash
-# From the project root
-make compose-dev
-# OR
-docker run -d --name kernel-browser-extended ... kernel-browser:extended
+Every evaluation YAML file follows this structure:
+
+```yaml
+# Unique identifier
+id: "example-001"
+
+# Human-readable name
+name: "Example Test"
+
+# Description of what this test does
+description: "Test description here"
+
+# Enable/disable the test
+enabled: true
+
+# Target configuration
+target:
+  url: "https://example.com"
+  wait_for: "networkidle"  # or "domcontentloaded", "load"
+  wait_timeout: 5000
+
+# Tool/agent type to use
+tool: "action_agent"
+
+# Timeout for the entire evaluation (milliseconds)
+timeout: 60000
+
+# Input for the agent (varies by tool)
+input:
+  objective: "Task description"  # For action_agent
+  # OR
+  message: "Prompt"              # For chat
+  # OR
+  task: "Multi-step task"        # For web_task_agent
+
+# Validation configuration
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3  # Optional
+    criteria:
+      - "Criterion 1"
+      - "Criterion 2"
+    visual_verification:
+      enabled: false  # Set to true for UI tests
+
+# Metadata
+metadata:
+  tags: ["tag1", "tag2"]
+  priority: "high"  # high, medium, low
+  owner: "team-name"
 ```
 
-The server should be accessible at `http://localhost:8080` (or the URL specified in `config.yml`).
+### Tool Types
+
+Different tools have different input fields:
 
-## Usage
+**chat** - Simple text response:
+```yaml
+tool: "chat"
+input:
+  message: "Your question or prompt here"
+```
 
-### Running Action Agent Evaluations
+**action_agent** - UI interactions:
+```yaml
+tool: "action_agent"
+input:
+  objective: "Click the submit button"
+  reasoning: "Testing form submission"  # Optional
+```
 
-```bash
-# Run all enabled action-agent evaluations (up to default limit)
-./run_action_agent.py
+**web_task_agent** - Multi-step web tasks:
+```yaml
+tool: "web_task_agent"
+input:
+  task: "Search for flights from NYC to LAX on June 15"
+```
+
+**research_agent** - Research tasks:
+```yaml
+tool: "research_agent"
+input:
+  query: "What are the latest developments in quantum computing?"
+```
 
-# Run first 10 evaluations
-./run_action_agent.py --limit 10
+### Target Configuration
 
-# Run specific evaluations by ID
-./run_action_agent.py --eval-ids action-agent-click-001 action-agent-form-001
+```yaml
+target:
+  url: "https://example.com"        # URL to navigate to
+  wait_for: "networkidle"           # Wait condition
+  wait_timeout: 5000                # Timeout in milliseconds
+```
 
-# Use custom config file
-./run_action_agent.py --config /path/to/config.yml
+For simple tests without web navigation:
+```yaml
+target:
+  url: "about:blank"
+  wait_timeout: 1000
 ```
 
-### Command-Line Options
+---
 
+## Judge Types
+
+### LLM Judge (Text-only)
+
+Standard text-based evaluation:
+
+```yaml
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"  # or "gpt-5", "gpt-4.1"
+    temperature: 0.3       # Optional
+    criteria:
+      - "Response is accurate"
+      - "Response is complete"
+      - "Response follows instructions"
 ```
---limit N           Maximum number of evaluations to run
---eval-ids ID...    Specific evaluation IDs to run
---config PATH       Path to config file (default: evals/config.yml)
+
+The LLM judge:
+- Evaluates text responses against criteria
+- Provides pass/fail result
+- Returns score (0.0 to 1.0)
+- Includes reasoning for the judgment
+
+### Vision Judge (Visual Verification)
+
+For UI interaction tests requiring visual validation:
+
+```yaml
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    criteria:
+      - "Located the correct UI element"
+      - "Successfully clicked the element"
+      - "UI state changed as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true   # Future feature
+      capture_after: true    # Currently implemented
+      prompts:
+        - "Verify the button is now in active state"
+        - "Check if the modal dialog is visible"
 ```
 
-## How It Works
+The Vision judge:
+- Automatically captures screenshots after agent actions
+- Uses vision-capable models (GPT-4o, GPT-4.1, etc.)
+- Analyzes visual state of the page
+- Provides more accurate scoring for UI tests
+
+**When to use Vision Judge:**
+- Testing UI interactions (clicks, form fills, navigation)
+- Verifying visual state changes
+- Checking element visibility and styling
+- Validating layout and positioning
+
+---
 
-### 1. Load Configuration
+## Visual Verification
 
-The runner automatically:
-- Loads environment variables from `.env` file (if present)
-- Loads model configurations from `config.yml`, including:
-  - API endpoint for the evaluation server
-  - Model tiers (main, mini, nano) for agent requests
-  - Judge model for evaluation assessment
-  - Execution settings (timeouts, delays, etc.)
-- Substitutes environment variables using `${VAR_NAME}` syntax
+### How It Works
 
-### 2. Load Evaluations
+1. **Screenshot Capture**: Framework automatically captures screenshot after agent completes action
+2. **Metadata Extraction**: API client extracts `client_id` and `tab_id` from response
+3. **CDP Screenshot**: Uses Chrome DevTools Protocol to capture viewport screenshot
+4. **Image Loading**: Screenshot is loaded and converted to base64 data URL
+5. **Vision Model**: Screenshot sent to vision-capable LLM with criteria
+6. **Visual Analysis**: Model analyzes visual state and evaluates criteria
 
-Evaluation definitions are loaded from YAML files in `data/`:
+### Configuration
 
 ```yaml
-id: "action-agent-click-001"
-name: "Search with Text Entry and Click"
+visual_verification:
+  enabled: true                    # Enable visual verification
+  capture_before: true             # Future: before action screenshot
+  capture_after: true              # Capture after action
+  prompts:                         # Guide vision model
+    - "Verify Section 2 is expanded"
+    - "Check if other sections are collapsed"
+    - "Ensure proper visual styling is applied"
+```
+
+### Screenshots Directory
+
+Screenshots are saved to `screenshots/` with naming pattern:
+```
+<eval-id>_<timestamp>.png
+```
+
+Example: `screenshots/accordion-001_20251020_170412.png`
+
+### Example: Accordion Test with Vision
+
+```yaml
+id: "accordion-001"
+name: "Expand Accordion Section"
 tool: "action_agent"
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
 input:
-  objective: "Type 'DevTools automation' in search box and click search button"
+  objective: "Click to expand the \"Section 2\" accordion panel"
+
 validation:
   type: "llm-judge"
   llm_judge:
     model: "gpt-4.1-mini"
     criteria:
-      - "Successfully located the search input field"
-      - "Entered text correctly"
-      - "Search was executed and results loaded"
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+    visual_verification:
+      enabled: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
 ```
 
-### 3. Execute Evaluations
+**Result:**
+- Score: 0.80 (4/5 criteria met)
+- Vision model confirms Section 2 is expanded (blue highlight)
+- Vision model verifies content is visible
+- Vision model checks other sections are collapsed
+- Animation smoothness cannot be verified from static screenshot
 
-For each evaluation:
+---
 
-1. **Extract input** from the YAML definition
-2. **Send API request** to `/v1/responses` with model config
-3. **Receive response** from the agent
-4. **Judge response** using LLM against validation criteria
-5. **Record result** (pass/fail, score, reasoning)
+## Results and Reports
 
-### 4. Generate Reports
-
-Results are saved to `reports/` as timestamped CSV files:
+### Console Output
 
+Standard output:
 ```
-reports/action-agent_2025-01-17_14-30-45.csv
+[1/1] Running: Expand Accordion Section
+  ID: accordion-001
+  Status: PASS
+  Score: 0.80
+  Time: 167222ms
 ```
 
-CSV columns:
-- `timestamp`: When the evaluation was run
-- `eval_id`: Evaluation identifier
-- `eval_name`: Human-readable name
-- `category`: Evaluation category
-- `status`: PASS or FAIL
-- `score`: Numerical score (0-1)
-- `judge_reasoning`: LLM judge's explanation
-- `execution_time_ms`: API request duration
-- `error`: Error message (if any)
-
-## Creating New Runners
-
-To create a runner for a different category (e.g., `research-agent`):
-
-1. Copy `run_action_agent.py` to `run_research_agent.py`
-2. Update the category parameter in `run_evaluations()`:
-   ```python
-   runner.run_evaluations(
-       category='research-agent',  # Change this
-       limit=limit,
-       eval_ids=args.eval_ids
-   )
-   ```
-3. Update the script description and help text
-4. Make it executable: `chmod +x run_research_agent.py`
+### Summary Statistics
 
-All runners share the same configuration and library code.
+```
+======================================================================
+Summary
+======================================================================
+Total: 10
+Passed: 8 (80.0%)
+Failed: 2
+Average Score: 0.85
+Average Time: 145234ms
+======================================================================
+```
 
-## Adding New Evaluations
+### CSV Reports
 
-To add new evaluation definitions:
+Reports saved to `reports/` directory:
+```
+reports/action-agent_2025-10-20_17-04-43.csv
+```
 
-1. Create a YAML file in the appropriate `data/` subdirectory
-2. Follow the existing evaluation format:
-   ```yaml
-   id: "unique-eval-id"
-   name: "Human-readable name"
-   enabled: true
-   tool: "action_agent"  # or chat, research_agent, etc.
-   input:
-     objective: "Task description"
-   validation:
-     type: "llm-judge"
-     llm_judge:
-       criteria:
-         - "Criterion 1"
-         - "Criterion 2"
-   ```
-3. The evaluation will be automatically discovered and loaded
+CSV columns:
+- `eval_id` - Evaluation identifier
+- `eval_name` - Human-readable name
+- `category` - Category/subdirectory
+- `passed` - Boolean pass/fail
+- `score` - Numeric score 0.0-1.0
+- `reasoning` - Judge's detailed reasoning
+- `execution_time_ms` - Time taken in milliseconds
+- `error` - Error message if failed
+- `screenshot_path` - Path to captured screenshot (if any)
+
+---
 
 ## Configuration Reference
 
-### Model Configuration
+### config.yml Structure
 
 ```yaml
+# API endpoint for eval-server
+api_endpoint: "http://localhost:8080"
+
+# Models for the agent under test
 main_model:
-  provider: "openai"         # Provider: openai, groq, etc.
-  model_name: "gpt-4"       # Model identifier
-  api_key: "${ENV_VAR}"     # API key (supports env vars)
-```
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
+
+mini_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
-### Execution Settings
+nano_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
-```yaml
+# Model for judging evaluations
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+
+# Execution settings
 execution:
   default_limit: 20           # Default number of evals to run
-  timeout: 300               # API request timeout (seconds)
-  concurrent_requests: 1     # Concurrent execution (future)
+  timeout: 3600              # API request timeout (seconds)
+  concurrent_requests: 1     # Concurrent execution
   request_delay: 1           # Delay between requests (seconds)
+
+# Reporting settings
+reporting:
+  reports_dir: "reports"
+  format: "csv"
+  include_reasoning: true
 ```
 
-### Reporting Settings
+### Environment Variables (.env)
 
-```yaml
-reporting:
-  reports_dir: "reports"          # Where to save CSV reports
-  format: "csv"                   # Report format
-  include_reasoning: true         # Include judge reasoning
+```bash
+# Required: OpenAI API key
+OPENAI_API_KEY=sk-...
+
+# Optional: Alternative providers
+OPENROUTER_API_KEY=sk-or-...
+ANTHROPIC_API_KEY=sk-ant-...
 ```
 
-## Troubleshooting
+---
 
-### API Server Connection Failed
+## Framework Architecture
 
+### Components
+
+**run.py (EvaluationRunner)**
+- Entry point for running evaluations
+- Handles CLI arguments and execution flow
+- Coordinates between components
+- Manages screenshot capture
+
+**lib/eval_loader.py (EvalLoader, Evaluation)**
+- Loads YAML evaluation definitions
+- Parses configuration and metadata
+- Provides evaluation objects to runner
+- Handles visual verification config
+
+**lib/api_client.py (APIClient)**
+- Communicates with eval-server API
+- Sends requests with model configs
+- Captures screenshots via Chrome DevTools Protocol
+- Extracts metadata (client_id, tab_id) from responses
+
+**lib/judge.py (LLMJudge, VisionJudge)**
+- LLMJudge: Text-based evaluation
+- VisionJudge: Visual verification with screenshots
+- Returns JudgeResult with pass/fail, score, reasoning
+
+**lib/config_loader.py (ConfigLoader)**
+- Loads global configuration from config.yml
+- Handles environment variable substitution
+- Provides model configs to components
+
+### Data Flow
+
+```
+1. User: python3 run.py --path test.yaml --verbose
+
+2. EvaluationRunner:
+   - Loads config.yml
+   - Initializes APIClient, LLMJudge, VisionJudge
+
+3. EvalLoader:
+   - Loads YAML evaluation
+   - Parses configuration
+   - Checks if visual_verification.enabled
+
+4. APIClient:
+   - Sends request to eval-server with input + models
+   - Receives response + metadata (client_id, tab_id)
+
+5. Screenshot Capture:
+   - Uses client_id/tab_id to capture screenshot via CDP
+   - Saves PNG to screenshots/ directory
+
+6. Judging:
+   - If visual_verification.enabled:
+     - Use VisionJudge
+     - Load screenshot as base64 data URL
+     - Send to vision model with criteria and image
+   - Else:
+     - Use LLMJudge for text-only evaluation
+
+7. Results:
+   - Print to console (with verbose details)
+   - Save CSV report
+   - Include screenshot path
 ```
-ERROR: Cannot connect to API server at http://localhost:8080
+
+---
+
+## Examples
+
+### Simple Math Test
+
+**File:** `data/test-simple/math-001.yaml`
+
+```yaml
+id: "math-001"
+name: "Simple Math 5x7"
+enabled: true
+
+target:
+  url: "about:blank"
+  wait_timeout: 1000
+
+tool: "chat"
+timeout: 10000
+
+input:
+  message: "How much is 5x7? Just respond with the number."
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    criteria:
+      - "Response contains the number 35"
+      - "Response is mathematically correct"
+
+metadata:
+  tags: ["test", "simple", "math"]
+  priority: "high"
 ```
 
-**Solution**: Ensure the evaluation server is running and accessible:
+**Run:**
 ```bash
-curl http://localhost:8080/status
+python3 run.py --path test-simple/math-001.yaml --verbose
 ```
 
-### Environment Variable Not Found
+### UI Interaction Test with Vision
+
+**File:** `data/action-agent/accordion-001.yaml`
+
+```yaml
+id: "accordion-001"
+name: "Expand Accordion Section"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
 
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+    visual_verification:
+      enabled: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+
+metadata:
+  tags: ["action", "accordion", "ui"]
+  priority: "high"
 ```
-ValueError: Environment variable ${OPENAI_API_KEY} not found
+
+**Run:**
+```bash
+python3 run.py --path action-agent/accordion-001.yaml --verbose
 ```
 
-**Solution**: Set the required environment variable using one of these methods:
+---
+
+## Troubleshooting
+
+### API Server Connection Failed
+
+**Error:** `API server is not reachable`
 
-1. **Using .env file (recommended)**:
+**Solution:**
+1. Check if eval-server is running:
    ```bash
-   cp .env.example .env
-   # Edit .env and add: OPENAI_API_KEY=sk-your-actual-key
+   docker-compose ps
    ```
-
-2. **Using shell export**:
+2. Verify API endpoint in `config.yml`
+3. Test connection:
    ```bash
-   export OPENAI_API_KEY="sk-..."
+   curl http://localhost:8080/health
+   ```
+
+### API Key Not Configured
+
+**Error:** `API key not configured for provider: openai`
+
+**Solution:**
+1. Check `.env` file has `OPENAI_API_KEY=sk-...`
+2. Verify `config.yml` references: `api_key: "${OPENAI_API_KEY}"`
+3. Ensure `.env` file is in the `evals/` directory
+
+### Screenshot Not Captured
+
+**Issue:** Visual verification enabled but no screenshot
+
+**Solution:**
+1. Check that agent returned `client_id` and `tab_id` metadata
+2. Verify eval-server is running with volume mount:
+   ```yaml
+   volumes:
+     - "./eval-server/nodejs:/opt/eval-server"
    ```
+3. Enable verbose mode to see capture attempts
+4. Check `screenshots/` directory permissions
+
+### Low Scores
 
-### No Evaluations Found
+**Issue:** Evaluations scoring lower than expected
 
+**Solution:**
+1. Review criteria - make them specific and measurable
+2. Use verbose mode to see judge reasoning
+3. Enable visual verification for UI tests
+4. Check screenshots to verify agent behavior
+5. Adjust criteria based on actual capabilities
+
+### Environment Variable Not Found
+
+**Error:** `Environment variable ${OPENAI_API_KEY} not found`
+
+**Solution:**
+1. Create `.env` file from `.env.example`
+2. Add API key: `OPENAI_API_KEY=sk-your-key`
+3. Or export in shell: `export OPENAI_API_KEY="sk-..."`
+
+---
+
+## Best Practices
+
+### Writing Criteria
+
+**Good criteria:**
+- Specific and measurable
+- Focus on observable outcomes
+- One assertion per criterion
+- Clear pass/fail conditions
+
+Example:
+```yaml
+criteria:
+  - "Response contains the number 35"
+  - "Section 2 accordion is expanded"
+  - "Search results show at least 5 flights"
 ```
-No evaluations found in category: action-agent
+
+**Bad criteria:**
+- Vague or subjective ("Response is good")
+- Multiple assertions ("Located and clicked button")
+- Impossible to verify ("Animation was smooth")
+
+### Using Visual Verification
+
+**When to enable:**
+- UI interaction tests
+- Visual state verification
+- Layout/styling checks
+
+**When NOT to enable:**
+- Simple text responses
+- Logic/computation tests
+- Research/information gathering
+
+---
+
+## Quick Reference
+
+### Command Cheat Sheet
+
+```bash
+# Run single eval
+python3 run.py --path <category>/<eval>.yaml
+
+# Run with verbose output
+python3 run.py --path <eval>.yaml --verbose
+
+# Run all in category
+python3 run.py --category <category>
+
+# Run all evaluations
+python3 run.py --all
+
+# View results
+cat reports/<category>_<timestamp>.csv
+
+# View screenshot
+open screenshots/<eval-id>_<timestamp>.png
 ```
 
-**Solution**: Verify that:
-1. The `data/action-agent/` directory exists
-2. It contains `.yaml` files
-3. Evaluations have `enabled: true`
+### Evaluation Categories
+
+- `test-simple/` - Simple sanity tests
+- `action-agent/` - UI interaction tests
+- `web-task-agent/` - Multi-step web tasks
+- `research-agent/` - Research tasks
+- `schema-extractor/` - Data extraction
+- `screenshot-verification/` - Visual tests
+- `end-to-end/` - Complex scenarios
+
+---
+
+## Contributing
+
+To add new evaluations:
+
+1. Create YAML file in appropriate `data/` category
+2. Follow existing evaluation format
+3. Test with verbose mode first
+4. Adjust criteria based on results
+
+To add new categories:
+
+1. Create new directory under `data/`
+2. Add YAML evaluation files
+3. Run with `--category <new-category>`
+
+---
 
-## Future Enhancements
+## Version Information
 
-- Additional runner scripts for other categories
-- Parallel evaluation execution
-- Web UI for viewing reports
-- Integration with CI/CD pipelines
-- Support for additional judge providers (Claude, local models)
+- **Framework Version:** 2.0
+- **Features:** Universal runner, VisionJudge, screenshot capture
+- **Last Updated:** October 2025

From 17d3bff504a93b0eb8bb63cb6c1c2dd8583e3324 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Mon, 20 Oct 2025 19:53:14 -0500
Subject: [PATCH 12/24] Changed evals structure

---
 docker-compose.yml                         |   2 +
 eval-server/nodejs/src/api-server.js       |  28 +-
 evals/config.yml                           |   8 +-
 evals/data/action-agent/accordion-001.yaml |   2 -
 evals/lib/__init__.py                      |   3 +-
 evals/lib/api_client.py                    | 174 ++++++
 evals/lib/eval_loader.py                   |  40 ++
 evals/run.py                               | 628 +++++++++++++++++++++
 evals/run_action_agent.py                  | 333 -----------
 9 files changed, 872 insertions(+), 346 deletions(-)
 create mode 100755 evals/run.py
 delete mode 100755 evals/run_action_agent.py

diff --git a/docker-compose.yml b/docker-compose.yml
index d54e756..d783215 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -39,6 +39,8 @@ services:
       - "./kernel-images/images/chromium-headful/.tmp/chromium/flags:/chromium/flags:ro"
       # Persist Chromium data across container restarts (set CHROMIUM_DATA_HOST env var to customize path)
       - "${CHROMIUM_DATA_HOST:-./chromium-data}:/data"
+      # Mount eval-server code for live updates during development
+      - "./eval-server/nodejs:/opt/eval-server"
     tmpfs:
       - /dev/shm:size=2g
     restart: unless-stopped
diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 08be5a4..8dc43f2 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -422,8 +422,8 @@ class APIServer {
       // Extract the response text from the result
       const responseText = this.extractResponseText(result);
 
-      // Format in OpenAI Responses API format
-      return this.formatOpenAIResponse(responseText);
+      // Format in OpenAI-compatible Responses API format with tab metadata
+      return this.formatResponse(responseText, tabResult.compositeClientId.split(':')[0], tabResult.tabId);
 
     } catch (error) {
       logger.error('Error handling responses request:', error);
@@ -624,12 +624,15 @@ class APIServer {
   }
 
   /**
-   * Format response in OpenAI Responses API format
+   * Format response in OpenAI-compatible Responses API format
    */
-  formatOpenAIResponse(responseText) {
+  formatResponse(responseText, clientId = null, tabId = null) {
     const messageId = `msg_${uuidv4().replace(/-/g, '')}`;
-    
-    return [
+
+    // Debug: log the parameters
+    logger.debug('formatResponse called with:', { clientId, tabId, hasClientId: !!clientId, hasTabId: !!tabId });
+
+    const response = [
       {
         id: messageId,
         type: 'message',
@@ -643,6 +646,19 @@ class APIServer {
         ]
       }
     ];
+
+    // Add metadata if clientId and tabId are provided
+    if (clientId && tabId) {
+      response[0].metadata = {
+        clientId,
+        tabId
+      };
+      logger.debug('Metadata added to response:', response[0].metadata);
+    } else {
+      logger.debug('Metadata NOT added - clientId or tabId missing');
+    }
+
+    return response;
   }
 
   sendResponse(res, statusCode, data) {
diff --git a/evals/config.yml b/evals/config.yml
index 5de80b5..5b42f16 100644
--- a/evals/config.yml
+++ b/evals/config.yml
@@ -9,12 +9,12 @@ api_endpoint: "http://localhost:8080"
 
 main_model:
   provider: "openai"
-  model_name: "gpt-5"
+  model_name: "gpt-5-mini"
   api_key: "${OPENAI_API_KEY}"
 
 mini_model:
   provider: "openai"
-  model_name: "gpt-5-mini"
+  model_name: "gpt-5-nano"
   api_key: "${OPENAI_API_KEY}"
 
 nano_model:
@@ -30,12 +30,12 @@ nano_model:
 
 # mini_model:
 #   provider: "openrouter"
-#   model_name: "x-ai/grok-4-fast:free"
+#   model_name: "openai/gpt-oss-20b:free"
 #   api_key: "${OPENROUTER_API_KEY}"
 
 # nano_model:
 #   provider: "openrouter"
-#   model_name: "x-ai/grok-4-fast:free"
+#   model_name: "openai/gpt-oss-20b:free"
 #   api_key: "${OPENROUTER_API_KEY}"
 
 # Model configuration for judging evaluation responses
diff --git a/evals/data/action-agent/accordion-001.yaml b/evals/data/action-agent/accordion-001.yaml
index dae142d..fc2fdbb 100644
--- a/evals/data/action-agent/accordion-001.yaml
+++ b/evals/data/action-agent/accordion-001.yaml
@@ -26,7 +26,6 @@ validation:
       - "Successfully clicked to expand the section"
       - "Section 2 content became visible"
       - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
     visual_verification:
       enabled: true
       capture_before: true
@@ -34,7 +33,6 @@ validation:
       prompts:
         - "Verify Section 2 is now expanded and content visible"
         - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
         - "Ensure Section 2 header shows expanded state"
 
 metadata:
diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py
index a6b245b..db65bcc 100644
--- a/evals/lib/__init__.py
+++ b/evals/lib/__init__.py
@@ -5,7 +5,7 @@
 from .config_loader import ConfigLoader, get_config
 from .eval_loader import EvalLoader, Evaluation
 from .api_client import APIClient
-from .judge import LLMJudge, SimpleJudge, JudgeResult
+from .judge import LLMJudge, SimpleJudge, VisionJudge, JudgeResult
 
 __all__ = [
     'ConfigLoader',
@@ -15,5 +15,6 @@
     'APIClient',
     'LLMJudge',
     'SimpleJudge',
+    'VisionJudge',
     'JudgeResult'
 ]
diff --git a/evals/lib/api_client.py b/evals/lib/api_client.py
index 2214710..f2cf3bb 100644
--- a/evals/lib/api_client.py
+++ b/evals/lib/api_client.py
@@ -92,11 +92,16 @@ def send_request(
             # Extract text from OpenAI Responses API format
             response_text = self._extract_response_text(response_data)
 
+            # Extract client/tab IDs from metadata (if present)
+            client_id, tab_id = self._extract_metadata(response_data)
+
             return {
                 "success": True,
                 "response": response_text,
                 "raw_response": response_data,
                 "execution_time_ms": execution_time_ms,
+                "client_id": client_id,
+                "tab_id": tab_id,
                 "error": None
             }
 
@@ -192,6 +197,175 @@ def _extract_response_text(self, response_data: Any) -> str:
         except Exception as e:
             return f"[Error extracting response: {e}]"
 
+    def _extract_metadata(self, response_data: Any) -> tuple[str | None, str | None]:
+        """
+        Extract clientId and tabId from response metadata.
+
+        Args:
+            response_data: Raw API response
+
+        Returns:
+            Tuple of (client_id, tab_id) or (None, None)
+        """
+        try:
+            if isinstance(response_data, list) and len(response_data) > 0:
+                message = response_data[0]
+                metadata = message.get('metadata', {})
+                return metadata.get('clientId'), metadata.get('tabId')
+        except Exception:
+            pass
+        return None, None
+
+    def capture_screenshot(
+        self,
+        client_id: str,
+        tab_id: str,
+        full_page: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Capture a screenshot of a specific tab.
+
+        Args:
+            client_id: Base client ID
+            tab_id: Tab ID to capture
+            full_page: Whether to capture the full page (default: False)
+
+        Returns:
+            Dict with:
+            - success: bool
+            - image_data: str (base64 data URL) if successful
+            - error: str (if any)
+        """
+        api_url = f"{self.base_url}/page/screenshot"
+
+        payload = {
+            "clientId": client_id,
+            "tabId": tab_id,
+            "fullPage": full_page
+        }
+
+        try:
+            response = requests.post(
+                api_url,
+                json=payload,
+                timeout=self.timeout,
+                headers={"Content-Type": "application/json"}
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            return {
+                "success": True,
+                "image_data": result.get("imageData"),
+                "format": result.get("format", "png"),
+                "error": None
+            }
+
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "image_data": None,
+                "error": f"Screenshot request timed out after {self.timeout} seconds"
+            }
+
+        except requests.exceptions.HTTPError as e:
+            error_msg = f"HTTP error: {e.response.status_code}"
+            try:
+                error_details = e.response.json()
+                error_msg += f" - {error_details.get('error', str(error_details))}"
+            except:
+                error_msg += f" - {e.response.text[:200]}"
+
+            return {
+                "success": False,
+                "image_data": None,
+                "error": error_msg
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "image_data": None,
+                "error": f"Screenshot failed: {str(e)}"
+            }
+
+    def get_page_content(
+        self,
+        client_id: str,
+        tab_id: str,
+        format: str = "html"
+    ) -> Dict[str, Any]:
+        """
+        Get page content (HTML or text) from a specific tab.
+
+        Args:
+            client_id: Base client ID
+            tab_id: Tab ID to get content from
+            format: Content format - "html" or "text" (default: "html")
+
+        Returns:
+            Dict with:
+            - success: bool
+            - content: str (page content) if successful
+            - format: str (content format)
+            - error: str (if any)
+        """
+        api_url = f"{self.base_url}/page/content"
+
+        payload = {
+            "clientId": client_id,
+            "tabId": tab_id,
+            "format": format
+        }
+
+        try:
+            response = requests.post(
+                api_url,
+                json=payload,
+                timeout=self.timeout,
+                headers={"Content-Type": "application/json"}
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            return {
+                "success": True,
+                "content": result.get("content"),
+                "format": result.get("format", format),
+                "length": result.get("length", 0),
+                "error": None
+            }
+
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "content": None,
+                "error": f"Content request timed out after {self.timeout} seconds"
+            }
+
+        except requests.exceptions.HTTPError as e:
+            error_msg = f"HTTP error: {e.response.status_code}"
+            try:
+                error_details = e.response.json()
+                error_msg += f" - {error_details.get('error', str(error_details))}"
+            except:
+                error_msg += f" - {e.response.text[:200]}"
+
+            return {
+                "success": False,
+                "content": None,
+                "error": error_msg
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "content": None,
+                "error": f"Content retrieval failed: {str(e)}"
+            }
+
     def check_health(self) -> bool:
         """
         Check if the API server is healthy.
diff --git a/evals/lib/eval_loader.py b/evals/lib/eval_loader.py
index e1bc555..f25303f 100644
--- a/evals/lib/eval_loader.py
+++ b/evals/lib/eval_loader.py
@@ -106,6 +106,46 @@ def get_judge_model(self) -> str:
         llm_judge = self.validation.get('llm_judge', {})
         return llm_judge.get('model', 'gpt-4.1-mini')
 
+    def requires_vision_judge(self) -> bool:
+        """
+        Check if this evaluation requires vision judge (visual verification).
+
+        Returns:
+            True if visual verification is enabled, False otherwise
+        """
+        if self.validation_type != 'llm-judge':
+            return False
+
+        llm_judge = self.validation.get('llm_judge', {})
+        visual_verification = llm_judge.get('visual_verification', {})
+        return visual_verification.get('enabled', False)
+
+    def get_visual_verification_config(self) -> Optional[Dict[str, Any]]:
+        """
+        Get visual verification configuration.
+
+        Returns:
+            Visual verification config dict or None if not enabled
+        """
+        if not self.requires_vision_judge():
+            return None
+
+        llm_judge = self.validation.get('llm_judge', {})
+        return llm_judge.get('visual_verification', {})
+
+    def get_verification_prompts(self) -> List[str]:
+        """
+        Get visual verification prompts.
+
+        Returns:
+            List of verification prompt strings for vision judge
+        """
+        visual_config = self.get_visual_verification_config()
+        if not visual_config:
+            return []
+
+        return visual_config.get('prompts', [])
+
     def get_target_url(self) -> Optional[str]:
         """
         Get the target URL for this evaluation.
diff --git a/evals/run.py b/evals/run.py
new file mode 100755
index 0000000..2d1efff
--- /dev/null
+++ b/evals/run.py
@@ -0,0 +1,628 @@
+#!/usr/bin/env python3
+"""
+Universal Evaluation Runner
+
+Runs evaluations from YAML definitions with flexible execution modes:
+- Run specific eval by path: --path action-agent/a11y-001.yaml
+- Run all evals in category: --category action-agent
+- Run all evals: --all
+"""
+
+import argparse
+import csv
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional
+
+# Add lib directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from lib import (
+    ConfigLoader,
+    EvalLoader,
+    APIClient,
+    LLMJudge,
+    VisionJudge,
+    Evaluation,
+    JudgeResult
+)
+
+
+class EvaluationRunner:
+    """Manages evaluation execution and reporting."""
+
+    def __init__(self, config: ConfigLoader, verbose: bool = False):
+        """
+        Initialize evaluation runner.
+
+        Args:
+            config: Configuration loader
+            verbose: Enable verbose output
+        """
+        self.config = config
+        self.verbose = verbose
+
+        # Initialize components
+        self.eval_loader = EvalLoader()
+        self.api_client = APIClient(
+            base_url=config.get_api_endpoint(),
+            timeout=config.get_timeout()
+        )
+
+        # Initialize judges
+        judge_config = config.get_judge_config()
+        self.judge = LLMJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature')
+        )
+        self.vision_judge = VisionJudge(
+            provider=judge_config['provider'],
+            model_name=judge_config['model_name'],
+            api_key=judge_config['api_key'],
+            temperature=judge_config.get('temperature')
+        )
+
+        # Get nested model config for API requests
+        self.model_config = config.get_nested_model_config()
+
+        # Results tracking
+        self.results = []
+
+        # Create screenshots directory
+        self.screenshots_dir = Path(__file__).parent / 'screenshots'
+        self.screenshots_dir.mkdir(exist_ok=True)
+
+    def run_from_path(self, eval_path: str):
+        """
+        Run a specific evaluation from a file path.
+
+        Args:
+            eval_path: Path to evaluation YAML file (relative to data/ or absolute)
+        """
+        print(f"\n{'='*70}")
+        print(f"Running Evaluation from Path")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Resolve path
+        eval_file = self._resolve_eval_path(eval_path)
+        if not eval_file.exists():
+            print(f"ERROR: Evaluation file not found: {eval_file}")
+            sys.exit(1)
+
+        # Load evaluation
+        print(f"Loading evaluation from {eval_path}...")
+        import yaml
+        with open(eval_file, 'r') as f:
+            data = yaml.safe_load(f)
+
+        evaluation = Evaluation(eval_file, data)
+
+        if not evaluation.enabled:
+            print(f"WARNING: Evaluation {evaluation.id} is disabled")
+            return
+
+        print(f"Found: {evaluation.name} (ID: {evaluation.id})\n")
+
+        # Run evaluation
+        try:
+            result = self._run_single_evaluation(evaluation)
+            self.results.append(result)
+
+            # Print result
+            status = "PASS" if result['passed'] else "FAIL"
+            print(f"[1/1] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+            print(f"  Status: {status}")
+            print(f"  Score: {result['score']:.2f}")
+            print(f"  Time: {result['execution_time_ms']}ms")
+            print()
+
+        except Exception as e:
+            print(f"  ERROR: {str(e)}\n")
+            self.results.append({
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"Execution error: {str(e)}",
+                'execution_time_ms': 0,
+                'error': str(e)
+            })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        self._save_report(evaluation.category)
+
+    def run_evaluations(
+        self,
+        category: Optional[str] = None,
+        limit: Optional[int] = None,
+        eval_ids: Optional[List[str]] = None,
+        run_all: bool = False
+    ):
+        """
+        Run evaluations for a specific category or all categories.
+
+        Args:
+            category: Category name (e.g., 'action-agent'), None for all
+            limit: Maximum number of evaluations to run
+            eval_ids: Optional list of specific evaluation IDs to run
+            run_all: Run all evaluations across all categories
+        """
+        title = "All Evaluations" if run_all else f"{category} Evaluations"
+        print(f"\n{'='*70}")
+        print(f"Running {title}")
+        print(f"{'='*70}\n")
+
+        # Check API server health
+        print("Checking API server connection...")
+        if not self.api_client.check_health():
+            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
+            print("Please ensure the evaluation server is running.")
+            sys.exit(1)
+        print("✓ API server is reachable\n")
+
+        # Load evaluations
+        if run_all:
+            print("Loading all evaluations...")
+            data_dir = Path(__file__).parent / 'data'
+            categories = [d.name for d in data_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
+            all_evaluations = []
+            for cat in categories:
+                evals = self.eval_loader.load_from_directory(category=cat, enabled_only=True)
+                all_evaluations.extend(evals)
+            evaluations = all_evaluations
+        else:
+            print(f"Loading evaluations from {category}...")
+            evaluations = self.eval_loader.load_from_directory(
+                category=category,
+                enabled_only=True
+            )
+
+        # Filter by eval_ids if specified
+        if eval_ids:
+            evaluations = [e for e in evaluations if e.id in eval_ids]
+
+        # Apply limit
+        if limit:
+            evaluations = evaluations[:limit]
+
+        if not evaluations:
+            msg = "all categories" if run_all else f"category: {category}"
+            print(f"No evaluations found in {msg}")
+            return
+
+        print(f"Found {len(evaluations)} evaluations to run\n")
+
+        # Run each evaluation
+        for i, evaluation in enumerate(evaluations, 1):
+            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
+            print(f"  ID: {evaluation.id}")
+
+            try:
+                result = self._run_single_evaluation(evaluation)
+                self.results.append(result)
+
+                # Print result
+                status = "PASS" if result['passed'] else "FAIL"
+                print(f"  Status: {status}")
+                print(f"  Score: {result['score']:.2f}")
+                print(f"  Time: {result['execution_time_ms']}ms")
+                print()
+
+                # Add delay between requests
+                if i < len(evaluations):
+                    delay = self.config.get_execution_config().get('request_delay', 1)
+                    if delay > 0:
+                        time.sleep(delay)
+
+            except KeyboardInterrupt:
+                print("\n\nInterrupted by user. Saving partial results...")
+                break
+            except Exception as e:
+                print(f"  ERROR: {str(e)}\n")
+                # Record failure
+                self.results.append({
+                    'eval_id': evaluation.id,
+                    'eval_name': evaluation.name,
+                    'category': evaluation.category,
+                    'passed': False,
+                    'score': 0.0,
+                    'reasoning': f"Execution error: {str(e)}",
+                    'execution_time_ms': 0,
+                    'error': str(e)
+                })
+
+        # Print summary
+        self._print_summary()
+
+        # Save report
+        report_category = 'all' if run_all else category
+        self._save_report(report_category)
+
+    def _resolve_eval_path(self, eval_path: str) -> Path:
+        """
+        Resolve evaluation path to absolute path.
+
+        Args:
+            eval_path: Relative or absolute path to eval file
+
+        Returns:
+            Absolute Path object
+        """
+        path = Path(eval_path)
+
+        # If absolute and exists, use it
+        if path.is_absolute() and path.exists():
+            return path
+
+        # Try relative to data directory
+        data_dir = Path(__file__).parent / 'data'
+        candidate = data_dir / eval_path
+        if candidate.exists():
+            return candidate
+
+        # Try as-is (relative to current directory)
+        if path.exists():
+            return path.resolve()
+
+        # Return the data_dir candidate (will fail with proper error message)
+        return candidate
+
+    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
+        """
+        Run a single evaluation.
+
+        Args:
+            evaluation: Evaluation to run
+
+        Returns:
+            Result dictionary
+        """
+        # Get input message
+        input_message = evaluation.get_input_message()
+
+        # Verbose: print input
+        if self.verbose:
+            print(f"\n  Input: {input_message}")
+
+        # Get target URL and wait timeout
+        target_url = evaluation.get_target_url()
+        wait_timeout = evaluation.get_wait_timeout()
+
+        # Send API request
+        api_response = self.api_client.send_request(
+            input_message=input_message,
+            model_config=self.model_config,
+            url=target_url,
+            wait_timeout=wait_timeout
+        )
+
+        if not api_response['success']:
+            return {
+                'eval_id': evaluation.id,
+                'eval_name': evaluation.name,
+                'category': evaluation.category,
+                'passed': False,
+                'score': 0.0,
+                'reasoning': f"API request failed: {api_response['error']}",
+                'execution_time_ms': api_response['execution_time_ms'],
+                'error': api_response['error'],
+                'screenshot_path': None
+            }
+
+        # Verbose: print response
+        if self.verbose:
+            print(f"  Response: {api_response['response'][:200]}{'...' if len(api_response['response']) > 200 else ''}")
+
+        # Capture screenshot if client/tab IDs are available
+        screenshot_path = None
+        if api_response.get('client_id') and api_response.get('tab_id'):
+            screenshot_path = self._capture_screenshot(
+                evaluation.id,
+                api_response['client_id'],
+                api_response['tab_id']
+            )
+
+        # Judge the response
+        criteria = evaluation.get_validation_criteria()
+
+        # Check if visual verification is required
+        if evaluation.requires_vision_judge() and screenshot_path:
+            # Use VisionJudge with screenshot
+            screenshot_data_url = self._load_screenshot_as_data_url(screenshot_path)
+            verification_prompts = evaluation.get_verification_prompts()
+
+            if self.verbose:
+                print(f"  Using Vision Judge with screenshot")
+
+            judge_result = self.vision_judge.judge(
+                input_prompt=input_message,
+                response=api_response['response'],
+                criteria=criteria,
+                screenshots={"after": screenshot_data_url} if screenshot_data_url else None,
+                verification_prompts=verification_prompts if verification_prompts else None
+            )
+        else:
+            # Use standard LLMJudge
+            judge_result = self.judge.judge(
+                input_prompt=input_message,
+                response=api_response['response'],
+                criteria=criteria
+            )
+
+        # Verbose: print reasoning
+        if self.verbose:
+            print(f"  Judge Reasoning: {judge_result.reasoning}")
+            if screenshot_path:
+                print(f"  Screenshot: {screenshot_path}")
+
+        return {
+            'eval_id': evaluation.id,
+            'eval_name': evaluation.name,
+            'category': evaluation.category,
+            'passed': judge_result.passed,
+            'score': judge_result.score,
+            'reasoning': judge_result.reasoning,
+            'execution_time_ms': api_response['execution_time_ms'],
+            'error': None,
+            'screenshot_path': screenshot_path
+        }
+
+    def _capture_screenshot(self, eval_id: str, client_id: str, tab_id: str) -> str | None:
+        """
+        Capture screenshot of the page after evaluation.
+
+        Args:
+            eval_id: Evaluation ID for filename
+            client_id: Client ID
+            tab_id: Tab ID
+
+        Returns:
+            Path to saved screenshot or None if failed
+        """
+        try:
+            from datetime import datetime
+            import base64
+
+            result = self.api_client.capture_screenshot(client_id, tab_id, full_page=False)
+
+            if result['success'] and result.get('image_data'):
+                # Generate filename with timestamp
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+                filename = f"{eval_id}_{timestamp}.png"
+                filepath = self.screenshots_dir / filename
+
+                # Extract base64 data (remove data:image/png;base64, prefix if present)
+                image_data = result['image_data']
+                if image_data.startswith('data:image'):
+                    image_data = image_data.split(',', 1)[1]
+
+                # Save screenshot
+                with open(filepath, 'wb') as f:
+                    f.write(base64.b64decode(image_data))
+
+                return str(filepath)
+
+        except Exception as e:
+            if self.verbose:
+                print(f"  Screenshot capture failed: {e}")
+
+        return None
+
+    def _load_screenshot_as_data_url(self, screenshot_path: str) -> str | None:
+        """
+        Load a screenshot file and convert it to a base64 data URL.
+
+        Args:
+            screenshot_path: Path to the screenshot file
+
+        Returns:
+            Data URL string (data:image/png;base64,...) or None if failed
+        """
+        try:
+            import base64
+
+            with open(screenshot_path, 'rb') as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+                return f"data:image/png;base64,{image_data}"
+
+        except Exception as e:
+            if self.verbose:
+                print(f"  Screenshot load failed: {e}")
+            return None
+
+    def _print_summary(self):
+        """Print summary statistics."""
+        if not self.results:
+            return
+
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r['passed'])
+        failed = total - passed
+        pass_rate = (passed / total) * 100 if total > 0 else 0
+        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
+        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
+
+        print(f"\n{'='*70}")
+        print("Summary")
+        print(f"{'='*70}")
+        print(f"Total: {total}")
+        print(f"Passed: {passed} ({pass_rate:.1f}%)")
+        print(f"Failed: {failed}")
+        print(f"Average Score: {avg_score:.2f}")
+        print(f"Average Time: {avg_time:.0f}ms")
+        print(f"{'='*70}\n")
+
+    def _save_report(self, category: str):
+        """
+        Save evaluation results to CSV report.
+
+        Args:
+            category: Category name for report filename
+        """
+        if not self.results:
+            return
+
+        # Create reports directory
+        reports_dir = self.config.get_reports_dir()
+        reports_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        filename = f"{category}_{timestamp}.csv"
+        filepath = reports_dir / filename
+
+        # Write CSV
+        with open(filepath, 'w', newline='', encoding='utf-8') as f:
+            fieldnames = [
+                'timestamp',
+                'eval_id',
+                'eval_name',
+                'category',
+                'status',
+                'score',
+                'judge_reasoning',
+                'execution_time_ms',
+                'error'
+            ]
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'timestamp': datetime.now().isoformat(),
+                    'eval_id': result['eval_id'],
+                    'eval_name': result['eval_name'],
+                    'category': result['category'],
+                    'status': 'PASS' if result['passed'] else 'FAIL',
+                    'score': f"{result['score']:.2f}",
+                    'judge_reasoning': result['reasoning'],
+                    'execution_time_ms': result['execution_time_ms'],
+                    'error': result.get('error', '')
+                })
+
+        print(f"Report saved to: {filepath}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Universal evaluation runner for browser-agent evals",
+        epilog="""
+Examples:
+  # Run specific eval by path
+  python3 run.py --path action-agent/a11y-001.yaml
+
+  # Run all evals in a category
+  python3 run.py --category action-agent --limit 5
+
+  # Run specific evals by ID
+  python3 run.py --category action-agent --eval-ids a11y-001 a11y-002
+
+  # Run all evals across all categories
+  python3 run.py --all
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    # Execution mode (mutually exclusive)
+    mode_group = parser.add_mutually_exclusive_group(required=True)
+    mode_group.add_argument(
+        '--path',
+        type=str,
+        help='Path to specific evaluation YAML file (e.g., action-agent/a11y-001.yaml)'
+    )
+    mode_group.add_argument(
+        '--category',
+        type=str,
+        help='Run all evaluations in a specific category (e.g., action-agent)'
+    )
+    mode_group.add_argument(
+        '--all',
+        action='store_true',
+        help='Run all evaluations across all categories'
+    )
+
+    # Filtering options (only for category/all modes)
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Maximum number of evaluations to run (default: all)'
+    )
+    parser.add_argument(
+        '--eval-ids',
+        nargs='+',
+        help='Specific evaluation IDs to run (only with --category)'
+    )
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help='Path to config.yml (default: evals/config.yml)'
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Enable verbose output (show input, response, reasoning, screenshots)'
+    )
+
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if args.eval_ids and not args.category:
+        parser.error("--eval-ids can only be used with --category")
+
+    try:
+        # Load configuration
+        config = ConfigLoader(config_path=args.config)
+
+        # Create evaluation runner with verbose flag
+        runner = EvaluationRunner(config, verbose=args.verbose)
+
+        # Execute based on mode
+        if args.path:
+            runner.run_from_path(args.path)
+        elif args.category:
+            # Use limit from config if not specified
+            limit = args.limit if args.limit is not None else config.get_default_limit()
+            runner.run_evaluations(
+                category=args.category,
+                limit=limit,
+                eval_ids=args.eval_ids
+            )
+        elif args.all:
+            limit = args.limit if args.limit is not None else config.get_default_limit()
+            runner.run_evaluations(
+                limit=limit,
+                run_all=True
+            )
+
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/run_action_agent.py b/evals/run_action_agent.py
deleted file mode 100755
index 52858e5..0000000
--- a/evals/run_action_agent.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-"""
-Action Agent Evaluation Runner
-
-Runs evaluations for action-agent category and generates reports.
-"""
-
-import argparse
-import csv
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List
-
-# Add lib directory to path
-sys.path.insert(0, str(Path(__file__).parent))
-
-from lib import (
-    ConfigLoader,
-    EvalLoader,
-    APIClient,
-    LLMJudge,
-    Evaluation,
-    JudgeResult
-)
-
-
-class EvaluationRunner:
-    """Manages evaluation execution and reporting."""
-
-    def __init__(self, config: ConfigLoader):
-        """
-        Initialize evaluation runner.
-
-        Args:
-            config: Configuration loader
-        """
-        self.config = config
-
-        # Initialize components
-        self.eval_loader = EvalLoader()
-        self.api_client = APIClient(
-            base_url=config.get_api_endpoint(),
-            timeout=config.get_timeout()
-        )
-
-        # Initialize judge
-        judge_config = config.get_judge_config()
-        self.judge = LLMJudge(
-            provider=judge_config['provider'],
-            model_name=judge_config['model_name'],
-            api_key=judge_config['api_key'],
-            temperature=judge_config.get('temperature', 0.1)
-        )
-
-        # Get nested model config for API requests
-        self.model_config = config.get_nested_model_config()
-
-        # Results tracking
-        self.results = []
-
-    def run_evaluations(
-        self,
-        category: str,
-        limit: int = None,
-        eval_ids: List[str] = None
-    ):
-        """
-        Run evaluations for a specific category.
-
-        Args:
-            category: Category name (e.g., 'action-agent')
-            limit: Maximum number of evaluations to run
-            eval_ids: Optional list of specific evaluation IDs to run
-        """
-        print(f"\n{'='*70}")
-        print(f"Running {category} Evaluations")
-        print(f"{'='*70}\n")
-
-        # Check API server health
-        print("Checking API server connection...")
-        if not self.api_client.check_health():
-            print("ERROR: Cannot connect to API server at", self.config.get_api_endpoint())
-            print("Please ensure the evaluation server is running.")
-            sys.exit(1)
-        print("✓ API server is reachable\n")
-
-        # Load evaluations
-        print(f"Loading evaluations from {category}...")
-        evaluations = self.eval_loader.load_from_directory(
-            category=category,
-            enabled_only=True
-        )
-
-        # Filter by eval_ids if specified
-        if eval_ids:
-            evaluations = [e for e in evaluations if e.id in eval_ids]
-
-        # Apply limit
-        if limit:
-            evaluations = evaluations[:limit]
-
-        if not evaluations:
-            print(f"No evaluations found in category: {category}")
-            return
-
-        print(f"Found {len(evaluations)} evaluations to run\n")
-
-        # Run each evaluation
-        for i, evaluation in enumerate(evaluations, 1):
-            print(f"[{i}/{len(evaluations)}] Running: {evaluation.name}")
-            print(f"  ID: {evaluation.id}")
-
-            try:
-                result = self._run_single_evaluation(evaluation)
-                self.results.append(result)
-
-                # Print result
-                status = "PASS" if result['passed'] else "FAIL"
-                print(f"  Status: {status}")
-                print(f"  Score: {result['score']:.2f}")
-                print(f"  Time: {result['execution_time_ms']}ms")
-                print()
-
-                # Add delay between requests
-                if i < len(evaluations):
-                    delay = self.config.get_execution_config().get('request_delay', 1)
-                    if delay > 0:
-                        time.sleep(delay)
-
-            except KeyboardInterrupt:
-                print("\n\nInterrupted by user. Saving partial results...")
-                break
-            except Exception as e:
-                print(f"  ERROR: {str(e)}\n")
-                # Record failure
-                self.results.append({
-                    'eval_id': evaluation.id,
-                    'eval_name': evaluation.name,
-                    'category': category,
-                    'passed': False,
-                    'score': 0.0,
-                    'reasoning': f"Execution error: {str(e)}",
-                    'execution_time_ms': 0,
-                    'error': str(e)
-                })
-
-        # Print summary
-        self._print_summary()
-
-        # Save report
-        self._save_report(category)
-
-    def _run_single_evaluation(self, evaluation: Evaluation) -> dict:
-        """
-        Run a single evaluation.
-
-        Args:
-            evaluation: Evaluation to run
-
-        Returns:
-            Result dictionary
-        """
-        # Get input message
-        input_message = evaluation.get_input_message()
-
-        # Get target URL and wait timeout
-        target_url = evaluation.get_target_url()
-        wait_timeout = evaluation.get_wait_timeout()
-
-        # Send API request
-        api_response = self.api_client.send_request(
-            input_message=input_message,
-            model_config=self.model_config,
-            url=target_url,
-            wait_timeout=wait_timeout
-        )
-
-        if not api_response['success']:
-            return {
-                'eval_id': evaluation.id,
-                'eval_name': evaluation.name,
-                'category': evaluation.category,
-                'passed': False,
-                'score': 0.0,
-                'reasoning': f"API request failed: {api_response['error']}",
-                'execution_time_ms': api_response['execution_time_ms'],
-                'error': api_response['error']
-            }
-
-        # Judge the response
-        criteria = evaluation.get_validation_criteria()
-        judge_result = self.judge.judge(
-            input_prompt=input_message,
-            response=api_response['response'],
-            criteria=criteria
-        )
-
-        return {
-            'eval_id': evaluation.id,
-            'eval_name': evaluation.name,
-            'category': evaluation.category,
-            'passed': judge_result.passed,
-            'score': judge_result.score,
-            'reasoning': judge_result.reasoning,
-            'execution_time_ms': api_response['execution_time_ms'],
-            'error': None
-        }
-
-    def _print_summary(self):
-        """Print summary statistics."""
-        if not self.results:
-            return
-
-        total = len(self.results)
-        passed = sum(1 for r in self.results if r['passed'])
-        failed = total - passed
-        pass_rate = (passed / total) * 100 if total > 0 else 0
-        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
-        avg_time = sum(r['execution_time_ms'] for r in self.results) / total if total > 0 else 0
-
-        print(f"\n{'='*70}")
-        print("Summary")
-        print(f"{'='*70}")
-        print(f"Total: {total}")
-        print(f"Passed: {passed} ({pass_rate:.1f}%)")
-        print(f"Failed: {failed}")
-        print(f"Average Score: {avg_score:.2f}")
-        print(f"Average Time: {avg_time:.0f}ms")
-        print(f"{'='*70}\n")
-
-    def _save_report(self, category: str):
-        """
-        Save evaluation results to CSV report.
-
-        Args:
-            category: Category name for report filename
-        """
-        if not self.results:
-            return
-
-        # Create reports directory
-        reports_dir = self.config.get_reports_dir()
-        reports_dir.mkdir(parents=True, exist_ok=True)
-
-        # Generate filename with timestamp
-        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        filename = f"{category}_{timestamp}.csv"
-        filepath = reports_dir / filename
-
-        # Write CSV
-        with open(filepath, 'w', newline='', encoding='utf-8') as f:
-            fieldnames = [
-                'timestamp',
-                'eval_id',
-                'eval_name',
-                'category',
-                'status',
-                'score',
-                'judge_reasoning',
-                'execution_time_ms',
-                'error'
-            ]
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-
-            writer.writeheader()
-            for result in self.results:
-                writer.writerow({
-                    'timestamp': datetime.now().isoformat(),
-                    'eval_id': result['eval_id'],
-                    'eval_name': result['eval_name'],
-                    'category': result['category'],
-                    'status': 'PASS' if result['passed'] else 'FAIL',
-                    'score': f"{result['score']:.2f}",
-                    'judge_reasoning': result['reasoning'],
-                    'execution_time_ms': result['execution_time_ms'],
-                    'error': result.get('error', '')
-                })
-
-        print(f"Report saved to: {filepath}")
-
-
-def main():
-    """Main entry point."""
-    parser = argparse.ArgumentParser(
-        description="Run action-agent evaluations"
-    )
-    parser.add_argument(
-        '--limit',
-        type=int,
-        default=None,
-        help='Maximum number of evaluations to run (default: all)'
-    )
-    parser.add_argument(
-        '--eval-ids',
-        nargs='+',
-        help='Specific evaluation IDs to run'
-    )
-    parser.add_argument(
-        '--config',
-        type=str,
-        default=None,
-        help='Path to config.yml (default: evals/config.yml)'
-    )
-
-    args = parser.parse_args()
-
-    try:
-        # Load configuration
-        config = ConfigLoader(config_path=args.config)
-
-        # Use limit from config if not specified
-        limit = args.limit if args.limit is not None else config.get_default_limit()
-
-        # Create and run evaluation runner
-        runner = EvaluationRunner(config)
-        runner.run_evaluations(
-            category='action-agent',
-            limit=limit,
-            eval_ids=args.eval_ids
-        )
-
-    except KeyboardInterrupt:
-        print("\nInterrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        print(f"ERROR: {e}")
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()

From dcab9be1ebf010e71a2d38ece7bd919e2cbe5103 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Mon, 20 Oct 2025 22:46:18 -0500
Subject: [PATCH 13/24] Restore missing eval functionality from c94dd24
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add VisionJudge class to evals/lib/judge.py for visual evaluation with screenshots
- Add /page/content endpoint to api-server.js for retrieving page HTML/text
- Add /page/screenshot endpoint to api-server.js for capturing screenshots
- Fixes ImportError: cannot import name 'VisionJudge'

These were lost during cleanup but are required for the eval runner to work.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 eval-server/nodejs/src/api-server.js |  77 +++++++++++
 evals/lib/judge.py                   | 195 +++++++++++++++++++++++++++
 2 files changed, 272 insertions(+)

diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index 8dc43f2..fd3b13d 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -142,6 +142,22 @@ class APIServer {
             result = await this.handleResponsesRequest(JSON.parse(body));
             break;
 
+          case '/page/content':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.getPageContent(JSON.parse(body));
+            break;
+
+          case '/page/screenshot':
+            if (method !== 'POST') {
+              this.sendError(res, 405, 'Method not allowed');
+              return;
+            }
+            result = await this.getScreenshot(JSON.parse(body));
+            break;
+
           default:
             this.sendError(res, 404, 'Not found');
             return;
@@ -349,6 +365,67 @@ class APIServer {
     };
   }
 
+  async getPageContent(payload) {
+    const { clientId, tabId, format = 'html' } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    if (!tabId) {
+      throw new Error('Tab ID is required');
+    }
+
+    if (!['html', 'text'].includes(format)) {
+      throw new Error('Format must be either "html" or "text"');
+    }
+
+    const baseClientId = clientId.split(':')[0];
+
+    logger.info('Getting page content', { baseClientId, tabId, format });
+
+    // Call appropriate method based on format
+    const result = format === 'html'
+      ? await this.evaluationServer.getPageHTML(tabId)
+      : await this.evaluationServer.getPageText(tabId);
+
+    return {
+      clientId: baseClientId,
+      tabId: result.tabId,
+      content: result.content,
+      format: result.format,
+      length: result.length,
+      timestamp: Date.now()
+    };
+  }
+
+  async getScreenshot(payload) {
+    const { clientId, tabId, fullPage = false } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    if (!tabId) {
+      throw new Error('Tab ID is required');
+    }
+
+    const baseClientId = clientId.split(':')[0];
+
+    logger.info('Capturing screenshot', { baseClientId, tabId, fullPage });
+
+    const result = await this.evaluationServer.captureScreenshot(tabId, { fullPage });
+
+    return {
+      clientId: baseClientId,
+      tabId: result.tabId,
+      imageData: result.imageData,
+      format: result.format,
+      fullPage: result.fullPage,
+      timestamp: Date.now()
+    };
+  }
+
   /**
    * Handle OpenAI Responses API compatible requests with nested model format
    */
diff --git a/evals/lib/judge.py b/evals/lib/judge.py
index 0878c17..400b07a 100644
--- a/evals/lib/judge.py
+++ b/evals/lib/judge.py
@@ -190,6 +190,201 @@ def _build_judge_prompt(
         return prompt
 
 
+class VisionJudge:
+    """Vision-capable LLM judge for evaluating agent responses with screenshots."""
+
+    def __init__(
+        self,
+        provider: str,
+        model_name: str,
+        api_key: str,
+        temperature: float = None
+    ):
+        """
+        Initialize Vision judge.
+
+        Args:
+            provider: Provider name (currently only "openai" supported)
+            model_name: Model name (e.g., "gpt-4o", "gpt-4-vision-preview")
+            api_key: API key for the provider
+            temperature: Sampling temperature (optional, None uses model default)
+        """
+        self.provider = provider
+        self.model_name = model_name
+        self.api_key = api_key
+        self.temperature = temperature
+
+        if provider == "openai":
+            self.client = OpenAI(api_key=api_key)
+        else:
+            raise ValueError(f"Unsupported judge provider: {provider}")
+
+    def judge(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str],
+        screenshots: Dict[str, str] = None,
+        verification_prompts: List[str] = None
+    ) -> JudgeResult:
+        """
+        Judge a response against evaluation criteria with visual verification.
+
+        Args:
+            input_prompt: The original input/prompt sent to the agent
+            response: The agent's response to evaluate
+            criteria: List of criteria strings to evaluate against
+            screenshots: Dict with 'before' and/or 'after' screenshot base64 data URLs
+            verification_prompts: Optional list of visual verification prompts
+
+        Returns:
+            JudgeResult with pass/fail, score, and reasoning
+        """
+        # Build judgment prompt
+        judge_prompt = self._build_judge_prompt(
+            input_prompt,
+            response,
+            criteria,
+            verification_prompts or []
+        )
+
+        # Build message content with text and images
+        content = [{"type": "text", "text": judge_prompt}]
+
+        # Add screenshots if provided
+        if screenshots:
+            if screenshots.get("before"):
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": screenshots["before"], "detail": "auto"}
+                })
+                content.append({
+                    "type": "text",
+                    "text": "BEFORE Screenshot: The page state before the agent action"
+                })
+
+            if screenshots.get("after"):
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": screenshots["after"], "detail": "auto"}
+                })
+                content.append({
+                    "type": "text",
+                    "text": "AFTER Screenshot: The page state after the agent action"
+                })
+
+        try:
+            # Build API call parameters
+            call_params = {
+                "model": self.model_name,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an expert evaluator assessing AI agent responses with visual verification capabilities. "
+                                   "Analyze both text responses and screenshots to provide objective, detailed assessments based on the given criteria."
+                    },
+                    {
+                        "role": "user",
+                        "content": content
+                    }
+                ],
+                "response_format": {"type": "json_object"}
+            }
+
+            # Only add temperature if it's specified
+            if self.temperature is not None:
+                call_params["temperature"] = self.temperature
+
+            # Call LLM to judge
+            completion = self.client.chat.completions.create(**call_params)
+
+            # Parse response
+            result_text = completion.choices[0].message.content
+            result_data = json.loads(result_text)
+
+            # Extract fields
+            passed = result_data.get("passed", False)
+            score = result_data.get("score", 0.0)
+            reasoning = result_data.get("reasoning", "")
+            criteria_results = result_data.get("criteria_results", {})
+
+            return JudgeResult(
+                passed=passed,
+                score=score,
+                reasoning=reasoning,
+                criteria_results=criteria_results
+            )
+
+        except Exception as e:
+            # Return failure result on error
+            return JudgeResult(
+                passed=False,
+                score=0.0,
+                reasoning=f"Vision judge evaluation failed: {str(e)}",
+                criteria_results={}
+            )
+
+    def _build_judge_prompt(
+        self,
+        input_prompt: str,
+        response: str,
+        criteria: List[str],
+        verification_prompts: List[str]
+    ) -> str:
+        """
+        Build the judgment prompt for the vision LLM.
+
+        Args:
+            input_prompt: Original input
+            response: Agent's response
+            criteria: List of evaluation criteria
+            verification_prompts: List of visual verification prompts
+
+        Returns:
+            Formatted prompt string
+        """
+        criteria_list = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
+
+        prompt = f"""Evaluate the following AI agent response against the specified criteria.
+
+## Original Input/Task
+{input_prompt}
+
+## Agent's Response
+{response}
+"""
+
+        # Add visual verification prompts if provided
+        if verification_prompts:
+            verification_list = "\n".join([f"{i+1}. {p}" for i, p in enumerate(verification_prompts)])
+            prompt += f"""
+## Visual Verification Prompts
+{verification_list}
+"""
+
+        prompt += f"""
+## Evaluation Criteria
+{criteria_list}
+
+## Your Task
+Evaluate whether the agent's response satisfies each criterion. Use the screenshots (if provided) to verify the visual state of the page before and after the agent's action. Provide your assessment in JSON format with the following structure:
+
+{{
+  "passed": true/false,  // Overall pass/fail
+  "score": 0.0-1.0,     // Numerical score (0=complete failure, 1=perfect)
+  "reasoning": "Detailed explanation of your assessment including visual analysis",
+  "criteria_results": {{
+    "Criterion 1 text": true/false,
+    "Criterion 2 text": true/false,
+    ...
+  }}
+}}
+
+Be strict but fair in your evaluation. A response should only pass if it genuinely satisfies the criteria.
+"""
+        return prompt
+
+
 class SimpleJudge:
     """Simple keyword-based judge for basic evaluations (fallback)."""
 

From 1fb0bc17a7c7e0d6e2fd0441c998fc4fdb185014 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 09:28:38 -0500
Subject: [PATCH 14/24] Another attempt to cleanup

---
 .../nodejs/evals/action-agent/a11y-001.yaml   |   46 -
 .../evals/action-agent/accordion-001.yaml     |   46 -
 .../action-agent/action-agent-a11y-001.yaml   |   46 -
 .../action-agent-accordion-001.yaml           |   46 -
 .../action-agent-autocomplete-001.yaml        |   46 -
 .../action-agent-checkbox-001.yaml            |   46 -
 .../action-agent-checkbox-002.yaml            |   47 -
 .../action-agent/action-agent-click-001.yaml  |   47 -
 .../action-agent-context-001.yaml             |   46 -
 .../action-agent-datepicker-001.yaml          |   46 -
 .../action-agent-daterange-001.yaml           |   46 -
 .../action-agent-dropdown-001.yaml            |   46 -
 .../action-agent-dynamic-001.yaml             |   46 -
 .../action-agent-ecommerce-001.yaml           |   46 -
 .../action-agent/action-agent-error-001.yaml  |   47 -
 .../action-agent/action-agent-filter-001.yaml |   46 -
 .../action-agent/action-agent-form-001.yaml   |   46 -
 .../action-agent/action-agent-hover-001.yaml  |   46 -
 .../action-agent-keyboard-001.yaml            |   46 -
 .../action-agent/action-agent-login-001.yaml  |   47 -
 .../action-agent/action-agent-modal-001.yaml  |   46 -
 .../action-agent-multiselect-001.yaml         |   46 -
 .../action-agent-multistep-001.yaml           |   47 -
 .../action-agent/action-agent-nav-001.yaml    |   46 -
 .../action-agent/action-agent-radio-001.yaml  |   47 -
 .../action-agent/action-agent-slider-001.yaml |   46 -
 .../action-agent-tableselect-001.yaml         |   46 -
 .../action-agent-tablesort-001.yaml           |   46 -
 .../action-agent/action-agent-tabs-001.yaml   |   46 -
 .../action-agent-timepicker-001.yaml          |   46 -
 .../action-agent/action-agent-upload-001.yaml |   46 -
 .../action-agent/action-agent-video-001.yaml  |   47 -
 .../action-agent/action-agent-video-002.yaml  |   47 -
 .../evals/action-agent/autocomplete-001.yaml  |   46 -
 .../evals/action-agent/checkbox-001.yaml      |   46 -
 .../evals/action-agent/checkbox-002.yaml      |   47 -
 .../nodejs/evals/action-agent/click-001.yaml  |   47 -
 .../evals/action-agent/context-001.yaml       |   46 -
 .../evals/action-agent/datepicker-001.yaml    |   46 -
 .../evals/action-agent/daterange-001.yaml     |   46 -
 .../evals/action-agent/dropdown-001.yaml      |   46 -
 .../evals/action-agent/dynamic-001.yaml       |   46 -
 .../evals/action-agent/ecommerce-001.yaml     |   46 -
 .../nodejs/evals/action-agent/error-001.yaml  |   47 -
 .../nodejs/evals/action-agent/filter-001.yaml |   46 -
 .../nodejs/evals/action-agent/form-001.yaml   |   46 -
 .../nodejs/evals/action-agent/hover-001.yaml  |   46 -
 .../evals/action-agent/keyboard-001.yaml      |   46 -
 .../nodejs/evals/action-agent/login-001.yaml  |   47 -
 .../nodejs/evals/action-agent/modal-001.yaml  |   46 -
 .../evals/action-agent/multiselect-001.yaml   |   46 -
 .../evals/action-agent/multistep-001.yaml     |   47 -
 .../nodejs/evals/action-agent/nav-001.yaml    |   46 -
 .../nodejs/evals/action-agent/radio-001.yaml  |   47 -
 .../nodejs/evals/action-agent/slider-001.yaml |   46 -
 .../evals/action-agent/tableselect-001.yaml   |   46 -
 .../evals/action-agent/tablesort-001.yaml     |   46 -
 .../nodejs/evals/action-agent/tabs-001.yaml   |   46 -
 .../evals/action-agent/timepicker-001.yaml    |   46 -
 .../nodejs/evals/action-agent/upload-001.yaml |   46 -
 .../nodejs/evals/action-agent/video-001.yaml  |   47 -
 .../nodejs/evals/action-agent/video-002.yaml  |   47 -
 eval-server/nodejs/evals/config.yaml          |   11 -
 .../end-to-end/b-vitamins-research-001.yaml   |   35 -
 .../end-to-end/investment-research-001.yaml   |   35 -
 .../end-to-end/product-comparison-001.yaml    |   40 -
 .../end-to-end/recipe-nutrition-001.yaml      |   40 -
 .../evals/end-to-end/travel-planning-001.yaml |   40 -
 .../evals/research-agent/basic-001.yaml       |   39 -
 .../evals/research-agent/business-001.yaml    |   39 -
 .../evals/research-agent/comparison-001.yaml  |   39 -
 .../evals/research-agent/current-001.yaml     |   40 -
 .../nodejs/evals/research-agent/edge-001.yaml |   39 -
 .../research-agent-basic-001.yaml             |   39 -
 .../research-agent-business-001.yaml          |   39 -
 .../research-agent-comparison-001.yaml        |   39 -
 .../research-agent-current-001.yaml           |   40 -
 .../research-agent-edge-001.yaml              |   39 -
 .../research-agent-technical-001.yaml         |   39 -
 .../research-agent-tools-001.yaml             |   40 -
 .../evals/research-agent/technical-001.yaml   |   39 -
 .../evals/research-agent/tools-001.yaml       |   40 -
 .../schema-extractor/amazon-product-001.yaml  |   78 -
 .../evals/schema-extractor/bbc-news-001.yaml  |   69 -
 .../schema-extractor/bing-search-001.yaml     |   70 -
 .../github-repo-001-streamlined.yaml          |   66 -
 .../schema-extractor/github-repo-001.yaml     |   66 -
 .../schema-extractor/google-flights-001.yaml  |  106 --
 .../schema-extractor/google-search-001.yaml   |   76 -
 .../evals/schema-extractor/homedepot-001.yaml |   92 --
 .../evals/schema-extractor/macys-001.yaml     |  106 --
 .../wikipedia-search-001.yaml                 |   77 -
 .../dynamic-content-verification-001.yaml     |   45 -
 .../screenshot-error-handling-001.yaml        |   42 -
 .../screenshot-fullpage-001.yaml              |   43 -
 .../screenshot-viewport-001.yaml              |   42 -
 .../visual-comparison-001.yaml                |   45 -
 .../amazon-product-001.yaml                   |   78 -
 .../bbc-news-001.yaml                         |   69 -
 .../bing-search-001.yaml                      |   70 -
 .../github-repo-001.yaml                      |   66 -
 .../google-flights-001.yaml                   |  106 --
 .../google-search-001.yaml                    |   76 -
 .../homedepot-001.yaml                        |   92 --
 .../macys-001.yaml                            |  106 --
 .../wikipedia-001.yaml                        |   76 -
 .../wikipedia-search-001.yaml                 |   77 -
 .../evals/web-task-agent/booking-001.yaml     |   45 -
 .../evals/web-task-agent/ecommerce-001.yaml   |   53 -
 .../evals/web-task-agent/error-001.yaml       |   45 -
 .../evals/web-task-agent/extract-001.yaml     |   60 -
 .../evals/web-task-agent/finance-001.yaml     |   68 -
 .../evals/web-task-agent/flight-001.yaml      |   45 -
 .../nodejs/evals/web-task-agent/food-001.yaml |   68 -
 .../evals/web-task-agent/iframe-001.yaml      |   83 --
 .../nodejs/evals/web-task-agent/jobs-001.yaml |   68 -
 .../evals/web-task-agent/learning-001.yaml    |   69 -
 .../nodejs/evals/web-task-agent/nav-001.yaml  |   46 -
 .../nodejs/evals/web-task-agent/news-001.yaml |   64 -
 .../evals/web-task-agent/realestate-001.yaml  |   70 -
 .../evals/web-task-agent/scroll-001.yaml      |   61 -
 .../evals/web-task-agent/scroll-002.yaml      |   65 -
 .../evals/web-task-agent/scroll-003.yaml      |   61 -
 .../evals/web-task-agent/scroll-004.yaml      |   61 -
 .../evals/web-task-agent/scroll-005.yaml      |   73 -
 .../evals/web-task-agent/search-001.yaml      |   41 -
 .../evals/web-task-agent/social-001.yaml      |   60 -
 .../web-task-agent-booking-001.yaml           |   45 -
 .../web-task-agent-ecommerce-001.yaml         |   53 -
 .../web-task-agent-error-001.yaml             |   45 -
 .../web-task-agent-extract-001.yaml           |   60 -
 .../web-task-agent-finance-001.yaml           |   68 -
 .../web-task-agent-flight-001.yaml            |   45 -
 .../web-task-agent-food-001.yaml              |   68 -
 .../web-task-agent-iframe-001.yaml            |   83 --
 .../web-task-agent-jobs-001.yaml              |   68 -
 .../web-task-agent-learning-001.yaml          |   69 -
 .../web-task-agent-nav-001.yaml               |   46 -
 .../web-task-agent-news-001.yaml              |   64 -
 .../web-task-agent-realestate-001.yaml        |   70 -
 .../web-task-agent-scroll-001.yaml            |   61 -
 .../web-task-agent-scroll-002.yaml            |   65 -
 .../web-task-agent-scroll-003.yaml            |   61 -
 .../web-task-agent-scroll-004.yaml            |   61 -
 .../web-task-agent-scroll-005.yaml            |   73 -
 .../web-task-agent-search-001.yaml            |   41 -
 .../web-task-agent-social-001.yaml            |   60 -
 eval-server/nodejs/start.js                   |   39 +
 eval-server/python/README.md                  |  368 -----
 eval-server/python/UV_COMMANDS.md             |  188 ---
 eval-server/python/evals/README.md            |  195 ---
 .../python/evals/browsecomp_dataset.py        |  252 ----
 .../python/evals/browsecomp_eval_server.py    |  836 -----------
 eval-server/python/evals/browsecomp_scorer.py |  328 -----
 .../evals/run_browsecomp_eval_server.sh       |   12 -
 eval-server/python/examples/__init__.py       |   10 -
 eval-server/python/examples/basic_server.py   |  100 --
 eval-server/python/examples/logs/.gitignore   |    2 -
 .../python/examples/programmatic_evals.py     |  428 ------
 eval-server/python/examples/with_stack.py     |  201 ---
 eval-server/python/logs/.gitignore            |    2 -
 eval-server/python/pyproject.toml             |   84 --
 eval-server/python/quick_test.py              |   38 -
 eval-server/python/requirements.txt           |   10 -
 eval-server/python/run.py                     |  100 --
 eval-server/python/scripts.py                 |   68 -
 .../python/src/bo_eval_server/__init__.py     |   29 -
 .../src/bo_eval_server/client_manager.py      |  401 -----
 .../python/src/bo_eval_server/config.py       |   75 -
 .../python/src/bo_eval_server/eval_server.py  |  292 ----
 .../src/bo_eval_server/evaluation_stack.py    |  102 --
 .../python/src/bo_eval_server/logger.py       |  180 ---
 .../python/src/bo_eval_server/rpc_client.py   |  229 ---
 eval-server/python/test_client.py             |  190 ---
 eval-server/python/uv.lock                    | 1306 -----------------
 evals/config.yml                              |   40 +-
 176 files changed, 67 insertions(+), 13908 deletions(-)
 delete mode 100644 eval-server/nodejs/evals/action-agent/a11y-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/accordion-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/checkbox-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/checkbox-002.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/click-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/context-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/datepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/daterange-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/dropdown-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/dynamic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/filter-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/form-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/hover-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/keyboard-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/login-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/modal-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/multiselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/multistep-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/radio-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/slider-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tableselect-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tablesort-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/tabs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/timepicker-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/upload-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/video-001.yaml
 delete mode 100644 eval-server/nodejs/evals/action-agent/video-002.yaml
 delete mode 100644 eval-server/nodejs/evals/config.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
 delete mode 100644 eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/basic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/business-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/current-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/edge-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/technical-001.yaml
 delete mode 100644 eval-server/nodejs/evals/research-agent/tools-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/macys-001.yaml
 delete mode 100644 eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
 delete mode 100644 eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
 delete mode 100644 eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/booking-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/extract-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/finance-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/flight-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/food-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/learning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/social-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
 delete mode 100644 eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
 create mode 100644 eval-server/nodejs/start.js
 delete mode 100644 eval-server/python/README.md
 delete mode 100644 eval-server/python/UV_COMMANDS.md
 delete mode 100644 eval-server/python/evals/README.md
 delete mode 100644 eval-server/python/evals/browsecomp_dataset.py
 delete mode 100755 eval-server/python/evals/browsecomp_eval_server.py
 delete mode 100644 eval-server/python/evals/browsecomp_scorer.py
 delete mode 100755 eval-server/python/evals/run_browsecomp_eval_server.sh
 delete mode 100644 eval-server/python/examples/__init__.py
 delete mode 100644 eval-server/python/examples/basic_server.py
 delete mode 100644 eval-server/python/examples/logs/.gitignore
 delete mode 100644 eval-server/python/examples/programmatic_evals.py
 delete mode 100644 eval-server/python/examples/with_stack.py
 delete mode 100644 eval-server/python/logs/.gitignore
 delete mode 100644 eval-server/python/pyproject.toml
 delete mode 100644 eval-server/python/quick_test.py
 delete mode 100644 eval-server/python/requirements.txt
 delete mode 100644 eval-server/python/run.py
 delete mode 100644 eval-server/python/scripts.py
 delete mode 100644 eval-server/python/src/bo_eval_server/__init__.py
 delete mode 100644 eval-server/python/src/bo_eval_server/client_manager.py
 delete mode 100644 eval-server/python/src/bo_eval_server/config.py
 delete mode 100644 eval-server/python/src/bo_eval_server/eval_server.py
 delete mode 100644 eval-server/python/src/bo_eval_server/evaluation_stack.py
 delete mode 100644 eval-server/python/src/bo_eval_server/logger.py
 delete mode 100644 eval-server/python/src/bo_eval_server/rpc_client.py
 delete mode 100644 eval-server/python/test_client.py
 delete mode 100644 eval-server/python/uv.lock

diff --git a/eval-server/nodejs/evals/action-agent/a11y-001.yaml b/eval-server/nodejs/evals/action-agent/a11y-001.yaml
deleted file mode 100644
index 7c7947a..0000000
--- a/eval-server/nodejs/evals/action-agent/a11y-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accessibility action test
-id: "a11y-001"
-name: "Click Using ARIA Label"
-description: "Test clicking an element identified primarily by ARIA attributes"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the button with aria-label \"Print Page\""
-  reasoning: "Testing action selection using accessibility attributes"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Used accessibility tree to find elements"
-      - "Correctly identified element by ARIA label"
-      - "Successfully clicked the target button"
-      - "Demonstrated understanding of accessibility attributes"
-      - "No reliance on visual appearance alone"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Print Page button was successfully clicked"
-        - "Check if any print dialog or print preview appeared"
-        - "Confirm the button showed visual feedback (pressed state)"
-        - "Ensure the action was performed on the correct accessibility-labeled element"
-
-metadata:
-  tags: ["action", "accessibility", "aria", "click", "a11y"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/accordion-001.yaml b/eval-server/nodejs/evals/action-agent/accordion-001.yaml
deleted file mode 100644
index dae142d..0000000
--- a/eval-server/nodejs/evals/action-agent/accordion-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accordion expansion test
-id: "accordion-001"
-name: "Expand Accordion Section"
-description: "Test clicking to expand an accordion panel"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/accordion/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to expand the \"Section 2\" accordion panel"
-  reasoning: "Testing accordion expand/collapse interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Section 2 accordion header"
-      - "Successfully clicked to expand the section"
-      - "Section 2 content became visible"
-      - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify Section 2 is now expanded and content visible"
-        - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
-        - "Ensure Section 2 header shows expanded state"
-
-metadata:
-  tags: ["action", "accordion", "expand", "collapse", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
deleted file mode 100644
index 9526551..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-a11y-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accessibility action test
-id: "action-agent-a11y-001"
-name: "Click Using ARIA Label"
-description: "Test clicking an element identified primarily by ARIA attributes"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the button with aria-label \"Print Page\""
-  reasoning: "Testing action selection using accessibility attributes"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Used accessibility tree to find elements"
-      - "Correctly identified element by ARIA label"
-      - "Successfully clicked the target button"
-      - "Demonstrated understanding of accessibility attributes"
-      - "No reliance on visual appearance alone"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Print Page button was successfully clicked"
-        - "Check if any print dialog or print preview appeared"
-        - "Confirm the button showed visual feedback (pressed state)"
-        - "Ensure the action was performed on the correct accessibility-labeled element"
-
-metadata:
-  tags: ["action", "accessibility", "aria", "click", "a11y"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
deleted file mode 100644
index f2df343..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-accordion-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Accordion expansion test
-id: "action-agent-accordion-001"
-name: "Expand Accordion Section"
-description: "Test clicking to expand an accordion panel"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/accordion/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to expand the \"Section 2\" accordion panel"
-  reasoning: "Testing accordion expand/collapse interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Section 2 accordion header"
-      - "Successfully clicked to expand the section"
-      - "Section 2 content became visible"
-      - "Other sections collapsed appropriately"
-      - "Accordion animation completed smoothly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify Section 2 is now expanded and content visible"
-        - "Check if other accordion sections collapsed"
-        - "Confirm the expansion animation completed"
-        - "Ensure Section 2 header shows expanded state"
-
-metadata:
-  tags: ["action", "accordion", "expand", "collapse", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
deleted file mode 100644
index c22bfc7..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-autocomplete-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Autocomplete search test
-id: "action-agent-autocomplete-001"
-name: "Use Autocomplete Search"
-description: "Test typing in autocomplete field and selecting from suggestions"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/autocomplete/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
-  reasoning: "Testing autocomplete/typeahead interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the autocomplete input field"
-      - "Typed \"Java\" to trigger suggestions"
-      - "Autocomplete dropdown appeared with suggestions"
-      - "Selected \"JavaScript\" from the suggestion list"
-      - "Input field shows the selected value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify \"JavaScript\" appears in the input field"
-        - "Check if autocomplete suggestions appeared"
-        - "Confirm the correct suggestion was selected"
-        - "Ensure dropdown closed after selection"
-
-metadata:
-  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
deleted file mode 100644
index b76f307..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Checkbox/radio button test
-id: "action-agent-checkbox-001"
-name: "Toggle Newsletter Checkbox"
-description: "Test clicking checkbox elements for form options"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Click the checkbox labeled \"I have a bike\" to check it"
-  reasoning: "Testing interaction with checkbox form elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct checkbox among multiple options"
-      - "Used click action on the checkbox element"
-      - "Checkbox state changed from unchecked to checked"
-      - "Handled the iframe structure if present"
-      - "No errors with form element interaction"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
-        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
-        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
-        - "Ensure no other checkboxes were accidentally modified"
-
-metadata:
-  tags: ["action", "checkbox", "form", "w3schools", "input"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml b/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
deleted file mode 100644
index 0b25fa8..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-checkbox-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Toggle checkbox test - using HTML form test site
-id: "action-agent-checkbox-002"
-name: "Check Extra Cheese Checkbox"
-description: "Test checking a specific checkbox using the check method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
-  reasoning: "Testing checkbox interaction functionality using check method"
-  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
-      - "Used the check method instead of click for better reliability"
-      - "Checkbox became checked (if it wasn't already)"
-      - "No errors occurred during checkbox interaction"
-      - "Form maintained its structure after checkbox selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
-        - "Check that the checkbox shows proper visual feedback for checked state"
-        - "Confirm the form structure remained intact"
-        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
-
-metadata:
-  tags: ["action", "checkbox", "check", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
deleted file mode 100644
index e9af6cf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-click-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Basic search interaction test
-id: "action-agent-click-001"
-name: "Search with Text Entry and Click"
-description: "Test entering text in search field and clicking search button"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
-  reasoning: "Testing multi-step interaction: text input followed by button click"
-  hint: "First fill the search input field, then find and click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully located the search input field"
-      - "Entered \"DevTools automation\" text in the search box"
-      - "Located the Google Search button after entering text"
-      - "Successfully clicked the search button"
-      - "Search was executed and results page loaded"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify text \"DevTools automation\" was entered in the search field"
-        - "Check if search results page loaded with relevant results"
-        - "Confirm the search was executed (URL changed to results page)"
-        - "Ensure search results are related to \"DevTools automation\""
-
-metadata:
-  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
deleted file mode 100644
index 6162697..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-context-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Right click context menu test
-id: "action-agent-context-001"
-name: "Right Click Context Menu"
-description: "Test right-clicking to open context menu"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/context_menu"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Right-click on the context menu area to open the context menu"
-  reasoning: "Testing right-click context menu interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the designated context menu area"
-      - "Performed right-click action correctly"
-      - "Context menu appeared with options"
-      - "Successfully triggered the right-click event"
-      - "Alert or confirmation appeared as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify right-click was performed on correct area"
-        - "Check if context menu or alert appeared"
-        - "Confirm right-click event was properly triggered"
-        - "Ensure the expected response occurred"
-
-metadata:
-  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
deleted file mode 100644
index f4abbf7..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-datepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date picker test
-id: "action-agent-datepicker-001"
-name: "Select Date from Calendar"
-description: "Test clicking date input and selecting a specific date from calendar popup"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/datepicker/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
-  reasoning: "Testing interaction with calendar popup widgets"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the date input field"
-      - "Calendar popup opened successfully"
-      - "Navigated to correct month/year if needed"
-      - "Selected the specific date (March 15, 2024)"
-      - "Date input field shows the selected date"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the date input field contains the selected date"
-        - "Check if the calendar widget opened and closed properly"
-        - "Confirm the correct date was highlighted and selected"
-        - "Ensure the date format matches expected output"
-
-metadata:
-  tags: ["action", "datepicker", "calendar", "form", "popup"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
deleted file mode 100644
index 4581a47..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-daterange-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date range picker test
-id: "action-agent-daterange-001"
-name: "Select Date Range"
-description: "Test selecting a date range with start and end dates"
-enabled: true
-
-target:
-  url: "https://www.daterangepicker.com/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select a date range from February 1, 2024 to February 28, 2024"
-  reasoning: "Testing complex date range selection with start and end dates"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Opened the date range picker interface"
-      - "Selected the start date (February 1, 2024)"
-      - "Selected the end date (February 28, 2024)"
-      - "Date range was properly applied"
-      - "Input field shows the complete date range"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both start and end dates are displayed in the input"
-        - "Check if the date range picker shows the selected range"
-        - "Confirm the format matches expected date range display"
-        - "Ensure both dates were selected in sequence"
-
-metadata:
-  tags: ["action", "daterange", "date-picker", "form", "complex"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
deleted file mode 100644
index b37b91c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-dropdown-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dropdown selection test
-id: "action-agent-dropdown-001"
-name: "Select Dropdown Option"
-description: "Test selecting an option from a dropdown menu"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select \"Audi\" from the car brands dropdown menu"
-  reasoning: "Testing dropdown selection interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the dropdown/select element"
-      - "Identified the correct option to select"
-      - "Successfully selected the Audi option"
-      - "Dropdown value changed to the selected option"
-      - "Handled select element interaction properly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the dropdown selection changed"
-        - "Confirm \"Audi\" is now displayed as the selected option"
-        - "Check if the dropdown is closed after selection"
-        - "Verify no other form elements were affected by the selection"
-
-metadata:
-  tags: ["action", "dropdown", "select", "form", "w3schools"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
deleted file mode 100644
index a4380f3..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-dynamic-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dynamic content interaction test
-id: "action-agent-dynamic-001"
-name: "Click Dynamic Load Button"
-description: "Test clicking a button that loads dynamic content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the \"Start\" button to trigger dynamic content loading"
-  reasoning: "Testing interaction with dynamically loaded content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found and clicked the Start button"
-      - "Handled the dynamic loading process"
-      - "Recognized that content changes after clicking"
-      - "No timing issues with the dynamic content"
-      - "Successfully triggered the loading animation"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loaded after clicking Start"
-        - "Check if loading animation or spinner was displayed"
-        - "Confirm new content appeared that was previously hidden"
-        - "Verify the Start button state changed or was replaced after clicking"
-
-metadata:
-  tags: ["action", "dynamic", "click", "ajax", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
deleted file mode 100644
index 503c157..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-ecommerce-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# E-commerce action test
-id: "action-agent-ecommerce-001"
-name: "Add Product to Cart"
-description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 180000
-
-input:
-  objective: "Click the \"Add to Cart\" button for this storage container"
-  reasoning: "Testing e-commerce interaction with product cart functionality"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Add to Cart button on the product page"
-      - "Successfully clicked the button"
-      - "Handled any popups or confirmations that appeared"
-      - "Verified the item was added (cart count changed or confirmation shown)"
-      - "Dealt with page dynamics after clicking"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the Add to Cart button was clicked"
-        - "Check if cart count indicator increased or shows the item was added"
-        - "Look for any confirmation popup or notification about the item being added"
-        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
-
-metadata:
-  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
-  priority: "high"
-  timeout: 180000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
deleted file mode 100644
index 43c95e6..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-error-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Error recovery test
-id: "action-agent-error-001"
-name: "Handle Missing Element"
-description: "Test agent behavior when target element is not found"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the \"Sign Up\" button"
-  reasoning: "Testing error handling when element does not exist"
-  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to find the requested element"
-      - "Recognized that the element does not exist"
-      - "Provided clear error message or explanation"
-      - "Did not crash or produce confusing output"
-      - "Suggested alternatives or explained the issue"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the page remains in a stable state despite the missing element"
-        - "Confirm no error dialogs or broken UI elements appeared"
-        - "Check that the agent handled the missing element gracefully"
-        - "Ensure the page was properly analyzed even though the target was not found"
-
-metadata:
-  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
-  priority: "high"
-  timeout: 60000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
deleted file mode 100644
index 7782999..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-filter-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Search filter application test
-id: "action-agent-filter-001"
-name: "Apply Search Filters"
-description: "Test applying search filters to modify results"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Anna\" in the search filter to filter the list"
-  reasoning: "Testing search filter application"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the search filter input"
-      - "Typed \"Anna\" in the filter field"
-      - "List items filtered to show only matching results"
-      - "Non-matching items were hidden or removed from view"
-      - "Filter functionality worked as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search input contains \"Anna\""
-        - "Check if list shows only items containing \"Anna\""
-        - "Confirm non-matching items are not visible"
-        - "Ensure filter functionality reduced the visible list items"
-
-metadata:
-  tags: ["action", "filter", "search", "list", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
deleted file mode 100644
index 61d036f..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-form-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Form fill action test
-id: "action-agent-form-001"
-name: "Fill Search Query"
-description: "Test filling a search input field with specific text"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Fill the search box with \"Chrome DevTools automation testing\""
-  reasoning: "Testing form input capability with a specific search query"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully identified the search input field"
-      - "Used perform_action with fill method"
-      - "Correctly filled the field with the specified text"
-      - "Verified the field accepted the input"
-      - "No formatting or encoding issues with the text"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to confirm text was entered in the search field"
-        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
-        - "Check if search suggestions or autocomplete dropdown appeared"
-        - "Ensure no input validation errors are shown"
-
-metadata:
-  tags: ["action", "form-fill", "input", "google", "basic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
deleted file mode 100644
index ed98fbf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-hover-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Hover action test
-id: "action-agent-hover-001"
-name: "Hover to Reveal Menu"
-description: "Test hovering over an element to reveal hidden content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/hovers"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Hover over the first user avatar image to reveal the hidden caption"
-  reasoning: "Testing hover interaction to reveal dynamic content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first user avatar image"
-      - "Used appropriate hover action method"
-      - "Successfully triggered the hover state"
-      - "Hidden caption became visible after hover"
-      - "Handled mouse interaction correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify hover revealed hidden content"
-        - "Check that caption or overlay appeared over the first avatar"
-        - "Confirm the hover state is visually active on the image"
-        - "Verify user information or caption text is now visible"
-
-metadata:
-  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
deleted file mode 100644
index 6bfceac..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-keyboard-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Keyboard tab navigation test
-id: "action-agent-keyboard-001"
-name: "Keyboard Tab Navigation"
-description: "Test using keyboard navigation to move between elements"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Use Tab key to navigate between menu items and Enter to activate"
-  reasoning: "Testing keyboard-only navigation patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used keyboard navigation"
-      - "Tab key moved focus between menu items"
-      - "Focus indicators were visible during navigation"
-      - "Enter key activated the focused menu item"
-      - "Keyboard navigation followed accessibility standards"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify focus indicators are visible on menu items"
-        - "Check if keyboard navigation moved focus correctly"
-        - "Confirm Enter key activated the focused item"
-        - "Ensure accessibility navigation patterns worked"
-
-metadata:
-  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
deleted file mode 100644
index 1b705ce..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-login-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Login form test
-id: "action-agent-login-001"
-name: "Fill Login Credentials"
-description: "Test filling username and password fields in a login form"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/login"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
-  reasoning: "Testing form fill with multiple fields including password type"
-  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified both username and password fields"
-      - "Filled username field with correct value"
-      - "Filled password field with correct value"
-      - "Handled password field type appropriately"
-      - "Used the provided input_data XML format correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the username field shows \"tomsmith\" entered"
-        - "Confirm the password field has dots/asterisks indicating password entry"
-        - "Check that both fields are properly filled before submission"
-        - "Ensure no validation errors are shown for the filled fields"
-
-metadata:
-  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
deleted file mode 100644
index 1324fee..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-modal-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Modal dialog test
-id: "action-agent-modal-001"
-name: "Open and Close Modal"
-description: "Test opening modal dialog and closing it with X button"
-enabled: true
-
-target:
-  url: "https://getbootstrap.com/docs/5.0/components/modal/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to open the modal dialog, then close it using the X button"
-  reasoning: "Testing modal dialog interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the modal trigger button"
-      - "Modal dialog opened successfully"
-      - "Modal content was visible and accessible"
-      - "Found and clicked the close (X) button"
-      - "Modal closed and page returned to normal state"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify modal opened with visible content"
-        - "Check if modal overlay appeared correctly"
-        - "Confirm modal was closed after clicking X"
-        - "Ensure page background is accessible again"
-
-metadata:
-  tags: ["action", "modal", "dialog", "popup", "overlay"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
deleted file mode 100644
index fed3f78..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-multiselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Multi-select dropdown test
-id: "action-agent-multiselect-001"
-name: "Select Multiple Options"
-description: "Test selecting multiple options from a multi-select dropdown"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
-  reasoning: "Testing multiple selection in select elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the multi-select dropdown element"
-      - "Successfully selected Volvo option"
-      - "Successfully selected Audi option"
-      - "Both options remain selected simultaneously"
-      - "Used appropriate multi-select interaction method"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both Volvo and Audi appear selected"
-        - "Check if both options are highlighted/marked"
-        - "Confirm multi-select functionality worked correctly"
-        - "Ensure no other options were accidentally selected"
-
-metadata:
-  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
deleted file mode 100644
index 31514dd..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-multistep-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Multi-step form test
-id: "action-agent-multistep-001"
-name: "Complete Search and Submit"
-description: "Test filling a search form and then clicking the submit button"
-enabled: true
-
-target:
-  url: "https://www.bing.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
-  reasoning: "Testing multi-step form interaction combining fill and click actions"
-  hint: "This requires two actions: first fill the search field, then click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Recognized this requires multiple actions"
-      - "First filled the search input correctly"
-      - "Then located and clicked the search button"
-      - "Both actions completed successfully in sequence"
-      - "Search was initiated with the correct query"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the search input contains \"automated testing tools\" text"
-        - "Confirm the search was submitted and results page loaded"
-        - "Check that search results are related to the query"
-        - "Ensure the multi-step action completed fully with both fill and click"
-
-metadata:
-  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
deleted file mode 100644
index f49a0cf..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Complex navigation test
-id: "action-agent-nav-001"
-name: "Navigate via Menu Click"
-description: "Test clicking navigation menu items to navigate between pages"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
-  reasoning: "Testing navigation through link clicks on a multilingual site"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct language link among many options"
-      - "Successfully clicked the English link"
-      - "Navigation occurred to the English Wikipedia"
-      - "Used appropriate tools to verify navigation success"
-      - "Handled the multilingual page structure correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
-        - "Check if the page language and content changed to English"
-        - "Verify the URL changed to en.wikipedia.org"
-        - "Confirm the English Wikipedia main page is displayed"
-
-metadata:
-  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
deleted file mode 100644
index 07d6ef8..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-radio-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Radio button selection test
-id: "action-agent-radio-001"
-name: "Select Radio Button Option"
-description: "Test selecting a specific radio button option using click method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
-  reasoning: "Testing radio button selection functionality"
-  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Medium radio button in the Pizza Size section"
-      - "Successfully clicked the Medium radio button"
-      - "Radio button became selected (checked state)"
-      - "Other radio buttons in the same group became unselected"
-      - "Form maintained its structure after radio button selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Medium radio button is now selected (shows filled circle)"
-        - "Check that other pizza size options (Small, Large) are no longer selected"
-        - "Confirm the form structure remained intact"
-        - "Ensure the Medium pizza size radio button was specifically targeted"
-
-metadata:
-  tags: ["action", "radio", "click", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
deleted file mode 100644
index c370658..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-slider-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Range slider test
-id: "action-agent-slider-001"
-name: "Adjust Range Slider"
-description: "Test moving slider to set a specific value"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/slider/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Move the slider to set the value to 75"
-  reasoning: "Testing slider/range input manipulation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the slider control element"
-      - "Successfully moved the slider handle"
-      - "Set the slider value to approximately 75"
-      - "Slider position reflects the target value"
-      - "Any associated display shows the correct value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify slider handle moved to represent value 75"
-        - "Check if value display shows 75 or close to it"
-        - "Confirm slider position visually matches target"
-        - "Ensure slider interaction was smooth and successful"
-
-metadata:
-  tags: ["action", "slider", "range", "form", "drag"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
deleted file mode 100644
index d78e66c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tableselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table row selection test
-id: "action-agent-tableselect-001"
-name: "Select Table Row"
-description: "Test clicking to select a table row"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/api/select_single_row.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the first row to select it"
-  reasoning: "Testing table row selection patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first table row"
-      - "Successfully clicked the row"
-      - "Row became highlighted/selected"
-      - "Selection state is visually apparent"
-      - "Only one row is selected at a time"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the first row is now highlighted/selected"
-        - "Check if row selection visual feedback is clear"
-        - "Confirm only the clicked row is selected"
-        - "Ensure row selection styling is properly applied"
-
-metadata:
-  tags: ["action", "table", "select", "row", "highlight"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
deleted file mode 100644
index e3e3176..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tablesort-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table column sorting test
-id: "action-agent-tablesort-001"
-name: "Sort Table Column"
-description: "Test clicking table column header to sort data"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Name\" column header to sort the table by name"
-  reasoning: "Testing table column sorting interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Name column header"
-      - "Successfully clicked the column header"
-      - "Table data reordered by name alphabetically"
-      - "Sort indicator appeared on the Name column"
-      - "Table sorting completed without errors"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify table rows are now sorted alphabetically by name"
-        - "Check if sort arrow/indicator appears on Name column"
-        - "Confirm the data order changed from before to after"
-        - "Ensure table structure remained intact after sorting"
-
-metadata:
-  tags: ["action", "table", "sort", "column", "data"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
deleted file mode 100644
index 22db60c..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-tabs-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Tab panel navigation test
-id: "action-agent-tabs-001"
-name: "Navigate Tab Panels"
-description: "Test clicking tab to switch between tab panels"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/tabs/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
-  reasoning: "Testing tab panel navigation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the \"Nunc tincidunt\" tab button"
-      - "Successfully clicked the tab"
-      - "Tab panel content switched to the selected tab"
-      - "Active tab visual state changed appropriately"
-      - "Content area updated to show the new panel"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
-        - "Check if the content panel changed to show new content"
-        - "Confirm the tab switching animation completed"
-        - "Ensure the correct tab content is visible"
-
-metadata:
-  tags: ["action", "tabs", "navigation", "panels", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
deleted file mode 100644
index 056fbe9..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-timepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Time picker test
-id: "action-agent-timepicker-001"
-name: "Select Time from Picker"
-description: "Test setting time using time picker controls"
-enabled: true
-
-target:
-  url: "https://timepicker.co/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Set the time to 2:30 PM using the time picker controls"
-  reasoning: "Testing time selection with hour/minute controls"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the time picker interface"
-      - "Set the hour to 2 (14 for 24-hour format)"
-      - "Set the minutes to 30"
-      - "Selected PM or appropriate time format"
-      - "Time input shows 2:30 PM or equivalent"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the time input displays 2:30 PM or 14:30"
-        - "Check if hour and minute were set correctly"
-        - "Confirm AM/PM selection if applicable"
-        - "Ensure the time picker interface was properly used"
-
-metadata:
-  tags: ["action", "timepicker", "time", "form", "clock"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
deleted file mode 100644
index 518515d..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-upload-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# File upload test
-id: "action-agent-upload-001"
-name: "Upload File via Input"
-description: "Test clicking file input and uploading a test file"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/upload"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the file input and upload a test file"
-  reasoning: "Testing file upload interaction through input elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the file input element"
-      - "Triggered file selection dialog"
-      - "Selected a file for upload"
-      - "File name appears in the input field"
-      - "Upload process initiated successfully"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify file name appears in the upload input field"
-        - "Check if file selection was successful"
-        - "Confirm upload button is available or file is ready"
-        - "Ensure no upload errors are displayed"
-
-metadata:
-  tags: ["action", "upload", "file", "input", "form"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml b/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
deleted file mode 100644
index ba21b28..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-video-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video playback controls test
-id: "action-agent-video-001"
-name: "Control Video Playback"
-description: "Test starting video playback using click + spacebar"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the video element to focus it, then press spacebar to start playback"
-  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
-  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Video element in the accessibility tree"
-      - "Successfully clicked the Video element to focus it"
-      - "Used keyboard input to press spacebar"
-      - "Video playback started after spacebar press"
-      - "No errors occurred during the interaction sequence"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify video player is visible on the page"
-        - "Check if the play button was clicked (may show pause button after)"
-        - "Look for visual indicators that video started playing"
-        - "Ensure no error messages appeared during video interaction"
-
-metadata:
-  tags: ["action", "video", "media", "controls", "playback"]
-  priority: "high"
-  timeout: 90000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml b/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
deleted file mode 100644
index d7188ec..0000000
--- a/eval-server/nodejs/evals/action-agent/action-agent-video-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video play button specific targeting test
-id: "action-agent-video-002"
-name: "Click Video Play Button Specifically"
-description: "Test clicking the specific play button (not the video element)"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
-  reasoning: "Testing specific targeting of the play button element"
-  hint: "Target the button element with text or label \"play\", do not click the Video element"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found a button element (not Video element) with \"play\" in the name"
-      - "Successfully clicked the play button specifically"
-      - "Did not click on the Video element itself"
-      - "Play button click was executed correctly"
-      - "Video responded to the button click"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the play button (not video element) was clicked"
-        - "Check if video started playing after button click"
-        - "Confirm the target was the button, not the video container"
-        - "Look for changes in video player state"
-
-metadata:
-  tags: ["action", "video", "button", "specific-targeting"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml b/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
deleted file mode 100644
index 4bd4aa8..0000000
--- a/eval-server/nodejs/evals/action-agent/autocomplete-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Autocomplete search test
-id: "autocomplete-001"
-name: "Use Autocomplete Search"
-description: "Test typing in autocomplete field and selecting from suggestions"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/autocomplete/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
-  reasoning: "Testing autocomplete/typeahead interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the autocomplete input field"
-      - "Typed \"Java\" to trigger suggestions"
-      - "Autocomplete dropdown appeared with suggestions"
-      - "Selected \"JavaScript\" from the suggestion list"
-      - "Input field shows the selected value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify \"JavaScript\" appears in the input field"
-        - "Check if autocomplete suggestions appeared"
-        - "Confirm the correct suggestion was selected"
-        - "Ensure dropdown closed after selection"
-
-metadata:
-  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/checkbox-001.yaml b/eval-server/nodejs/evals/action-agent/checkbox-001.yaml
deleted file mode 100644
index 041f2f6..0000000
--- a/eval-server/nodejs/evals/action-agent/checkbox-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Checkbox/radio button test
-id: "checkbox-001"
-name: "Toggle Newsletter Checkbox"
-description: "Test clicking checkbox elements for form options"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Click the checkbox labeled \"I have a bike\" to check it"
-  reasoning: "Testing interaction with checkbox form elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct checkbox among multiple options"
-      - "Used click action on the checkbox element"
-      - "Checkbox state changed from unchecked to checked"
-      - "Handled the iframe structure if present"
-      - "No errors with form element interaction"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
-        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
-        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
-        - "Ensure no other checkboxes were accidentally modified"
-
-metadata:
-  tags: ["action", "checkbox", "form", "w3schools", "input"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/checkbox-002.yaml b/eval-server/nodejs/evals/action-agent/checkbox-002.yaml
deleted file mode 100644
index 036f388..0000000
--- a/eval-server/nodejs/evals/action-agent/checkbox-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Toggle checkbox test - using HTML form test site
-id: "checkbox-002"
-name: "Check Extra Cheese Checkbox"
-description: "Test checking a specific checkbox using the check method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
-  reasoning: "Testing checkbox interaction functionality using check method"
-  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
-      - "Used the check method instead of click for better reliability"
-      - "Checkbox became checked (if it wasn't already)"
-      - "No errors occurred during checkbox interaction"
-      - "Form maintained its structure after checkbox selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
-        - "Check that the checkbox shows proper visual feedback for checked state"
-        - "Confirm the form structure remained intact"
-        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
-
-metadata:
-  tags: ["action", "checkbox", "check", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/click-001.yaml b/eval-server/nodejs/evals/action-agent/click-001.yaml
deleted file mode 100644
index e86c8fd..0000000
--- a/eval-server/nodejs/evals/action-agent/click-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Basic search interaction test
-id: "click-001"
-name: "Search with Text Entry and Click"
-description: "Test entering text in search field and clicking search button"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
-  reasoning: "Testing multi-step interaction: text input followed by button click"
-  hint: "First fill the search input field, then find and click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully located the search input field"
-      - "Entered \"DevTools automation\" text in the search box"
-      - "Located the Google Search button after entering text"
-      - "Successfully clicked the search button"
-      - "Search was executed and results page loaded"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify text \"DevTools automation\" was entered in the search field"
-        - "Check if search results page loaded with relevant results"
-        - "Confirm the search was executed (URL changed to results page)"
-        - "Ensure search results are related to \"DevTools automation\""
-
-metadata:
-  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/context-001.yaml b/eval-server/nodejs/evals/action-agent/context-001.yaml
deleted file mode 100644
index 0ca7c58..0000000
--- a/eval-server/nodejs/evals/action-agent/context-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Right click context menu test
-id: "context-001"
-name: "Right Click Context Menu"
-description: "Test right-clicking to open context menu"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/context_menu"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Right-click on the context menu area to open the context menu"
-  reasoning: "Testing right-click context menu interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the designated context menu area"
-      - "Performed right-click action correctly"
-      - "Context menu appeared with options"
-      - "Successfully triggered the right-click event"
-      - "Alert or confirmation appeared as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify right-click was performed on correct area"
-        - "Check if context menu or alert appeared"
-        - "Confirm right-click event was properly triggered"
-        - "Ensure the expected response occurred"
-
-metadata:
-  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/datepicker-001.yaml b/eval-server/nodejs/evals/action-agent/datepicker-001.yaml
deleted file mode 100644
index 9b6a9df..0000000
--- a/eval-server/nodejs/evals/action-agent/datepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date picker test
-id: "datepicker-001"
-name: "Select Date from Calendar"
-description: "Test clicking date input and selecting a specific date from calendar popup"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/datepicker/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
-  reasoning: "Testing interaction with calendar popup widgets"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the date input field"
-      - "Calendar popup opened successfully"
-      - "Navigated to correct month/year if needed"
-      - "Selected the specific date (March 15, 2024)"
-      - "Date input field shows the selected date"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the date input field contains the selected date"
-        - "Check if the calendar widget opened and closed properly"
-        - "Confirm the correct date was highlighted and selected"
-        - "Ensure the date format matches expected output"
-
-metadata:
-  tags: ["action", "datepicker", "calendar", "form", "popup"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/daterange-001.yaml b/eval-server/nodejs/evals/action-agent/daterange-001.yaml
deleted file mode 100644
index a9b202b..0000000
--- a/eval-server/nodejs/evals/action-agent/daterange-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Date range picker test
-id: "daterange-001"
-name: "Select Date Range"
-description: "Test selecting a date range with start and end dates"
-enabled: true
-
-target:
-  url: "https://www.daterangepicker.com/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select a date range from February 1, 2024 to February 28, 2024"
-  reasoning: "Testing complex date range selection with start and end dates"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Opened the date range picker interface"
-      - "Selected the start date (February 1, 2024)"
-      - "Selected the end date (February 28, 2024)"
-      - "Date range was properly applied"
-      - "Input field shows the complete date range"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both start and end dates are displayed in the input"
-        - "Check if the date range picker shows the selected range"
-        - "Confirm the format matches expected date range display"
-        - "Ensure both dates were selected in sequence"
-
-metadata:
-  tags: ["action", "daterange", "date-picker", "form", "complex"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/dropdown-001.yaml b/eval-server/nodejs/evals/action-agent/dropdown-001.yaml
deleted file mode 100644
index a64edb0..0000000
--- a/eval-server/nodejs/evals/action-agent/dropdown-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dropdown selection test
-id: "dropdown-001"
-name: "Select Dropdown Option"
-description: "Test selecting an option from a dropdown menu"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select \"Audi\" from the car brands dropdown menu"
-  reasoning: "Testing dropdown selection interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the dropdown/select element"
-      - "Identified the correct option to select"
-      - "Successfully selected the Audi option"
-      - "Dropdown value changed to the selected option"
-      - "Handled select element interaction properly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the dropdown selection changed"
-        - "Confirm \"Audi\" is now displayed as the selected option"
-        - "Check if the dropdown is closed after selection"
-        - "Verify no other form elements were affected by the selection"
-
-metadata:
-  tags: ["action", "dropdown", "select", "form", "w3schools"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/dynamic-001.yaml b/eval-server/nodejs/evals/action-agent/dynamic-001.yaml
deleted file mode 100644
index fba60bd..0000000
--- a/eval-server/nodejs/evals/action-agent/dynamic-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dynamic content interaction test
-id: "dynamic-001"
-name: "Click Dynamic Load Button"
-description: "Test clicking a button that loads dynamic content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the \"Start\" button to trigger dynamic content loading"
-  reasoning: "Testing interaction with dynamically loaded content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found and clicked the Start button"
-      - "Handled the dynamic loading process"
-      - "Recognized that content changes after clicking"
-      - "No timing issues with the dynamic content"
-      - "Successfully triggered the loading animation"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loaded after clicking Start"
-        - "Check if loading animation or spinner was displayed"
-        - "Confirm new content appeared that was previously hidden"
-        - "Verify the Start button state changed or was replaced after clicking"
-
-metadata:
-  tags: ["action", "dynamic", "click", "ajax", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml b/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
deleted file mode 100644
index ae573de..0000000
--- a/eval-server/nodejs/evals/action-agent/ecommerce-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# E-commerce action test
-id: "ecommerce-001"
-name: "Add Product to Cart"
-description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 180000
-
-input:
-  objective: "Click the \"Add to Cart\" button for this storage container"
-  reasoning: "Testing e-commerce interaction with product cart functionality"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Add to Cart button on the product page"
-      - "Successfully clicked the button"
-      - "Handled any popups or confirmations that appeared"
-      - "Verified the item was added (cart count changed or confirmation shown)"
-      - "Dealt with page dynamics after clicking"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify the Add to Cart button was clicked"
-        - "Check if cart count indicator increased or shows the item was added"
-        - "Look for any confirmation popup or notification about the item being added"
-        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
-
-metadata:
-  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
-  priority: "high"
-  timeout: 180000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/error-001.yaml b/eval-server/nodejs/evals/action-agent/error-001.yaml
deleted file mode 100644
index a2b5646..0000000
--- a/eval-server/nodejs/evals/action-agent/error-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Error recovery test
-id: "error-001"
-name: "Handle Missing Element"
-description: "Test agent behavior when target element is not found"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the \"Sign Up\" button"
-  reasoning: "Testing error handling when element does not exist"
-  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to find the requested element"
-      - "Recognized that the element does not exist"
-      - "Provided clear error message or explanation"
-      - "Did not crash or produce confusing output"
-      - "Suggested alternatives or explained the issue"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the page remains in a stable state despite the missing element"
-        - "Confirm no error dialogs or broken UI elements appeared"
-        - "Check that the agent handled the missing element gracefully"
-        - "Ensure the page was properly analyzed even though the target was not found"
-
-metadata:
-  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
-  priority: "high"
-  timeout: 60000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/filter-001.yaml b/eval-server/nodejs/evals/action-agent/filter-001.yaml
deleted file mode 100644
index 7efa8f1..0000000
--- a/eval-server/nodejs/evals/action-agent/filter-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Search filter application test
-id: "filter-001"
-name: "Apply Search Filters"
-description: "Test applying search filters to modify results"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Type \"Anna\" in the search filter to filter the list"
-  reasoning: "Testing search filter application"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the search filter input"
-      - "Typed \"Anna\" in the filter field"
-      - "List items filtered to show only matching results"
-      - "Non-matching items were hidden or removed from view"
-      - "Filter functionality worked as expected"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search input contains \"Anna\""
-        - "Check if list shows only items containing \"Anna\""
-        - "Confirm non-matching items are not visible"
-        - "Ensure filter functionality reduced the visible list items"
-
-metadata:
-  tags: ["action", "filter", "search", "list", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/form-001.yaml b/eval-server/nodejs/evals/action-agent/form-001.yaml
deleted file mode 100644
index c4f06da..0000000
--- a/eval-server/nodejs/evals/action-agent/form-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Form fill action test
-id: "form-001"
-name: "Fill Search Query"
-description: "Test filling a search input field with specific text"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Fill the search box with \"Chrome DevTools automation testing\""
-  reasoning: "Testing form input capability with a specific search query"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully identified the search input field"
-      - "Used perform_action with fill method"
-      - "Correctly filled the field with the specified text"
-      - "Verified the field accepted the input"
-      - "No formatting or encoding issues with the text"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to confirm text was entered in the search field"
-        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
-        - "Check if search suggestions or autocomplete dropdown appeared"
-        - "Ensure no input validation errors are shown"
-
-metadata:
-  tags: ["action", "form-fill", "input", "google", "basic"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/hover-001.yaml b/eval-server/nodejs/evals/action-agent/hover-001.yaml
deleted file mode 100644
index a58b225..0000000
--- a/eval-server/nodejs/evals/action-agent/hover-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Hover action test
-id: "hover-001"
-name: "Hover to Reveal Menu"
-description: "Test hovering over an element to reveal hidden content"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/hovers"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Hover over the first user avatar image to reveal the hidden caption"
-  reasoning: "Testing hover interaction to reveal dynamic content"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first user avatar image"
-      - "Used appropriate hover action method"
-      - "Successfully triggered the hover state"
-      - "Hidden caption became visible after hover"
-      - "Handled mouse interaction correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify hover revealed hidden content"
-        - "Check that caption or overlay appeared over the first avatar"
-        - "Confirm the hover state is visually active on the image"
-        - "Verify user information or caption text is now visible"
-
-metadata:
-  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/keyboard-001.yaml b/eval-server/nodejs/evals/action-agent/keyboard-001.yaml
deleted file mode 100644
index 6a1ffd1..0000000
--- a/eval-server/nodejs/evals/action-agent/keyboard-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Keyboard tab navigation test
-id: "keyboard-001"
-name: "Keyboard Tab Navigation"
-description: "Test using keyboard navigation to move between elements"
-enabled: true
-
-target:
-  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Use Tab key to navigate between menu items and Enter to activate"
-  reasoning: "Testing keyboard-only navigation patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used keyboard navigation"
-      - "Tab key moved focus between menu items"
-      - "Focus indicators were visible during navigation"
-      - "Enter key activated the focused menu item"
-      - "Keyboard navigation followed accessibility standards"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify focus indicators are visible on menu items"
-        - "Check if keyboard navigation moved focus correctly"
-        - "Confirm Enter key activated the focused item"
-        - "Ensure accessibility navigation patterns worked"
-
-metadata:
-  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/login-001.yaml b/eval-server/nodejs/evals/action-agent/login-001.yaml
deleted file mode 100644
index b56fbca..0000000
--- a/eval-server/nodejs/evals/action-agent/login-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Login form test
-id: "login-001"
-name: "Fill Login Credentials"
-description: "Test filling username and password fields in a login form"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/login"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
-  reasoning: "Testing form fill with multiple fields including password type"
-  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified both username and password fields"
-      - "Filled username field with correct value"
-      - "Filled password field with correct value"
-      - "Handled password field type appropriately"
-      - "Used the provided input_data XML format correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the username field shows \"tomsmith\" entered"
-        - "Confirm the password field has dots/asterisks indicating password entry"
-        - "Check that both fields are properly filled before submission"
-        - "Ensure no validation errors are shown for the filled fields"
-
-metadata:
-  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/modal-001.yaml b/eval-server/nodejs/evals/action-agent/modal-001.yaml
deleted file mode 100644
index ef05d16..0000000
--- a/eval-server/nodejs/evals/action-agent/modal-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Modal dialog test
-id: "modal-001"
-name: "Open and Close Modal"
-description: "Test opening modal dialog and closing it with X button"
-enabled: true
-
-target:
-  url: "https://getbootstrap.com/docs/5.0/components/modal/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click to open the modal dialog, then close it using the X button"
-  reasoning: "Testing modal dialog interaction patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located and clicked the modal trigger button"
-      - "Modal dialog opened successfully"
-      - "Modal content was visible and accessible"
-      - "Found and clicked the close (X) button"
-      - "Modal closed and page returned to normal state"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify modal opened with visible content"
-        - "Check if modal overlay appeared correctly"
-        - "Confirm modal was closed after clicking X"
-        - "Ensure page background is accessible again"
-
-metadata:
-  tags: ["action", "modal", "dialog", "popup", "overlay"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/multiselect-001.yaml b/eval-server/nodejs/evals/action-agent/multiselect-001.yaml
deleted file mode 100644
index a456c9b..0000000
--- a/eval-server/nodejs/evals/action-agent/multiselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Multi-select dropdown test
-id: "multiselect-001"
-name: "Select Multiple Options"
-description: "Test selecting multiple options from a multi-select dropdown"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
-  reasoning: "Testing multiple selection in select elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the multi-select dropdown element"
-      - "Successfully selected Volvo option"
-      - "Successfully selected Audi option"
-      - "Both options remain selected simultaneously"
-      - "Used appropriate multi-select interaction method"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify both Volvo and Audi appear selected"
-        - "Check if both options are highlighted/marked"
-        - "Confirm multi-select functionality worked correctly"
-        - "Ensure no other options were accidentally selected"
-
-metadata:
-  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/multistep-001.yaml b/eval-server/nodejs/evals/action-agent/multistep-001.yaml
deleted file mode 100644
index 14923a2..0000000
--- a/eval-server/nodejs/evals/action-agent/multistep-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Multi-step form test
-id: "multistep-001"
-name: "Complete Search and Submit"
-description: "Test filling a search form and then clicking the submit button"
-enabled: true
-
-target:
-  url: "https://www.bing.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
-  reasoning: "Testing multi-step form interaction combining fill and click actions"
-  hint: "This requires two actions: first fill the search field, then click the search button"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Recognized this requires multiple actions"
-      - "First filled the search input correctly"
-      - "Then located and clicked the search button"
-      - "Both actions completed successfully in sequence"
-      - "Search was initiated with the correct query"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the search input contains \"automated testing tools\" text"
-        - "Confirm the search was submitted and results page loaded"
-        - "Check that search results are related to the query"
-        - "Ensure the multi-step action completed fully with both fill and click"
-
-metadata:
-  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/nav-001.yaml b/eval-server/nodejs/evals/action-agent/nav-001.yaml
deleted file mode 100644
index e1ef610..0000000
--- a/eval-server/nodejs/evals/action-agent/nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Complex navigation test
-id: "nav-001"
-name: "Navigate via Menu Click"
-description: "Test clicking navigation menu items to navigate between pages"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
-  reasoning: "Testing navigation through link clicks on a multilingual site"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identified the correct language link among many options"
-      - "Successfully clicked the English link"
-      - "Navigation occurred to the English Wikipedia"
-      - "Used appropriate tools to verify navigation success"
-      - "Handled the multilingual page structure correctly"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
-        - "Check if the page language and content changed to English"
-        - "Verify the URL changed to en.wikipedia.org"
-        - "Confirm the English Wikipedia main page is displayed"
-
-metadata:
-  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/radio-001.yaml b/eval-server/nodejs/evals/action-agent/radio-001.yaml
deleted file mode 100644
index a136e1e..0000000
--- a/eval-server/nodejs/evals/action-agent/radio-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Radio button selection test
-id: "radio-001"
-name: "Select Radio Button Option"
-description: "Test selecting a specific radio button option using click method"
-enabled: true
-
-target:
-  url: "https://httpbin.org/forms/post"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 45000
-
-input:
-  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
-  reasoning: "Testing radio button selection functionality"
-  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Medium radio button in the Pizza Size section"
-      - "Successfully clicked the Medium radio button"
-      - "Radio button became selected (checked state)"
-      - "Other radio buttons in the same group became unselected"
-      - "Form maintained its structure after radio button selection"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the Medium radio button is now selected (shows filled circle)"
-        - "Check that other pizza size options (Small, Large) are no longer selected"
-        - "Confirm the form structure remained intact"
-        - "Ensure the Medium pizza size radio button was specifically targeted"
-
-metadata:
-  tags: ["action", "radio", "click", "form", "httpbin"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/slider-001.yaml b/eval-server/nodejs/evals/action-agent/slider-001.yaml
deleted file mode 100644
index 9369671..0000000
--- a/eval-server/nodejs/evals/action-agent/slider-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Range slider test
-id: "slider-001"
-name: "Adjust Range Slider"
-description: "Test moving slider to set a specific value"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/slider/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Move the slider to set the value to 75"
-  reasoning: "Testing slider/range input manipulation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the slider control element"
-      - "Successfully moved the slider handle"
-      - "Set the slider value to approximately 75"
-      - "Slider position reflects the target value"
-      - "Any associated display shows the correct value"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify slider handle moved to represent value 75"
-        - "Check if value display shows 75 or close to it"
-        - "Confirm slider position visually matches target"
-        - "Ensure slider interaction was smooth and successful"
-
-metadata:
-  tags: ["action", "slider", "range", "form", "drag"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tableselect-001.yaml b/eval-server/nodejs/evals/action-agent/tableselect-001.yaml
deleted file mode 100644
index b38341e..0000000
--- a/eval-server/nodejs/evals/action-agent/tableselect-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table row selection test
-id: "tableselect-001"
-name: "Select Table Row"
-description: "Test clicking to select a table row"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/api/select_single_row.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the first row to select it"
-  reasoning: "Testing table row selection patterns"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the first table row"
-      - "Successfully clicked the row"
-      - "Row became highlighted/selected"
-      - "Selection state is visually apparent"
-      - "Only one row is selected at a time"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the first row is now highlighted/selected"
-        - "Check if row selection visual feedback is clear"
-        - "Confirm only the clicked row is selected"
-        - "Ensure row selection styling is properly applied"
-
-metadata:
-  tags: ["action", "table", "select", "row", "highlight"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tablesort-001.yaml b/eval-server/nodejs/evals/action-agent/tablesort-001.yaml
deleted file mode 100644
index 32695c7..0000000
--- a/eval-server/nodejs/evals/action-agent/tablesort-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Table column sorting test
-id: "tablesort-001"
-name: "Sort Table Column"
-description: "Test clicking table column header to sort data"
-enabled: true
-
-target:
-  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Name\" column header to sort the table by name"
-  reasoning: "Testing table column sorting interaction"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Name column header"
-      - "Successfully clicked the column header"
-      - "Table data reordered by name alphabetically"
-      - "Sort indicator appeared on the Name column"
-      - "Table sorting completed without errors"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify table rows are now sorted alphabetically by name"
-        - "Check if sort arrow/indicator appears on Name column"
-        - "Confirm the data order changed from before to after"
-        - "Ensure table structure remained intact after sorting"
-
-metadata:
-  tags: ["action", "table", "sort", "column", "data"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/tabs-001.yaml b/eval-server/nodejs/evals/action-agent/tabs-001.yaml
deleted file mode 100644
index 1079266..0000000
--- a/eval-server/nodejs/evals/action-agent/tabs-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Tab panel navigation test
-id: "tabs-001"
-name: "Navigate Tab Panels"
-description: "Test clicking tab to switch between tab panels"
-enabled: true
-
-target:
-  url: "https://jqueryui.com/tabs/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
-  reasoning: "Testing tab panel navigation"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the \"Nunc tincidunt\" tab button"
-      - "Successfully clicked the tab"
-      - "Tab panel content switched to the selected tab"
-      - "Active tab visual state changed appropriately"
-      - "Content area updated to show the new panel"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
-        - "Check if the content panel changed to show new content"
-        - "Confirm the tab switching animation completed"
-        - "Ensure the correct tab content is visible"
-
-metadata:
-  tags: ["action", "tabs", "navigation", "panels", "ui"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/timepicker-001.yaml b/eval-server/nodejs/evals/action-agent/timepicker-001.yaml
deleted file mode 100644
index cbc5742..0000000
--- a/eval-server/nodejs/evals/action-agent/timepicker-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Time picker test
-id: "timepicker-001"
-name: "Select Time from Picker"
-description: "Test setting time using time picker controls"
-enabled: true
-
-target:
-  url: "https://timepicker.co/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Set the time to 2:30 PM using the time picker controls"
-  reasoning: "Testing time selection with hour/minute controls"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the time picker interface"
-      - "Set the hour to 2 (14 for 24-hour format)"
-      - "Set the minutes to 30"
-      - "Selected PM or appropriate time format"
-      - "Time input shows 2:30 PM or equivalent"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the time input displays 2:30 PM or 14:30"
-        - "Check if hour and minute were set correctly"
-        - "Confirm AM/PM selection if applicable"
-        - "Ensure the time picker interface was properly used"
-
-metadata:
-  tags: ["action", "timepicker", "time", "form", "clock"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/upload-001.yaml b/eval-server/nodejs/evals/action-agent/upload-001.yaml
deleted file mode 100644
index d5c276c..0000000
--- a/eval-server/nodejs/evals/action-agent/upload-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# File upload test
-id: "upload-001"
-name: "Upload File via Input"
-description: "Test clicking file input and uploading a test file"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/upload"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Click the file input and upload a test file"
-  reasoning: "Testing file upload interaction through input elements"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the file input element"
-      - "Triggered file selection dialog"
-      - "Selected a file for upload"
-      - "File name appears in the input field"
-      - "Upload process initiated successfully"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify file name appears in the upload input field"
-        - "Check if file selection was successful"
-        - "Confirm upload button is available or file is ready"
-        - "Ensure no upload errors are displayed"
-
-metadata:
-  tags: ["action", "upload", "file", "input", "form"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/video-001.yaml b/eval-server/nodejs/evals/action-agent/video-001.yaml
deleted file mode 100644
index 17c76be..0000000
--- a/eval-server/nodejs/evals/action-agent/video-001.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video playback controls test
-id: "video-001"
-name: "Control Video Playback"
-description: "Test starting video playback using click + spacebar"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Click the video element to focus it, then press spacebar to start playback"
-  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
-  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Located the Video element in the accessibility tree"
-      - "Successfully clicked the Video element to focus it"
-      - "Used keyboard input to press spacebar"
-      - "Video playback started after spacebar press"
-      - "No errors occurred during the interaction sequence"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify video player is visible on the page"
-        - "Check if the play button was clicked (may show pause button after)"
-        - "Look for visual indicators that video started playing"
-        - "Ensure no error messages appeared during video interaction"
-
-metadata:
-  tags: ["action", "video", "media", "controls", "playback"]
-  priority: "high"
-  timeout: 90000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/action-agent/video-002.yaml b/eval-server/nodejs/evals/action-agent/video-002.yaml
deleted file mode 100644
index b20014c..0000000
--- a/eval-server/nodejs/evals/action-agent/video-002.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Video play button specific targeting test
-id: "video-002"
-name: "Click Video Play Button Specifically"
-description: "Test clicking the specific play button (not the video element)"
-enabled: true
-
-target:
-  url: "https://www.w3schools.com/html/html5_video.asp"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
-  reasoning: "Testing specific targeting of the play button element"
-  hint: "Target the button element with text or label \"play\", do not click the Video element"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Found a button element (not Video element) with \"play\" in the name"
-      - "Successfully clicked the play button specifically"
-      - "Did not click on the Video element itself"
-      - "Play button click was executed correctly"
-      - "Video responded to the button click"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify the play button (not video element) was clicked"
-        - "Check if video started playing after button click"
-        - "Confirm the target was the button, not the video container"
-        - "Look for changes in video player state"
-
-metadata:
-  tags: ["action", "video", "button", "specific-targeting"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/config.yaml b/eval-server/nodejs/evals/config.yaml
deleted file mode 100644
index 3968421..0000000
--- a/eval-server/nodejs/evals/config.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# model:
-#   main_model: "deepseek-r1:14b"
-#   mini_model: "deepseek-r1:14b"
-#   nano_model: "deepseek-r1:14b"
-#   provider: "litellm"
-
-model:
-  main_model: "gpt-4.1"
-  mini_model: "gpt-4.1-mini"
-  nano_model: "gpt-4.1-nano"
-  provider: "openai"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml b/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
deleted file mode 100644
index 746ead6..0000000
--- a/eval-server/nodejs/evals/end-to-end/b-vitamins-research-001.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# B-Vitamins Research - End-to-End Test
-id: "vitamins-research-001"
-name: "B-Vitamins Supplementation Research"
-description: "End-to-end test for comprehensive B-vitamins research using chat interface"
-enabled: true
-
-tool: "chat"
-timeout: 600000
-
-input:
-  message: "Research everything on the supplementation of B-vitamins for adults. I need: types of vitamins, available forms and their advantages, dosage and safety"
-  reasoning: "End-to-end test validating complete user workflow with dynamic tool usage for health research"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Covers all B-vitamin types (B1, B2, B3, B5, B6, B7, B9, B12) comprehensively"
-      - "Explains different forms of each vitamin and their advantages"
-      - "Provides appropriate dosage recommendations for adults"
-      - "Discusses safety considerations and potential side effects"
-      - "Information is accurate and from reliable health sources"
-      - "Response is well-organized and easy to understand"
-      - "Demonstrates intelligent tool selection for health research"
-      - "Shows complete workflow from request to comprehensive result"
-
-metadata:
-  tags: ["end-to-end", "chat", "health", "vitamins", "research", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml b/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
deleted file mode 100644
index 72014df..0000000
--- a/eval-server/nodejs/evals/end-to-end/investment-research-001.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Renewable Energy Stocks Research - End-to-End Test
-id: "investment-research-001"
-name: "Renewable Energy Stocks Research"
-description: "End-to-end test for investment research using chat interface"
-enabled: true
-
-tool: "chat"
-timeout: 600000
-
-input:
-  message: "Research renewable energy stocks for potential investment. Focus on solar and wind companies with market cap over $1B."
-  reasoning: "End-to-end test validating financial research workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identifies specific solar and wind energy companies"
-      - "Confirms companies have market cap over $1 billion"
-      - "Provides relevant financial metrics and data"
-      - "Includes business descriptions and growth prospects"
-      - "Discusses investment considerations and risks"
-      - "Information appears current and from reliable sources"
-      - "Demonstrates intelligent financial research tool usage"
-      - "Shows complete workflow from request to investment analysis"
-
-metadata:
-  tags: ["end-to-end", "chat", "investment", "stocks", "renewable-energy", "financial", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml b/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
deleted file mode 100644
index 1363a09..0000000
--- a/eval-server/nodejs/evals/end-to-end/product-comparison-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Headphones Comparison - End-to-End Test
-id: "product-comparison-001"
-name: "Noise-Canceling Headphones Comparison"
-description: "End-to-end test for product research and comparison using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Compare the top 3 noise-canceling headphones under $300. Include features, pros/cons, and where to buy them."
-  reasoning: "End-to-end test validating product comparison workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Identifies 3 specific noise-canceling headphones under $300"
-      - "Provides detailed feature comparison for each model"
-      - "Lists pros and cons for each headphone clearly"
-      - "Includes pricing information and purchase locations"
-      - "Comparison is fair and based on objective criteria"
-      - "Information appears current and accurate"
-      - "Demonstrates intelligent research and extraction tool usage"
-      - "Shows complete workflow from request to actionable buying guide"
-
-metadata:
-  tags: ["end-to-end", "chat", "product", "comparison", "headphones", "shopping", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml b/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
deleted file mode 100644
index ef8b0f0..0000000
--- a/eval-server/nodejs/evals/end-to-end/recipe-nutrition-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Healthy Recipe Search - End-to-End Test
-id: "recipe-nutrition-001"
-name: "Healthy Family Dinner Recipes"
-description: "End-to-end test for recipe search with nutrition criteria using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Find me 3 healthy dinner recipes for a family of 4 that are under 500 calories per serving and take less than 30 minutes to prepare."
-  reasoning: "End-to-end test validating recipe search workflow with specific nutritional and time criteria"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Provides exactly 3 dinner recipes suitable for family of 4"
-      - "Each recipe is under 500 calories per serving"
-      - "All recipes can be prepared in under 30 minutes"
-      - "Includes ingredient lists and cooking instructions"
-      - "Nutritional information is provided or estimated"
-      - "Recipes are practical and family-friendly"
-      - "Demonstrates intelligent recipe search and analysis"
-      - "Shows complete workflow from request to actionable meal plan"
-
-metadata:
-  tags: ["end-to-end", "chat", "recipes", "nutrition", "healthy", "family", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml b/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
deleted file mode 100644
index 401f8b1..0000000
--- a/eval-server/nodejs/evals/end-to-end/travel-planning-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Barcelona Travel Planning - End-to-End Test
-id: "travel-planning-001"
-name: "Barcelona Trip Planning"
-description: "End-to-end test for comprehensive travel planning using chat interface"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "chat"
-timeout: 300000
-
-input:
-  message: "Help me plan a 3-day trip to Barcelona. I need flight options from New York, hotel recommendations in the city center, and top 5 attractions to visit."
-  reasoning: "End-to-end test validating complete travel planning workflow with dynamic tool usage"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Provides realistic flight options from New York to Barcelona"
-      - "Recommends hotels in Barcelona city center with details"
-      - "Lists top 5 attractions in Barcelona with descriptions"
-      - "Information is current and practically useful for trip planning"
-      - "Includes relevant details like prices, locations, or booking info"
-      - "Response is well-organized into clear sections"
-      - "Demonstrates multi-tool usage for comprehensive planning"
-      - "Shows complete workflow from request to actionable itinerary"
-
-metadata:
-  tags: ["end-to-end", "chat", "travel", "planning", "barcelona", "user-workflow"]
-  priority: "medium"
-  timeout: 300000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/basic-001.yaml b/eval-server/nodejs/evals/research-agent/basic-001.yaml
deleted file mode 100644
index fcd0086..0000000
--- a/eval-server/nodejs/evals/research-agent/basic-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Basic research test - stable topic with clear sources
-id: "basic-001"
-name: "Research Chrome DevTools History"
-description: "Research the history and development of Chrome DevTools"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "History and development of Chrome DevTools browser developer tools"
-  reasoning: "Testing basic research capabilities on a well-documented technical topic"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0
-    criteria:
-      - "Research covers the origins and early development of Chrome DevTools"
-      - "Information includes key milestones and major feature additions"
-      - "Sources include official documentation or reliable technical sources"
-      - "At least 3-5 different sources were consulted"
-      - "Information is factually accurate and up-to-date"
-      - "Research demonstrates understanding of the topic evolution"
-      - "Handoff to content_writer_agent occurred with comprehensive data"
-
-metadata:
-  tags: ["basic", "technical", "stable", "documentation"]
-  priority: "high"
-  timeout: 180000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/business-001.yaml b/eval-server/nodejs/evals/research-agent/business-001.yaml
deleted file mode 100644
index 7558120..0000000
--- a/eval-server/nodejs/evals/research-agent/business-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Business research test
-id: "business-001"
-name: "Research Remote Work Productivity"
-description: "Research remote work impact on productivity and business outcomes"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Remote work productivity statistics impact business outcomes 2024 studies"
-  reasoning: "Testing business research requiring statistical data and multiple perspectives"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research includes statistical data and survey results"
-      - "Covers multiple perspectives (employee, employer, industry)"
-      - "Sources include business publications, research studies, and reports"
-      - "Information addresses both positive and negative impacts"
-      - "Data is recent and relevant to current work trends"
-      - "Research demonstrates understanding of business implications"
-      - "Statistics and claims are properly sourced"
-
-metadata:
-  tags: ["business", "statistics", "workplace", "comprehensive"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/comparison-001.yaml b/eval-server/nodejs/evals/research-agent/comparison-001.yaml
deleted file mode 100644
index a9aa22b..0000000
--- a/eval-server/nodejs/evals/research-agent/comparison-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Comparative research test
-id: "comparison-001"
-name: "Compare JavaScript vs TypeScript"
-description: "Research and compare JavaScript and TypeScript for web development"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 200000
-
-input:
-  query: "JavaScript vs TypeScript comparison web development pros cons differences"
-  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers both JavaScript and TypeScript comprehensively"
-      - "Includes clear comparison points (syntax, features, ecosystem)"
-      - "Presents advantages and disadvantages of each language"
-      - "Sources include technical documentation and developer resources"
-      - "Information is balanced and objective, not biased toward one option"
-      - "Demonstrates understanding of use cases for each language"
-      - "Research data is well-organized for comparative analysis"
-
-metadata:
-  tags: ["comparison", "technical", "programming", "balanced"]
-  priority: "high"
-  timeout: 200000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/current-001.yaml b/eval-server/nodejs/evals/research-agent/current-001.yaml
deleted file mode 100644
index 6878868..0000000
--- a/eval-server/nodejs/evals/research-agent/current-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Current events research test
-id: "current-001"
-name: "Research Latest AI Development Trends"
-description: "Research recent developments in AI and machine learning (last 6 months)"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
-  reasoning: "Testing research on current events and rapidly evolving topics"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Research focuses on recent developments (within last 6 months)"
-      - "Covers multiple aspects of AI development (models, applications, research)"
-      - "Sources are current and from reputable news or research outlets"
-      - "Information includes specific examples or case studies"
-      - "Demonstrates ability to identify current trends vs older information"
-      - "Successfully gathered information from diverse source types"
-      - "Data is properly organized for content writer handoff"
-
-metadata:
-  tags: ["current-events", "ai", "dynamic", "trends"]
-  priority: "high"
-  timeout: 240000
-  retries: 1
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/edge-001.yaml b/eval-server/nodejs/evals/research-agent/edge-001.yaml
deleted file mode 100644
index d75c2bf..0000000
--- a/eval-server/nodejs/evals/research-agent/edge-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# No-results edge case test
-id: "edge-001"
-name: "Research Obscure Fictional Topic"
-description: "Test handling of queries with very limited or no reliable sources"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "quantum bluetooth watermelon encryption algorithm 2024"
-  reasoning: "Testing edge case handling when query yields no meaningful results"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent recognizes when query yields limited or unreliable results"
-      - "Demonstrates appropriate search strategy modification"
-      - "Does not fabricate information when sources are unavailable"
-      - "Gracefully handles lack of substantive results"
-      - "Still attempts handoff to content writer with available information"
-      - "Maintains professional approach despite limited data"
-      - "Shows appropriate uncertainty when information is sparse"
-
-metadata:
-  tags: ["edge-case", "no-results", "error-handling", "fictional"]
-  priority: "high"
-  timeout: 180000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
deleted file mode 100644
index 85743d5..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-basic-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Basic research test - stable topic with clear sources
-id: "research-agent-basic-001"
-name: "Research Chrome DevTools History"
-description: "Research the history and development of Chrome DevTools"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "History and development of Chrome DevTools browser developer tools"
-  reasoning: "Testing basic research capabilities on a well-documented technical topic"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0
-    criteria:
-      - "Research covers the origins and early development of Chrome DevTools"
-      - "Information includes key milestones and major feature additions"
-      - "Sources include official documentation or reliable technical sources"
-      - "At least 3-5 different sources were consulted"
-      - "Information is factually accurate and up-to-date"
-      - "Research demonstrates understanding of the topic evolution"
-      - "Handoff to content_writer_agent occurred with comprehensive data"
-
-metadata:
-  tags: ["basic", "technical", "stable", "documentation"]
-  priority: "high"
-  timeout: 180000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
deleted file mode 100644
index defeed1..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-business-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Business research test
-id: "research-agent-business-001"
-name: "Research Remote Work Productivity"
-description: "Research remote work impact on productivity and business outcomes"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Remote work productivity statistics impact business outcomes 2024 studies"
-  reasoning: "Testing business research requiring statistical data and multiple perspectives"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research includes statistical data and survey results"
-      - "Covers multiple perspectives (employee, employer, industry)"
-      - "Sources include business publications, research studies, and reports"
-      - "Information addresses both positive and negative impacts"
-      - "Data is recent and relevant to current work trends"
-      - "Research demonstrates understanding of business implications"
-      - "Statistics and claims are properly sourced"
-
-metadata:
-  tags: ["business", "statistics", "workplace", "comprehensive"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
deleted file mode 100644
index a433a58..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-comparison-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Comparative research test
-id: "research-agent-comparison-001"
-name: "Compare JavaScript vs TypeScript"
-description: "Research and compare JavaScript and TypeScript for web development"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 200000
-
-input:
-  query: "JavaScript vs TypeScript comparison web development pros cons differences"
-  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers both JavaScript and TypeScript comprehensively"
-      - "Includes clear comparison points (syntax, features, ecosystem)"
-      - "Presents advantages and disadvantages of each language"
-      - "Sources include technical documentation and developer resources"
-      - "Information is balanced and objective, not biased toward one option"
-      - "Demonstrates understanding of use cases for each language"
-      - "Research data is well-organized for comparative analysis"
-
-metadata:
-  tags: ["comparison", "technical", "programming", "balanced"]
-  priority: "high"
-  timeout: 200000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
deleted file mode 100644
index 198c981..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-current-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Current events research test
-id: "research-agent-current-001"
-name: "Research Latest AI Development Trends"
-description: "Research recent developments in AI and machine learning (last 6 months)"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
-  reasoning: "Testing research on current events and rapidly evolving topics"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Research focuses on recent developments (within last 6 months)"
-      - "Covers multiple aspects of AI development (models, applications, research)"
-      - "Sources are current and from reputable news or research outlets"
-      - "Information includes specific examples or case studies"
-      - "Demonstrates ability to identify current trends vs older information"
-      - "Successfully gathered information from diverse source types"
-      - "Data is properly organized for content writer handoff"
-
-metadata:
-  tags: ["current-events", "ai", "dynamic", "trends"]
-  priority: "high"
-  timeout: 240000
-  retries: 1
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
deleted file mode 100644
index 234c832..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-edge-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# No-results edge case test
-id: "research-agent-edge-001"
-name: "Research Obscure Fictional Topic"
-description: "Test handling of queries with very limited or no reliable sources"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 180000
-
-input:
-  query: "quantum bluetooth watermelon encryption algorithm 2024"
-  reasoning: "Testing edge case handling when query yields no meaningful results"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent recognizes when query yields limited or unreliable results"
-      - "Demonstrates appropriate search strategy modification"
-      - "Does not fabricate information when sources are unavailable"
-      - "Gracefully handles lack of substantive results"
-      - "Still attempts handoff to content writer with available information"
-      - "Maintains professional approach despite limited data"
-      - "Shows appropriate uncertainty when information is sparse"
-
-metadata:
-  tags: ["edge-case", "no-results", "error-handling", "fictional"]
-  priority: "high"
-  timeout: 180000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
deleted file mode 100644
index c5e2540..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-technical-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Deep technical research test
-id: "research-agent-technical-001"
-name: "Research WebAssembly Performance"
-description: "Deep dive research into WebAssembly performance characteristics and use cases"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 900000
-
-input:
-  query: "WebAssembly WASM performance benchmarks use cases implementation details"
-  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers technical details of WebAssembly architecture"
-      - "Includes performance benchmarks and comparison data"
-      - "Discusses practical use cases and implementation scenarios"
-      - "Sources include technical specifications, benchmarks, and expert analysis"
-      - "Information demonstrates deep understanding of the technology"
-      - "Research addresses both benefits and limitations"
-      - "Technical accuracy is maintained throughout"
-
-metadata:
-  tags: ["technical", "deep-dive", "performance", "webassembly"]
-  priority: "high"
-  timeout: 900000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml b/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
deleted file mode 100644
index 44da108..0000000
--- a/eval-server/nodejs/evals/research-agent/research-agent-tools-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Tool orchestration test - focuses on how well the agent uses available tools
-id: "research-agent-tools-001"
-name: "Research Python Framework Comparison"
-description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Django vs Flask Python web framework comparison features performance"
-  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent effectively used navigate_url to access search engines"
-      - "Schema-based extraction was used to gather structured search results"
-      - "Fetcher tool was used to collect content from multiple URLs"
-      - "Navigation strategy was logical and systematic"
-      - "Tool usage demonstrated purposeful research progression"
-      - "Information from different tools was effectively synthesized"
-      - "At least 3-5 different sources were accessed and processed"
-      - "Final handoff included comprehensive data from all tools"
-
-metadata:
-  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/technical-001.yaml b/eval-server/nodejs/evals/research-agent/technical-001.yaml
deleted file mode 100644
index f434081..0000000
--- a/eval-server/nodejs/evals/research-agent/technical-001.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Deep technical research test
-id: "technical-001"
-name: "Research WebAssembly Performance"
-description: "Deep dive research into WebAssembly performance characteristics and use cases"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 900000
-
-input:
-  query: "WebAssembly WASM performance benchmarks use cases implementation details"
-  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Research covers technical details of WebAssembly architecture"
-      - "Includes performance benchmarks and comparison data"
-      - "Discusses practical use cases and implementation scenarios"
-      - "Sources include technical specifications, benchmarks, and expert analysis"
-      - "Information demonstrates deep understanding of the technology"
-      - "Research addresses both benefits and limitations"
-      - "Technical accuracy is maintained throughout"
-
-metadata:
-  tags: ["technical", "deep-dive", "performance", "webassembly"]
-  priority: "high"
-  timeout: 900000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/research-agent/tools-001.yaml b/eval-server/nodejs/evals/research-agent/tools-001.yaml
deleted file mode 100644
index ae97430..0000000
--- a/eval-server/nodejs/evals/research-agent/tools-001.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Tool orchestration test - focuses on how well the agent uses available tools
-id: "tools-001"
-name: "Research Python Framework Comparison"
-description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "research_agent"
-timeout: 240000
-
-input:
-  query: "Django vs Flask Python web framework comparison features performance"
-  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Agent effectively used navigate_url to access search engines"
-      - "Schema-based extraction was used to gather structured search results"
-      - "Fetcher tool was used to collect content from multiple URLs"
-      - "Navigation strategy was logical and systematic"
-      - "Tool usage demonstrated purposeful research progression"
-      - "Information from different tools was effectively synthesized"
-      - "At least 3-5 different sources were accessed and processed"
-      - "Final handoff included comprehensive data from all tools"
-
-metadata:
-  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
-  priority: "high"
-  timeout: 240000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml b/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
deleted file mode 100644
index 42e4738..0000000
--- a/eval-server/nodejs/evals/schema-extractor/amazon-product-001.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# E-commerce product extraction test
-id: "amazon-product-001"
-name: "Extract Amazon Product Details"
-description: "Extract product information from an Amazon product page"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      product:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          brand:
-            type: "string"
-          price:
-            type: "object"
-            properties:
-              current:
-                type: "number"
-              currency:
-                type: "string"
-          rating:
-            type: "object"
-            properties:
-              average:
-                type: "number"
-              count:
-                type: "number"
-          images:
-            type: "array"
-            items:
-              type: "string"
-              format: "url"
-          features:
-            type: "array"
-            items:
-              type: "string"
-        required:
-          - "title"
-          - "price"
-      availability:
-        type: "string"
-    required:
-      - "product"
-  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
-  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Product title is accurate and complete"
-      - "Price information is current and properly formatted"
-      - "Rating data includes both average and review count"
-      - "Image URLs are valid and accessible"
-      - "Key product features are captured"
-      - "All URLs are properly resolved (not node IDs)"
-
-metadata:
-  tags: ["ecommerce", "amazon", "product", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml b/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
deleted file mode 100644
index 6843147..0000000
--- a/eval-server/nodejs/evals/schema-extractor/bbc-news-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# News article extraction test
-id: "bbc-news-001"
-name: "Extract BBC News Article"
-description: "Extract article content and metadata from a BBC News page"
-enabled: true
-
-target:
-  url: "https://www.bbc.com/news/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      headlines:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            summary:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            category:
-              type: "string"
-          required:
-            - "title"
-      mainStory:
-        type: "object"
-        properties:
-          headline:
-            type: "string"
-          summary:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-    required:
-      - "headlines"
-  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
-  reasoning: "Testing extraction from a news aggregation page with multiple articles"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Headlines are current and relevant to technology news"
-      - "Article summaries provide meaningful context"
-      - "URLs link to valid BBC news articles"
-      - "Main story is properly identified"
-      - "All extracted content is in English"
-
-metadata:
-  tags: ["news", "bbc", "aggregation", "dynamic"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
deleted file mode 100644
index 7e7d674..0000000
--- a/eval-server/nodejs/evals/schema-extractor/bing-search-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Bing Search results extraction test
-id: "bing-search-001"
-name: "Extract Bing Search Results"
-description: "Extract search results from Bing search page"
-enabled: true
-
-target:
-  url: "https://www.bing.com/search?q=web+scraping+best+practices"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            datePublished:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      sidebarInfo:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          description:
-            type: "string"
-          source:
-            type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
-  reasoning: "Testing extraction from Bing search results with different layout than Google"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results match the query intent"
-      - "Results include valid URLs and meaningful snippets"
-      - "Sidebar information is extracted when present"
-      - "No duplicate results in the list"
-
-metadata:
-  tags: ["search", "bing", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml b/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
deleted file mode 100644
index 07532e7..0000000
--- a/eval-server/nodejs/evals/schema-extractor/github-repo-001-streamlined.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test (Streamlined version)
-id: "github-repo-001-streamlined"
-name: "Extract GitHub Repository Info (Streamlined)"
-description: "Extract basic repository information from a GitHub page using streamlined extractor"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured", "streamlined"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml b/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
deleted file mode 100644
index 6693577..0000000
--- a/eval-server/nodejs/evals/schema-extractor/github-repo-001.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test
-id: "github-repo-001"
-name: "Extract GitHub Repository Info"
-description: "Extract basic repository information from a GitHub page"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml b/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
deleted file mode 100644
index ab2e53c..0000000
--- a/eval-server/nodejs/evals/schema-extractor/google-flights-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Google Flights search extraction test
-id: "google-flights-001"
-name: "Extract Google Flights Search Results"
-description: "Extract flight options from Google Flights search"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchCriteria:
-        type: "object"
-        properties:
-          origin:
-            type: "string"
-          destination:
-            type: "string"
-          departureDate:
-            type: "string"
-          returnDate:
-            type: "string"
-          tripType:
-            type: "string"
-          passengers:
-            type: "number"
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            flightNumber:
-              type: "string"
-            departureTime:
-              type: "string"
-            arrivalTime:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "number"
-            price:
-              type: "object"
-              properties:
-                amount:
-                  type: "number"
-                currency:
-                  type: "string"
-            cabin:
-              type: "string"
-            bookingUrl:
-              type: "string"
-              format: "url"
-            legroom:
-              type: "string"
-            amenities:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "airline"
-            - "departureTime"
-            - "arrivalTime"
-            - "price"
-      priceInsights:
-        type: "object"
-        properties:
-          trend:
-            type: "string"
-          recommendation:
-            type: "string"
-          averagePrice:
-            type: "number"
-    required:
-      - "flights"
-  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
-  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Flight times are in proper format"
-      - "Prices are numeric values with currency"
-      - "Airlines and flight numbers are accurate"
-      - "Stop information is correctly identified"
-      - "Duration is in readable format"
-
-metadata:
-  tags: ["travel", "flights", "google", "booking"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
deleted file mode 100644
index 5763ba8..0000000
--- a/eval-server/nodejs/evals/schema-extractor/google-search-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Google Search results extraction test
-id: "google-search-001"
-name: "Extract Google Search Results"
-description: "Extract search results from Google search page"
-enabled: true
-
-target:
-  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            domain:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      featuredSnippet:
-        type: "object"
-        properties:
-          content:
-            type: "string"
-          source:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-      relatedSearches:
-        type: "array"
-        items:
-          type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
-  reasoning: "Testing extraction from Google search results page with various result types"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are relevant to the query"
-      - "Each result has a valid title, URL, and snippet"
-      - "URLs are properly resolved and not node IDs"
-      - "Related searches are extracted if present"
-      - "Featured snippet is captured when available"
-
-metadata:
-  tags: ["search", "google", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml b/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
deleted file mode 100644
index 2eb4883..0000000
--- a/eval-server/nodejs/evals/schema-extractor/homedepot-001.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Home Depot product search extraction test
-id: "homedepot-001"
-name: "Extract Home Depot Product Search"
-description: "Extract product listings from Home Depot search results"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/s/power%2520drill"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchQuery:
-        type: "string"
-      totalResults:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            price:
-              type: "number"
-            originalPrice:
-              type: "number"
-            savings:
-              type: "number"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            availability:
-              type: "string"
-            features:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "price"
-            - "productUrl"
-      filters:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
-  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are relevant to the search query"
-      - "Prices are numeric values in USD"
-      - "Product URLs link to Home Depot product pages"
-      - "Ratings are on a 5-star scale"
-      - "Key product features are captured"
-
-metadata:
-  tags: ["ecommerce", "homedepot", "products", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/macys-001.yaml b/eval-server/nodejs/evals/schema-extractor/macys-001.yaml
deleted file mode 100644
index 81e05f9..0000000
--- a/eval-server/nodejs/evals/schema-extractor/macys-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Macy's product listing extraction test
-id: "macys-001"
-name: "Extract Macy's Product Listings"
-description: "Extract fashion products from Macy's category page"
-enabled: true
-
-target:
-  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      category:
-        type: "string"
-      totalProducts:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            currentPrice:
-              type: "number"
-            originalPrice:
-              type: "number"
-            discount:
-              type: "string"
-            colors:
-              type: "array"
-              items:
-                type: "string"
-            sizes:
-              type: "array"
-              items:
-                type: "string"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            promotions:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "brand"
-            - "currentPrice"
-      refinements:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          sizes:
-            type: "array"
-            items:
-              type: "string"
-          colors:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
-  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are from the correct category"
-      - "Prices reflect current and sale prices"
-      - "Color and size options are captured"
-      - "Brand names are accurately extracted"
-      - "Promotional text is included when present"
-
-metadata:
-  tags: ["ecommerce", "macys", "fashion", "products"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml b/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
deleted file mode 100644
index 616f0d6..0000000
--- a/eval-server/nodejs/evals/schema-extractor/wikipedia-search-001.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-# Wikipedia search results extraction test
-id: "wikipedia-search-001"
-name: "Extract Wikipedia Search Results"
-description: "Extract search results from Wikipedia search"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_data"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchTerm:
-        type: "string"
-      resultCount:
-        type: "number"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            category:
-              type: "string"
-            wordCount:
-              type: "number"
-            lastEdited:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      suggestedArticles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "searchResults"
-  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
-  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are Wikipedia articles"
-      - "Each result has a valid Wikipedia URL"
-      - "Snippets contain relevant content highlights"
-      - "Metadata like word count is extracted when available"
-
-metadata:
-  tags: ["search", "wikipedia", "encyclopedia"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml b/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
deleted file mode 100644
index 6ec53c4..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/dynamic-content-verification-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Dynamic content visual verification test
-id: "dynamic-content-verification-001"
-name: "Dynamic Content Visual Verification"
-description: "Test visual verification of dynamic content loading using screenshots"
-enabled: true
-
-target:
-  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
-
-tool: "action_agent"
-timeout: 90000
-
-input:
-  objective: "Take a screenshot, click the Start button, wait for content to load, then take another screenshot to verify the dynamic content appeared"
-  reasoning: "Testing visual verification of dynamic content changes using screenshot comparison"
-  hint: "Use take_screenshot before clicking Start, then again after the dynamic content loads"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Initial screenshot captured the page before dynamic loading"
-      - "Start button was successfully clicked"
-      - "Agent waited for dynamic content to fully load"
-      - "Final screenshot shows the revealed dynamic content"
-      - "Visual comparison demonstrates successful content loading verification"
-      - "Screenshots show clear before/after difference in content visibility"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare screenshots to verify dynamic content loading"
-        - "Confirm the first screenshot shows hidden content area"
-        - "Verify the second screenshot shows the revealed 'Hello World!' text"
-        - "Check that the loading animation or process is properly captured"
-
-metadata:
-  tags: ["screenshot", "dynamic-content", "visual-verification", "loading"]
-  priority: "high"
-  timeout: 90000
-  retries: 2
-  flaky: true
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
deleted file mode 100644
index 6d31c50..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-error-handling-001.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Screenshot error handling test
-id: "screenshot-error-handling-001"
-name: "Screenshot Error Handling"
-description: "Test screenshot tool error handling and recovery"
-enabled: true
-
-target:
-  url: "https://httpstat.us/500"
-
-tool: "take_screenshot"
-timeout: 30000
-
-input:
-  fullPage: false
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Screenshot tool handled the error page gracefully"
-      - "Either successfully captured the error page or reported appropriate error"
-      - "No crashes or undefined behavior occurred"
-      - "Tool response is meaningful regardless of page loading issues"
-      - "Error handling demonstrates robustness of screenshot functionality"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "If screenshot was taken, verify it shows the error page content"
-        - "Check that the tool handled the HTTP 500 error appropriately"
-        - "Confirm no blank or corrupted screenshots were produced"
-        - "Ensure error scenarios are handled professionally"
-
-metadata:
-  tags: ["screenshot", "error-handling", "robustness", "edge-case"]
-  priority: "normal"
-  timeout: 30000
-  retries: 1
-  flaky: true
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
deleted file mode 100644
index a1c71f9..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-fullpage-001.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Full page screenshot verification test
-id: "screenshot-fullpage-001"
-name: "Take Full Page Screenshot"
-description: "Test taking full page screenshot and verify functionality"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
-
-tool: "take_screenshot"
-timeout: 45000
-
-input:
-  fullPage: true
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Full page screenshot was successfully captured"
-      - "Data URL contains valid image data"
-      - "Screenshot captures the entire page content including areas below the fold"
-      - "Image size is larger than viewport-only screenshot would be"
-      - "No errors occurred during full page capture"
-      - "Screenshot includes both header and footer content"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "Verify the screenshot shows the complete Wikipedia article page"
-        - "Check that content above and below the fold is captured"
-        - "Confirm the image is taller than a typical viewport"
-        - "Ensure no content is cut off at the bottom"
-
-metadata:
-  tags: ["screenshot", "fullpage", "visual", "verification", "wikipedia"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml b/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
deleted file mode 100644
index 69531ee..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/screenshot-viewport-001.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Viewport screenshot verification test
-id: "screenshot-viewport-001"
-name: "Take Viewport Screenshot"
-description: "Test taking viewport screenshot and verify functionality"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-
-tool: "take_screenshot"
-timeout: 30000
-
-input:
-  fullPage: false
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Screenshot was successfully captured"
-      - "Data URL is properly formatted and contains image data"
-      - "Screenshot shows the viewport content correctly"
-      - "No errors occurred during screenshot capture"
-      - "Image data length indicates a valid screenshot was taken"
-    visual_verification:
-      enabled: true
-      capture_before: false
-      capture_after: true
-      prompts:
-        - "Verify the screenshot shows the Google homepage"
-        - "Check that the screenshot is not empty or corrupted"
-        - "Confirm the image quality is appropriate for verification"
-        - "Ensure the screenshot captures the current viewport accurately"
-
-metadata:
-  tags: ["screenshot", "viewport", "visual", "verification"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml b/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
deleted file mode 100644
index 7434a93..0000000
--- a/eval-server/nodejs/evals/screenshot-verification/visual-comparison-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Visual comparison verification test
-id: "visual-comparison-001"
-name: "Visual Comparison Before and After Action"
-description: "Test visual verification by comparing screenshots before and after an action"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-
-tool: "action_agent"
-timeout: 60000
-
-input:
-  objective: "Take a screenshot, then type 'DevTools testing' in the search box, and take another screenshot to compare"
-  reasoning: "Testing visual verification workflow with before/after screenshot comparison"
-  hint: "Use take_screenshot tool before and after performing the search input action"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Initial screenshot was taken before performing any actions"
-      - "Search text was successfully entered into the search field"
-      - "Second screenshot was taken after the text input"
-      - "Visual comparison shows the difference between before and after states"
-      - "Search field contains the entered text in the final screenshot"
-      - "Screenshots demonstrate successful action verification workflow"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Compare the before and after screenshots"
-        - "Verify the search field is empty in the first screenshot"
-        - "Confirm the search field contains 'DevTools testing' in the second screenshot"
-        - "Check that the visual changes accurately reflect the performed action"
-
-metadata:
-  tags: ["screenshot", "visual-comparison", "action-verification", "before-after"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: false
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
deleted file mode 100644
index b154454..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/amazon-product-001.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# E-commerce product extraction test (Streamlined)
-id: "amazon-product-001"
-name: "Extract Amazon Product Details"
-description: "Extract product information from an Amazon product page"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      product:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          brand:
-            type: "string"
-          price:
-            type: "object"
-            properties:
-              current:
-                type: "number"
-              currency:
-                type: "string"
-          rating:
-            type: "object"
-            properties:
-              average:
-                type: "number"
-              count:
-                type: "number"
-          images:
-            type: "array"
-            items:
-              type: "string"
-              format: "url"
-          features:
-            type: "array"
-            items:
-              type: "string"
-        required:
-          - "title"
-          - "price"
-      availability:
-        type: "string"
-    required:
-      - "product"
-  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
-  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Product title is accurate and complete"
-      - "Price information is current and properly formatted"
-      - "Rating data includes both average and review count"
-      - "Image URLs are valid and accessible"
-      - "Key product features are captured"
-      - "All URLs are properly resolved (not node IDs)"
-
-metadata:
-  tags: ["ecommerce", "amazon", "product", "dynamic"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
deleted file mode 100644
index 31ef288..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/bbc-news-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# News article extraction test (Streamlined)
-id: "bbc-news-001"
-name: "Extract BBC News Article"
-description: "Extract article content and metadata from a BBC News page"
-enabled: true
-
-target:
-  url: "https://www.bbc.com/news/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      headlines:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            summary:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            category:
-              type: "string"
-          required:
-            - "title"
-      mainStory:
-        type: "object"
-        properties:
-          headline:
-            type: "string"
-          summary:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-    required:
-      - "headlines"
-  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
-  reasoning: "Testing extraction from a news aggregation page with multiple articles"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    include_url: true
-    criteria:
-      - "Headlines are current and relevant to technology news"
-      - "Article summaries provide meaningful context"
-      - "URLs link to valid BBC news articles"
-      - "Main story is properly identified"
-      - "All extracted content is in English"
-
-metadata:
-  tags: ["news", "bbc", "aggregation", "dynamic"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
deleted file mode 100644
index e9f3b6e..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/bing-search-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Bing Search results extraction test
-id: "bing-search-001"
-name: "Extract Bing Search Results"
-description: "Extract search results from Bing search page"
-enabled: true
-
-target:
-  url: "https://www.bing.com/search?q=web+scraping+best+practices"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            datePublished:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      sidebarInfo:
-        type: "object"
-        properties:
-          title:
-            type: "string"
-          description:
-            type: "string"
-          source:
-            type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
-  reasoning: "Testing extraction from Bing search results with different layout than Google"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results match the query intent"
-      - "Results include valid URLs and meaningful snippets"
-      - "Sidebar information is extracted when present"
-      - "No duplicate results in the list"
-
-metadata:
-  tags: ["search", "bing", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
deleted file mode 100644
index 5c496c5..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/github-repo-001.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Simple structured data test (Streamlined)
-id: "github-repo-001"
-name: "Extract GitHub Repository Info"
-description: "Extract basic repository information from a GitHub page"
-enabled: true
-
-target:
-  url: "https://github.com/microsoft/TypeScript"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      name:
-        type: "string"
-      description:
-        type: "string"
-      language:
-        type: "string"
-      stars:
-        type: "number"
-      forks:
-        type: "number"
-      topics:
-        type: "array"
-        items:
-          type: "string"
-      readme:
-        type: "object"
-        properties:
-          summary:
-            type: "string"
-    required:
-      - "name"
-      - "description"
-  instruction: "Extract repository metadata and basic statistics"
-  reasoning: "Testing extraction from a well-structured GitHub repository page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "stars"
-      - "forks"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Repository name matches the GitHub page"
-      - "Description accurately reflects the project purpose"
-      - "Programming language is correctly identified"
-      - "Topic tags are relevant to the project"
-
-metadata:
-  tags: ["github", "repository", "structured"]
-  priority: "high"
-  timeout: 30000
-  retries: 1
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
deleted file mode 100644
index 981ccbd..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/google-flights-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Google Flights search extraction test
-id: "google-flights-001"
-name: "Extract Google Flights Search Results"
-description: "Extract flight options from Google Flights search"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchCriteria:
-        type: "object"
-        properties:
-          origin:
-            type: "string"
-          destination:
-            type: "string"
-          departureDate:
-            type: "string"
-          returnDate:
-            type: "string"
-          tripType:
-            type: "string"
-          passengers:
-            type: "number"
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            flightNumber:
-              type: "string"
-            departureTime:
-              type: "string"
-            arrivalTime:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "number"
-            price:
-              type: "object"
-              properties:
-                amount:
-                  type: "number"
-                currency:
-                  type: "string"
-            cabin:
-              type: "string"
-            bookingUrl:
-              type: "string"
-              format: "url"
-            legroom:
-              type: "string"
-            amenities:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "airline"
-            - "departureTime"
-            - "arrivalTime"
-            - "price"
-      priceInsights:
-        type: "object"
-        properties:
-          trend:
-            type: "string"
-          recommendation:
-            type: "string"
-          averagePrice:
-            type: "number"
-    required:
-      - "flights"
-  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
-  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Flight times are in proper format"
-      - "Prices are numeric values with currency"
-      - "Airlines and flight numbers are accurate"
-      - "Stop information is correctly identified"
-      - "Duration is in readable format"
-
-metadata:
-  tags: ["travel", "flights", "google", "booking"]
-  priority: "high"
-  timeout: 60000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
deleted file mode 100644
index c1725d4..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/google-search-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Google Search results extraction test
-id: "google-search-001"
-name: "Extract Google Search Results"
-description: "Extract search results from Google search page"
-enabled: true
-
-target:
-  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      query:
-        type: "string"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            domain:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      featuredSnippet:
-        type: "object"
-        properties:
-          content:
-            type: "string"
-          source:
-            type: "string"
-          url:
-            type: "string"
-            format: "url"
-      relatedSearches:
-        type: "array"
-        items:
-          type: "string"
-    required:
-      - "searchResults"
-  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
-  reasoning: "Testing extraction from Google search results page with various result types"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are relevant to the query"
-      - "Each result has a valid title, URL, and snippet"
-      - "URLs are properly resolved and not node IDs"
-      - "Related searches are extracted if present"
-      - "Featured snippet is captured when available"
-
-metadata:
-  tags: ["search", "google", "serp", "dynamic"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
deleted file mode 100644
index 1d26848..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/homedepot-001.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Home Depot product search extraction test
-id: "homedepot-001"
-name: "Extract Home Depot Product Search"
-description: "Extract product listings from Home Depot search results"
-enabled: true
-
-target:
-  url: "https://www.homedepot.com/s/power%2520drill"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchQuery:
-        type: "string"
-      totalResults:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            price:
-              type: "number"
-            originalPrice:
-              type: "number"
-            savings:
-              type: "number"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            availability:
-              type: "string"
-            features:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "price"
-            - "productUrl"
-      filters:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
-  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are relevant to the search query"
-      - "Prices are numeric values in USD"
-      - "Product URLs link to Home Depot product pages"
-      - "Ratings are on a 5-star scale"
-      - "Key product features are captured"
-
-metadata:
-  tags: ["ecommerce", "homedepot", "products", "search"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
deleted file mode 100644
index 28a2c10..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/macys-001.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-# Macy's product listing extraction test
-id: "macys-001"
-name: "Extract Macy's Product Listings"
-description: "Extract fashion products from Macy's category page"
-enabled: true
-
-target:
-  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 60000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      category:
-        type: "string"
-      totalProducts:
-        type: "number"
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            brand:
-              type: "string"
-            currentPrice:
-              type: "number"
-            originalPrice:
-              type: "number"
-            discount:
-              type: "string"
-            colors:
-              type: "array"
-              items:
-                type: "string"
-            sizes:
-              type: "array"
-              items:
-                type: "string"
-            rating:
-              type: "number"
-            reviewCount:
-              type: "number"
-            productUrl:
-              type: "string"
-              format: "url"
-            imageUrl:
-              type: "string"
-              format: "url"
-            promotions:
-              type: "array"
-              items:
-                type: "string"
-          required:
-            - "name"
-            - "brand"
-            - "currentPrice"
-      refinements:
-        type: "object"
-        properties:
-          brands:
-            type: "array"
-            items:
-              type: "string"
-          sizes:
-            type: "array"
-            items:
-              type: "string"
-          colors:
-            type: "array"
-            items:
-              type: "string"
-          priceRanges:
-            type: "array"
-            items:
-              type: "string"
-    required:
-      - "products"
-  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
-  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Products are from the correct category"
-      - "Prices reflect current and sale prices"
-      - "Color and size options are captured"
-      - "Brand names are accurately extracted"
-      - "Promotional text is included when present"
-
-metadata:
-  tags: ["ecommerce", "macys", "fashion", "products"]
-  priority: "high"
-  timeout: 60000
-  retries: 3
-  flaky: true
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
deleted file mode 100644
index 88983bd..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-001.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Wikipedia article extraction test (Streamlined)
-id: "wikipedia-chrome-devtools-001"
-name: "Extract Chrome DevTools Wikipedia Article"
-description: "Extract structured information from the Chrome DevTools Wikipedia page"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 45000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      title:
-        type: "string"
-      summary:
-        type: "string"
-      tableOfContents:
-        type: "array"
-        items:
-          type: "string"
-      infobox:
-        type: "object"
-        properties:
-          developer:
-            type: "string"
-          initialRelease:
-            type: "string"
-          operatingSystem:
-            type: "string"
-          license:
-            type: "string"
-      externalLinks:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            text:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "title"
-      - "summary"
-  instruction: "Extract the main article information including title, summary, table of contents, and infobox details"
-  reasoning: "Testing extraction from a stable, well-structured Wikipedia page"
-
-validation:
-  type: "hybrid"
-  snapshot:
-    exclude_paths:
-      - "externalLinks[*].url"
-    structure_only: false
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Article title matches the Wikipedia page title"
-      - "Summary captures the main description of Chrome DevTools"
-      - "Table of contents includes major sections"
-      - "Infobox contains key technical details"
-      - "External links are properly resolved URLs"
-
-metadata:
-  tags: ["wikipedia", "documentation", "stable"]
-  priority: "high"
-  timeout: 45000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml b/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
deleted file mode 100644
index c432c20..0000000
--- a/eval-server/nodejs/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-# Wikipedia search results extraction test
-id: "wikipedia-search-001"
-name: "Extract Wikipedia Search Results"
-description: "Extract search results from Wikipedia search"
-enabled: true
-
-target:
-  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "extract_schema_streamlined"
-timeout: 30000
-
-input:
-  schema:
-    type: "object"
-    properties:
-      searchTerm:
-        type: "string"
-      resultCount:
-        type: "number"
-      searchResults:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-            snippet:
-              type: "string"
-            category:
-              type: "string"
-            wordCount:
-              type: "number"
-            lastEdited:
-              type: "string"
-          required:
-            - "title"
-            - "url"
-            - "snippet"
-      suggestedArticles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            url:
-              type: "string"
-              format: "url"
-    required:
-      - "searchResults"
-  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
-  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4.1-mini"
-    temperature: 0.3
-    criteria:
-      - "Search results are Wikipedia articles"
-      - "Each result has a valid Wikipedia URL"
-      - "Snippets contain relevant content highlights"
-      - "Metadata like word count is extracted when available"
-
-metadata:
-  tags: ["search", "wikipedia", "encyclopedia"]
-  priority: "high"
-  timeout: 30000
-  retries: 2
-  flaky: false
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/booking-001.yaml b/eval-server/nodejs/evals/web-task-agent/booking-001.yaml
deleted file mode 100644
index 8a99d17..0000000
--- a/eval-server/nodejs/evals/web-task-agent/booking-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Hotel Search Workflow - Web Task Agent
-id: "booking-001"
-name: "Hotel Search Workflow"
-description: "Test web task agent orchestrating complex multi-step booking search"
-enabled: true
-
-target:
-  url: "https://www.booking.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
-  reasoning: "Customer is looking for travel booking"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully searched for hotels in San Francisco"
-      - "Results show hotels available for March 15-17 dates"
-      - "Guest count of 2 adults is reflected in the search results"
-      - "Returned multiple hotel options with relevant details"
-      - "Each hotel includes essential information (name, price, location)"
-      - "Results are presented in a clear, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify hotel search results are displayed for San Francisco"
-        - "Check that dates March 15-17 are correctly selected"
-        - "Confirm guest count shows 2 adults"
-        - "Ensure search results show hotels with availability for specified dates"
-
-metadata:
-  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml b/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
deleted file mode 100644
index 338f464..0000000
--- a/eval-server/nodejs/evals/web-task-agent/ecommerce-001.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# E-commerce web task evaluation (matches DevTools test case)
-id: "ecommerce-001"
-name: "E-commerce Product Search"
-description: "Test web task agent handling product search on shopping site"
-enabled: true
-
-target:
-  url: "https://www.amazon.com"
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Search Amazon for \"wireless headphones\" and find products under $100"
-  reasoning: "Testing e-commerce search workflow with price filtering"
-  context: "User wants to find wireless headphones with specific price constraint"
-  extraction_schema:
-    type: "object"
-    properties:
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            price:
-              type: "string"
-            rating:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "hybrid"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully navigated to product search"
-      - "Applied appropriate filters correctly"
-      - "Extracted product details accurately"
-      - "Provided meaningful comparison of features"
-      - "Stayed within specified price range"
-  snapshot:
-    structure_only: true
-    exclude_paths:
-      - "timestamp"
-      - "sessionId"
-
-metadata:
-  tags: ["web-task", "multi-step", "ecommerce", "search"]
-  priority: "high"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/error-001.yaml b/eval-server/nodejs/evals/web-task-agent/error-001.yaml
deleted file mode 100644
index 1831a14..0000000
--- a/eval-server/nodejs/evals/web-task-agent/error-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Error Recovery Workflow - Web Task Agent
-id: "error-001"
-name: "Error Recovery Workflow"
-description: "Test web task agent handling action_agent failures and retry logic"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
-  reasoning: "Customer is asking for this response"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
-      - "Either found some results OR provided clear explanation why no results were found"
-      - "Response handles the edge case gracefully without errors"
-      - "If no results found, suggested alternative actions or explanations"
-      - "Maintained professional tone despite unusual request"
-      - "Final output is coherent and helpful to the user"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Check if search was attempted despite unusual query"
-        - "Verify error handling did not break the page interaction"
-        - "Confirm agent attempted to complete the task or provided clear error info"
-        - "Ensure page is still functional after error recovery attempts"
-
-metadata:
-  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/extract-001.yaml b/eval-server/nodejs/evals/web-task-agent/extract-001.yaml
deleted file mode 100644
index e836aa0..0000000
--- a/eval-server/nodejs/evals/web-task-agent/extract-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Structured Data Extraction - Web Task Agent
-id: "extract-001"
-name: "Structured Data Extraction"
-description: "Test web task agent extracting structured data from search results"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
-  reasoning: "User is looking to understand the top stories on Hacker News"
-  extraction_schema:
-    type: "object"
-    properties:
-      stories:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments:
-              type: "number"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully returned exactly 5 Hacker News stories in structured text format"
-      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
-      - "Results are presented in readable text format similar to the example provided"
-      - "Response includes all required fields: title, score, comments count, URL"
-      - "Maintained proper orchestration pattern throughout the extraction process"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News homepage is loaded and displaying stories"
-        - "Check that top stories are visible with scores and comment counts"
-        - "Confirm story titles and metadata are clearly displayed"
-        - "Ensure page structure allows for data extraction"
-
-metadata:
-  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/finance-001.yaml b/eval-server/nodejs/evals/web-task-agent/finance-001.yaml
deleted file mode 100644
index 2c661ed..0000000
--- a/eval-server/nodejs/evals/web-task-agent/finance-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Stock Information Research - Web Task Agent
-id: "finance-001"
-name: "Stock Information Research"
-description: "Test extracting stock prices and financial information"
-enabled: true
-
-target:
-  url: "https://finance.yahoo.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
-  reasoning: "Users need automated financial data collection for investment decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      stock_info:
-        type: "object"
-        properties:
-          symbol:
-            type: "string"
-          company_name:
-            type: "string"
-          current_price:
-            type: "string"
-          change:
-            type: "string"
-          change_percent:
-            type: "string"
-          market_cap:
-            type: "string"
-          pe_ratio:
-            type: "string"
-          volume:
-            type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Apple (AAPL) stock information"
-      - "Current stock price is clearly stated"
-      - "Market cap information is included"
-      - "Price change and percentage change are provided"
-      - "Additional metrics (PE ratio, volume) included when available"
-      - "Financial data is current and presented in readable text format (not JSON)"
-      - "Stock information is well-organized and easy to understand"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
-        - "Check that current stock price and change are visible"
-        - "Confirm market cap and trading volume are displayed"
-        - "Ensure financial metrics and charts are shown"
-
-metadata:
-  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/flight-001.yaml b/eval-server/nodejs/evals/web-task-agent/flight-001.yaml
deleted file mode 100644
index f74b255..0000000
--- a/eval-server/nodejs/evals/web-task-agent/flight-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Complex Flight Search - Web Task Agent
-id: "flight-001"
-name: "Complex Flight Search"
-description: "Test web task agent handling complex flight search with multiple criteria"
-enabled: true
-
-target:
-  url: "https://www.kayak.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
-  reasoning: "Customer is looking for finding the best flight options"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Flight results show March 20 departure date"
-      - "Flight results show March 30 return date"
-      - "Returned multiple flight options with airlines and prices"
-      - "Each flight includes essential details (times, airlines, prices)"
-      - "Results clearly distinguish between outbound and return flights"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify flight search results are displayed"
-        - "Check SEA to NRT route is correctly selected"
-        - "Confirm dates March 20 departure and March 30 return"
-        - "Ensure flight options are showing with prices and airlines"
-
-metadata:
-  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/food-001.yaml b/eval-server/nodejs/evals/web-task-agent/food-001.yaml
deleted file mode 100644
index 382b470..0000000
--- a/eval-server/nodejs/evals/web-task-agent/food-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Restaurant Search and Menu Extraction - Web Task Agent
-id: "food-001"
-name: "Restaurant Search and Menu Extraction"
-description: "Test searching restaurants and extracting menu information"
-enabled: true
-
-target:
-  url: "https://www.yelp.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
-  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
-  extraction_schema:
-    type: "object"
-    properties:
-      restaurants:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            rating:
-              type: "string"
-            price_range:
-              type: "string"
-            cuisine:
-              type: "string"
-            address:
-              type: "string"
-            phone:
-              type: "string"
-            hours:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Italian restaurants in San Francisco"
-      - "Each restaurant includes name, rating, and price range"
-      - "Location/address information is provided for each restaurant"
-      - "Contact details (phone/hours) included when available"
-      - "All restaurants listed serve Italian cuisine"
-      - "Results are presented in clear, structured text format (not JSON)"
-      - "Restaurants are numbered or organized clearly for easy comparison"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yelp search results for Italian restaurants"
-        - "Check that restaurants show ratings and price ranges"
-        - "Confirm location filter shows San Francisco results"
-        - "Ensure restaurant listings include contact information"
-
-metadata:
-  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml b/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
deleted file mode 100644
index a9234e5..0000000
--- a/eval-server/nodejs/evals/web-task-agent/iframe-001.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# ANA Airlines Iframe Content Extraction - Web Task Agent
-id: "iframe-001"
-name: "ANA Airlines Iframe Content Extraction"
-description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
-enabled: true
-
-target:
-  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
-  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            flight_number:
-              type: "string"
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            departure_date:
-              type: "string"
-            arrival_date:
-              type: "string"
-            duration:
-              type: "string"
-            aircraft:
-              type: "string"
-            price:
-              type: "string"
-            cabin_class:
-              type: "string"
-            stops:
-              type: "string"
-      booking_interface_status:
-        type: "string"
-      iframe_content_found:
-        type: "boolean"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully navigated ANA Airlines booking interface"
-      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
-      - "Extracted flight information from ANA flight search results"
-      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
-      - "Extracted pricing information in appropriate currency"
-      - "Handled any booking interface elements, popups, or navigation flows"
-      - "Results show flights for the correct dates (March 20-30, 2026)"
-      - "Successfully demonstrated iframe content extraction capabilities"
-      - "Booking interface status indicates successful page interaction"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify ANA Airlines flight search page loaded correctly"
-        - "Check that search parameters show SEA to NRT route"
-        - "Confirm flight results are displayed (may be in iframes)"
-        - "Ensure booking interface elements are functional"
-        - "Verify flight information is accessible and extractable"
-
-metadata:
-  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml b/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
deleted file mode 100644
index 7a6caa8..0000000
--- a/eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Job Search Workflow - Web Task Agent
-id: "jobs-001"
-name: "Job Search Workflow"
-description: "Test web task agent orchestrating job search on LinkedIn"
-enabled: true
-
-target:
-  url: "https://www.linkedin.com/jobs"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
-  reasoning: "User wants to find job opportunities in tech industry"
-  extraction_schema:
-    type: "object"
-    properties:
-      jobs:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            company:
-              type: "string"
-            location:
-              type: "string"
-            salary:
-              type: "string"
-            description:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
-      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
-      - "If using forms: delegated keyword and location input to action_agent"
-      - "Extracted job listings using extract_data"
-      - "Returned structured job data in readable text format (not JSON)"
-      - "Each job listing includes title, company, location, and other relevant fields"
-      - "Results are numbered or organized clearly for easy reading"
-      - "Demonstrated proper workflow orchestration for job search"
-      - "Never used direct browser interaction tools"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify LinkedIn job search results are displayed"
-        - "Check that search shows Software Engineer jobs in San Francisco"
-        - "Confirm job listings include company names and titles"
-        - "Ensure at least 5 job results are visible"
-
-metadata:
-  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/learning-001.yaml b/eval-server/nodejs/evals/web-task-agent/learning-001.yaml
deleted file mode 100644
index 1e4c761..0000000
--- a/eval-server/nodejs/evals/web-task-agent/learning-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Online Course Search - Web Task Agent
-id: "learning-001"
-name: "Online Course Search"
-description: "Test searching and extracting course information from learning platforms"
-enabled: true
-
-target:
-  url: "https://www.coursera.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
-  reasoning: "Users want to compare courses across platforms for learning decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      courses:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            instructor:
-              type: "string"
-            university:
-              type: "string"
-            rating:
-              type: "string"
-            duration:
-              type: "string"
-            price:
-              type: "string"
-            description:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Machine Learning courses on Coursera"
-      - "Returned details for top 5 courses as requested"
-      - "Each course includes title, instructor, university, and rating"
-      - "Duration and pricing information included for each course"
-      - "Course descriptions or key topics are provided"
-      - "Results are presented in structured text format (not JSON)"
-      - "Courses are numbered (1-5) and well-organized for easy comparison"
-      - "Each course entry is clearly formatted and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Coursera search results for Machine Learning"
-        - "Check that courses show titles, instructors, and ratings"
-        - "Confirm course details include duration and pricing"
-        - "Ensure search results are relevant to Machine Learning"
-
-metadata:
-  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/nav-001.yaml b/eval-server/nodejs/evals/web-task-agent/nav-001.yaml
deleted file mode 100644
index bff519f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Site Navigation Workflow - Web Task Agent
-id: "nav-001"
-name: "Site Navigation Workflow"
-description: "Test web task agent orchestrating navigation between different sections of a site"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
-  reasoning: "User is looking to explore Wikipedia content through structured navigation"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated Wikipedia search via action_agent calls"
-      - "Navigated to artificial intelligence article through action_agent"
-      - "Located machine learning section via action_agent coordination"
-      - "Extracted relevant information about machine learning"
-      - "Demonstrated multi-step navigation workflow"
-      - "Maintained orchestration pattern throughout navigation"
-      - "Provided structured summary of found information"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify navigation reached artificial intelligence Wikipedia page"
-        - "Check that machine learning section or content is visible"
-        - "Confirm successful navigation through multiple page sections"
-        - "Ensure content related to machine learning is displayed"
-
-metadata:
-  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/news-001.yaml b/eval-server/nodejs/evals/web-task-agent/news-001.yaml
deleted file mode 100644
index 4c29aed..0000000
--- a/eval-server/nodejs/evals/web-task-agent/news-001.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# News Article Aggregation - Web Task Agent
-id: "news-001"
-name: "News Article Aggregation"
-description: "Test aggregating news headlines and summaries from news sites"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
-  reasoning: "Users want automated news monitoring for research and awareness"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments_count:
-              type: "number"
-            url:
-              type: "string"
-            top_comment:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted 10 Hacker News stories as requested"
-      - "Each story includes title, score, and comment count"
-      - "URLs are provided for each story"
-      - "Stories appear to be from the current top/front page"
-      - "Results are presented in clear, numbered text format (1-10), not JSON"
-      - "All required fields are present and properly formatted in readable text"
-      - "Each story is clearly separated and easy to read"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News stories are visible with scores"
-        - "Check that story titles and comment counts are shown"
-        - "Confirm top stories section is properly displayed"
-        - "Ensure story metadata is accessible for extraction"
-
-metadata:
-  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml b/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
deleted file mode 100644
index 5fd824e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/realestate-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Real Estate Property Search - Web Task Agent
-id: "realestate-001"
-name: "Real Estate Property Search"
-description: "Test property search workflow on real estate platforms"
-enabled: true
-
-target:
-  url: "https://www.zillow.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
-  reasoning: "User wants to find affordable housing options in a specific location"
-  extraction_schema:
-    type: "object"
-    properties:
-      properties:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            address:
-              type: "string"
-            price:
-              type: "string"
-            bedrooms:
-              type: "number"
-            bathrooms:
-              type: "number"
-            sqft:
-              type: "string"
-            lot_size:
-              type: "string"
-            year_built:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated location search via action_agent"
-      - "Delegated price filter setting to action_agent"
-      - "Coordinated property type selection through action_agent"
-      - "Applied search filters through proper action_agent calls"
-      - "Extracted property listings with extract_data"
-      - "Returned structured property data in readable text format (not JSON)"
-      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
-      - "Properties are clearly numbered or organized for easy comparison"
-      - "Demonstrated complex real estate search workflow orchestration"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Zillow search results for Austin, Texas properties"
-        - "Check that properties shown are under $500k"
-        - "Confirm property listings show price, beds, baths info"
-        - "Ensure search results match the specified criteria"
-
-metadata:
-  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
deleted file mode 100644
index 12a986f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-001.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Infinite Scroll Content Loading - Web Task Agent
-id: "scroll-001"
-name: "Infinite Scroll Content Loading"
-description: "Test web task agent handling infinite scroll pages to load more content"
-enabled: true
-
-target:
-  url: "https://twitter.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
-  reasoning: "Testing infinite scroll functionality for dynamic content loading"
-  extraction_schema:
-    type: "object"
-    properties:
-      tweets:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            author:
-              type: "string"
-            content:
-              type: "string"
-            likes:
-              type: "string"
-            retweets:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used scroll_page tool to scroll down the page"
-      - "Loaded additional content through scrolling actions"
-      - "Extracted at least 20 tweets from the feed"
-      - "Each tweet includes author and content information"
-      - "Demonstrated proper handling of dynamically loaded content"
-      - "Results are presented in clear, numbered text format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify initial Twitter feed is loaded"
-        - "Check that scrolling action loaded additional tweets"
-        - "Confirm at least 20 tweets are visible after scrolling"
-        - "Ensure page scrolled down significantly from initial position"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
deleted file mode 100644
index dce0156..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-002.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Product Review Scrolling - Web Task Agent
-id: "scroll-002"
-name: "Product Review Scrolling"
-description: "Test scrolling to load more product reviews on e-commerce sites"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/dp/B09B8V1LZ3"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
-  reasoning: "Users need to see multiple reviews beyond initial visible ones"
-  extraction_schema:
-    type: "object"
-    properties:
-      reviews:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            rating:
-              type: "string"
-            title:
-              type: "string"
-            author:
-              type: "string"
-            date:
-              type: "string"
-            verified:
-              type: "boolean"
-            content:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool to navigate to reviews section"
-      - "Scrolled within reviews area to load additional reviews"
-      - "Extracted multiple product reviews with ratings"
-      - "Each review includes rating, author, and content"
-      - "Successfully handled lazy-loaded review content"
-      - "Presented reviews in structured, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Amazon product page is loaded"
-        - "Check that page scrolled to reviews section"
-        - "Confirm additional reviews loaded after scrolling"
-        - "Ensure review content is fully visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
deleted file mode 100644
index df7eaba..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-003.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# News Article Progressive Loading - Web Task Agent
-id: "scroll-003"
-name: "News Article Progressive Loading"
-description: "Test scrolling through news sites that load articles progressively"
-enabled: true
-
-target:
-  url: "https://medium.com/topic/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
-  reasoning: "Testing progressive content loading on news/blog platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            author:
-              type: "string"
-            reading_time:
-              type: "string"
-            preview:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool multiple times to load content"
-      - "Successfully loaded at least 15 articles through scrolling"
-      - "Extracted article titles and author information"
-      - "Handled Medium's progressive loading mechanism"
-      - "Articles are from technology topic as requested"
-      - "Results presented in clear, numbered format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Medium technology page is loaded"
-        - "Check that initial articles are visible"
-        - "Confirm scrolling loaded additional articles"
-        - "Ensure at least 15 articles are visible after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
deleted file mode 100644
index e9b3534..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-004.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Search Results Infinite Scroll - Web Task Agent
-id: "scroll-004"
-name: "Search Results Infinite Scroll"
-description: "Test handling search results that use infinite scroll instead of pagination"
-enabled: true
-
-target:
-  url: "https://www.pinterest.com/search/pins/?q=web%20design"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
-  reasoning: "Testing infinite scroll on visual search platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      pins:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            description:
-              type: "string"
-            saves:
-              type: "string"
-            source:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully performed search for \"web design\" pins"
-      - "Used scroll_page tool to trigger infinite scroll loading"
-      - "Loaded at least 30 pins through scrolling actions"
-      - "Extracted pin titles and metadata"
-      - "Handled Pinterest's masonry layout and lazy loading"
-      - "Results are well-organized and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Pinterest search results for web design"
-        - "Check initial pins are displayed"
-        - "Confirm scrolling loaded many more pins"
-        - "Ensure grid layout shows 30+ pins after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml b/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
deleted file mode 100644
index 47c8769..0000000
--- a/eval-server/nodejs/evals/web-task-agent/scroll-005.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-# Google Flights Scroll and Show More - Web Task Agent
-id: "scroll-005"
-name: "Google Flights Scroll and Show More"
-description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
-  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "string"
-            price:
-              type: "string"
-            aircraft:
-              type: "string"
-      total_flights_found:
-        type: "number"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted initial flight results from Google Flights"
-      - "Used scroll_page tool to scroll down the flight results list"
-      - "Located and clicked \"Show more flights\" button using action_agent"
-      - "Loaded additional flight options beyond the initial set"
-      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Each flight includes airline, times, duration, stops, and price"
-      - "Flights are for the correct dates (March 20-30, 2026)"
-      - "Results are presented in clear, numbered format"
-      - "Successfully combined scrolling and clicking actions to load more content"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Google Flights page shows SEA to NRT flights"
-        - "Check that initial flight results are displayed"
-        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
-        - "Ensure additional flights loaded after clicking the button"
-        - "Verify at least 20 flight options are now visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/search-001.yaml b/eval-server/nodejs/evals/web-task-agent/search-001.yaml
deleted file mode 100644
index da3a4eb..0000000
--- a/eval-server/nodejs/evals/web-task-agent/search-001.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Basic web task search evaluation (matches DevTools test case)
-id: "search-001"
-name: "Site-Specific Search Task"
-description: "Test web task agent orchestrating a search workflow on a specific site"
-enabled: true
-
-target:
-  url: "chrome://new-tab-page"
-
-tool: "web_task_agent"
-timeout: 60000
-
-input:
-  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
-  reasoning: "Testing basic site-specific search workflow orchestration"
-  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully returned exactly 3 search results in structured text format"
-      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
-      - "Each result includes a URL in the format \"URL: [link]\""
-      - "Results are presented in a clear, readable text format (not JSON)"
-      - "Response includes a brief summary or conclusion statement"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search was completed and results page is showing"
-        - "Check that search results are related to \"Chrome DevTools automation\""
-        - "Confirm at least 3 search results are visible on the page"
-        - "Ensure the search workflow was completed successfully"
-
-metadata:
-  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
-  priority: "normal"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/social-001.yaml b/eval-server/nodejs/evals/web-task-agent/social-001.yaml
deleted file mode 100644
index a35ebfd..0000000
--- a/eval-server/nodejs/evals/web-task-agent/social-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Social Media Content Extraction - Web Task Agent
-id: "social-001"
-name: "Social Media Content Extraction"
-description: "Test extracting trending topics and posts from social media"
-enabled: true
-
-target:
-  url: "https://twitter.com/explore"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 trending topics from Twitter/X explore page"
-  reasoning: "User wants to stay updated on current trends"
-  extraction_schema:
-    type: "object"
-    properties:
-      trends:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            topic:
-              type: "string"
-            posts_count:
-              type: "string"
-            category:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully accessed Twitter/X explore page and found trending topics"
-      - "Returned exactly 5 trending topics as requested"
-      - "Each topic includes the trend name/hashtag"
-      - "Post counts or metrics are included when available"
-      - "Topics are current/recent trends (not outdated)"
-      - "Results are presented in clear, numbered text format (not JSON)"
-      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Twitter/X explore page is loaded"
-        - "Check that trending topics section is visible"
-        - "Confirm trending topics show names and post counts"
-        - "Ensure page shows current trending content"
-
-metadata:
-  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
deleted file mode 100644
index a2842b6..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-booking-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Hotel Search Workflow - Web Task Agent
-id: "web-task-agent-booking-001"
-name: "Hotel Search Workflow"
-description: "Test web task agent orchestrating complex multi-step booking search"
-enabled: true
-
-target:
-  url: "https://www.booking.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
-  reasoning: "Customer is looking for travel booking"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully searched for hotels in San Francisco"
-      - "Results show hotels available for March 15-17 dates"
-      - "Guest count of 2 adults is reflected in the search results"
-      - "Returned multiple hotel options with relevant details"
-      - "Each hotel includes essential information (name, price, location)"
-      - "Results are presented in a clear, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify hotel search results are displayed for San Francisco"
-        - "Check that dates March 15-17 are correctly selected"
-        - "Confirm guest count shows 2 adults"
-        - "Ensure search results show hotels with availability for specified dates"
-
-metadata:
-  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
deleted file mode 100644
index a6b9735..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# E-commerce web task evaluation (matches DevTools test case)
-id: "web-task-agent-ecommerce-001"
-name: "E-commerce Product Search"
-description: "Test web task agent handling product search on shopping site"
-enabled: true
-
-target:
-  url: "https://www.amazon.com"
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Search Amazon for \"wireless headphones\" and find products under $100"
-  reasoning: "Testing e-commerce search workflow with price filtering"
-  context: "User wants to find wireless headphones with specific price constraint"
-  extraction_schema:
-    type: "object"
-    properties:
-      products:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            price:
-              type: "string"
-            rating:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "hybrid"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully navigated to product search"
-      - "Applied appropriate filters correctly"
-      - "Extracted product details accurately"
-      - "Provided meaningful comparison of features"
-      - "Stayed within specified price range"
-  snapshot:
-    structure_only: true
-    exclude_paths:
-      - "timestamp"
-      - "sessionId"
-
-metadata:
-  tags: ["web-task", "multi-step", "ecommerce", "search"]
-  priority: "high"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
deleted file mode 100644
index cc5c7df..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-error-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Error Recovery Workflow - Web Task Agent
-id: "web-task-agent-error-001"
-name: "Error Recovery Workflow"
-description: "Test web task agent handling action_agent failures and retry logic"
-enabled: true
-
-target:
-  url: "https://www.google.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
-  reasoning: "Customer is asking for this response"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
-      - "Either found some results OR provided clear explanation why no results were found"
-      - "Response handles the edge case gracefully without errors"
-      - "If no results found, suggested alternative actions or explanations"
-      - "Maintained professional tone despite unusual request"
-      - "Final output is coherent and helpful to the user"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Check if search was attempted despite unusual query"
-        - "Verify error handling did not break the page interaction"
-        - "Confirm agent attempted to complete the task or provided clear error info"
-        - "Ensure page is still functional after error recovery attempts"
-
-metadata:
-  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
deleted file mode 100644
index 14eadcb..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-extract-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Structured Data Extraction - Web Task Agent
-id: "web-task-agent-extract-001"
-name: "Structured Data Extraction"
-description: "Test web task agent extracting structured data from search results"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
-  reasoning: "User is looking to understand the top stories on Hacker News"
-  extraction_schema:
-    type: "object"
-    properties:
-      stories:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments:
-              type: "number"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully returned exactly 5 Hacker News stories in structured text format"
-      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
-      - "Results are presented in readable text format similar to the example provided"
-      - "Response includes all required fields: title, score, comments count, URL"
-      - "Maintained proper orchestration pattern throughout the extraction process"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News homepage is loaded and displaying stories"
-        - "Check that top stories are visible with scores and comment counts"
-        - "Confirm story titles and metadata are clearly displayed"
-        - "Ensure page structure allows for data extraction"
-
-metadata:
-  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
deleted file mode 100644
index 8f7a2b0..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-finance-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Stock Information Research - Web Task Agent
-id: "web-task-agent-finance-001"
-name: "Stock Information Research"
-description: "Test extracting stock prices and financial information"
-enabled: true
-
-target:
-  url: "https://finance.yahoo.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
-  reasoning: "Users need automated financial data collection for investment decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      stock_info:
-        type: "object"
-        properties:
-          symbol:
-            type: "string"
-          company_name:
-            type: "string"
-          current_price:
-            type: "string"
-          change:
-            type: "string"
-          change_percent:
-            type: "string"
-          market_cap:
-            type: "string"
-          pe_ratio:
-            type: "string"
-          volume:
-            type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Apple (AAPL) stock information"
-      - "Current stock price is clearly stated"
-      - "Market cap information is included"
-      - "Price change and percentage change are provided"
-      - "Additional metrics (PE ratio, volume) included when available"
-      - "Financial data is current and presented in readable text format (not JSON)"
-      - "Stock information is well-organized and easy to understand"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
-        - "Check that current stock price and change are visible"
-        - "Confirm market cap and trading volume are displayed"
-        - "Ensure financial metrics and charts are shown"
-
-metadata:
-  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
deleted file mode 100644
index a17883f..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-flight-001.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Complex Flight Search - Web Task Agent
-id: "web-task-agent-flight-001"
-name: "Complex Flight Search"
-description: "Test web task agent handling complex flight search with multiple criteria"
-enabled: true
-
-target:
-  url: "https://www.kayak.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
-  reasoning: "Customer is looking for finding the best flight options"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Flight results show March 20 departure date"
-      - "Flight results show March 30 return date"
-      - "Returned multiple flight options with airlines and prices"
-      - "Each flight includes essential details (times, airlines, prices)"
-      - "Results clearly distinguish between outbound and return flights"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify flight search results are displayed"
-        - "Check SEA to NRT route is correctly selected"
-        - "Confirm dates March 20 departure and March 30 return"
-        - "Ensure flight options are showing with prices and airlines"
-
-metadata:
-  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
deleted file mode 100644
index 32ee646..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-food-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Restaurant Search and Menu Extraction - Web Task Agent
-id: "web-task-agent-food-001"
-name: "Restaurant Search and Menu Extraction"
-description: "Test searching restaurants and extracting menu information"
-enabled: true
-
-target:
-  url: "https://www.yelp.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
-  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
-  extraction_schema:
-    type: "object"
-    properties:
-      restaurants:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            name:
-              type: "string"
-            rating:
-              type: "string"
-            price_range:
-              type: "string"
-            cuisine:
-              type: "string"
-            address:
-              type: "string"
-            phone:
-              type: "string"
-            hours:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Italian restaurants in San Francisco"
-      - "Each restaurant includes name, rating, and price range"
-      - "Location/address information is provided for each restaurant"
-      - "Contact details (phone/hours) included when available"
-      - "All restaurants listed serve Italian cuisine"
-      - "Results are presented in clear, structured text format (not JSON)"
-      - "Restaurants are numbered or organized clearly for easy comparison"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Yelp search results for Italian restaurants"
-        - "Check that restaurants show ratings and price ranges"
-        - "Confirm location filter shows San Francisco results"
-        - "Ensure restaurant listings include contact information"
-
-metadata:
-  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
deleted file mode 100644
index 30b0eac..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-iframe-001.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# ANA Airlines Iframe Content Extraction - Web Task Agent
-id: "web-task-agent-iframe-001"
-name: "ANA Airlines Iframe Content Extraction"
-description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
-enabled: true
-
-target:
-  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
-  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            flight_number:
-              type: "string"
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            departure_date:
-              type: "string"
-            arrival_date:
-              type: "string"
-            duration:
-              type: "string"
-            aircraft:
-              type: "string"
-            price:
-              type: "string"
-            cabin_class:
-              type: "string"
-            stops:
-              type: "string"
-      booking_interface_status:
-        type: "string"
-      iframe_content_found:
-        type: "boolean"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully navigated ANA Airlines booking interface"
-      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
-      - "Extracted flight information from ANA flight search results"
-      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
-      - "Extracted pricing information in appropriate currency"
-      - "Handled any booking interface elements, popups, or navigation flows"
-      - "Results show flights for the correct dates (March 20-30, 2026)"
-      - "Successfully demonstrated iframe content extraction capabilities"
-      - "Booking interface status indicates successful page interaction"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify ANA Airlines flight search page loaded correctly"
-        - "Check that search parameters show SEA to NRT route"
-        - "Confirm flight results are displayed (may be in iframes)"
-        - "Ensure booking interface elements are functional"
-        - "Verify flight information is accessible and extractable"
-
-metadata:
-  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
deleted file mode 100644
index 2c72df3..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-jobs-001.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Job Search Workflow - Web Task Agent
-id: "web-task-agent-jobs-001"
-name: "Job Search Workflow"
-description: "Test web task agent orchestrating job search on LinkedIn"
-enabled: true
-
-target:
-  url: "https://www.linkedin.com/jobs"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
-  reasoning: "User wants to find job opportunities in tech industry"
-  extraction_schema:
-    type: "object"
-    properties:
-      jobs:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            company:
-              type: "string"
-            location:
-              type: "string"
-            salary:
-              type: "string"
-            description:
-              type: "string"
-            url:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
-      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
-      - "If using forms: delegated keyword and location input to action_agent"
-      - "Extracted job listings using extract_data"
-      - "Returned structured job data in readable text format (not JSON)"
-      - "Each job listing includes title, company, location, and other relevant fields"
-      - "Results are numbered or organized clearly for easy reading"
-      - "Demonstrated proper workflow orchestration for job search"
-      - "Never used direct browser interaction tools"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify LinkedIn job search results are displayed"
-        - "Check that search shows Software Engineer jobs in San Francisco"
-        - "Confirm job listings include company names and titles"
-        - "Ensure at least 5 job results are visible"
-
-metadata:
-  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
deleted file mode 100644
index 8dcdc7d..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-learning-001.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Online Course Search - Web Task Agent
-id: "web-task-agent-learning-001"
-name: "Online Course Search"
-description: "Test searching and extracting course information from learning platforms"
-enabled: true
-
-target:
-  url: "https://www.coursera.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
-  reasoning: "Users want to compare courses across platforms for learning decisions"
-  extraction_schema:
-    type: "object"
-    properties:
-      courses:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            instructor:
-              type: "string"
-            university:
-              type: "string"
-            rating:
-              type: "string"
-            duration:
-              type: "string"
-            price:
-              type: "string"
-            description:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully found Machine Learning courses on Coursera"
-      - "Returned details for top 5 courses as requested"
-      - "Each course includes title, instructor, university, and rating"
-      - "Duration and pricing information included for each course"
-      - "Course descriptions or key topics are provided"
-      - "Results are presented in structured text format (not JSON)"
-      - "Courses are numbered (1-5) and well-organized for easy comparison"
-      - "Each course entry is clearly formatted and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Coursera search results for Machine Learning"
-        - "Check that courses show titles, instructors, and ratings"
-        - "Confirm course details include duration and pricing"
-        - "Ensure search results are relevant to Machine Learning"
-
-metadata:
-  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
deleted file mode 100644
index fdee2f4..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-nav-001.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Site Navigation Workflow - Web Task Agent
-id: "web-task-agent-nav-001"
-name: "Site Navigation Workflow"
-description: "Test web task agent orchestrating navigation between different sections of a site"
-enabled: true
-
-target:
-  url: "https://www.wikipedia.org"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 90000
-
-input:
-  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
-  reasoning: "User is looking to explore Wikipedia content through structured navigation"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated Wikipedia search via action_agent calls"
-      - "Navigated to artificial intelligence article through action_agent"
-      - "Located machine learning section via action_agent coordination"
-      - "Extracted relevant information about machine learning"
-      - "Demonstrated multi-step navigation workflow"
-      - "Maintained orchestration pattern throughout navigation"
-      - "Provided structured summary of found information"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify navigation reached artificial intelligence Wikipedia page"
-        - "Check that machine learning section or content is visible"
-        - "Confirm successful navigation through multiple page sections"
-        - "Ensure content related to machine learning is displayed"
-
-metadata:
-  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
deleted file mode 100644
index d9e1934..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-news-001.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# News Article Aggregation - Web Task Agent
-id: "web-task-agent-news-001"
-name: "News Article Aggregation"
-description: "Test aggregating news headlines and summaries from news sites"
-enabled: true
-
-target:
-  url: "https://news.ycombinator.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
-  reasoning: "Users want automated news monitoring for research and awareness"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            score:
-              type: "number"
-            comments_count:
-              type: "number"
-            url:
-              type: "string"
-            top_comment:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted 10 Hacker News stories as requested"
-      - "Each story includes title, score, and comment count"
-      - "URLs are provided for each story"
-      - "Stories appear to be from the current top/front page"
-      - "Results are presented in clear, numbered text format (1-10), not JSON"
-      - "All required fields are present and properly formatted in readable text"
-      - "Each story is clearly separated and easy to read"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Hacker News stories are visible with scores"
-        - "Check that story titles and comment counts are shown"
-        - "Confirm top stories section is properly displayed"
-        - "Ensure story metadata is accessible for extraction"
-
-metadata:
-  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
deleted file mode 100644
index f22bc13..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Real Estate Property Search - Web Task Agent
-id: "web-task-agent-realestate-001"
-name: "Real Estate Property Search"
-description: "Test property search workflow on real estate platforms"
-enabled: true
-
-target:
-  url: "https://www.zillow.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
-  reasoning: "User wants to find affordable housing options in a specific location"
-  extraction_schema:
-    type: "object"
-    properties:
-      properties:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            address:
-              type: "string"
-            price:
-              type: "string"
-            bedrooms:
-              type: "number"
-            bathrooms:
-              type: "number"
-            sqft:
-              type: "string"
-            lot_size:
-              type: "string"
-            year_built:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Orchestrated location search via action_agent"
-      - "Delegated price filter setting to action_agent"
-      - "Coordinated property type selection through action_agent"
-      - "Applied search filters through proper action_agent calls"
-      - "Extracted property listings with extract_data"
-      - "Returned structured property data in readable text format (not JSON)"
-      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
-      - "Properties are clearly numbered or organized for easy comparison"
-      - "Demonstrated complex real estate search workflow orchestration"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Zillow search results for Austin, Texas properties"
-        - "Check that properties shown are under $500k"
-        - "Confirm property listings show price, beds, baths info"
-        - "Ensure search results match the specified criteria"
-
-metadata:
-  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
deleted file mode 100644
index 6fd0f6e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-001.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Infinite Scroll Content Loading - Web Task Agent
-id: "web-task-agent-scroll-001"
-name: "Infinite Scroll Content Loading"
-description: "Test web task agent handling infinite scroll pages to load more content"
-enabled: true
-
-target:
-  url: "https://twitter.com"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
-  reasoning: "Testing infinite scroll functionality for dynamic content loading"
-  extraction_schema:
-    type: "object"
-    properties:
-      tweets:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            author:
-              type: "string"
-            content:
-              type: "string"
-            likes:
-              type: "string"
-            retweets:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully used scroll_page tool to scroll down the page"
-      - "Loaded additional content through scrolling actions"
-      - "Extracted at least 20 tweets from the feed"
-      - "Each tweet includes author and content information"
-      - "Demonstrated proper handling of dynamically loaded content"
-      - "Results are presented in clear, numbered text format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify initial Twitter feed is loaded"
-        - "Check that scrolling action loaded additional tweets"
-        - "Confirm at least 20 tweets are visible after scrolling"
-        - "Ensure page scrolled down significantly from initial position"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
deleted file mode 100644
index d5d060a..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-002.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Product Review Scrolling - Web Task Agent
-id: "web-task-agent-scroll-002"
-name: "Product Review Scrolling"
-description: "Test scrolling to load more product reviews on e-commerce sites"
-enabled: true
-
-target:
-  url: "https://www.amazon.com/dp/B08N5WRWNW"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
-  reasoning: "Users need to see multiple reviews beyond initial visible ones"
-  extraction_schema:
-    type: "object"
-    properties:
-      reviews:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            rating:
-              type: "string"
-            title:
-              type: "string"
-            author:
-              type: "string"
-            date:
-              type: "string"
-            verified:
-              type: "boolean"
-            content:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool to navigate to reviews section"
-      - "Scrolled within reviews area to load additional reviews"
-      - "Extracted multiple product reviews with ratings"
-      - "Each review includes rating, author, and content"
-      - "Successfully handled lazy-loaded review content"
-      - "Presented reviews in structured, readable format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Amazon product page is loaded"
-        - "Check that page scrolled to reviews section"
-        - "Confirm additional reviews loaded after scrolling"
-        - "Ensure review content is fully visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
deleted file mode 100644
index f435017..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-003.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# News Article Progressive Loading - Web Task Agent
-id: "web-task-agent-scroll-003"
-name: "News Article Progressive Loading"
-description: "Test scrolling through news sites that load articles progressively"
-enabled: true
-
-target:
-  url: "https://medium.com/topic/technology"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
-  reasoning: "Testing progressive content loading on news/blog platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      articles:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            author:
-              type: "string"
-            reading_time:
-              type: "string"
-            preview:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Used scroll_page tool multiple times to load content"
-      - "Successfully loaded at least 15 articles through scrolling"
-      - "Extracted article titles and author information"
-      - "Handled Medium's progressive loading mechanism"
-      - "Articles are from technology topic as requested"
-      - "Results presented in clear, numbered format"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Medium technology page is loaded"
-        - "Check that initial articles are visible"
-        - "Confirm scrolling loaded additional articles"
-        - "Ensure at least 15 articles are visible after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
deleted file mode 100644
index 5970947..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-004.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Search Results Infinite Scroll - Web Task Agent
-id: "web-task-agent-scroll-004"
-name: "Search Results Infinite Scroll"
-description: "Test handling search results that use infinite scroll instead of pagination"
-enabled: true
-
-target:
-  url: "https://www.pinterest.com/search/pins/?q=web%20design"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
-  reasoning: "Testing infinite scroll on visual search platforms"
-  extraction_schema:
-    type: "object"
-    properties:
-      pins:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            title:
-              type: "string"
-            description:
-              type: "string"
-            saves:
-              type: "string"
-            source:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully performed search for \"web design\" pins"
-      - "Used scroll_page tool to trigger infinite scroll loading"
-      - "Loaded at least 30 pins through scrolling actions"
-      - "Extracted pin titles and metadata"
-      - "Handled Pinterest's masonry layout and lazy loading"
-      - "Results are well-organized and readable"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Pinterest search results for web design"
-        - "Check initial pins are displayed"
-        - "Confirm scrolling loaded many more pins"
-        - "Ensure grid layout shows 30+ pins after scrolling"
-
-metadata:
-  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
deleted file mode 100644
index e603ff7..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-scroll-005.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-# Google Flights Scroll and Show More - Web Task Agent
-id: "web-task-agent-scroll-005"
-name: "Google Flights Scroll and Show More"
-description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
-enabled: true
-
-target:
-  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
-  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
-  extraction_schema:
-    type: "object"
-    properties:
-      flights:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            airline:
-              type: "string"
-            departure_time:
-              type: "string"
-            arrival_time:
-              type: "string"
-            duration:
-              type: "string"
-            stops:
-              type: "string"
-            price:
-              type: "string"
-            aircraft:
-              type: "string"
-      total_flights_found:
-        type: "number"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully extracted initial flight results from Google Flights"
-      - "Used scroll_page tool to scroll down the flight results list"
-      - "Located and clicked \"Show more flights\" button using action_agent"
-      - "Loaded additional flight options beyond the initial set"
-      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
-      - "Each flight includes airline, times, duration, stops, and price"
-      - "Flights are for the correct dates (March 20-30, 2026)"
-      - "Results are presented in clear, numbered format"
-      - "Successfully combined scrolling and clicking actions to load more content"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Google Flights page shows SEA to NRT flights"
-        - "Check that initial flight results are displayed"
-        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
-        - "Ensure additional flights loaded after clicking the button"
-        - "Verify at least 20 flight options are now visible"
-
-metadata:
-  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
deleted file mode 100644
index 50dc920..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-search-001.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Basic web task search evaluation (matches DevTools test case)
-id: "web-task-agent-search-001"
-name: "Site-Specific Search Task"
-description: "Test web task agent orchestrating a search workflow on a specific site"
-enabled: true
-
-target:
-  url: "chrome://new-tab-page"
-
-tool: "web_task_agent"
-timeout: 60000
-
-input:
-  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
-  reasoning: "Testing basic site-specific search workflow orchestration"
-  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
-
-
-validation:
-  type: "llm-judge"
-  llm_judge:
-    model: "gpt-4o"
-    criteria:
-      - "Successfully returned exactly 3 search results in structured text format"
-      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
-      - "Each result includes a URL in the format \"URL: [link]\""
-      - "Results are presented in a clear, readable text format (not JSON)"
-      - "Response includes a brief summary or conclusion statement"
-    visual_verification:
-      enabled: true
-      capture_before: true
-      capture_after: true
-      prompts:
-        - "Verify search was completed and results page is showing"
-        - "Check that search results are related to \"Chrome DevTools automation\""
-        - "Confirm at least 3 search results are visible on the page"
-        - "Ensure the search workflow was completed successfully"
-
-metadata:
-  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
-  priority: "normal"
\ No newline at end of file
diff --git a/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml b/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
deleted file mode 100644
index f1f969e..0000000
--- a/eval-server/nodejs/evals/web-task-agent/web-task-agent-social-001.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Social Media Content Extraction - Web Task Agent
-id: "web-task-agent-social-001"
-name: "Social Media Content Extraction"
-description: "Test extracting trending topics and posts from social media"
-enabled: true
-
-target:
-  url: "https://twitter.com/explore"
-  wait_for: "networkidle"
-  wait_timeout: 5000
-
-tool: "web_task_agent"
-timeout: 180000
-
-input:
-  task: "Extract the top 5 trending topics from Twitter/X explore page"
-  reasoning: "User wants to stay updated on current trends"
-  extraction_schema:
-    type: "object"
-    properties:
-      trends:
-        type: "array"
-        items:
-          type: "object"
-          properties:
-            topic:
-              type: "string"
-            posts_count:
-              type: "string"
-            category:
-              type: "string"
-
-
-validation:
-  type: "llm_judge"
-  llm_judge:
-    model: "gpt-4o-mini"
-    temperature: 0.3
-    criteria:
-      - "Successfully accessed Twitter/X explore page and found trending topics"
-      - "Returned exactly 5 trending topics as requested"
-      - "Each topic includes the trend name/hashtag"
-      - "Post counts or metrics are included when available"
-      - "Topics are current/recent trends (not outdated)"
-      - "Results are presented in clear, numbered text format (not JSON)"
-      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
-    visual_verification:
-      enabled: true
-      capture_before_action: true
-      capture_after_action: true
-      verification_prompts:
-        - "Verify Twitter/X explore page is loaded"
-        - "Check that trending topics section is visible"
-        - "Confirm trending topics show names and post counts"
-        - "Ensure page shows current trending content"
-
-metadata:
-  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
-  priority: "high"
-  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/nodejs/start.js b/eval-server/nodejs/start.js
new file mode 100644
index 0000000..a3d45bf
--- /dev/null
+++ b/eval-server/nodejs/start.js
@@ -0,0 +1,39 @@
+import { EvalServer } from "./src/lib/EvalServer.js";
+import { HTTPWrapper } from "./src/lib/HTTPWrapper.js";
+
+const WS_PORT = parseInt(process.env.PORT || "8082");
+const HTTP_PORT = parseInt(process.env.API_PORT || "8081");
+const HOST = process.env.HOST || "0.0.0.0";
+
+console.log("🔧 Creating EvalServer...");
+const evalServer = new EvalServer({
+  host: HOST,
+  port: WS_PORT
+});
+
+console.log("🔧 Creating HTTP wrapper...");
+const httpWrapper = new HTTPWrapper(evalServer, {
+  port: HTTP_PORT,
+  host: HOST
+});
+
+console.log("🔧 Starting EvalServer...");
+await evalServer.start();
+console.log(`✅ EvalServer started on ws://${HOST}:${WS_PORT}`);
+
+console.log("🔧 Starting HTTP wrapper...");
+await httpWrapper.start();
+console.log(`✅ HTTP API started on http://${HOST}:${HTTP_PORT}`);
+
+console.log("⏳ Server ready for connections...");
+
+// Keep process alive
+process.on('SIGTERM', () => {
+  console.log('Received SIGTERM, shutting down gracefully...');
+  process.exit(0);
+});
+
+process.on('SIGINT', () => {
+  console.log('Received SIGINT, shutting down gracefully...');
+  process.exit(0);
+});
diff --git a/eval-server/python/README.md b/eval-server/python/README.md
deleted file mode 100644
index f167b48..0000000
--- a/eval-server/python/README.md
+++ /dev/null
@@ -1,368 +0,0 @@
-# bo-eval-server (Python)
-
-A minimal Python library for creating WebSocket-based evaluation servers for LLM agents.
-
-## Features
-
-- 🔌 **WebSocket Server**: Real-time agent connections with asyncio
-- 🤖 **Bidirectional RPC**: JSON-RPC 2.0 for calling methods on connected agents
-- 📚 **Programmatic API**: Create and manage evaluations in Python code
-- 📊 **Evaluation Stack**: LIFO stack for managing evaluation queues
-- ⚡ **Concurrent Support**: Full async/await support for multiple agents
-- 🔍 **Enhanced Logging**: Structured logging with loguru
-- ✨ **Minimal Dependencies**: Only websockets and loguru required
-
-## Quick Start
-
-### Basic WebSocket Server
-
-```python
-import asyncio
-from bo_eval_server import EvalServer
-
-async def main():
-    server = EvalServer(
-        auth_key='hello',
-        host='127.0.0.1',
-        port=8080
-    )
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        response = await client.evaluate({
-            "id": "test_eval",
-            "name": "Capital of France",
-            "tool": "chat",
-            "input": {"message": "What is the capital of France?"}
-        })
-        
-        print(f'Response: {response}')
-    
-    await server.start()
-    print('Server running on ws://127.0.0.1:8080')
-    
-    # Keep server running
-    await server.wait_closed()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### Using Evaluation Stack
-
-```python
-import asyncio
-from bo_eval_server import EvalServer, EvaluationStack
-
-async def main():
-    server = EvalServer(auth_key='secret', port=8080)
-    stack = EvaluationStack()
-    
-    # Add evaluations to stack
-    stack.push({
-        "id": "eval_001",
-        "name": "Math Question",
-        "tool": "chat",
-        "input": {"message": "What is 2 + 2?"}
-    })
-    
-    stack.push({
-        "id": "eval_002", 
-        "name": "Science Question",
-        "tool": "chat",
-        "input": {"message": "What is the speed of light?"}
-    })
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        # Process evaluations from stack
-        while not stack.is_empty():
-            evaluation = stack.pop()
-            try:
-                result = await client.evaluate(evaluation)
-                print(f'✅ {evaluation["name"]}: {result["status"]}')
-            except Exception as e:
-                print(f'❌ {evaluation["name"]}: {e}')
-    
-    await server.start()
-    await server.wait_closed()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Installation
-
-### Using uv (Recommended)
-
-```bash
-# Install uv package manager (if not already installed)
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Install dependencies and create virtual environment
-uv sync
-
-# Run examples using the convenient runner
-python run.py basic      # Basic server example
-python run.py stack      # Evaluation stack example  
-python run.py prog       # Programmatic evaluations example
-python run.py all        # Show all available examples
-
-# Or run examples directly with uv
-uv run python examples/basic_server.py
-uv run python examples/with_stack.py
-uv run python examples/programmatic_evals.py
-```
-
-### Using pip (Alternative)
-
-```bash
-# Install the package
-pip install -e .
-
-# Or install with development dependencies
-pip install -e ".[dev]"
-
-# Or install from requirements.txt
-pip install -r requirements.txt
-```
-
-## Library Usage
-
-### EvalServer API
-
-```python
-from bo_eval_server import EvalServer
-
-# Create server instance
-server = EvalServer(
-    auth_key='your-secret-key',  # Required: client authentication
-    host='127.0.0.1',           # Optional: default 'localhost'
-    port=8080,                  # Optional: default 8080
-)
-
-# Register event handlers
-@server.on_connect
-async def handle_connect(client):
-    # Called when client connects and is ready
-    pass
-
-@server.on_disconnect  
-async def handle_disconnect(client_info):
-    # Called when client disconnects
-    pass
-
-# Server lifecycle
-await server.start()        # Start the server
-await server.stop()         # Stop the server
-await server.wait_closed()  # Wait for server to close
-
-# Server status
-status = server.get_status()
-print(f"Server running: {status['running']}")
-```
-
-### Client Proxy API
-
-```python
-@server.on_connect
-async def handle_client(client):
-    # Client information
-    print(f'Client ID: {client.id}')
-    print(f'Tab ID: {client.tab_id}')
-    print(f'Base Client ID: {client.base_client_id}')
-    
-    # Execute evaluations
-    result = await client.evaluate({
-        "id": "eval_001",
-        "name": "Test Evaluation",
-        "description": "Optional description",
-        "tool": "chat",
-        "input": {"message": "Your question here"},
-        "timeout": 30.0,  # Optional timeout in seconds
-        "metadata": {"tags": ["api", "test"]}
-    })
-    
-    # Send custom messages
-    await client.send_message({
-        "type": "custom", 
-        "data": "Hello client!"
-    })
-```
-
-### EvaluationStack API
-
-```python
-from bo_eval_server import EvaluationStack
-
-stack = EvaluationStack()
-
-# Add evaluations (LIFO - Last In, First Out)
-stack.push({
-    "id": "eval_001",
-    "name": "Test",
-    "tool": "chat", 
-    "input": {"message": "Hello"}
-})
-
-# Remove and get evaluation
-evaluation = stack.pop()  # Returns dict or None if empty
-
-# Stack operations
-size = stack.size()           # Get number of evaluations
-is_empty = stack.is_empty()   # Check if empty
-top = stack.peek()            # View top without removing
-stack.clear()                 # Remove all evaluations
-all_evals = stack.to_array()  # Get copy as list
-```
-
-## Agent Protocol
-
-Your agent needs to implement the WebSocket protocol:
-
-### 1. Connect to WebSocket
-```python
-import websockets
-import json
-
-ws = await websockets.connect('ws://localhost:8080')
-```
-
-### 2. Receive Authentication Challenge
-The server sends an authentication challenge with the secret key:
-```python
-challenge = json.loads(await ws.recv())
-# Expected: {"type": "auth_challenge", "secretKey": "hello", "connectionId": "uuid"}
-```
-
-### 3. Send Registration Response
-Client validates the secret key and responds:
-```python
-await ws.send(json.dumps({
-    "type": "register",
-    "clientId": "your-client-id",
-    "acceptAuth": True,  # True if secret key is acceptable
-    "connectionId": challenge["connectionId"],
-    "capabilities": ["chat", "action"]
-}))
-```
-
-### 4. Receive Registration Confirmation
-```python
-confirmation = json.loads(await ws.recv())
-# Expected: {"type": "registered", "clientId": "your-client-id", "serverTime": 123456}
-```
-
-### 5. Send Ready Signal
-```python
-await ws.send(json.dumps({"type": "ready"}))
-```
-
-### 6. Handle RPC Calls
-```python
-async for message in ws:
-    data = json.loads(message)
-    
-    if data.get("jsonrpc") == "2.0" and data.get("method") == "evaluate":
-        # Handle evaluation request
-        result = await handle_evaluation(data["params"])
-        
-        # Send response
-        await ws.send(json.dumps({
-            "jsonrpc": "2.0",
-            "id": data["id"],
-            "result": result
-        }))
-```
-
-## Architecture
-
-```
-src/bo_eval_server/
-├── __init__.py           # Package exports
-├── eval_server.py        # Main EvalServer class
-├── evaluation_stack.py   # EvaluationStack implementation
-├── client_manager.py     # Client connection management
-├── rpc_client.py         # JSON-RPC client implementation
-├── config.py             # Configuration management
-└── logger.py             # Enhanced logging setup
-```
-
-## Design Principles
-
-- **Async-First**: Built on asyncio for high concurrency
-- **Minimal Dependencies**: Only essential packages required
-- **Type Hints**: Full typing support for better development experience
-- **Event-Driven**: React to client connections with decorators
-- **Programmatic**: Full control through Python code
-- **Clean API**: Simple, Pythonic interface
-
-## Examples
-
-See the `examples/` directory for complete working examples:
-
-- `basic_server.py` - Simple WebSocket server setup
-- `with_stack.py` - Using evaluation stack for queuing
-- `programmatic_evals.py` - Creating evaluations in code
-
-## Evaluation Scripts
-
-The `evals/` directory contains ready-to-use evaluation scripts for various benchmarks:
-
-- `browsecomp_eval_server.py` - Browsecomp benchmark server (1,266 web browsing questions)
-  - Run with: `./evals/run_browsecomp_eval_server.sh`
-  - See `evals/README.md` for detailed usage
-
-## Development
-
-### Using uv
-
-```bash
-# Install with development dependencies
-uv sync --dev
-
-# Run tests
-uv run pytest
-
-# Format code
-uv run black src/ examples/
-
-# Type checking
-uv run mypy src/
-
-# Run all development commands
-uv run pytest && uv run black src/ examples/ && uv run mypy src/
-```
-
-### Using pip
-
-```bash
-# Install in development mode
-pip install -e ".[dev]"
-
-# Run tests
-pytest
-
-# Format code
-black src/ examples/
-
-# Type checking
-mypy src/
-```
-
-## Environment Variables
-
-```bash
-# Optional configuration
-BO_EVAL_SERVER_HOST=localhost
-BO_EVAL_SERVER_PORT=8080
-BO_EVAL_SERVER_LOG_LEVEL=INFO
-```
-
----
-
-This Python implementation provides the core WebSocket evaluation server functionality with a clean, async API for programmatic evaluation management.
\ No newline at end of file
diff --git a/eval-server/python/UV_COMMANDS.md b/eval-server/python/UV_COMMANDS.md
deleted file mode 100644
index ea79fcb..0000000
--- a/eval-server/python/UV_COMMANDS.md
+++ /dev/null
@@ -1,188 +0,0 @@
-# UV Commands Reference
-
-Quick reference for using uv with bo-eval-server Python implementation.
-
-## Installation & Setup
-
-```bash
-# Install uv (if not already installed)
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Install project dependencies
-uv sync
-
-# Install with development dependencies
-uv sync --dev
-```
-
-## Running Examples
-
-### Using the convenience runner (Recommended)
-
-```bash
-# Basic WebSocket server
-python run.py basic
-
-# Evaluation stack example
-python run.py stack
-
-# Programmatic evaluations with analytics
-python run.py prog
-
-# Show all available examples
-python run.py all
-```
-
-### Direct uv execution
-
-```bash
-# Run examples directly
-uv run python examples/basic_server.py
-uv run python examples/with_stack.py  
-uv run python examples/programmatic_evals.py
-
-# Run with custom arguments or environment variables
-uv run --env BO_EVAL_SERVER_PORT=8081 python examples/basic_server.py
-```
-
-## Development Commands
-
-```bash
-# Run tests
-uv run pytest
-
-# Run tests with coverage
-uv run pytest --cov=src/bo_eval_server
-
-# Format code
-uv run black .
-uv run black src/ examples/
-
-# Type checking
-uv run mypy src/
-
-# Run all checks
-uv run pytest && uv run black . && uv run mypy src/
-```
-
-## Package Management
-
-```bash
-# Add new dependencies
-uv add requests
-uv add --dev pytest-cov
-
-# Remove dependencies  
-uv remove requests
-
-# Update dependencies
-uv sync --upgrade
-
-# Show installed packages
-uv tree
-
-# Show project info
-uv show
-```
-
-## Virtual Environment
-
-```bash
-# Activate virtual environment
-source .venv/bin/activate  # Unix/macOS
-# or
-.venv\Scripts\activate     # Windows
-
-# Check Python version in venv
-uv run python --version
-
-# Run any command in the virtual environment
-uv run <command>
-```
-
-## Project Scripts
-
-The project includes entry point scripts defined in `pyproject.toml`:
-
-```bash
-# After installation, these commands become available:
-bo-eval-basic         # Run basic server example
-bo-eval-stack         # Run evaluation stack example  
-bo-eval-programmatic  # Run programmatic evaluations example
-```
-
-## Useful UV Options
-
-```bash
-# Run with specific Python version
-uv run --python 3.11 python examples/basic_server.py
-
-# Run with environment variables
-uv run --env DEBUG=1 python examples/basic_server.py
-
-# Run in isolated environment (no local packages)
-uv run --isolated python examples/basic_server.py
-
-# Show verbose output
-uv sync --verbose
-
-# Force reinstall
-uv sync --reinstall
-```
-
-## Integration with IDEs
-
-For VS Code and other IDEs, point to the uv-created virtual environment:
-
-```bash
-# Show virtual environment path
-echo $PWD/.venv/bin/python
-
-# Or use uv to find it
-uv run which python
-```
-
-Then configure your IDE to use this Python interpreter for the project.
-
-## Common Workflows
-
-### Quick Start Development
-
-```bash
-git clone <repo>
-cd eval-server/python
-uv sync --dev
-python run.py basic
-```
-
-### Running Tests in CI
-
-```bash
-uv sync --dev --frozen
-uv run pytest --cov=src/bo_eval_server --cov-report=xml
-```
-
-### Building and Publishing
-
-```bash
-uv build
-uv publish  # If publishing to PyPI
-```
-
-## Troubleshooting
-
-```bash
-# Clear uv cache
-uv cache clean
-
-# Reinstall everything
-rm -rf .venv uv.lock
-uv sync
-
-# Check uv version
-uv --version
-
-# Get help
-uv --help
-uv run --help
-```
\ No newline at end of file
diff --git a/eval-server/python/evals/README.md b/eval-server/python/evals/README.md
deleted file mode 100644
index 6d3b082..0000000
--- a/eval-server/python/evals/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Python Evaluation Scripts
-
-This directory contains evaluation scripts for running various benchmark datasets using the Python eval-server.
-
-## Available Scripts
-
-### Browsecomp Evaluation Server
-
-**Script**: `browsecomp_eval_server.py`  
-**Wrapper**: `run_browsecomp_eval_server.sh`
-
-The browsecomp eval server loads questions from the [Browsecomp benchmark](https://github.com/openai/simple-evals) and distributes them to connected BrowserOperator clients via WebSocket connections.
-
-#### Features
-
-- Loads and decrypts 1,266 browsecomp questions automatically
-- Distributes exactly one question per client connection
-- Stack-based LIFO distribution
-- **Automatic scoring**: Compares responses against true answers
-- **Structured response parsing**: Handles BrowserOperator's message format
-- **Comprehensive logging**: Structured logs saved to timestamped files
-- Real-time progress tracking with accuracy metrics  
-- Confidence score extraction and analysis
-- Results saved to JSON file for later analysis
-- Configurable timeout (default: 60 minutes)
-- Configurable server settings
-
-#### Usage
-
-```bash
-# Use the wrapper script for proper dependencies
-./run_browsecomp_eval_server.sh --help
-
-# List available questions
-./run_browsecomp_eval_server.sh --list --list-limit 10
-
-# Start server with first 5 questions
-./run_browsecomp_eval_server.sh --limit 5
-
-# Start server with specific questions
-./run_browsecomp_eval_server.sh --questions 1 5 10 25
-
-# Start server with a range of questions (questions 10-15)
-./run_browsecomp_eval_server.sh --start 10 --end 15
-
-# Start server from question 100 to the end
-./run_browsecomp_eval_server.sh --start 100
-
-# Start server with questions 1-50
-./run_browsecomp_eval_server.sh --end 50
-
-# Start server with all 1,266 questions
-./run_browsecomp_eval_server.sh
-
-# Custom configuration
-./run_browsecomp_eval_server.sh --limit 20 --port 8081 --auth-key my-key
-
-# Save results to JSON file
-./run_browsecomp_eval_server.sh --limit 10 --save-results
-```
-
-#### How It Works
-
-1. **Load Questions**: The server loads browsecomp questions from the dataset
-2. **Stack Distribution**: Questions are placed in a LIFO stack
-3. **Client Connection**: When a BrowserOperator connects, it receives one question
-4. **Processing**: The client processes the question and returns results
-5. **Automatic Scoring**: Server compares responses against true answers
-6. **Tracking**: Server tracks completion, accuracy, and confidence statistics
-7. **Results**: Optionally saves detailed results to JSON file
-
-#### Example Workflow
-
-```bash
-# Terminal 1: Start the eval server
-cd /path/to/eval-server/python/evals
-./run_browsecomp_eval_server.sh --limit 10 --save-results
-
-# Terminal 2+: Connect BrowserOperator clients
-# Each client will automatically receive and process one question
-```
-
-#### Scoring Output
-
-When evaluations complete, you'll see automatic scoring results:
-
-```
-✅ Evaluation completed!
-📊 Response structure: 12 messages, 3 tool calls, gpt-4 model, 45230ms
-
-🎯 Scoring Results:
-   - True Answer: 1988-96
-   - Extracted Answer: 1988-96
-   - Correct: ✅ YES
-   - Confidence: 85%
-
-📊 Current Statistics:
-   ✅ Completed: 5/10
-   ❌ Failed: 0/10
-   📚 Remaining: 5/10
-
-🎯 Scoring Statistics:
-   📊 Accuracy: 80.0% (4/5 correct)
-   💡 Average Confidence: 78.5%
-```
-
-#### Results JSON Format
-
-When using `--save-results`, evaluations are saved to `browsecomp_eval_results_[timestamp].json`:
-
-```json
-{
-  "timestamp": "20240115_143022",
-  "total_questions": 10,
-  "completed": 10,
-  "failed": 0,
-  "accuracy": 80.0,
-  "average_confidence": 78.5,
-  "evaluations": [
-    {
-      "client_id": "abc123...",
-      "question_id": 1,
-      "result": "Explanation: ... Exact Answer: 1988-96 Confidence Score: 85%",
-      "scoring": {
-        "is_correct": true,
-        "true_answer": "1988-96",
-        "extracted_answer": "1988-96",
-        "confidence": 85
-      }
-    }
-  ]
-}
-```
-
-#### Logging
-
-The server creates comprehensive logs in the `./logs/` directory:
-
-- **Console Output**: Real-time progress with emojis and summaries
-- **Structured Logs**: Timestamped log file `browsecomp_eval_server_YYYYMMDD_HHMMSS.log`
-
-**Structured Log Events**:
-```
-EVENT: {"timestamp": "2024-01-15T14:30:22", "event_type": "client_connected", "client_id": "abc123", "stack_remaining": 10}
-EVENT: {"timestamp": "2024-01-15T14:30:25", "event_type": "evaluation_assigned", "evaluation_id": "browsecomp_q1", "question_id": 1}
-EVENT: {"timestamp": "2024-01-15T14:32:10", "event_type": "evaluation_completed", "is_correct": true, "confidence": 85, "model_used": "gpt-4"}
-EVENT: {"timestamp": "2024-01-15T14:35:00", "event_type": "session_completed", "accuracy": 80.0, "total_questions": 10}
-```
-
-**Log Files Location**: 
-- `./logs/browsecomp_eval_server_YYYYMMDD_HHMMSS.log` - Main server log
-- `./logs/` - Directory also used by eval-server's internal logging
-
-## Dependencies
-
-The evaluation scripts require additional dependencies beyond the base eval-server:
-- `pandas` - For dataset loading and manipulation
-- `requests` - For downloading datasets
-
-These are automatically installed when you run `uv sync` in the eval-server/python directory.
-
-## Adding New Evaluation Scripts
-
-To add a new evaluation script:
-
-1. Create your script in this directory
-2. Import the eval-server modules:
-   ```python
-   import sys
-   from pathlib import Path
-   sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-   from bo_eval_server import EvalServer, EvaluationStack
-   ```
-
-3. Create a wrapper script for easy execution:
-   ```bash
-   #!/bin/bash
-   SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-   cd "$SCRIPT_DIR/.."
-   uv run python evals/your_script.py "$@"
-   ```
-
-4. Make the wrapper executable: `chmod +x your_wrapper.sh`
-
-## Dataset Files
-
-- `browsecomp_dataset.py` - Dataset loader for browsecomp questions with automatic decryption support
-- `browsecomp_scorer.py` - Scoring logic that extracts answers and compares against ground truth
-
-## Notes
-
-- Always use the wrapper scripts (`.sh` files) to ensure proper dependencies are loaded
-- The eval server runs on WebSocket protocol (ws://localhost:8080 by default)
-- Each connected client receives exactly one evaluation from the stack
-- Progress and statistics are shown in real-time during execution
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_dataset.py b/eval-server/python/evals/browsecomp_dataset.py
deleted file mode 100644
index 387f713..0000000
--- a/eval-server/python/evals/browsecomp_dataset.py
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Dataset Loader
-Handles downloading and decrypting the Browsecomp benchmark dataset
-"""
-
-import base64
-import hashlib
-import pandas as pd
-from typing import List, Dict, Optional
-
-
-DATASET_URL = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv"
-
-
-def derive_key(password: str, length: int) -> bytes:
-    """Derive a fixed-length key from the password using SHA256."""
-    hasher = hashlib.sha256()
-    hasher.update(password.encode())
-    key = hasher.digest()
-    return key * (length // len(key)) + key[: length % len(key)]
-
-
-def decrypt(ciphertext_b64: str, password: str) -> str:
-    """Decrypt base64-encoded ciphertext with XOR."""
-    encrypted = base64.b64decode(ciphertext_b64)
-    key = derive_key(password, len(encrypted))
-    decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
-    return decrypted.decode()
-
-
-class BrowsecompDataset:
-    def __init__(self, password: Optional[str] = None):
-        """
-        Initialize the Browsecomp dataset loader.
-        
-        Args:
-            password: Decryption password for the dataset. If None, will try
-                     to read from BROWSECOMP_PASSWORD environment variable.
-        """
-        self.password = password
-        if not self.password:
-            import os
-            self.password = os.environ.get('BROWSECOMP_PASSWORD', '')
-        
-        self.dataset = None
-        
-    def load_dataset(self) -> pd.DataFrame:
-        """Load and decrypt the Browsecomp dataset."""
-        print("Loading Browsecomp dataset...")
-        
-        try:
-            # Download the dataset
-            df = pd.read_csv(DATASET_URL)
-            print(f"Downloaded {len(df)} encrypted questions")
-            
-            # Add index column for reference
-            df = df.reset_index(drop=True)
-            df['question_id'] = df.index + 1
-            
-            # Check for canary field in dataset (automatic password)
-            canary_password = None
-            if 'canary' in df.columns and len(df) > 0:
-                canary_password = df.iloc[0]['canary']
-                print(f"Found canary password in dataset")
-            
-            # Use provided password or canary from dataset
-            decryption_password = self.password or canary_password
-            
-            if decryption_password:
-                print("Decrypting questions...")
-                decrypted_rows = []
-                
-                for idx, row in df.iterrows():
-                    try:
-                        # Use the canary from the row or the provided password
-                        row_canary = row.get('canary', decryption_password)
-                        
-                        # Decrypt the problem and answer columns
-                        row_dict = row.to_dict()
-                        
-                        if 'problem' in row and pd.notna(row['problem']):
-                            row_dict['problem_decrypted'] = decrypt(row['problem'], row_canary)
-                            row_dict['problem_encrypted'] = row['problem']
-                        else:
-                            row_dict['problem_decrypted'] = "[No problem field]"
-                        
-                        if 'answer' in row and pd.notna(row['answer']):
-                            row_dict['answer_decrypted'] = decrypt(row['answer'], row_canary)
-                            row_dict['answer_encrypted'] = row['answer']
-                        else:
-                            row_dict['answer_decrypted'] = ""
-                            
-                        decrypted_rows.append(row_dict)
-                            
-                    except Exception as e:
-                        print(f"Error decrypting row {idx}: {e}")
-                        row_dict = row.to_dict()
-                        row_dict['problem_decrypted'] = f"[Decryption failed: {str(e)}]"
-                        row_dict['answer_decrypted'] = ""
-                        decrypted_rows.append(row_dict)
-                
-                df = pd.DataFrame(decrypted_rows)
-                print(f"Successfully decrypted {len(df)} questions")
-            else:
-                print("Warning: No password provided and no canary found, questions remain encrypted")
-                df['problem_decrypted'] = df.get('problem', '')
-                df['answer_decrypted'] = df.get('answer', '')
-            
-            # Normalize column names for consistency
-            df = self._normalize_columns(df)
-            
-            # Add difficulty level (all Browsecomp questions are considered level 1)
-            df['task'] = 1
-            
-            self.dataset = df
-            return df
-            
-        except Exception as e:
-            print(f"Error loading dataset: {e}")
-            raise
-    
-    def _normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Normalize column names to match expected format."""
-        # Map Browsecomp columns to standard format
-        column_mapping = {
-            'problem_decrypted': 'question',
-            'problem': 'question_encrypted',
-            'answer_decrypted': 'true_answer',
-            'answer': 'true_answer_encrypted',
-            'question_id': 'question_id'
-        }
-        
-        # Apply renaming
-        for old_col, new_col in column_mapping.items():
-            if old_col in df.columns:
-                df = df.rename(columns={old_col: new_col})
-        
-        # Ensure required columns exist
-        if 'question' not in df.columns:
-            if 'problem_decrypted' in df.columns:
-                df['question'] = df['problem_decrypted']
-            else:
-                raise ValueError("No question column found in dataset")
-        
-        if 'true_answer' not in df.columns:
-            if 'answer_decrypted' in df.columns:
-                df['true_answer'] = df['answer_decrypted']
-            elif 'answer' in df.columns:
-                df['true_answer'] = df['answer']
-            else:
-                print("Warning: No answer column found, setting empty answers")
-                df['true_answer'] = ''
-        
-        return df
-    
-    def get_questions(self, 
-                     indices: Optional[List[int]] = None,
-                     limit: Optional[int] = None) -> pd.DataFrame:
-        """
-        Get specific questions from the dataset.
-        
-        Args:
-            indices: List of question numbers (1-based) to retrieve
-            limit: Maximum number of questions to return
-            
-        Returns:
-            DataFrame with selected questions
-        """
-        if self.dataset is None:
-            self.load_dataset()
-        
-        df = self.dataset.copy()
-        
-        # Filter by specific indices if provided
-        if indices:
-            # Convert to 0-based indexing
-            zero_based_indices = [i - 1 for i in indices if i > 0]
-            valid_indices = [i for i in zero_based_indices if i < len(df)]
-            
-            if not valid_indices:
-                print(f"No valid question indices found. Available range: 1-{len(df)}")
-                return pd.DataFrame()
-            
-            df = df.iloc[valid_indices]
-        
-        # Apply limit if specified
-        if limit and not indices:
-            df = df.head(limit)
-        
-        return df
-    
-    def list_questions(self, limit: int = 20) -> None:
-        """Display available questions."""
-        if self.dataset is None:
-            self.load_dataset()
-        
-        print(f"\nAvailable Browsecomp questions (showing first {limit}):")
-        print("=" * 80)
-        
-        for idx in range(min(limit, len(self.dataset))):
-            row = self.dataset.iloc[idx]
-            question = row.get('question', row.get('problem_decrypted', '[Encrypted]'))
-            
-            # Truncate long questions
-            if isinstance(question, str):
-                question_preview = question[:60] + "..." if len(question) > 60 else question
-            else:
-                question_preview = "[No question text]"
-                
-            print(f"#{idx + 1:3d} {question_preview}")
-        
-        if len(self.dataset) > limit:
-            print(f"\n... and {len(self.dataset) - limit} more questions")
-        
-        print(f"\nTotal: {len(self.dataset)} questions")
-        
-        # Check if questions are actually decrypted
-        if len(self.dataset) > 0:
-            first_question = self.dataset.iloc[0].get('question', '')
-            if not first_question or first_question.startswith('['):
-                print("⚠️  Questions are encrypted. Set BROWSECOMP_PASSWORD to decrypt.")
-            else:
-                print("✓ Questions are decrypted and ready to use")
-
-
-def test_dataset_loading():
-    """Test the dataset loading functionality."""
-    dataset = BrowsecompDataset()
-    
-    try:
-        df = dataset.load_dataset()
-        print(f"\n✓ Loaded {len(df)} questions")
-        print(f"Columns: {list(df.columns)}")
-        
-        # Show first question
-        if len(df) > 0:
-            first = df.iloc[0]
-            print(f"\nFirst question (truncated):")
-            question_text = str(first.get('question', ''))
-            print(f"  Question: {question_text[:100]}...")
-            print(f"  Answer: {first.get('true_answer', 'N/A')}")
-            
-    except Exception as e:
-        print(f"✗ Error: {e}")
-        return False
-    
-    return True
-
-
-if __name__ == "__main__":
-    test_dataset_loading()
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_eval_server.py b/eval-server/python/evals/browsecomp_eval_server.py
deleted file mode 100755
index 753e7cf..0000000
--- a/eval-server/python/evals/browsecomp_eval_server.py
+++ /dev/null
@@ -1,836 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Evaluation Server
-
-Command-line controlled eval processing server that loads browsecomp questions
-into a stack and distributes them one per client connection.
-"""
-
-import argparse
-import asyncio
-import json
-import logging
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-
-# Add eval-server src to path
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-# Add current directory (evals) to path for browsecomp_dataset import
-sys.path.insert(0, str(Path(__file__).parent))
-
-from bo_eval_server import EvalServer, EvaluationStack
-from browsecomp_dataset import BrowsecompDataset
-from browsecomp_scorer import question_scorer, extract_answer, extract_confidence
-
-
-def log_evaluation_event(logger: logging.Logger, event_type: str, data: Dict[str, Any]) -> None:
-    """
-    Log a structured evaluation event.
-    
-    Args:
-        logger: Logger instance
-        event_type: Type of event (client_connect, evaluation_start, evaluation_complete, etc.)
-        data: Event data to log
-    """
-    log_entry = {
-        "timestamp": datetime.now().isoformat(),
-        "event_type": event_type,
-        **data
-    }
-    logger.info(f"EVENT: {json.dumps(log_entry)}")
-
-
-def setup_logging(log_dir: str = "./logs") -> logging.Logger:
-    """
-    Set up logging to both console and file.
-    
-    Args:
-        log_dir: Directory to save log files
-        
-    Returns:
-        Configured logger
-    """
-    # Ensure logs directory exists
-    Path(log_dir).mkdir(exist_ok=True)
-    
-    # Create timestamp for log file
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_file = Path(log_dir) / f"browsecomp_eval_server_{timestamp}.log"
-    
-    # Create logger
-    logger = logging.getLogger('browsecomp_eval_server')
-    logger.setLevel(logging.INFO)
-    
-    # Clear any existing handlers
-    logger.handlers.clear()
-    
-    # Create formatter
-    formatter = logging.Formatter(
-        '%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
-        datefmt='%Y-%m-%d %H:%M:%S'
-    )
-    
-    # Console handler (for immediate feedback)
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_handler.setLevel(logging.INFO)
-    console_handler.setFormatter(formatter)
-    logger.addHandler(console_handler)
-    
-    # File handler (for persistent logging)
-    file_handler = logging.FileHandler(log_file)
-    file_handler.setLevel(logging.INFO)
-    file_handler.setFormatter(formatter)
-    logger.addHandler(file_handler)
-    
-    logger.info(f"Logging initialized - saving to {log_file}")
-    return logger
-
-
-def extract_response_text(result: Any) -> str:
-    """
-    Extract the actual response text from BrowserOperator's structured response format.
-    
-    Args:
-        result: The response from BrowserOperator (could be string, dict, or structured format)
-        
-    Returns:
-        The text content that should be scored
-    """
-    # Handle partial results with errors first
-    if isinstance(result, dict) and result.get('partial') and result.get('error'):
-        # This is our error structure, fallback to string representation
-        return str(result)
-    
-    # Handle structured BrowserOperator response
-    if isinstance(result, dict):
-        # Look for messages array (main response structure)
-        if 'messages' in result and isinstance(result['messages'], list):
-            response_parts = []
-            
-            for message in result['messages']:
-                if isinstance(message, dict):
-                    # Model responses with answers
-                    if message.get('entity') == 'model' and message.get('answer'):
-                        response_parts.append(message['answer'])
-                    # Tool results
-                    elif message.get('entity') == 'tool_result' and message.get('resultText'):
-                        response_parts.append(message['resultText'])
-                    # User messages
-                    elif message.get('entity') == 'user' and message.get('text'):
-                        response_parts.append(message['text'])
-            
-            if response_parts:
-                return '\n'.join(response_parts)
-        
-        # Fallback: look for common response fields
-        for field in ['answer', 'response', 'result', 'text', 'content']:
-            if field in result and result[field]:
-                return str(result[field])
-    
-    # Fallback to string representation
-    return str(result)
-
-
-def convert_question_to_evaluation(question_row: Dict[str, Any], question_id: int) -> Dict[str, Any]:
-    """
-    Convert a browsecomp question to the evaluation format expected by eval-server.
-    
-    Args:
-        question_row: Row from the browsecomp dataset DataFrame
-        question_id: Question ID number (1-based)
-        
-    Returns:
-        Evaluation object compatible with eval-server
-    """
-    question_text = question_row.get('question', question_row.get('problem_decrypted', ''))
-    true_answer = question_row.get('true_answer', question_row.get('answer_decrypted', ''))
-    
-    return {
-        "id": f"browsecomp_q{question_id}",
-        "name": f"Browsecomp Question {question_id}",
-        "description": f"Web browsing evaluation question from browsecomp dataset",
-        "tool": "chat",
-        "input": {
-            "message": f"{question_text}\n\nPlease provide your response in the following format:\n\nExplanation: [Step-by-step reasoning and information gathering]\n\nExact Answer: [The precise answer to the question]\n\nConfidence Score: [Confidence as a percentage, e.g., 85%]"
-        },
-        # Store original data for later reference/scoring
-        "metadata": {
-            "question_id": question_id,
-            "true_answer": true_answer,
-            "original_question": question_text,
-            "dataset": "browsecomp"
-        }
-    }
-
-
-def load_browsecomp_evaluations(
-    limit: Optional[int] = None,
-    questions: Optional[List[int]] = None,
-    start: Optional[int] = None,
-    end: Optional[int] = None,
-    password: Optional[str] = None
-) -> List[Dict[str, Any]]:
-    """
-    Load browsecomp questions and convert them to evaluation format.
-    
-    Args:
-        limit: Maximum number of questions to load
-        questions: Specific question numbers to load (1-based)
-        start: Start question number for range selection (1-based, inclusive)
-        end: End question number for range selection (1-based, inclusive)
-        password: Decryption password (optional, auto-detected from dataset)
-        
-    Returns:
-        List of evaluation objects
-    """
-    print("📚 Loading Browsecomp dataset...")
-    
-    # Load dataset
-    dataset = BrowsecompDataset(password=password)
-    
-    try:
-        df = dataset.load_dataset()
-        print(f"✅ Loaded {len(df)} questions from dataset")
-    except Exception as e:
-        print(f"❌ Failed to load dataset: {e}")
-        return []
-    
-    # Get specific questions, range, or apply limit
-    if questions:
-        print(f"📋 Filtering to specific questions: {questions}")
-        df_filtered = dataset.get_questions(indices=questions)
-    elif start is not None or end is not None:
-        # Handle range selection
-        if start is not None and end is not None:
-            if start > end:
-                print(f"❌ Invalid range: start ({start}) cannot be greater than end ({end})")
-                return []
-            if start < 1:
-                print(f"❌ Invalid start: question numbers are 1-based, got {start}")
-                return []
-            if end > len(df):
-                print(f"⚠️  End question {end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                end = len(df)
-            
-            print(f"📋 Loading questions {start} to {end} (range of {end - start + 1} questions)")
-            # Convert to 0-based indexing for pandas
-            range_questions = list(range(start, end + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-        elif start is not None:
-            # Only start specified, go to end of dataset
-            if start < 1:
-                print(f"❌ Invalid start: question numbers are 1-based, got {start}")
-                return []
-            if start > len(df):
-                print(f"❌ Start question {start} exceeds dataset size ({len(df)})")
-                return []
-            
-            print(f"📋 Loading questions from {start} to end ({len(df) - start + 1} questions)")
-            range_questions = list(range(start, len(df) + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-        else:
-            # Only end specified, start from beginning
-            if end < 1:
-                print(f"❌ Invalid end: question numbers are 1-based, got {end}")
-                return []
-            if end > len(df):
-                print(f"⚠️  End question {end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                end = len(df)
-            
-            print(f"📋 Loading questions 1 to {end} ({end} questions)")
-            range_questions = list(range(1, end + 1))
-            df_filtered = dataset.get_questions(indices=range_questions)
-    elif limit:
-        print(f"📋 Limiting to first {limit} questions")
-        df_filtered = dataset.get_questions(limit=limit)
-    else:
-        print(f"📋 Loading all {len(df)} questions")
-        df_filtered = df
-    
-    if df_filtered.empty:
-        print("❌ No questions found with the specified criteria")
-        return []
-    
-    print(f"🔄 Converting {len(df_filtered)} questions to evaluation format...")
-    
-    # Convert to evaluation format
-    evaluations = []
-    for idx, row in df_filtered.iterrows():
-        question_id = row.get('question_id', idx + 1)
-        evaluation = convert_question_to_evaluation(row.to_dict(), question_id)
-        evaluations.append(evaluation)
-        
-        # Show preview of first few questions
-        if len(evaluations) <= 3:
-            question_preview = evaluation['input']['message'][:80] + "..."
-            print(f"   • Q{question_id}: {question_preview}")
-    
-    if len(evaluations) > 3:
-        print(f"   ... and {len(evaluations) - 3} more questions")
-    
-    print(f"✅ Created {len(evaluations)} evaluation objects")
-    return evaluations
-
-
-def main():
-    """Main function for the browsecomp evaluation server."""
-    return asyncio.run(async_main())
-
-async def async_main():
-    """Async main function for the browsecomp evaluation server."""
-    parser = argparse.ArgumentParser(description="Browsecomp Evaluation Server")
-    parser.add_argument(
-        "--limit", 
-        type=int, 
-        help="Maximum number of questions to load (default: all 1,266 questions)"
-    )
-    parser.add_argument(
-        "--questions", 
-        type=int, 
-        nargs="+", 
-        help="Specific question numbers to load (1-based, e.g. --questions 1 5 10)"
-    )
-    parser.add_argument(
-        "--start", 
-        type=int, 
-        help="Start question number for range selection (1-based, inclusive)"
-    )
-    parser.add_argument(
-        "--end", 
-        type=int, 
-        help="End question number for range selection (1-based, inclusive)"
-    )
-    parser.add_argument(
-        "--port", 
-        type=int, 
-        default=8080, 
-        help="Server port (default: 8080)"
-    )
-    parser.add_argument(
-        "--host", 
-        type=str, 
-        default="127.0.0.1", 
-        help="Server host (default: 127.0.0.1)"
-    )
-    parser.add_argument(
-        "--auth-key", 
-        type=str, 
-        default="browsecomp-eval", 
-        help="Authentication key (default: browsecomp-eval)"
-    )
-    parser.add_argument(
-        "--password", 
-        type=str, 
-        help="Dataset decryption password (optional, auto-detected from dataset)"
-    )
-    parser.add_argument(
-        "--list", 
-        action="store_true", 
-        help="List available questions without starting server"
-    )
-    parser.add_argument(
-        "--list-limit", 
-        type=int, 
-        default=20, 
-        help="Number of questions to show when listing (default: 20)"
-    )
-    parser.add_argument(
-        "--save-results", 
-        action="store_true", 
-        help="Save evaluation results to JSON file on completion"
-    )
-    parser.add_argument(
-        "--timeout", 
-        type=float, 
-        default=3600.0, 
-        help="Timeout for each evaluation in seconds (default: 3600s/60min)"
-    )
-    
-    args = parser.parse_args()
-    
-    # Setup logging
-    logger = setup_logging("./logs")
-    
-    # Handle list mode
-    if args.list:
-        logger.info("📋 Listing available browsecomp questions...")
-        dataset = BrowsecompDataset(password=args.password)
-        
-        # Apply filtering for list mode if range or specific questions are specified
-        if args.questions or args.start is not None or args.end is not None:
-            # Load the full dataset first
-            df = dataset.load_dataset()
-            
-            # Apply the same filtering logic as the main function
-            if args.questions:
-                print(f"📋 Showing specific questions: {args.questions}")
-                df_filtered = dataset.get_questions(indices=args.questions)
-            elif args.start is not None or args.end is not None:
-                # Handle range selection (same logic as in load_browsecomp_evaluations)
-                if args.start is not None and args.end is not None:
-                    if args.start > args.end:
-                        print(f"❌ Invalid range: start ({args.start}) cannot be greater than end ({args.end})")
-                        return 1
-                    if args.start < 1:
-                        print(f"❌ Invalid start: question numbers are 1-based, got {args.start}")
-                        return 1
-                    if args.end > len(df):
-                        print(f"⚠️  End question {args.end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                        args.end = len(df)
-                    
-                    print(f"📋 Showing questions {args.start} to {args.end}")
-                    range_questions = list(range(args.start, args.end + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-                elif args.start is not None:
-                    if args.start < 1:
-                        print(f"❌ Invalid start: question numbers are 1-based, got {args.start}")
-                        return 1
-                    if args.start > len(df):
-                        print(f"❌ Start question {args.start} exceeds dataset size ({len(df)})")
-                        return 1
-                    
-                    print(f"📋 Showing questions from {args.start} to end")
-                    range_questions = list(range(args.start, len(df) + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-                else:  # args.end is not None
-                    if args.end < 1:
-                        print(f"❌ Invalid end: question numbers are 1-based, got {args.end}")
-                        return 1
-                    if args.end > len(df):
-                        print(f"⚠️  End question {args.end} exceeds dataset size ({len(df)}), using {len(df)} instead")
-                        args.end = len(df)
-                    
-                    print(f"📋 Showing questions 1 to {args.end}")
-                    range_questions = list(range(1, args.end + 1))
-                    df_filtered = dataset.get_questions(indices=range_questions)
-            
-            # Display filtered results
-            if not df_filtered.empty:
-                print("=" * 80)
-                for idx, row in df_filtered.iterrows():
-                    question_id = row.get('question_id', idx + 1)
-                    question = row.get('question', row.get('problem_decrypted', '[Encrypted]'))
-                    
-                    if isinstance(question, str):
-                        question_preview = question[:60] + "..." if len(question) > 60 else question
-                    else:
-                        question_preview = str(question)[:60] + "..."
-                    
-                    print(f"#{question_id:3d} {question_preview}")
-                
-                print(f"\nShowing {len(df_filtered)} question(s)")
-            else:
-                print("❌ No questions found with the specified criteria")
-        else:
-            # Standard list mode
-            dataset.list_questions(limit=args.list_limit)
-        
-        return
-    
-    logger.info("🚀 Starting Browsecomp Evaluation Server")
-    logger.info("=" * 60)
-    
-    # Validate arguments
-    if args.questions and (args.start is not None or args.end is not None):
-        print("❌ Cannot use --questions together with --start/--end. Choose one approach.")
-        return 1
-    
-    if args.limit and (args.start is not None or args.end is not None):
-        print("❌ Cannot use --limit together with --start/--end. Choose one approach.")
-        return 1
-    
-    # Load evaluations
-    evaluations = load_browsecomp_evaluations(
-        limit=args.limit,
-        questions=args.questions,
-        start=args.start,
-        end=args.end,
-        password=args.password
-    )
-    
-    if not evaluations:
-        print("❌ No evaluations loaded. Exiting.")
-        return 1
-    
-    # Create evaluation stack and populate it
-    stack = EvaluationStack()
-    
-    print(f"\n📚 Loading {len(evaluations)} evaluations into stack...")
-    for evaluation in evaluations:
-        stack.push(evaluation)
-    
-    print(f"✅ Stack loaded with {stack.size()} evaluations")
-    print(f"🔝 Top evaluation: {stack.peek()['name'] if stack.peek() else 'None'}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key=args.auth_key,
-        host=args.host,
-        port=args.port,
-        log_level='INFO',
-        log_dir='./logs',
-        rpc_timeout=args.timeout,
-    )
-    
-    # Track processed evaluations
-    completed_evaluations = []
-    failed_evaluations = []
-    client_evaluation_map = {}  # client_id -> evaluation_id mapping
-    
-    print(f"\n🌐 Server Configuration:")
-    print(f"   Host: {args.host}")
-    print(f"   Port: {args.port}")
-    print(f"   Auth Key: {args.auth_key}")
-    print(f"   Timeout: {args.timeout}s ({args.timeout/60:.1f} minutes)")
-    print(f"   Total Evaluations: {stack.size()}")
-    
-    @server.on_connect
-    async def handle_client(client):
-        logger.info(f'🎉 CLIENT CONNECTED!')
-        logger.info(f'   - Client ID: {client.id}')
-        logger.info(f'   - Client tabId: {client.tab_id}')
-        logger.info(f'   - Client info: {client.get_info()}')
-        
-        # Log structured client connection event
-        log_evaluation_event(logger, "client_connected", {
-            "client_id": client.id,
-            "tab_id": client.tab_id,
-            "client_info": client.get_info(),
-            "stack_remaining": stack.size()
-        })
-        
-        # Check if we have evaluations left in the stack
-        if stack.is_empty():
-            print('⚠️  No more evaluations in stack for this client')
-            print('   All browsecomp questions have been distributed')
-            await client.send_message({
-                "type": "no_evaluations",
-                "message": "All browsecomp questions have been distributed"
-            })
-            return
-        
-        # Pop the next evaluation from the stack (ONE evaluation per client!)
-        evaluation = stack.pop()
-        evaluation_id = evaluation['id']
-        question_id = evaluation['metadata']['question_id']
-        
-        print(f'📋 Assigning evaluation: "{evaluation["name"]}" (Question #{question_id})')
-        print(f'📊 Remaining evaluations in stack: {stack.size()}')
-        
-        # Track which evaluation was sent to which client
-        client_evaluation_map[client.id] = evaluation_id
-        
-        # Log evaluation assignment
-        log_evaluation_event(logger, "evaluation_assigned", {
-            "client_id": client.id,
-            "evaluation_id": evaluation_id,
-            "question_id": question_id,
-            "evaluation_name": evaluation["name"],
-            "stack_remaining": stack.size(),
-            "true_answer": evaluation['metadata']['true_answer']
-        })
-        
-        try:
-            print(f'🔄 Starting evaluation... (timeout: {args.timeout}s)')
-            result = await client.evaluate(evaluation, timeout=args.timeout)
-            
-            print('✅ Evaluation completed!')
-            
-            # Extract the true answer from evaluation metadata
-            true_answer = evaluation['metadata']['true_answer']
-            
-            # Check if this is a partial result with errors
-            is_partial_result = (isinstance(result, dict) and 
-                               result.get('partial') and 
-                               result.get('error'))
-            
-            # Extract the actual response text from the structured format
-            response_text = extract_response_text(result)
-            
-            # Show structured response details if available
-            if isinstance(result, dict) and 'messages' in result:
-                message_count = len(result.get('messages', []))
-                model_used = result.get('modelUsed', 'unknown')
-                execution_time = result.get('executionTime', 0)
-                tool_calls = len(result.get('toolCalls', []))
-                print(f'📊 Response structure: {message_count} messages, {tool_calls} tool calls, {model_used} model, {execution_time}ms')
-            else:
-                print(f'📊 Response for "{evaluation["name"]}": {response_text[:100]}...')
-            
-            # Score the response
-            is_correct = question_scorer(response_text, true_answer)
-            extracted_answer = extract_answer(response_text)
-            confidence = extract_confidence(response_text)
-            
-            # Print scoring results
-            print(f'🎯 Scoring Results:')
-            print(f'   - True Answer: {true_answer}')
-            print(f'   - Extracted Answer: {extracted_answer}')
-            print(f'   - Correct: {"✅ YES" if is_correct else "❌ NO"}')
-            print(f'   - Confidence: {confidence}%')
-            
-            if is_partial_result:
-                print(f'⚠️  Note: Result obtained after retries with errors:')
-                print(f'   - Error: {result.get("error", "Unknown error")}')
-                print(f'   - Attempts: {result.get("attempts", "Unknown")}')
-                print(f'   - The BrowserOperator had issues but provided a response')
-            
-            # Log evaluation completion
-            log_evaluation_event(logger, "evaluation_completed", {
-                "client_id": client.id,
-                "evaluation_id": evaluation_id,
-                "question_id": question_id,
-                "evaluation_name": evaluation["name"],
-                "is_correct": is_correct,
-                "extracted_answer": extracted_answer,
-                "true_answer": true_answer,
-                "confidence": confidence,
-                "is_partial_result": is_partial_result,
-                "model_used": result.get('modelUsed') if isinstance(result, dict) else None,
-                "execution_time_ms": result.get('executionTime') if isinstance(result, dict) else None,
-                "tool_calls_count": len(result.get('toolCalls', [])) if isinstance(result, dict) else None
-            })
-            
-            completed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'result': result,
-                'question_id': question_id,
-                'scoring': {
-                    'is_correct': is_correct,
-                    'true_answer': true_answer,
-                    'extracted_answer': extracted_answer,
-                    'confidence': confidence
-                },
-                'partial_result': is_partial_result,
-                'execution_info': {
-                    'had_errors': is_partial_result,
-                    'error_message': result.get('error') if is_partial_result else None,
-                    'retry_attempts': result.get('attempts') if is_partial_result else 1,
-                    'model_used': result.get('modelUsed') if isinstance(result, dict) else None,
-                    'execution_time_ms': result.get('executionTime') if isinstance(result, dict) else None,
-                    'tool_calls_count': len(result.get('toolCalls', [])) if isinstance(result, dict) else None,
-                    'messages_count': len(result.get('messages', [])) if isinstance(result, dict) else None
-                }
-            })
-            
-        except Exception as e:
-            error_msg = str(e)
-            print(f'❌ Evaluation "{evaluation["name"]}" failed: {error_msg}')
-            
-            # Check if this is a tool execution error that might still be running
-            if "Tool execution failed" in error_msg or "-32000" in error_msg:
-                print(f'⚠️  Note: BrowserOperator may still be processing this question')
-                print(f'   The client reported an error but might continue execution')
-                print(f'   Consider increasing timeout with --timeout parameter')
-            
-            # Log evaluation failure
-            log_evaluation_event(logger, "evaluation_failed", {
-                "client_id": client.id,
-                "evaluation_id": evaluation_id,
-                "question_id": question_id,
-                "evaluation_name": evaluation["name"],
-                "error_message": error_msg,
-                "is_tool_execution_error": "Tool execution failed" in error_msg or "-32000" in error_msg,
-                "true_answer": evaluation['metadata']['true_answer']
-            })
-            
-            failed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'error': error_msg,
-                'question_id': question_id,
-            })
-        
-        # Send completion message
-        try:
-            await client.send_message({
-                "type": "evaluation_complete",
-                "evaluation_id": evaluation_id,
-                "evaluation_name": evaluation["name"],
-                "question_id": question_id,
-                "status": "completed" if evaluation_id not in [e['evaluation']['id'] for e in failed_evaluations] else "failed"
-            })
-        except Exception as e:
-            print(f'   ⚠️  Failed to send completion message: {e}')
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        client_id = client_info["id"]
-        print(f'\n🔌 Client disconnected: {client_id}')
-        
-        # Show what evaluation this client was working on
-        evaluation_id = None
-        if client_id in client_evaluation_map:
-            evaluation_id = client_evaluation_map[client_id]
-            print(f'   Was working on: {evaluation_id}')
-        
-        # Log client disconnect
-        log_evaluation_event(logger, "client_disconnected", {
-            "client_id": client_id,
-            "evaluation_id": evaluation_id,
-            "completed_count": len(completed_evaluations),
-            "failed_count": len(failed_evaluations),
-            "stack_remaining": stack.size()
-        })
-        
-        # Show final statistics
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations)
-        remaining = stack.size()
-        total_original = len(evaluations)
-        
-        print(f'\n📊 Current Statistics:')
-        print(f'   ✅ Completed: {total_completed}/{total_original}')
-        print(f'   ❌ Failed: {total_failed}/{total_original}')
-        print(f'   📚 Remaining: {remaining}/{total_original}')
-        print(f'   🔄 In Progress: {total_original - total_completed - total_failed - remaining}')
-        
-        # Calculate scoring statistics
-        if completed_evaluations:
-            correct_count = sum(1 for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False))
-            partial_count = sum(1 for item in completed_evaluations if item.get('partial_result', False))
-            accuracy = correct_count / total_completed * 100 if total_completed > 0 else 0
-            avg_confidence = sum(item.get('scoring', {}).get('confidence', 0) for item in completed_evaluations) / total_completed if total_completed > 0 else 0
-            
-            print(f'\n🎯 Scoring Statistics:')
-            print(f'   📊 Accuracy: {accuracy:.1f}% ({correct_count}/{total_completed} correct)')
-            print(f'   💡 Average Confidence: {avg_confidence:.1f}%')
-            if partial_count > 0:
-                print(f'   ⚠️  Partial Results: {partial_count}/{total_completed} had execution errors but recovered')
-        
-        if completed_evaluations:
-            print(f'\n🎯 Recently Completed Evaluations:')
-            for item in completed_evaluations[-3:]:  # Show last 3
-                eval_name = item['evaluation']['name']
-                question_id = item['question_id']
-                client_id_short = item['client_id'][:8]  # Short client ID
-                is_correct = item.get('scoring', {}).get('is_correct', False)
-                confidence = item.get('scoring', {}).get('confidence', 0)
-                is_partial = item.get('partial_result', False)
-                status_emoji = '✅' if is_correct else '❌'
-                partial_indicator = '⚠️' if is_partial else ''
-                print(f'   • Q{question_id}: {eval_name} {status_emoji}{partial_indicator} (confidence: {confidence}%, client: {client_id_short})')
-        
-        if failed_evaluations:
-            print(f'\n💥 Failed Evaluations:')
-            for item in failed_evaluations:
-                eval_name = item['evaluation']['name']
-                question_id = item['question_id']
-                error = item['error']
-                print(f'   • Q{question_id}: {eval_name} - {error}')
-    
-    # Start server
-    try:
-        print(f'\n🚀 Starting server on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your BrowserOperator to start processing browsecomp questions')
-        print('   Press Ctrl+C to stop the server')
-        print('=' * 60)
-        
-        await server.start()
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-        # Show final summary
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations)
-        total_processed = total_completed + total_failed
-        
-        if total_processed > 0:
-            print(f'\n📈 Final Summary:')
-            print(f'   Total processed: {total_processed}/{len(evaluations)}')
-            print(f'   Success rate: {total_completed/total_processed*100:.1f}%')
-            print(f'   Completed: {total_completed}')
-            print(f'   Failed: {total_failed}')
-            
-            # Final scoring statistics
-            if completed_evaluations:
-                correct_count = sum(1 for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False))
-                accuracy = correct_count / total_completed * 100 if total_completed > 0 else 0
-                avg_confidence = sum(item.get('scoring', {}).get('confidence', 0) for item in completed_evaluations) / total_completed if total_completed > 0 else 0
-                
-                print(f'\n🏆 Final Scoring Results:')
-                print(f'   📊 Overall Accuracy: {accuracy:.1f}% ({correct_count}/{total_completed} correct)')
-                print(f'   💡 Average Confidence: {avg_confidence:.1f}%')
-                
-                # Show confidence correlation
-                correct_items = [item for item in completed_evaluations if item.get('scoring', {}).get('is_correct', False)]
-                incorrect_items = [item for item in completed_evaluations if not item.get('scoring', {}).get('is_correct', False)]
-                
-                if correct_items:
-                    avg_conf_correct = sum(item.get('scoring', {}).get('confidence', 0) for item in correct_items) / len(correct_items)
-                    print(f'   ✅ Avg confidence when correct: {avg_conf_correct:.1f}%')
-                
-                if incorrect_items:
-                    avg_conf_incorrect = sum(item.get('scoring', {}).get('confidence', 0) for item in incorrect_items) / len(incorrect_items)
-                    print(f'   ❌ Avg confidence when incorrect: {avg_conf_incorrect:.1f}%')
-                
-                # Save results to JSON file
-                if completed_evaluations and (args.save_results or total_completed == len(evaluations)):
-                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                    results_file = f"browsecomp_eval_results_{timestamp}.json"
-                    
-                    results_data = {
-                        "timestamp": timestamp,
-                        "total_questions": len(evaluations),
-                        "completed": total_completed,
-                        "failed": total_failed,
-                        "accuracy": accuracy,
-                        "average_confidence": avg_confidence,
-                        "evaluations": completed_evaluations
-                    }
-                    
-                    with open(results_file, 'w') as f:
-                        json.dump(results_data, f, indent=2)
-                    
-                    print(f'\n💾 Results saved to: {results_file}')
-                
-                # Log final session summary
-                log_evaluation_event(logger, "session_completed", {
-                    "total_questions": len(evaluations),
-                    "completed": total_completed,
-                    "failed": total_failed,
-                    "accuracy": accuracy,
-                    "average_confidence": avg_confidence,
-                    "partial_results": partial_count,
-                    "results_file": results_file if 'results_file' in locals() else None
-                })
-        
-    except Exception as e:
-        logger.error(f'💥 Server error: {e}')
-        log_evaluation_event(logger, "server_error", {
-            "error_message": str(e),
-            "completed_count": len(completed_evaluations),
-            "failed_count": len(failed_evaluations)
-        })
-        
-        if server.is_running():
-            await server.stop()
-        return 1
-    
-    logger.info("✅ Server session ended successfully")
-    return 0
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        exit_code = main()
-        sys.exit(exit_code)
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/evals/browsecomp_scorer.py b/eval-server/python/evals/browsecomp_scorer.py
deleted file mode 100644
index 17561e6..0000000
--- a/eval-server/python/evals/browsecomp_scorer.py
+++ /dev/null
@@ -1,328 +0,0 @@
-#!/usr/bin/env python3
-"""
-Browsecomp Benchmark Scorer
-Implements the scoring logic for Browsecomp benchmark questions
-"""
-
-import re
-from typing import Union, Any, Optional, Dict
-
-
-# Grader template based on the original Browsecomp evaluation
-GRADER_TEMPLATE = """
-Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
-
-[question]: {question}
-
-[response]: {response}
-
-Your judgement must be in the format and criteria specified below:
-
-extracted_final_answer: The final exact answer extracted from the [response]. Put the answer as 'None' if there is no exact, final answer to extract.
-
-[correct_answer]: {correct_answer}
-
-reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer.
-
-correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise.
-
-confidence: The extracted confidence score between 0% and 100% from [response]. Put 100 if no confidence score available.
-""".strip()
-
-
-def extract_answer(response: str) -> str:
-    """Extract the exact answer from a response."""
-    # Look for "Exact Answer:" pattern
-    patterns = [
-        r'[Ee]xact [Aa]nswer:\s*([^\n]+)',
-        r'[Ff]inal [Aa]nswer:\s*([^\n]+)',
-        r'[Aa]nswer:\s*([^\n]+)',
-    ]
-    
-    for pattern in patterns:
-        match = re.search(pattern, response)
-        if match:
-            return match.group(1).strip()
-    
-    # If no pattern found, try to extract from the end of response
-    lines = response.strip().split('\n')
-    if lines:
-        # Check last few lines for answer-like content
-        for line in reversed(lines[-3:]):
-            line = line.strip()
-            if line and not line.startswith('[') and not line.startswith('Confidence'):
-                return line
-    
-    return ""
-
-
-def extract_confidence(response: str) -> float:
-    """Extract confidence score from response."""
-    patterns = [
-        r'[Cc]onfidence\s*[Ss]core:\s*(\d+)%',
-        r'[Cc]onfidence:\s*(\d+)%',
-        r'(\d+)%\s*confident',
-        r'I am (\d+)% confident',
-        r'(\d+)%\s*confidence',
-    ]
-    
-    for pattern in patterns:
-        match = re.search(pattern, response)
-        if match:
-            return float(match.group(1))
-    
-    return 100.0  # Default to 100% if not specified
-
-
-def normalize_answer(answer: str) -> str:
-    """Normalize answer for comparison."""
-    if not isinstance(answer, str):
-        answer = str(answer)
-    
-    # Convert to lowercase
-    answer = answer.lower().strip()
-    
-    # Remove common punctuation at the end
-    answer = answer.rstrip('.,!?;:')
-    
-    # Normalize whitespace
-    answer = ' '.join(answer.split())
-    
-    return answer
-
-
-def extract_number(text: str) -> Union[float, None]:
-    """Extract a number from text."""
-    # Remove common separators and convert to standard format
-    text = text.replace(',', '')
-    
-    # Try to find numbers with various patterns
-    patterns = [
-        r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?',  # Scientific notation
-        r'[-+]?\d+\.?\d*',  # Regular numbers
-        r'[-+]?\d+',  # Integers
-    ]
-    
-    for pattern in patterns:
-        matches = re.findall(pattern, text)
-        if matches:
-            try:
-                # Return the first valid number
-                return float(matches[0])
-            except ValueError:
-                continue
-    
-    return None
-
-
-def compare_numeric_answers(pred: str, true: str, tolerance: float = 0.01) -> bool:
-    """Compare numeric answers with tolerance."""
-    pred_num = extract_number(pred)
-    true_num = extract_number(true)
-    
-    if pred_num is None or true_num is None:
-        return False
-    
-    # Check relative tolerance for non-zero values
-    if true_num != 0:
-        relative_error = abs(pred_num - true_num) / abs(true_num)
-        return relative_error <= tolerance
-    else:
-        # For zero values, use absolute tolerance
-        return abs(pred_num - true_num) <= tolerance
-
-
-def question_scorer(prediction: str, true_answer: str) -> bool:
-    """
-    Score a prediction against the true answer.
-    Returns True if the prediction is considered correct.
-    
-    This is a simplified scorer for quick evaluation.
-    For production use, consider using grade_with_llm for more accurate grading.
-    """
-    if not prediction or not true_answer:
-        return False
-    
-    # Extract the answer part from the prediction
-    extracted_answer = extract_answer(prediction)
-    if not extracted_answer:
-        extracted_answer = prediction
-    
-    # Normalize both answers
-    pred_norm = normalize_answer(extracted_answer)
-    true_norm = normalize_answer(true_answer)
-    
-    # Exact match after normalization
-    if pred_norm == true_norm:
-        return True
-    
-    # Check if the true answer is contained in the prediction
-    if true_norm in pred_norm:
-        return True
-    
-    # Check numeric answers
-    if any(char.isdigit() for char in true_answer):
-        if compare_numeric_answers(extracted_answer, true_answer):
-            return True
-    
-    # Check for common variations
-    # Handle yes/no answers
-    if true_norm in ['yes', 'no']:
-        if true_norm == 'yes' and pred_norm in ['yes', 'true', 'correct', 'affirmative']:
-            return True
-        if true_norm == 'no' and pred_norm in ['no', 'false', 'incorrect', 'negative']:
-            return True
-    
-    return False
-
-
-def grade_with_llm(question: str, correct_answer: str, response: str,
-                   grader_function: Optional[callable] = None) -> Dict[str, Any]:
-    """
-    Grade a response using an LLM grader.
-    
-    Args:
-        question: The original question
-        correct_answer: The correct answer
-        response: The model's response
-        grader_function: Optional function to call the grader LLM
-        
-    Returns:
-        Dictionary with grading results
-    """
-    if not grader_function:
-        # If no grader function provided, use simple scoring
-        is_correct = question_scorer(response, correct_answer)
-        confidence = extract_confidence(response)
-        
-        return {
-            'is_correct': is_correct,
-            'confidence': confidence,
-            'reasoning': 'Graded using rule-based scorer',
-            'extracted_answer': extract_answer(response)
-        }
-    
-    # Format the grading prompt
-    grader_prompt = GRADER_TEMPLATE.format(
-        question=question,
-        correct_answer=correct_answer,
-        response=response,
-    )
-    
-    # Call the grader
-    grading_response = grader_function(grader_prompt)
-    
-    # Parse the grading response
-    is_correct = False
-    confidence = 100.0
-    reasoning = ""
-    extracted_answer = ""
-    
-    # Look for patterns in grading response
-    correct_match = re.search(r"correct:\s*(yes|no)", grading_response.lower())
-    if correct_match:
-        is_correct = correct_match.group(1) == "yes"
-    
-    confidence_match = re.search(r"confidence:\s*(\d+)", grading_response)
-    if confidence_match:
-        confidence = float(confidence_match.group(1))
-    
-    reasoning_match = re.search(r"reasoning:\s*([^\n]+)", grading_response, re.IGNORECASE)
-    if reasoning_match:
-        reasoning = reasoning_match.group(1).strip()
-    
-    answer_match = re.search(r"extracted_final_answer:\s*([^\n]+)", grading_response, re.IGNORECASE)
-    if answer_match:
-        extracted_answer = answer_match.group(1).strip()
-    
-    return {
-        'is_correct': is_correct,
-        'confidence': confidence,
-        'reasoning': reasoning,
-        'extracted_answer': extracted_answer,
-        'grader_response': grading_response
-    }
-
-
-def evaluate_predictions(predictions: list, true_answers: list) -> dict:
-    """
-    Evaluate a list of predictions against true answers.
-    Returns statistics about the evaluation.
-    """
-    if len(predictions) != len(true_answers):
-        raise ValueError("Predictions and true answers must have the same length")
-    
-    results = {
-        'total': len(predictions),
-        'correct': 0,
-        'incorrect': 0,
-        'details': [],
-        'average_confidence': 0.0
-    }
-    
-    total_confidence = 0.0
-    
-    for pred, true in zip(predictions, true_answers):
-        is_correct = question_scorer(pred, true)
-        confidence = extract_confidence(pred)
-        
-        results['details'].append({
-            'prediction': pred,
-            'true_answer': true,
-            'correct': is_correct,
-            'confidence': confidence,
-            'extracted_answer': extract_answer(pred)
-        })
-        
-        if is_correct:
-            results['correct'] += 1
-        else:
-            results['incorrect'] += 1
-        
-        total_confidence += confidence
-    
-    results['accuracy'] = results['correct'] / results['total'] if results['total'] > 0 else 0
-    results['average_confidence'] = total_confidence / results['total'] if results['total'] > 0 else 0
-    
-    return results
-
-
-# Example usage and tests
-if __name__ == "__main__":
-    # Test cases
-    test_cases = [
-        (
-            "Explanation: I found that...\nExact Answer: Paris\nConfidence Score: 95%",
-            "Paris",
-            True
-        ),
-        (
-            "The answer is 42",
-            "42",
-            True
-        ),
-        (
-            "Exact Answer: Yes\nConfidence: 80%",
-            "yes",
-            True
-        ),
-        (
-            "After browsing, I found the answer is 3.14159",
-            "3.14",
-            True
-        ),
-        (
-            "The result is 99",
-            "100",
-            False
-        ),
-    ]
-    
-    print("Testing Browsecomp scorer:")
-    for pred, true, expected in test_cases:
-        result = question_scorer(pred, true)
-        extracted = extract_answer(pred)
-        confidence = extract_confidence(pred)
-        status = "✓" if result == expected else "✗"
-        print(f"{status} Pred: '{pred[:50]}...' | True: '{true}' | Correct: {result}")
-        print(f"   Extracted: '{extracted}' | Confidence: {confidence}%")
\ No newline at end of file
diff --git a/eval-server/python/evals/run_browsecomp_eval_server.sh b/eval-server/python/evals/run_browsecomp_eval_server.sh
deleted file mode 100755
index e393dad..0000000
--- a/eval-server/python/evals/run_browsecomp_eval_server.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-# Wrapper script to run browsecomp eval server with proper dependencies
-
-# Get the directory of this script
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-EVAL_SERVER_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
-
-# Change to eval-server python directory
-cd "$EVAL_SERVER_DIR"
-
-# Run with uv, passing all arguments
-uv run python evals/browsecomp_eval_server.py "$@"
\ No newline at end of file
diff --git a/eval-server/python/examples/__init__.py b/eval-server/python/examples/__init__.py
deleted file mode 100644
index 4bb7da7..0000000
--- a/eval-server/python/examples/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Examples package for bo-eval-server.
-
-This package contains working examples demonstrating different uses of the evaluation server:
-- basic_server: Simple WebSocket server setup
-- with_stack: Using evaluation stack for queuing evaluations
-- programmatic_evals: Advanced programmatic evaluation creation
-"""
-
-__version__ = "1.0.0"
\ No newline at end of file
diff --git a/eval-server/python/examples/basic_server.py b/eval-server/python/examples/basic_server.py
deleted file mode 100644
index 3a1f9b0..0000000
--- a/eval-server/python/examples/basic_server.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Basic EvalServer example - Simple WebSocket server setup.
-
-This example shows the minimal setup for a WebSocket evaluation server.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer
-
-
-async def main():
-    """Main example function for basic server setup."""
-    # Create server with basic configuration
-    server = EvalServer(
-        auth_key='hello',
-        host='127.0.0.1',
-        port=8080,
-        log_level='DEBUG',
-        log_dir='./logs',  # Optional: create logs directory
-    )
-    
-    # Set up client connection handler
-    @server.on_connect
-    async def handle_client(client):
-        print(f'🔗 Client connected: {client.id}')
-        print(f'   Tab ID: {client.tab_id}')
-        print(f'   Capabilities: {client.capabilities}')
-        
-        # Send EXACTLY the same evaluation as NodeJS library-usage.js
-        try:
-            print('🔄 Starting evaluation...')
-            response = await client.evaluate({
-                "id": "test_eval",
-                "name": "Capital of France",
-                "description": "Simple test evaluation", 
-                "tool": "chat",
-                "input": {
-                    "message": "What is the capital of France?"
-                }
-            })
-            
-            print('✅ Evaluation completed!')
-            print(f'📊 Response: {response}')
-            
-        except Exception as e:
-            print(f'❌ Evaluation failed: {e}')
-        
-        # Send a custom message
-        try:
-            await client.send_message({
-                "type": "info",
-                "message": "Evaluation completed successfully!"
-            })
-        except Exception as e:
-            print(f'⚠️  Failed to send message: {e}')
-    
-    # Set up client disconnection handler
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'🔌 Client disconnected: {client_info["id"]}')
-        print(f'   Connection duration: {client_info.get("duration", "unknown")}s')
-    
-    # Start the server
-    try:
-        await server.start()
-        print(f'🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Check if logs directory exists, create if needed
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/examples/logs/.gitignore b/eval-server/python/examples/logs/.gitignore
deleted file mode 100644
index 326f777..0000000
--- a/eval-server/python/examples/logs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.log
-*.jsonl
\ No newline at end of file
diff --git a/eval-server/python/examples/programmatic_evals.py b/eval-server/python/examples/programmatic_evals.py
deleted file mode 100644
index 47e579d..0000000
--- a/eval-server/python/examples/programmatic_evals.py
+++ /dev/null
@@ -1,428 +0,0 @@
-#!/usr/bin/env python3
-"""
-Programmatic evaluation creation example.
-
-This example demonstrates creating and customizing evaluations programmatically
-in Python code, including dynamic evaluation generation and conditional logic.
-"""
-
-import asyncio
-import random
-import sys
-import time
-from pathlib import Path
-from typing import Dict, Any, List
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer, EvaluationStack
-
-
-class EvaluationGenerator:
-    """Helper class for generating evaluations programmatically."""
-    
-    def __init__(self):
-        self.counter = 0
-    
-    def create_evaluation(
-        self,
-        name: str,
-        tool: str,
-        input_data: Dict[str, Any],
-        description: str = "",
-        metadata: Dict[str, Any] = None,
-        timeout: float = 30.0,
-    ) -> Dict[str, Any]:
-        """Create a standardized evaluation object."""
-        self.counter += 1
-        
-        return {
-            "id": f"generated_{self.counter:03d}_{int(time.time())}",
-            "name": name,
-            "description": description or f"Programmatically generated evaluation: {name}",
-            "tool": tool,
-            "input": input_data,
-            "timeout": timeout,
-            "metadata": {
-                "generated": True,
-                "timestamp": time.time(),
-                "generator": "programmatic_evals.py",
-                **(metadata or {})
-            }
-        }
-    
-    def create_chat_evaluation(
-        self,
-        message: str,
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create a chat-based evaluation."""
-        return self.create_evaluation(
-            name=name or f"Chat: {message[:30]}...",
-            tool="chat",
-            input_data={"message": message},
-            **kwargs
-        )
-    
-    def create_action_evaluation(
-        self,
-        objective: str,
-        url: str = None,
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create an action-based evaluation."""
-        input_data = {"objective": objective}
-        if url:
-            input_data["url"] = url
-            
-        return self.create_evaluation(
-            name=name or f"Action: {objective[:30]}...",
-            tool="action",
-            input_data=input_data,
-            **kwargs
-        )
-    
-    def create_research_evaluation(
-        self,
-        query: str,
-        depth: str = "basic",
-        name: str = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """Create a research-based evaluation."""
-        return self.create_evaluation(
-            name=name or f"Research: {query[:30]}...",
-            tool="research",
-            input_data={
-                "query": query,
-                "depth": depth,
-            },
-            **kwargs
-        )
-
-
-def create_dynamic_evaluations(generator: EvaluationGenerator) -> List[Dict[str, Any]]:
-    """Create evaluations based on dynamic logic."""
-    evaluations = []
-    
-    # Math evaluations with increasing difficulty
-    for i in range(3):
-        if i == 0:
-            a, b = random.randint(1, 10), random.randint(1, 10)
-            op = "+"
-            difficulty = "easy"
-        elif i == 1:
-            a, b = random.randint(10, 50), random.randint(10, 50)
-            op = "*"
-            difficulty = "medium"
-        else:
-            a, b = random.randint(100, 1000), random.randint(2, 20)
-            op = "/"
-            difficulty = "hard"
-        
-        evaluation = generator.create_chat_evaluation(
-            message=f"Calculate: {a} {op} {b}",
-            name=f"Math {difficulty.title()} #{i+1}",
-            metadata={
-                "category": "mathematics",
-                "difficulty": difficulty,
-                "numbers": [a, b],
-                "operation": op
-            }
-        )
-        evaluations.append(evaluation)
-    
-    # Conditional evaluations based on current time
-    current_hour = time.localtime().tm_hour
-    if 6 <= current_hour < 12:
-        time_context = "morning"
-        questions = [
-            "What's a good breakfast recipe?",
-            "How can I boost my energy in the morning?",
-        ]
-    elif 12 <= current_hour < 18:
-        time_context = "afternoon"
-        questions = [
-            "What's a healthy lunch option?",
-            "How can I stay productive in the afternoon?",
-        ]
-    else:
-        time_context = "evening"
-        questions = [
-            "What's a good dinner recipe?",
-            "How can I relax in the evening?",
-        ]
-    
-    for i, question in enumerate(questions):
-        evaluation = generator.create_chat_evaluation(
-            message=question,
-            name=f"{time_context.title()} Question #{i+1}",
-            metadata={
-                "category": "lifestyle",
-                "time_context": time_context,
-                "hour": current_hour
-            }
-        )
-        evaluations.append(evaluation)
-    
-    # Generate research evaluations for trending topics
-    trending_topics = [
-        "artificial intelligence trends 2024",
-        "sustainable energy solutions",
-        "space exploration recent developments",
-    ]
-    
-    for topic in trending_topics:
-        evaluation = generator.create_research_evaluation(
-            query=topic,
-            depth="detailed",
-            name=f"Research: {topic.title()}",
-            metadata={
-                "category": "research",
-                "topic": topic,
-                "priority": "high"
-            },
-            timeout=60.0  # Longer timeout for research
-        )
-        evaluations.append(evaluation)
-    
-    return evaluations
-
-
-async def main():
-    """Main example function for programmatic evaluation creation."""
-    print("🏭 Programmatic Evaluation Generation Example")
-    print("=" * 50)
-    
-    # Create evaluation generator
-    generator = EvaluationGenerator()
-    
-    # Create evaluation stack
-    stack = EvaluationStack()
-    
-    # Generate static evaluations
-    print("\n📝 Creating static evaluations...")
-    static_evals = [
-        generator.create_chat_evaluation(
-            message="Explain quantum computing in simple terms",
-            name="Quantum Computing Explanation",
-            metadata={"category": "science", "complexity": "advanced"}
-        ),
-        generator.create_action_evaluation(
-            objective="Find and click the search button",
-            url="https://www.google.com",
-            name="Google Search Action",
-            metadata={"category": "web_automation", "site": "google"}
-        ),
-        generator.create_chat_evaluation(
-            message="Write a haiku about programming",
-            name="Programming Haiku",
-            metadata={"category": "creative", "format": "poetry"}
-        ),
-    ]
-    
-    for eval_obj in static_evals:
-        stack.push(eval_obj)
-        print(f"   ➕ {eval_obj['name']}")
-    
-    # Generate dynamic evaluations
-    print("\n🎲 Creating dynamic evaluations...")
-    dynamic_evals = create_dynamic_evaluations(generator)
-    
-    for eval_obj in dynamic_evals:
-        stack.push(eval_obj)
-        print(f"   ➕ {eval_obj['name']} (category: {eval_obj['metadata']['category']})")
-    
-    print(f"\n📊 Total evaluations created: {stack.size()}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key='programmatic-demo',
-        host='127.0.0.1',
-        port=8080,
-        log_level='INFO',
-        log_dir='./logs',
-        max_concurrent_evaluations=5,  # Allow more concurrent evaluations
-    )
-    
-    # Track evaluation results with detailed analysis
-    results = {
-        'completed': [],
-        'failed': [],
-        'by_category': {},
-        'by_difficulty': {},
-        'timing': [],
-    }
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'\n🔗 Client connected: {client.id}')
-        print(f'   Processing {stack.size()} evaluations...')
-        
-        start_time = time.time()
-        processed = 0
-        
-        while not stack.is_empty():
-            evaluation = stack.pop()
-            if not evaluation:
-                break
-            
-            processed += 1
-            eval_start = time.time()
-            
-            print(f'\n📋 [{processed}] {evaluation["name"]}')
-            print(f'   Category: {evaluation["metadata"].get("category", "unknown")}')
-            print(f'   Tool: {evaluation["tool"]}')
-            
-            try:
-                # Use concurrency-limited evaluation
-                result = await server.evaluate_with_concurrency_limit(
-                    client, 
-                    evaluation,
-                    timeout=evaluation.get("timeout", 30.0)
-                )
-                
-                eval_duration = time.time() - eval_start
-                
-                # Record successful result
-                result_record = {
-                    'evaluation': evaluation,
-                    'result': result,
-                    'duration': eval_duration,
-                    'client_id': client.id,
-                    'timestamp': time.time(),
-                }
-                results['completed'].append(result_record)
-                
-                # Update category stats
-                category = evaluation["metadata"].get("category", "unknown")
-                if category not in results['by_category']:
-                    results['by_category'][category] = {'completed': 0, 'failed': 0}
-                results['by_category'][category]['completed'] += 1
-                
-                # Update difficulty stats
-                difficulty = evaluation["metadata"].get("difficulty", "unknown")
-                if difficulty not in results['by_difficulty']:
-                    results['by_difficulty'][difficulty] = {'completed': 0, 'failed': 0}
-                results['by_difficulty'][difficulty]['completed'] += 1
-                
-                # Record timing
-                results['timing'].append(eval_duration)
-                
-                print(f'   ✅ Completed in {eval_duration:.2f}s')
-                
-                # Show preview of response
-                if "output" in result and "response" in result["output"]:
-                    response = result["output"]["response"]
-                    preview = response[:150] + "..." if len(response) > 150 else response
-                    print(f'   💬 "{preview}"')
-                
-            except Exception as e:
-                eval_duration = time.time() - eval_start
-                
-                # Record failed result
-                failure_record = {
-                    'evaluation': evaluation,
-                    'error': str(e),
-                    'duration': eval_duration,
-                    'client_id': client.id,
-                    'timestamp': time.time(),
-                }
-                results['failed'].append(failure_record)
-                
-                # Update stats
-                category = evaluation["metadata"].get("category", "unknown")
-                if category not in results['by_category']:
-                    results['by_category'][category] = {'completed': 0, 'failed': 0}
-                results['by_category'][category]['failed'] += 1
-                
-                difficulty = evaluation["metadata"].get("difficulty", "unknown")
-                if difficulty not in results['by_difficulty']:
-                    results['by_difficulty'][difficulty] = {'completed': 0, 'failed': 0}
-                results['by_difficulty'][difficulty]['failed'] += 1
-                
-                print(f'   ❌ Failed after {eval_duration:.2f}s: {e}')
-        
-        total_duration = time.time() - start_time
-        print(f'\n🏁 Batch completed in {total_duration:.2f}s')
-        print(f'   Processed: {processed}')
-        print(f'   Success rate: {len(results["completed"])/processed*100:.1f}%')
-        
-        # Send detailed completion message
-        await client.send_message({
-            "type": "batch_analysis",
-            "total_processed": processed,
-            "completed": len(results['completed']),
-            "failed": len(results['failed']),
-            "duration": total_duration,
-            "average_eval_time": sum(results['timing']) / len(results['timing']) if results['timing'] else 0,
-            "categories": list(results['by_category'].keys()),
-        })
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'\n🔌 Client disconnected: {client_info["id"]}')
-        
-        # Show detailed analysis
-        total = len(results['completed']) + len(results['failed'])
-        if total > 0:
-            print(f'\n📈 Final Analysis:')
-            print(f'   Total evaluations: {total}')
-            print(f'   Successful: {len(results["completed"])} ({len(results["completed"])/total*100:.1f}%)')
-            print(f'   Failed: {len(results["failed"])} ({len(results["failed"])/total*100:.1f}%)')
-            
-            if results['timing']:
-                avg_time = sum(results['timing']) / len(results['timing'])
-                min_time = min(results['timing'])
-                max_time = max(results['timing'])
-                print(f'   Average time: {avg_time:.2f}s (min: {min_time:.2f}s, max: {max_time:.2f}s)')
-            
-            print(f'\n📊 By Category:')
-            for category, stats in results['by_category'].items():
-                total_cat = stats['completed'] + stats['failed']
-                success_rate = stats['completed'] / total_cat * 100 if total_cat > 0 else 0
-                print(f'   {category}: {total_cat} total, {success_rate:.1f}% success')
-            
-            if any(results['by_difficulty'].values()):
-                print(f'\n🎯 By Difficulty:')
-                for difficulty, stats in results['by_difficulty'].items():
-                    if difficulty != "unknown":
-                        total_diff = stats['completed'] + stats['failed']
-                        success_rate = stats['completed'] / total_diff * 100 if total_diff > 0 else 0
-                        print(f'   {difficulty}: {total_diff} total, {success_rate:.1f}% success')
-    
-    # Start server
-    try:
-        await server.start()
-        print(f'\n🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your agent client to start processing evaluations')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/examples/with_stack.py b/eval-server/python/examples/with_stack.py
deleted file mode 100644
index f4b5d20..0000000
--- a/eval-server/python/examples/with_stack.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-"""
-EvalServer with EvaluationStack example.
-
-This example demonstrates using an EvaluationStack to queue evaluations
-and distribute them across multiple client connections.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add src to path for local development
-sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-from bo_eval_server import EvalServer, EvaluationStack
-
-
-def create_sample_evaluations():
-    """Create evaluations matching NodeJS multiple-evals.js exactly."""
-    evaluations = [
-        {
-            "id": "math_eval",
-            "name": "Basic Math Problem",
-            "description": "Simple arithmetic evaluation",
-            "tool": "chat",
-            "input": {
-                "message": "What is 15 * 7 + 23? Please show your calculation steps."
-            }
-        },
-        {
-            "id": "geography_eval", 
-            "name": "Capital of France",
-            "description": "Geography knowledge test",
-            "tool": "chat",
-            "input": {
-                "message": "What is the capital of France?"
-            }
-        },
-        {
-            "id": "creative_eval",
-            "name": "Creative Writing",
-            "description": "Short creative writing task",
-            "tool": "chat", 
-            "input": {
-                "message": "Write a two-sentence story about a robot discovering friendship."
-            }
-        },
-        {
-            "id": "tech_eval",
-            "name": "Technology Knowledge",  
-            "description": "Basic technology concepts",
-            "tool": "chat",
-            "input": {
-                "message": "Explain what HTTP stands for and what it's used for in simple terms."
-            }
-        }
-    ]
-    return evaluations
-
-
-async def main():
-    """Main example function for evaluation stack usage."""
-    # Create evaluation stack and populate it
-    stack = EvaluationStack()
-    sample_evaluations = create_sample_evaluations()
-    
-    print(f"📚 Created {len(sample_evaluations)} sample evaluations")
-    
-    # Add evaluations to stack (LIFO order)
-    for evaluation in sample_evaluations:
-        stack.push(evaluation)
-        print(f"   ➕ Added: {evaluation['name']}")
-    
-    print(f"📊 Stack size: {stack.size()}")
-    print(f"🔝 Top evaluation: {stack.peek()['name'] if stack.peek() else 'None'}")
-    
-    # Create server
-    server = EvalServer(
-        auth_key='stack-demo',
-        host='127.0.0.1',
-        port=8080,
-        log_level='INFO',
-        log_dir='./logs',
-    )
-    
-    # Track processed evaluations
-    completed_evaluations = []
-    failed_evaluations = []
-    
-    @server.on_connect
-    async def handle_client(client):
-        print('🎉 CLIENT CONNECTED!')
-        print(f'   - Client ID: {client.id}')
-        print(f'   - Client tabId: {client.tab_id}')
-        print(f'   - Client info: {client.get_info()}')
-        
-        # Check if we have evaluations left in the stack
-        if stack.is_empty():
-            print('⚠️  No more evaluations in stack for this client')
-            print('   Consider refilling the stack or handling this scenario')
-            return
-        
-        # Pop the next evaluation from the stack (ONE evaluation per client!)
-        evaluation = stack.pop()
-        print(f'📋 Assigning evaluation: "{evaluation["name"]}" ({evaluation["id"]})')
-        print(f'📊 Remaining evaluations in stack: {stack.size()}')
-        
-        try:
-            print('🔄 Starting evaluation...')
-            result = await client.evaluate(evaluation)
-            
-            print('✅ Evaluation completed!')
-            print(f'📊 Response for "{evaluation["name"]}": {result}')
-            
-            completed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'result': result,
-            })
-            
-        except Exception as e:
-            print(f'❌ Evaluation "{evaluation["name"]}" failed: {e}')
-            
-            failed_evaluations.append({
-                'client_id': client.id,
-                'evaluation': evaluation,
-                'error': str(e),
-            })
-        
-        # Send completion message
-        try:
-            await client.send_message({
-                "type": "evaluation_complete",
-                "evaluation_id": evaluation["id"],
-                "evaluation_name": evaluation["name"],
-                "status": "completed" if evaluation["id"] not in [e['evaluation']['id'] for e in failed_evaluations] else "failed"
-            })
-        except Exception as e:
-            print(f'   ⚠️  Failed to send completion message: {e}')
-    
-    @server.on_disconnect
-    async def handle_disconnect(client_info):
-        print(f'\n🔌 Client disconnected: {client_info["id"]}')
-        
-        # Show final statistics
-        total_completed = len(completed_evaluations)
-        total_failed = len(failed_evaluations) 
-        remaining = stack.size()
-        
-        print(f'\n📊 Final Statistics:')
-        print(f'   ✅ Completed: {total_completed}')
-        print(f'   ❌ Failed: {total_failed}')
-        print(f'   📚 Remaining: {remaining}')
-        
-        if completed_evaluations:
-            print(f'\n🎯 Completed Evaluations:')
-            for item in completed_evaluations:
-                eval_name = item['evaluation']['name']
-                client_id = item['client_id'][:8]  # Short client ID
-                print(f'   • {eval_name} (client: {client_id})')
-        
-        if failed_evaluations:
-            print(f'\n💥 Failed Evaluations:')
-            for item in failed_evaluations:
-                eval_name = item['evaluation']['name']
-                error = item['error']
-                print(f'   • {eval_name}: {error}')
-    
-    # Start server
-    try:
-        await server.start()
-        print(f'\n🚀 Server running on ws://{server.config.host}:{server.config.port}')
-        print('   Connect your agent client to start processing evaluations')
-        print('   Press Ctrl+C to stop the server')
-        
-        # Keep server running
-        await server.wait_closed()
-        
-    except KeyboardInterrupt:
-        print('\n🛑 Received interrupt signal, stopping server...')
-        await server.stop()
-        print('✅ Server stopped successfully')
-        
-    except Exception as e:
-        print(f'💥 Server error: {e}')
-        if server.is_running():
-            await server.stop()
-
-
-if __name__ == "__main__":
-    # Ensure logs directory exists
-    Path("./logs").mkdir(exist_ok=True)
-    
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Fatal error: {e}')
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/logs/.gitignore b/eval-server/python/logs/.gitignore
deleted file mode 100644
index 326f777..0000000
--- a/eval-server/python/logs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.log
-*.jsonl
\ No newline at end of file
diff --git a/eval-server/python/pyproject.toml b/eval-server/python/pyproject.toml
deleted file mode 100644
index 83d30ee..0000000
--- a/eval-server/python/pyproject.toml
+++ /dev/null
@@ -1,84 +0,0 @@
-[build-system]
-requires = ["setuptools>=61.0", "wheel"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "bo-eval-server"
-version = "1.0.0"
-description = "WebSocket server for evaluating LLM agents - Python implementation"
-readme = "README.md"
-license = {text = "MIT"}
-authors = [
-    {name = "Browser Operator Team"}
-]
-classifiers = [
-    "Development Status :: 4 - Beta",
-    "Intended Audience :: Developers",
-    "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Topic :: Software Development :: Libraries :: Python Modules",
-    "Topic :: System :: Networking",
-]
-keywords = ["websocket", "llm", "evaluation", "rpc", "library", "programmatic"]
-requires-python = ">=3.8"
-dependencies = [
-    "websockets>=11.0.0",
-    "loguru>=0.7.0",
-    "pandas>=2.0.0",
-    "requests>=2.31.0",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest>=7.0.0",
-    "pytest-asyncio>=0.21.0",
-    "black>=23.0.0",
-    "mypy>=1.0.0",
-]
-
-[project.urls]
-Homepage = "https://github.com/chromium/devtools-frontend"
-Repository = "https://github.com/chromium/devtools-frontend"
-Issues = "https://github.com/chromium/devtools-frontend/issues"
-
-[project.scripts]
-bo-eval-basic = "scripts:run_basic_server"
-bo-eval-stack = "scripts:run_with_stack"
-bo-eval-programmatic = "scripts:run_programmatic_evals"
-
-[tool.setuptools.packages.find]
-where = ["src"]
-
-[tool.setuptools.package-data]
-"*" = ["*.md", "*.txt", "*.yaml", "*.json"]
-
-[tool.black]
-line-length = 88
-target-version = ['py38']
-
-[tool.mypy]
-python_version = "3.8"
-warn_return_any = true
-warn_unused_configs = true
-disallow_untyped_defs = true
-
-[tool.pytest.ini_options]
-asyncio_mode = "auto"
-testpaths = ["tests"]
-python_files = ["test_*.py"]
-python_classes = ["Test*"]
-python_functions = ["test_*"]
-
-[dependency-groups]
-dev = [
-    "black>=24.8.0",
-    "mypy>=1.14.1",
-    "pytest>=8.3.5",
-    "pytest-asyncio>=0.24.0",
-]
diff --git a/eval-server/python/quick_test.py b/eval-server/python/quick_test.py
deleted file mode 100644
index 5bf5b9a..0000000
--- a/eval-server/python/quick_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""Quick test to see what's happening with the server."""
-
-import asyncio
-import json
-import websockets
-
-async def test_server():
-    print("🔗 Testing server connection...")
-    try:
-        async with websockets.connect('ws://127.0.0.1:8080') as ws:
-            print("✅ Connected to server")
-            
-            # Wait for welcome message
-            print("⏳ Waiting for welcome message...")
-            welcome = await asyncio.wait_for(ws.recv(), timeout=5.0)
-            print(f"📥 Welcome: {welcome}")
-            
-            # Send registration
-            registration = {
-                "type": "register",
-                "clientId": "test-client-123",
-                "secretKey": "hello",
-                "capabilities": ["chat"]
-            }
-            print(f"📤 Sending registration: {json.dumps(registration)}")
-            await ws.send(json.dumps(registration))
-            
-            # Wait for ack
-            print("⏳ Waiting for registration ack...")
-            ack = await asyncio.wait_for(ws.recv(), timeout=5.0)
-            print(f"📥 Registration ack: {ack}")
-            
-    except Exception as e:
-        print(f"❌ Error: {e}")
-
-if __name__ == "__main__":
-    asyncio.run(test_server())
\ No newline at end of file
diff --git a/eval-server/python/requirements.txt b/eval-server/python/requirements.txt
deleted file mode 100644
index e9fc8ca..0000000
--- a/eval-server/python/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Core dependencies
-websockets>=11.0.0
-loguru>=0.7.0
-
-# Development dependencies (optional)
-# Install with: pip install -e ".[dev]"
-# pytest>=7.0.0
-# pytest-asyncio>=0.21.0
-# black>=23.0.0
-# mypy>=1.0.0
\ No newline at end of file
diff --git a/eval-server/python/run.py b/eval-server/python/run.py
deleted file mode 100644
index 407cd68..0000000
--- a/eval-server/python/run.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple script runner for bo-eval-server examples.
-
-Usage:
-    python run.py basic      # Run basic server example
-    python run.py stack      # Run evaluation stack example  
-    python run.py prog       # Run programmatic evaluations example
-    python run.py all        # Show all available examples
-"""
-
-import subprocess
-import sys
-from pathlib import Path
-
-
-def run_with_uv(script_path: str, description: str):
-    """Run a Python script using uv."""
-    print(f"🚀 {description}")
-    print(f"   Running: uv run python {script_path}")
-    print("-" * 50)
-    
-    try:
-        # Ensure logs directory exists
-        logs_dir = Path("logs")
-        logs_dir.mkdir(exist_ok=True)
-        
-        # Run the script with uv
-        result = subprocess.run([
-            "uv", "run", "python", script_path
-        ], cwd=Path(__file__).parent)
-        
-        return result.returncode
-        
-    except KeyboardInterrupt:
-        print("\n🛑 Interrupted by user")
-        return 130
-    except FileNotFoundError:
-        print("❌ Error: 'uv' command not found. Please install uv first:")
-        print("   curl -LsSf https://astral.sh/uv/install.sh | sh")
-        return 1
-    except Exception as e:
-        print(f"💥 Error running script: {e}")
-        return 1
-
-
-def show_examples():
-    """Show all available examples."""
-    print("📚 Available Examples:")
-    print()
-    print("🔧 basic      - Basic WebSocket server setup")
-    print("             Simple server that connects to one client and runs a single evaluation")
-    print()
-    print("📚 stack      - Evaluation stack usage")
-    print("             Demonstrates LIFO queue for managing multiple evaluations")
-    print()
-    print("🏭 prog       - Programmatic evaluation creation")
-    print("             Advanced example with dynamic evaluation generation and analytics")
-    print()
-    print("Usage:")
-    print("  python run.py basic")
-    print("  python run.py stack")
-    print("  python run.py prog")
-    print()
-    print("Or with uv directly:")
-    print("  uv run python examples/basic_server.py")
-    print("  uv run python examples/with_stack.py")
-    print("  uv run python examples/programmatic_evals.py")
-
-
-def main():
-    """Main entry point."""
-    if len(sys.argv) != 2:
-        print("Usage: python run.py [basic|stack|prog|all]")
-        print("       python run.py all    # Show all examples")
-        sys.exit(1)
-    
-    command = sys.argv[1].lower()
-    
-    examples = {
-        "basic": ("examples/basic_server.py", "Basic WebSocket Server Example"),
-        "stack": ("examples/with_stack.py", "Evaluation Stack Example"),
-        "prog": ("examples/programmatic_evals.py", "Programmatic Evaluations Example"),
-        "programmatic": ("examples/programmatic_evals.py", "Programmatic Evaluations Example"),
-    }
-    
-    if command == "all":
-        show_examples()
-        return 0
-    elif command in examples:
-        script_path, description = examples[command]
-        return run_with_uv(script_path, description)
-    else:
-        print(f"❌ Unknown command: {command}")
-        print("Available commands: basic, stack, prog, all")
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
diff --git a/eval-server/python/scripts.py b/eval-server/python/scripts.py
deleted file mode 100644
index b57377d..0000000
--- a/eval-server/python/scripts.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script runner for bo-eval-server examples using uv.
-
-This module provides entry points for running examples with uv.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-# Add the examples directory to path
-examples_dir = Path(__file__).parent / "examples"
-sys.path.insert(0, str(examples_dir))
-
-
-def run_basic_server():
-    """Run the basic server example."""
-    from examples.basic_server import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-def run_with_stack():
-    """Run the evaluation stack example."""
-    from examples.with_stack import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-def run_programmatic_evals():
-    """Run the programmatic evaluations example."""
-    from examples.programmatic_evals import main
-    try:
-        asyncio.run(main())
-    except KeyboardInterrupt:
-        print('\n👋 Goodbye!')
-    except Exception as e:
-        print(f'💥 Error: {e}')
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python scripts.py [basic|stack|programmatic]")
-        sys.exit(1)
-    
-    script = sys.argv[1]
-    if script == "basic":
-        run_basic_server()
-    elif script == "stack":
-        run_with_stack()
-    elif script == "programmatic":
-        run_programmatic_evals()
-    else:
-        print(f"Unknown script: {script}")
-        print("Available scripts: basic, stack, programmatic")
-        sys.exit(1)
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/__init__.py b/eval-server/python/src/bo_eval_server/__init__.py
deleted file mode 100644
index 3a8b6aa..0000000
--- a/eval-server/python/src/bo_eval_server/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-bo-eval-server: A minimal Python library for WebSocket-based LLM agent evaluation servers.
-
-This package provides core functionality for:
-- WebSocket server for agent connections
-- JSON-RPC 2.0 bidirectional communication  
-- Evaluation stack for managing evaluation queues
-- Enhanced logging and client management
-"""
-
-from .eval_server import EvalServer
-from .evaluation_stack import EvaluationStack
-from .client_manager import ClientManager, ClientProxy
-from .rpc_client import RpcClient
-from .config import Config
-from .logger import setup_logger
-
-__version__ = "1.0.0"
-__author__ = "Browser Operator Team"
-
-__all__ = [
-    "EvalServer",
-    "EvaluationStack", 
-    "ClientManager",
-    "ClientProxy",
-    "RpcClient",
-    "Config",
-    "setup_logger",
-]
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/client_manager.py b/eval-server/python/src/bo_eval_server/client_manager.py
deleted file mode 100644
index 280f676..0000000
--- a/eval-server/python/src/bo_eval_server/client_manager.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""
-Client management for WebSocket connections.
-
-Handles client registration, authentication, and provides a proxy interface
-for interacting with connected agents.
-"""
-
-import asyncio
-import json
-import time
-import uuid
-from typing import Dict, Any, Optional, List, Callable, Awaitable
-
-import websockets
-from loguru import logger
-
-from .rpc_client import RpcClient, RpcError, RpcTimeoutError
-from .logger import log_connection, log_evaluation
-
-
-class ClientProxy:
-    """Proxy object for interacting with a connected agent."""
-    
-    def __init__(
-        self,
-        client_id: str,
-        websocket: websockets.WebSocketServerProtocol,
-        rpc_client: RpcClient,
-        tab_id: Optional[str] = None,
-        base_client_id: Optional[str] = None,
-        capabilities: Optional[List[str]] = None,
-    ):
-        """
-        Initialize client proxy.
-        
-        Args:
-            client_id: Unique client identifier
-            websocket: WebSocket connection
-            rpc_client: RPC client for method calls
-            tab_id: Browser tab ID (if applicable)
-            base_client_id: Base client ID for grouping
-            capabilities: List of agent capabilities
-        """
-        self.id = client_id
-        self.tab_id = tab_id
-        self.base_client_id = base_client_id or client_id
-        self.capabilities = capabilities or []
-        self._websocket = websocket
-        self._rpc_client = rpc_client
-        self._connected_at = time.time()
-    
-    async def evaluate(
-        self,
-        evaluation: Dict[str, Any],
-        timeout: Optional[float] = None,
-    ) -> Dict[str, Any]:
-        """
-        Execute an evaluation on the connected agent.
-        
-        Args:
-            evaluation: Evaluation object with required fields
-            timeout: Optional timeout override
-            
-        Returns:
-            Evaluation result from the agent
-            
-        Raises:
-            ValueError: If evaluation is invalid
-            RpcError: If the RPC call fails
-            RpcTimeoutError: If the call times out
-        """
-        # Validate evaluation object
-        required_fields = ['id', 'name', 'tool', 'input']
-        for field in required_fields:
-            if field not in evaluation:
-                raise ValueError(f"Evaluation missing required field: {field}")
-        
-        evaluation_id = evaluation['id']
-        start_time = time.time()
-        
-        try:
-            # Log evaluation start
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="started",
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            
-            # Make RPC call to agent
-            result = await self._rpc_client.call(
-                method="evaluate",
-                params=evaluation,
-                timeout=timeout,
-                client_id=self.id,
-            )
-            
-            duration = time.time() - start_time
-            
-            # Log evaluation completion
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="completed",
-                duration=duration,
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            
-            return result
-            
-        except RpcTimeoutError:
-            duration = time.time() - start_time
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="timeout",
-                duration=duration,
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            raise
-            
-        except Exception as e:
-            duration = time.time() - start_time
-            log_evaluation(
-                evaluation_id=evaluation_id,
-                client_id=self.id,
-                status="failed",
-                duration=duration,
-                error=str(e),
-                evaluation_name=evaluation.get('name'),
-                tool=evaluation.get('tool'),
-            )
-            raise
-    
-    async def send_message(self, message: Dict[str, Any]) -> None:
-        """
-        Send a custom message to the connected agent.
-        
-        Args:
-            message: Message object to send
-        """
-        try:
-            await self._websocket.send(json.dumps(message))
-        except Exception as e:
-            logger.error(f"Failed to send message to client {self.id}: {e}")
-            raise
-    
-    def get_info(self) -> Dict[str, Any]:
-        """
-        Get client information.
-        
-        Returns:
-            Dictionary with client details
-        """
-        return {
-            'id': self.id,
-            'tab_id': self.tab_id,
-            'base_client_id': self.base_client_id,
-            'capabilities': self.capabilities,
-            'connected_at': self._connected_at,
-            'connected': self._rpc_client.is_connected(),
-        }
-    
-    def is_connected(self) -> bool:
-        """Check if the client is still connected."""
-        return self._rpc_client.is_connected()
-    
-    def __repr__(self) -> str:
-        """String representation of the client proxy."""
-        return f"ClientProxy(id={self.id}, connected={self.is_connected()})"
-
-
-class ClientManager:
-    """Manages WebSocket client connections and authentication."""
-    
-    def __init__(self, auth_key: str, rpc_timeout: float = 1500.0):
-        """
-        Initialize client manager.
-        
-        Args:
-            auth_key: Required authentication key for clients
-            rpc_timeout: Default RPC timeout in seconds
-        """
-        self.auth_key = auth_key
-        self.rpc_timeout = rpc_timeout
-        self._clients: Dict[str, ClientProxy] = {}
-        self._pending_connections: Dict[str, Dict[str, Any]] = {}
-        
-        # Event handlers
-        self._on_connect_handler: Optional[Callable[[ClientProxy], Awaitable[None]]] = None
-        self._on_disconnect_handler: Optional[Callable[[Dict[str, Any]], Awaitable[None]]] = None
-    
-    def on_connect(self, handler: Callable[[ClientProxy], Awaitable[None]]) -> None:
-        """Set the handler for client connections."""
-        self._on_connect_handler = handler
-    
-    def on_disconnect(self, handler: Callable[[Dict[str, Any]], Awaitable[None]]) -> None:
-        """Set the handler for client disconnections."""
-        self._on_disconnect_handler = handler
-    
-    async def handle_connection(self, websocket: websockets.WebSocketServerProtocol) -> None:
-        """
-        Handle a new WebSocket connection - matches NodeJS EvalServer flow.
-        
-        Args:
-            websocket: WebSocket connection
-        """
-        connection_id = str(uuid.uuid4())
-        client_proxy: Optional[ClientProxy] = None
-        
-        try:
-            # Send welcome message immediately (like NodeJS)
-            welcome_message = {
-                'type': 'welcome',
-                'serverId': 'python-eval-server-001',
-                'version': '1.0.0',
-                'timestamp': time.time()
-            }
-            logger.debug(f"Sending welcome message to connection {connection_id}")
-            await websocket.send(json.dumps(welcome_message))
-            
-            # Wait for registration message
-            client_proxy = await self._authenticate_client(websocket, connection_id)
-            
-            if client_proxy:
-                # Start RPC client
-                await client_proxy._rpc_client.start()
-                
-                # Add to active clients
-                self._clients[client_proxy.id] = client_proxy
-                
-                # Call connection handler
-                if self._on_connect_handler:
-                    await self._on_connect_handler(client_proxy)
-                
-                # Keep connection alive until closed
-                await client_proxy._rpc_client._message_handler_task
-                
-        except websockets.exceptions.ConnectionClosed:
-            logger.debug(f"WebSocket connection closed: {connection_id}")
-        except Exception as e:
-            logger.error(f"Error handling connection {connection_id}: {e}")
-        finally:
-            # Clean up on disconnect
-            if client_proxy:
-                await self._handle_disconnect(client_proxy)
-    
-    async def _authenticate_client(
-        self,
-        websocket: websockets.WebSocketServerProtocol,
-        connection_id: str,
-    ) -> Optional[ClientProxy]:
-        """Authenticate and register a client connection - matches NodeJS implementation."""
-        try:
-            logger.debug(f"Waiting for registration message from connection {connection_id}")
-            # Wait for registration message with timeout
-            message = await asyncio.wait_for(websocket.recv(), timeout=30.0)
-            logger.debug(f"Received message from {connection_id}: {message}")
-            data = json.loads(message)
-            
-            if data.get('type') != 'register':
-                logger.warning(f"Invalid first message from {connection_id}: expected 'register', got '{data.get('type')}'")
-                await websocket.send(json.dumps({
-                    'type': 'registration_ack',
-                    'status': 'rejected',
-                    'message': 'First message must be registration'
-                }))
-                return None
-            
-            # Auto-accept clients like NodeJS does (NodeJS auto-creates client configs)
-            # For simplicity, we'll accept any client with the correct secret key or no secret key
-            if 'secretKey' in data:
-                if data.get('secretKey') != self.auth_key:
-                    logger.warning(f"Invalid auth key from {connection_id}: expected '{self.auth_key}', got '{data.get('secretKey')}'")
-                    await websocket.send(json.dumps({
-                        'type': 'registration_ack',
-                        'clientId': data.get('clientId', str(uuid.uuid4())),
-                        'status': 'rejected',
-                        'message': 'Invalid authentication key'
-                    }))
-                    return None
-                else:
-                    logger.debug(f"Valid secret key provided by {connection_id}")
-            else:
-                logger.debug(f"No secret key provided by {connection_id}, accepting anyway")
-            
-            client_id = data.get('clientId', str(uuid.uuid4()))
-            tab_id = data.get('tabId')
-            base_client_id = data.get('baseClientId')
-            capabilities = data.get('capabilities', [])
-            
-            logger.info(f"Registering client {client_id} from connection {connection_id}")
-            logger.debug(f"Client capabilities: {capabilities}")
-            
-            # Send registration acknowledgment
-            registration_response = {
-                'type': 'registration_ack',
-                'clientId': client_id,
-                'status': 'accepted',
-                'message': 'Client registered successfully'
-            }
-            logger.debug(f"Sending registration ack to {client_id}: {registration_response}")
-            await websocket.send(json.dumps(registration_response))
-            
-            # Wait for ready signal
-            logger.debug(f"Waiting for ready signal from client {client_id}")
-            ready_message = await asyncio.wait_for(websocket.recv(), timeout=30.0)
-            logger.debug(f"Received ready message from {client_id}: {ready_message}")
-            ready_data = json.loads(ready_message)
-            
-            if ready_data.get('type') != 'ready':
-                logger.warning(f"Invalid ready message from {client_id}: expected 'ready', got '{ready_data.get('type')}'")
-                await websocket.send(json.dumps({
-                    'type': 'error',
-                    'message': 'Expected ready signal after registration'
-                }))
-                return None
-            
-            logger.info(f"Client {client_id} is ready for evaluations")
-            
-            # Create RPC client and proxy
-            rpc_client = RpcClient(websocket, self.rpc_timeout)
-            client_proxy = ClientProxy(
-                client_id=client_id,
-                websocket=websocket,
-                rpc_client=rpc_client,
-                tab_id=tab_id,
-                base_client_id=base_client_id,
-                capabilities=capabilities,
-            )
-            
-            # Log successful connection
-            log_connection(
-                event="connect",
-                client_id=client_id,
-                tab_id=tab_id,
-                base_client_id=base_client_id,
-                capabilities=capabilities,
-            )
-            
-            return client_proxy
-            
-        except asyncio.TimeoutError:
-            logger.warning(f"Client registration timeout: {connection_id}")
-            return None
-        except json.JSONDecodeError:
-            logger.warning(f"Invalid JSON in registration: {connection_id}")
-            return None
-        except Exception as e:
-            logger.error(f"Error during client authentication: {e}")
-            return None
-    
-    async def _handle_disconnect(self, client_proxy: ClientProxy) -> None:
-        """Handle client disconnection cleanup."""
-        client_id = client_proxy.id
-        
-        # Remove from active clients
-        self._clients.pop(client_id, None)
-        
-        # Stop RPC client
-        await client_proxy._rpc_client.stop()
-        
-        # Get client info for disconnect handler
-        client_info = client_proxy.get_info()
-        
-        # Log disconnection
-        log_connection(
-            event="disconnect",
-            client_id=client_id,
-            tab_id=client_proxy.tab_id,
-            base_client_id=client_proxy.base_client_id,
-        )
-        
-        # Call disconnect handler
-        if self._on_disconnect_handler:
-            try:
-                await self._on_disconnect_handler(client_info)
-            except Exception as e:
-                logger.error(f"Error in disconnect handler: {e}")
-    
-    def get_clients(self) -> List[ClientProxy]:
-        """Get list of connected clients."""
-        return list(self._clients.values())
-    
-    def get_client(self, client_id: str) -> Optional[ClientProxy]:
-        """Get a specific client by ID."""
-        return self._clients.get(client_id)
-    
-    def get_status(self) -> Dict[str, Any]:
-        """Get client manager status."""
-        return {
-            'connected_clients': len(self._clients),
-            'client_ids': list(self._clients.keys()),
-        }
-    
-    def __repr__(self) -> str:
-        """String representation of the client manager."""
-        return f"ClientManager(clients={len(self._clients)})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/config.py b/eval-server/python/src/bo_eval_server/config.py
deleted file mode 100644
index 46e72b9..0000000
--- a/eval-server/python/src/bo_eval_server/config.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Configuration management for bo-eval-server.
-
-Handles server configuration with environment variable support.
-"""
-
-import os
-from typing import Optional
-
-
-class Config:
-    """Configuration class for EvalServer with environment variable support."""
-    
-    def __init__(
-        self,
-        host: Optional[str] = None,
-        port: Optional[int] = None,
-        auth_key: Optional[str] = None,
-        log_level: Optional[str] = None,
-        rpc_timeout: Optional[float] = None,
-        max_concurrent_evaluations: Optional[int] = None,
-    ):
-        """
-        Initialize configuration with optional overrides.
-        
-        Args:
-            host: Server host (default: localhost)
-            port: Server port (default: 8080)
-            auth_key: Authentication key for clients
-            log_level: Logging level (default: INFO)
-            rpc_timeout: RPC call timeout in seconds (default: 1500.0)
-            max_concurrent_evaluations: Max concurrent evaluations (default: 10)
-        """
-        self.host = host or os.getenv('BO_EVAL_SERVER_HOST', 'localhost')
-        self.port = int(port or os.getenv('BO_EVAL_SERVER_PORT', '8080'))
-        self.auth_key = auth_key or os.getenv('BO_EVAL_SERVER_AUTH_KEY')
-        self.log_level = log_level or os.getenv('BO_EVAL_SERVER_LOG_LEVEL', 'INFO')
-        self.rpc_timeout = float(
-            rpc_timeout or os.getenv('BO_EVAL_SERVER_RPC_TIMEOUT', '1500.0')
-        )
-        self.max_concurrent_evaluations = int(
-            max_concurrent_evaluations or 
-            os.getenv('BO_EVAL_SERVER_MAX_CONCURRENT', '10')
-        )
-    
-    def validate(self) -> None:
-        """Validate configuration parameters."""
-        if not self.auth_key:
-            raise ValueError("auth_key is required for server authentication")
-        
-        if not isinstance(self.port, int) or self.port <= 0 or self.port > 65535:
-            raise ValueError(f"Invalid port: {self.port}")
-        
-        if self.rpc_timeout <= 0:
-            raise ValueError(f"Invalid RPC timeout: {self.rpc_timeout}")
-        
-        if self.max_concurrent_evaluations <= 0:
-            raise ValueError(
-                f"Invalid max_concurrent_evaluations: {self.max_concurrent_evaluations}"
-            )
-    
-    def to_dict(self) -> dict:
-        """Convert configuration to dictionary."""
-        return {
-            'host': self.host,
-            'port': self.port,
-            'auth_key': '***' if self.auth_key else None,  # Hide sensitive data
-            'log_level': self.log_level,
-            'rpc_timeout': self.rpc_timeout,
-            'max_concurrent_evaluations': self.max_concurrent_evaluations,
-        }
-    
-    def __repr__(self) -> str:
-        """String representation of configuration."""
-        return f"Config({self.to_dict()})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/eval_server.py b/eval-server/python/src/bo_eval_server/eval_server.py
deleted file mode 100644
index 9f6ccb7..0000000
--- a/eval-server/python/src/bo_eval_server/eval_server.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""
-EvalServer - Main WebSocket server for LLM agent evaluations.
-
-A library-first evaluation server that accepts connections from AI agents,
-sends them evaluation tasks via RPC calls, and collects their responses.
-"""
-
-import asyncio
-from typing import Dict, Any, Optional, Callable, Awaitable, List
-
-import websockets
-from loguru import logger
-
-from .config import Config
-from .client_manager import ClientManager, ClientProxy
-from .logger import setup_logger, log_server_event
-
-
-class EvalServer:
-    """
-    Main evaluation server class for managing WebSocket connections and evaluations.
-    
-    Example usage:
-    ```python
-    server = EvalServer(
-        auth_key='your-secret-key',
-        host='127.0.0.1', 
-        port=8080
-    )
-    
-    @server.on_connect
-    async def handle_client(client):
-        print(f'Client connected: {client.id}')
-        
-        result = await client.evaluate({
-            "id": "test_eval",
-            "name": "Test Evaluation",
-            "tool": "chat",
-            "input": {"message": "Hello world"}
-        })
-        
-        print(f'Response: {result}')
-    
-    await server.start()
-    await server.wait_closed()
-    ```
-    """
-    
-    def __init__(
-        self,
-        auth_key: str,
-        host: str = 'localhost',
-        port: int = 8080,
-        rpc_timeout: float = 1500.0,
-        log_level: str = 'INFO',
-        log_dir: Optional[str] = None,
-        max_concurrent_evaluations: int = 10,
-    ):
-        """
-        Initialize the evaluation server.
-        
-        Args:
-            auth_key: Required authentication key for client connections
-            host: Server host address
-            port: Server port number  
-            rpc_timeout: Default timeout for RPC calls in seconds
-            log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
-            log_dir: Directory for log files (optional)
-            max_concurrent_evaluations: Maximum concurrent evaluations
-        """
-        # Create and validate configuration
-        self.config = Config(
-            host=host,
-            port=port,
-            auth_key=auth_key,
-            log_level=log_level,
-            rpc_timeout=rpc_timeout,
-            max_concurrent_evaluations=max_concurrent_evaluations,
-        )
-        self.config.validate()
-        
-        # Setup logging
-        setup_logger(
-            log_level=self.config.log_level,
-            log_dir=log_dir,
-        )
-        
-        # Initialize client manager
-        self.client_manager = ClientManager(
-            auth_key=self.config.auth_key,
-            rpc_timeout=self.config.rpc_timeout,
-        )
-        
-        # Server state
-        self._server: Optional[websockets.WebSocketServer] = None
-        self._running = False
-        self._start_time: Optional[float] = None
-        
-        # Evaluation concurrency control
-        self._evaluation_semaphore = asyncio.Semaphore(
-            self.config.max_concurrent_evaluations
-        )
-    
-    def on_connect(self, handler: Callable[[ClientProxy], Awaitable[None]]) -> Callable:
-        """
-        Decorator to set the client connection handler.
-        
-        Args:
-            handler: Async function to call when a client connects
-            
-        Returns:
-            The handler function (for decorator use)
-        """
-        self.client_manager.on_connect(handler)
-        return handler
-    
-    def on_disconnect(self, handler: Callable[[Dict[str, Any]], Awaitable[None]]) -> Callable:
-        """
-        Decorator to set the client disconnection handler.
-        
-        Args:
-            handler: Async function to call when a client disconnects
-            
-        Returns:
-            The handler function (for decorator use)
-        """
-        self.client_manager.on_disconnect(handler)
-        return handler
-    
-    async def start(self) -> None:
-        """
-        Start the WebSocket server.
-        
-        Raises:
-            RuntimeError: If server is already running
-            OSError: If unable to bind to the specified host/port
-        """
-        if self._running:
-            raise RuntimeError("Server is already running")
-        
-        try:
-            logger.info(f"Starting EvalServer on {self.config.host}:{self.config.port}")
-            
-            # Start WebSocket server
-            self._server = await websockets.serve(
-                self.client_manager.handle_connection,
-                self.config.host,
-                self.config.port,
-                ping_interval=20,
-                ping_timeout=20,
-                close_timeout=10,
-            )
-            
-            self._running = True
-            self._start_time = asyncio.get_event_loop().time()
-            
-            log_server_event(
-                event="start",
-                host=self.config.host,
-                port=self.config.port,
-                config=self.config.to_dict(),
-            )
-            
-            logger.info(f"EvalServer started successfully on ws://{self.config.host}:{self.config.port}")
-            
-        except Exception as e:
-            logger.error(f"Failed to start server: {e}")
-            log_server_event(event="start_failed", error=str(e))
-            raise
-    
-    async def stop(self) -> None:
-        """
-        Stop the WebSocket server.
-        
-        Raises:
-            RuntimeError: If server is not running
-        """
-        if not self._running:
-            raise RuntimeError("Server is not running")
-        
-        try:
-            logger.info("Stopping EvalServer...")
-            
-            if self._server:
-                self._server.close()
-                await self._server.wait_closed()
-            
-            self._running = False
-            self._start_time = None
-            
-            log_server_event(event="stop")
-            logger.info("EvalServer stopped successfully")
-            
-        except Exception as e:
-            logger.error(f"Error stopping server: {e}")
-            log_server_event(event="stop_failed", error=str(e))
-            raise
-    
-    async def wait_closed(self) -> None:
-        """
-        Wait for the server to be closed.
-        
-        This method blocks until the server is stopped, useful for keeping
-        the server running in the main program.
-        """
-        if not self._running or not self._server:
-            return
-            
-        try:
-            await self._server.wait_closed()
-        except Exception as e:
-            logger.error(f"Error waiting for server closure: {e}")
-    
-    def get_status(self) -> Dict[str, Any]:
-        """
-        Get server status information.
-        
-        Returns:
-            Dictionary with server status details
-        """
-        uptime = None
-        if self._running and self._start_time:
-            uptime = asyncio.get_event_loop().time() - self._start_time
-        
-        return {
-            'running': self._running,
-            'host': self.config.host,
-            'port': self.config.port,
-            'uptime': uptime,
-            'config': self.config.to_dict(),
-            'clients': self.client_manager.get_status(),
-        }
-    
-    def get_clients(self) -> List[ClientProxy]:
-        """
-        Get list of connected clients.
-        
-        Returns:
-            List of ClientProxy objects
-        """
-        return self.client_manager.get_clients()
-    
-    def get_client(self, client_id: str) -> Optional[ClientProxy]:
-        """
-        Get a specific client by ID.
-        
-        Args:
-            client_id: Client identifier
-            
-        Returns:
-            ClientProxy object or None if not found
-        """
-        return self.client_manager.get_client(client_id)
-    
-    async def evaluate_with_concurrency_limit(
-        self,
-        client: ClientProxy,
-        evaluation: Dict[str, Any],
-        timeout: Optional[float] = None,
-    ) -> Dict[str, Any]:
-        """
-        Execute an evaluation with concurrency limiting.
-        
-        Args:
-            client: Client to execute evaluation on
-            evaluation: Evaluation object
-            timeout: Optional timeout override
-            
-        Returns:
-            Evaluation result
-        """
-        async with self._evaluation_semaphore:
-            return await client.evaluate(evaluation, timeout)
-    
-    def is_running(self) -> bool:
-        """Check if the server is currently running."""
-        return self._running
-    
-    def __repr__(self) -> str:
-        """String representation of the server."""
-        status = "running" if self._running else "stopped"
-        return f"EvalServer(status={status}, host={self.config.host}, port={self.config.port})"
-    
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self.start()
-        return self
-    
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        if self._running:
-            await self.stop()
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/evaluation_stack.py b/eval-server/python/src/bo_eval_server/evaluation_stack.py
deleted file mode 100644
index 1ad5078..0000000
--- a/eval-server/python/src/bo_eval_server/evaluation_stack.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-EvaluationStack - A simple stack-like structure for managing evaluations.
-
-Provides LIFO (Last In, First Out) access to evaluation objects.
-Useful for distributing different evaluations across multiple client connections.
-"""
-
-from typing import Dict, Any, List, Optional
-
-
-class EvaluationStack:
-    """A LIFO stack for managing evaluation objects."""
-    
-    def __init__(self) -> None:
-        """Initialize an empty evaluation stack."""
-        self._evaluations: List[Dict[str, Any]] = []
-    
-    def push(self, evaluation: Dict[str, Any]) -> None:
-        """
-        Add an evaluation to the top of the stack.
-        
-        Args:
-            evaluation: The evaluation object to add
-            
-        Raises:
-            ValueError: If evaluation is invalid or missing required fields
-        """
-        if not evaluation or not isinstance(evaluation, dict):
-            raise ValueError('Evaluation must be a valid dictionary')
-        
-        # Validate required fields
-        required_fields = ['id', 'name', 'tool', 'input']
-        for field in required_fields:
-            if field not in evaluation or not evaluation[field]:
-                raise ValueError(f'Evaluation missing required field: {field}')
-        
-        self._evaluations.append(evaluation)
-    
-    def pop(self) -> Optional[Dict[str, Any]]:
-        """
-        Remove and return the evaluation from the top of the stack.
-        
-        Returns:
-            The evaluation object, or None if stack is empty
-        """
-        if self._evaluations:
-            return self._evaluations.pop()
-        return None
-    
-    def is_empty(self) -> bool:
-        """
-        Check if the stack is empty.
-        
-        Returns:
-            True if stack has no evaluations
-        """
-        return len(self._evaluations) == 0
-    
-    def size(self) -> int:
-        """
-        Get the number of evaluations in the stack.
-        
-        Returns:
-            The stack size
-        """
-        return len(self._evaluations)
-    
-    def peek(self) -> Optional[Dict[str, Any]]:
-        """
-        Peek at the top evaluation without removing it.
-        
-        Returns:
-            The top evaluation object, or None if stack is empty
-        """
-        if self.is_empty():
-            return None
-        return self._evaluations[-1]
-    
-    def clear(self) -> None:
-        """Clear all evaluations from the stack."""
-        self._evaluations.clear()
-    
-    def to_array(self) -> List[Dict[str, Any]]:
-        """
-        Get a copy of all evaluations in the stack (top to bottom).
-        
-        Returns:
-            List of evaluation objects from top to bottom
-        """
-        return list(reversed(self._evaluations))
-    
-    def __len__(self) -> int:
-        """Return the number of evaluations in the stack."""
-        return len(self._evaluations)
-    
-    def __bool__(self) -> bool:
-        """Return True if stack has evaluations."""
-        return not self.is_empty()
-    
-    def __repr__(self) -> str:
-        """String representation of the stack."""
-        return f"EvaluationStack(size={self.size()})"
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/logger.py b/eval-server/python/src/bo_eval_server/logger.py
deleted file mode 100644
index 8f6e3c5..0000000
--- a/eval-server/python/src/bo_eval_server/logger.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Enhanced logging setup for bo-eval-server using loguru.
-
-Provides structured logging with JSON formatting and multiple log levels.
-"""
-
-import sys
-from pathlib import Path
-from typing import Optional, Dict, Any
-
-from loguru import logger
-
-
-def setup_logger(
-    log_level: str = "INFO",
-    log_dir: Optional[str] = None,
-    enable_json: bool = True,
-) -> None:
-    """
-    Setup enhanced logging with loguru.
-    
-    Args:
-        log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-        log_dir: Directory for log files (default: ./logs)
-        enable_json: Whether to use JSON formatting for structured logs
-    """
-    # Remove default handler
-    logger.remove()
-    
-    # Console handler with colored output
-    logger.add(
-        sys.stdout,
-        level=log_level,
-        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
-               "<level>{level: <8}</level> | "
-               "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
-               "<level>{message}</level>",
-        colorize=True,
-    )
-    
-    # File handlers if log_dir is specified
-    if log_dir:
-        log_path = Path(log_dir)
-        log_path.mkdir(exist_ok=True)
-        
-        # Combined log file
-        logger.add(
-            log_path / "combined.log",
-            level="DEBUG",
-            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
-            rotation="10 MB",
-            retention="7 days",
-        )
-        
-        # Error log file
-        logger.add(
-            log_path / "error.log", 
-            level="ERROR",
-            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
-            rotation="10 MB",
-            retention="30 days",
-        )
-        
-        # Structured JSON log for evaluations
-        if enable_json:
-            logger.add(
-                log_path / "evaluations.jsonl",
-                level="INFO",
-                format="{message}",
-                filter=lambda record: record["extra"].get("event_type") == "evaluation",
-                rotation="10 MB",
-                retention="30 days",
-            )
-
-
-def log_connection(event: str, client_id: str, **kwargs) -> None:
-    """
-    Log connection events with structured data.
-    
-    Args:
-        event: Connection event type (connect, disconnect, ready)
-        client_id: Client identifier
-        **kwargs: Additional event data
-    """
-    logger.bind(event_type="connection").info(
-        f"Connection {event}: {client_id}",
-        extra={
-            "event_type": "connection",
-            "connection_event": event,
-            "client_id": client_id,
-            **kwargs,
-        }
-    )
-
-
-def log_evaluation(
-    evaluation_id: str,
-    client_id: str,
-    status: str,
-    duration: Optional[float] = None,
-    **kwargs
-) -> None:
-    """
-    Log evaluation events with structured data.
-    
-    Args:
-        evaluation_id: Unique evaluation identifier
-        client_id: Client that handled the evaluation
-        status: Evaluation status (started, completed, failed, timeout)
-        duration: Evaluation duration in seconds
-        **kwargs: Additional evaluation data
-    """
-    message = f"Evaluation {status}: {evaluation_id} (client: {client_id})"
-    if duration is not None:
-        message += f" ({duration:.2f}s)"
-    
-    log_data = {
-        "event_type": "evaluation",
-        "evaluation_id": evaluation_id,
-        "client_id": client_id,
-        "status": status,
-        "duration": duration,
-        **kwargs,
-    }
-    
-    logger.bind(event_type="evaluation").info(message, extra=log_data)
-
-
-def log_rpc_call(
-    method: str,
-    client_id: str,
-    call_id: str,
-    status: str,
-    duration: Optional[float] = None,
-    **kwargs
-) -> None:
-    """
-    Log RPC call events with structured data.
-    
-    Args:
-        method: RPC method name
-        client_id: Target client identifier
-        call_id: RPC call identifier
-        status: Call status (sent, completed, failed, timeout)
-        duration: Call duration in seconds
-        **kwargs: Additional call data
-    """
-    message = f"RPC {status}: {method} -> {client_id} (id: {call_id})"
-    if duration is not None:
-        message += f" ({duration:.2f}s)"
-    
-    log_data = {
-        "event_type": "rpc",
-        "method": method,
-        "client_id": client_id,
-        "call_id": call_id,
-        "status": status,
-        "duration": duration,
-        **kwargs,
-    }
-    
-    logger.bind(event_type="rpc").info(message, extra=log_data)
-
-
-def log_server_event(event: str, **kwargs) -> None:
-    """
-    Log server lifecycle events.
-    
-    Args:
-        event: Server event type (start, stop, error)
-        **kwargs: Additional event data
-    """
-    logger.bind(event_type="server").info(
-        f"Server {event}",
-        extra={
-            "event_type": "server",
-            "server_event": event,
-            **kwargs,
-        }
-    )
\ No newline at end of file
diff --git a/eval-server/python/src/bo_eval_server/rpc_client.py b/eval-server/python/src/bo_eval_server/rpc_client.py
deleted file mode 100644
index 8fc024b..0000000
--- a/eval-server/python/src/bo_eval_server/rpc_client.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-JSON-RPC 2.0 client implementation for calling methods on connected agents.
-
-Handles request/response correlation, timeouts, and error conditions.
-"""
-
-import asyncio
-import json
-import time
-import uuid
-from typing import Dict, Any, Optional, Callable, Awaitable
-
-import websockets
-from loguru import logger
-
-from .logger import log_rpc_call
-
-
-class RpcError(Exception):
-    """Exception raised for RPC-related errors."""
-    pass
-
-
-class RpcTimeoutError(RpcError):
-    """Exception raised when RPC call times out."""
-    pass
-
-
-class RpcClient:
-    """JSON-RPC 2.0 client for bidirectional communication with agents."""
-    
-    def __init__(self, websocket: websockets.WebSocketServerProtocol, timeout: float = 1500.0):
-        """
-        Initialize RPC client for a WebSocket connection.
-        
-        Args:
-            websocket: WebSocket connection to the agent
-            timeout: Default timeout for RPC calls in seconds
-        """
-        self.websocket = websocket
-        self.timeout = timeout
-        self._pending_calls: Dict[str, asyncio.Future] = {}
-        self._message_handler_task: Optional[asyncio.Task] = None
-        self._closed = False
-    
-    async def start(self) -> None:
-        """Start the RPC client message handler."""
-        if self._message_handler_task is None:
-            self._message_handler_task = asyncio.create_task(self._handle_messages())
-    
-    async def stop(self) -> None:
-        """Stop the RPC client and cancel pending calls."""
-        self._closed = True
-        
-        # Cancel message handler
-        if self._message_handler_task:
-            self._message_handler_task.cancel()
-            try:
-                await self._message_handler_task
-            except asyncio.CancelledError:
-                pass
-        
-        # Cancel all pending calls
-        for future in self._pending_calls.values():
-            if not future.done():
-                future.cancel()
-        self._pending_calls.clear()
-    
-    async def call(
-        self,
-        method: str,
-        params: Optional[Dict[str, Any]] = None,
-        timeout: Optional[float] = None,
-        client_id: Optional[str] = None,
-    ) -> Any:
-        """
-        Make an RPC call to the connected agent.
-        
-        Args:
-            method: RPC method name to call
-            params: Parameters to pass to the method
-            timeout: Timeout for this call (uses default if None)
-            client_id: Client ID for logging purposes
-            
-        Returns:
-            The result returned by the agent
-            
-        Raises:
-            RpcError: If the call fails or returns an error
-            RpcTimeoutError: If the call times out
-            ConnectionError: If the WebSocket connection is closed
-        """
-        if self._closed:
-            raise ConnectionError("RPC client is closed")
-        
-        call_id = str(uuid.uuid4())
-        call_timeout = timeout or self.timeout
-        
-        # Create JSON-RPC 2.0 request
-        request = {
-            "jsonrpc": "2.0",
-            "method": method,
-            "params": params or {},
-            "id": call_id,
-        }
-        
-        # Create future for response
-        future: asyncio.Future = asyncio.Future()
-        self._pending_calls[call_id] = future
-        
-        start_time = time.time()
-        
-        try:
-            # Log RPC call start
-            log_rpc_call(
-                method=method,
-                client_id=client_id or "unknown",
-                call_id=call_id,
-                status="sent",
-                params=params,
-            )
-            
-            # Send request
-            await self.websocket.send(json.dumps(request))
-            
-            # Wait for response with timeout
-            try:
-                result = await asyncio.wait_for(future, timeout=call_timeout)
-                duration = time.time() - start_time
-                
-                # Log successful completion
-                log_rpc_call(
-                    method=method,
-                    client_id=client_id or "unknown",
-                    call_id=call_id,
-                    status="completed",
-                    duration=duration,
-                )
-                
-                return result
-                
-            except asyncio.TimeoutError:
-                duration = time.time() - start_time
-                
-                # Log timeout
-                log_rpc_call(
-                    method=method,
-                    client_id=client_id or "unknown",
-                    call_id=call_id,
-                    status="timeout",
-                    duration=duration,
-                )
-                
-                raise RpcTimeoutError(f"RPC call '{method}' timed out after {call_timeout}s")
-                
-        except Exception as e:
-            duration = time.time() - start_time
-            
-            # Log failure
-            log_rpc_call(
-                method=method,
-                client_id=client_id or "unknown",
-                call_id=call_id,
-                status="failed",
-                duration=duration,
-                error=str(e),
-            )
-            
-            raise
-            
-        finally:
-            # Clean up pending call
-            self._pending_calls.pop(call_id, None)
-    
-    async def _handle_messages(self) -> None:
-        """Handle incoming WebSocket messages and route RPC responses."""
-        try:
-            async for message in self.websocket:
-                if self._closed:
-                    break
-                
-                try:
-                    await self._process_message(message)
-                except Exception as e:
-                    logger.error(f"Error processing RPC message: {e}")
-                    
-        except websockets.exceptions.ConnectionClosed:
-            logger.debug("WebSocket connection closed in RPC message handler")
-        except Exception as e:
-            logger.error(f"Error in RPC message handler: {e}")
-        finally:
-            await self.stop()
-    
-    async def _process_message(self, message: str) -> None:
-        """Process a single WebSocket message."""
-        try:
-            data = json.loads(message)
-        except json.JSONDecodeError as e:
-            logger.warning(f"Invalid JSON in RPC message: {e}")
-            return
-        
-        # Handle JSON-RPC 2.0 responses
-        if isinstance(data, dict) and "jsonrpc" in data and "id" in data:
-            call_id = data["id"]
-            future = self._pending_calls.get(call_id)
-            
-            if future and not future.done():
-                if "result" in data:
-                    # Successful response
-                    future.set_result(data["result"])
-                elif "error" in data:
-                    # Error response
-                    error = data["error"]
-                    error_msg = f"RPC error {error.get('code', 'unknown')}: {error.get('message', 'Unknown error')}"
-                    future.set_exception(RpcError(error_msg))
-                else:
-                    # Invalid response format
-                    future.set_exception(RpcError("Invalid RPC response format"))
-            else:
-                logger.warning(f"Received response for unknown or completed call: {call_id}")
-    
-    def is_connected(self) -> bool:
-        """Check if the RPC client is still active."""
-        return not self._closed
-    
-    def __repr__(self) -> str:
-        """String representation of the RPC client."""
-        status = "connected" if self.is_connected() else "closed"
-        return f"RpcClient(status={status}, pending_calls={len(self._pending_calls)})"
\ No newline at end of file
diff --git a/eval-server/python/test_client.py b/eval-server/python/test_client.py
deleted file mode 100644
index 37f2520..0000000
--- a/eval-server/python/test_client.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test client for debugging connection issues with bo-eval-server.
-
-This client helps test the WebSocket connection and protocol implementation.
-"""
-
-import asyncio
-import json
-import sys
-import uuid
-from pathlib import Path
-
-# Add src to path for development
-sys.path.insert(0, str(Path(__file__).parent / "src"))
-
-try:
-    import websockets
-except ImportError:
-    print("❌ websockets not installed. Run: uv add websockets")
-    sys.exit(1)
-
-
-class TestClient:
-    """Simple test client for debugging server connections."""
-    
-    def __init__(self, server_url: str = "ws://127.0.0.1:8080", auth_key: str = "hello"):
-        self.server_url = server_url
-        self.auth_key = auth_key
-        self.client_id = str(uuid.uuid4())
-        self.websocket = None
-    
-    async def connect_and_test(self):
-        """Connect to server and test the NodeJS-compatible protocol."""
-        print(f"🔗 Connecting to {self.server_url}")
-        print(f"   Client ID: {self.client_id}")
-        print(f"   Auth Key: {self.auth_key}")
-        
-        try:
-            # Connect to WebSocket
-            self.websocket = await websockets.connect(
-                self.server_url,
-                ping_interval=20,
-                ping_timeout=20,
-                close_timeout=10,
-            )
-            print("✅ WebSocket connection established")
-            
-            # Send registration message (NodeJS style)
-            registration = {
-                "type": "register",
-                "clientId": self.client_id,
-                "secretKey": self.auth_key,
-                "capabilities": ["chat", "action", "research"]
-            }
-            
-            print("📤 Sending registration message:")
-            print(f"   {json.dumps(registration, indent=2)}")
-            
-            await self.websocket.send(json.dumps(registration))
-            
-            # Wait for registration acknowledgment
-            print("⏳ Waiting for registration acknowledgment...")
-            response = await asyncio.wait_for(self.websocket.recv(), timeout=10.0)
-            response_data = json.loads(response)
-            
-            print("📥 Received registration acknowledgment:")
-            print(f"   {json.dumps(response_data, indent=2)}")
-            
-            if response_data.get("type") == "registration_ack" and response_data.get("status") == "accepted":
-                print("✅ Registration successful!")
-                
-                # Send ready signal
-                ready_message = {"type": "ready"}
-                print("📤 Sending ready signal:")
-                print(f"   {json.dumps(ready_message, indent=2)}")
-                
-                await self.websocket.send(json.dumps(ready_message))
-                print("✅ Ready signal sent")
-                
-                # Listen for RPC calls
-                print("👂 Listening for RPC calls...")
-                await self.listen_for_calls()
-                
-            elif response_data.get("type") == "error":
-                print(f"❌ Registration failed: {response_data.get('message')}")
-                return False
-            else:
-                print(f"❓ Unexpected response: {response_data}")
-                return False
-                
-        except asyncio.TimeoutError:
-            print("⏰ Timeout waiting for server response")
-            return False
-        except websockets.exceptions.ConnectionClosed as e:
-            print(f"🔌 Connection closed: {e}")
-            return False
-        except Exception as e:
-            print(f"💥 Error during connection: {e}")
-            return False
-        finally:
-            if self.websocket:
-                await self.websocket.close()
-        
-        return True
-    
-    async def listen_for_calls(self):
-        """Listen for RPC calls from the server."""
-        try:
-            async for message in self.websocket:
-                print(f"\n📥 Received message: {message}")
-                
-                try:
-                    data = json.loads(message)
-                    
-                    if data.get("jsonrpc") == "2.0" and data.get("method") == "evaluate":
-                        print("🎯 Received RPC evaluation request")
-                        print(f"   ID: {data.get('id')}")
-                        print(f"   Params: {json.dumps(data.get('params', {}), indent=2)}")
-                        
-                        # Send mock response
-                        response = {
-                            "jsonrpc": "2.0",
-                            "id": data["id"],
-                            "result": {
-                                "status": "completed",
-                                "output": {
-                                    "response": f"Mock response for evaluation {data['params'].get('name', 'unknown')}"
-                                },
-                                "metadata": {
-                                    "client_id": self.client_id,
-                                    "test_client": True
-                                }
-                            }
-                        }
-                        
-                        print("📤 Sending mock response:")
-                        print(f"   {json.dumps(response, indent=2)}")
-                        
-                        await self.websocket.send(json.dumps(response))
-                        print("✅ Mock response sent")
-                    else:
-                        print(f"❓ Unknown message type: {data}")
-                        
-                except json.JSONDecodeError as e:
-                    print(f"❌ Invalid JSON received: {e}")
-                    
-        except websockets.exceptions.ConnectionClosed:
-            print("🔌 Connection closed by server")
-        except Exception as e:
-            print(f"💥 Error listening for calls: {e}")
-
-
-async def main():
-    """Main test function."""
-    print("🧪 Test Client for bo-eval-server")
-    print("=" * 40)
-    
-    if len(sys.argv) > 1:
-        server_url = sys.argv[1]
-    else:
-        server_url = "ws://127.0.0.1:8080"
-    
-    if len(sys.argv) > 2:
-        auth_key = sys.argv[2]
-    else:
-        auth_key = "hello"  # Default from examples
-    
-    client = TestClient(server_url, auth_key)
-    
-    try:
-        success = await client.connect_and_test()
-        if success:
-            print("\n✅ Test completed successfully!")
-        else:
-            print("\n❌ Test failed!")
-            sys.exit(1)
-    except KeyboardInterrupt:
-        print("\n🛑 Test interrupted by user")
-    except Exception as e:
-        print(f"\n💥 Test failed with error: {e}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    print("Usage: python test_client.py [ws://server:port] [auth_key]")
-    print("Example: python test_client.py ws://127.0.0.1:8080 hello")
-    print()
-    
-    asyncio.run(main())
\ No newline at end of file
diff --git a/eval-server/python/uv.lock b/eval-server/python/uv.lock
deleted file mode 100644
index 2da9568..0000000
--- a/eval-server/python/uv.lock
+++ /dev/null
@@ -1,1306 +0,0 @@
-version = 1
-revision = 2
-requires-python = ">=3.8"
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-    "python_full_version < '3.9'",
-]
-
-[[package]]
-name = "backports-asyncio-runner"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" },
-]
-
-[[package]]
-name = "black"
-version = "24.8.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy-extensions", marker = "python_full_version < '3.9'" },
-    { name = "packaging", marker = "python_full_version < '3.9'" },
-    { name = "pathspec", marker = "python_full_version < '3.9'" },
-    { name = "platformdirs", version = "4.3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/04/b0/46fb0d4e00372f4a86a6f8efa3cb193c9f64863615e39010b1477e010578/black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f", size = 644810, upload-time = "2024-08-02T17:43:18.405Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/6e/74e29edf1fba3887ed7066930a87f698ffdcd52c5dbc263eabb06061672d/black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6", size = 1632092, upload-time = "2024-08-02T17:47:26.911Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/49/575cb6c3faee690b05c9d11ee2e8dba8fbd6d6c134496e644c1feb1b47da/black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb", size = 1457529, upload-time = "2024-08-02T17:47:29.109Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/b4/d34099e95c437b53d01c4aa37cf93944b233066eb034ccf7897fa4e5f286/black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42", size = 1757443, upload-time = "2024-08-02T17:46:20.306Z" },
-    { url = "https://files.pythonhosted.org/packages/87/a0/6d2e4175ef364b8c4b64f8441ba041ed65c63ea1db2720d61494ac711c15/black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a", size = 1418012, upload-time = "2024-08-02T17:47:20.33Z" },
-    { url = "https://files.pythonhosted.org/packages/08/a6/0a3aa89de9c283556146dc6dbda20cd63a9c94160a6fbdebaf0918e4a3e1/black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1", size = 1615080, upload-time = "2024-08-02T17:48:05.467Z" },
-    { url = "https://files.pythonhosted.org/packages/db/94/b803d810e14588bb297e565821a947c108390a079e21dbdcb9ab6956cd7a/black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af", size = 1438143, upload-time = "2024-08-02T17:47:30.247Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/b5/f485e1bbe31f768e2e5210f52ea3f432256201289fd1a3c0afda693776b0/black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4", size = 1738774, upload-time = "2024-08-02T17:46:17.837Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/69/a000fc3736f89d1bdc7f4a879f8aaf516fb03613bb51a0154070383d95d9/black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af", size = 1427503, upload-time = "2024-08-02T17:46:22.654Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/a8/05fb14195cfef32b7c8d4585a44b7499c2a4b205e1662c427b941ed87054/black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368", size = 1646132, upload-time = "2024-08-02T17:49:52.843Z" },
-    { url = "https://files.pythonhosted.org/packages/41/77/8d9ce42673e5cb9988f6df73c1c5c1d4e9e788053cccd7f5fb14ef100982/black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed", size = 1448665, upload-time = "2024-08-02T17:47:54.479Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/94/eff1ddad2ce1d3cc26c162b3693043c6b6b575f538f602f26fe846dfdc75/black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018", size = 1762458, upload-time = "2024-08-02T17:46:19.384Z" },
-    { url = "https://files.pythonhosted.org/packages/28/ea/18b8d86a9ca19a6942e4e16759b2fa5fc02bbc0eb33c1b866fcd387640ab/black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2", size = 1436109, upload-time = "2024-08-02T17:46:52.97Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/d4/ae03761ddecc1a37d7e743b89cccbcf3317479ff4b88cfd8818079f890d0/black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd", size = 1617322, upload-time = "2024-08-02T17:51:20.203Z" },
-    { url = "https://files.pythonhosted.org/packages/14/4b/4dfe67eed7f9b1ddca2ec8e4418ea74f0d1dc84d36ea874d618ffa1af7d4/black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2", size = 1442108, upload-time = "2024-08-02T17:50:40.824Z" },
-    { url = "https://files.pythonhosted.org/packages/97/14/95b3f91f857034686cae0e73006b8391d76a8142d339b42970eaaf0416ea/black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e", size = 1745786, upload-time = "2024-08-02T17:46:02.939Z" },
-    { url = "https://files.pythonhosted.org/packages/95/54/68b8883c8aa258a6dde958cd5bdfada8382bec47c5162f4a01e66d839af1/black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920", size = 1426754, upload-time = "2024-08-02T17:46:38.603Z" },
-    { url = "https://files.pythonhosted.org/packages/13/b2/b3f24fdbb46f0e7ef6238e131f13572ee8279b70f237f221dd168a9dba1a/black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c", size = 1631706, upload-time = "2024-08-02T17:49:57.606Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/35/31010981e4a05202a84a3116423970fd1a59d2eda4ac0b3570fbb7029ddc/black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e", size = 1457429, upload-time = "2024-08-02T17:49:12.764Z" },
-    { url = "https://files.pythonhosted.org/packages/27/25/3f706b4f044dd569a20a4835c3b733dedea38d83d2ee0beb8178a6d44945/black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47", size = 1756488, upload-time = "2024-08-02T17:46:08.067Z" },
-    { url = "https://files.pythonhosted.org/packages/63/72/79375cd8277cbf1c5670914e6bd4c1b15dea2c8f8e906dc21c448d0535f0/black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb", size = 1417721, upload-time = "2024-08-02T17:46:42.637Z" },
-    { url = "https://files.pythonhosted.org/packages/27/1e/83fa8a787180e1632c3d831f7e58994d7aaf23a0961320d21e84f922f919/black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed", size = 206504, upload-time = "2024-08-02T17:43:15.747Z" },
-]
-
-[[package]]
-name = "black"
-version = "25.1.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "mypy-extensions", marker = "python_full_version >= '3.9'" },
-    { name = "packaging", marker = "python_full_version >= '3.9'" },
-    { name = "pathspec", marker = "python_full_version >= '3.9'" },
-    { name = "platformdirs", version = "4.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/94/49/26a7b0f3f35da4b5a65f081943b7bcd22d7002f5f0fb8098ec1ff21cb6ef/black-25.1.0.tar.gz", hash = "sha256:33496d5cd1222ad73391352b4ae8da15253c5de89b93a80b3e2c8d9a19ec2666", size = 649449, upload-time = "2025-01-29T04:15:40.373Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4d/3b/4ba3f93ac8d90410423fdd31d7541ada9bcee1df32fb90d26de41ed40e1d/black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32", size = 1629419, upload-time = "2025-01-29T05:37:06.642Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/02/0bde0485146a8a5e694daed47561785e8b77a0466ccc1f3e485d5ef2925e/black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da", size = 1461080, upload-time = "2025-01-29T05:37:09.321Z" },
-    { url = "https://files.pythonhosted.org/packages/52/0e/abdf75183c830eaca7589144ff96d49bce73d7ec6ad12ef62185cc0f79a2/black-25.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:055e59b198df7ac0b7efca5ad7ff2516bca343276c466be72eb04a3bcc1f82d7", size = 1766886, upload-time = "2025-01-29T04:18:24.432Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/a6/97d8bb65b1d8a41f8a6736222ba0a334db7b7b77b8023ab4568288f23973/black-25.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:db8ea9917d6f8fc62abd90d944920d95e73c83a5ee3383493e35d271aca872e9", size = 1419404, upload-time = "2025-01-29T04:19:04.296Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/4f/87f596aca05c3ce5b94b8663dbfe242a12843caaa82dd3f85f1ffdc3f177/black-25.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a39337598244de4bae26475f77dda852ea00a93bd4c728e09eacd827ec929df0", size = 1614372, upload-time = "2025-01-29T05:37:11.71Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/d0/2c34c36190b741c59c901e56ab7f6e54dad8df05a6272a9747ecef7c6036/black-25.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96c1c7cd856bba8e20094e36e0f948718dc688dba4a9d78c3adde52b9e6c2299", size = 1442865, upload-time = "2025-01-29T05:37:14.309Z" },
-    { url = "https://files.pythonhosted.org/packages/21/d4/7518c72262468430ead45cf22bd86c883a6448b9eb43672765d69a8f1248/black-25.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce2e264d59c91e52d8000d507eb20a9aca4a778731a08cfff7e5ac4a4bb7096", size = 1749699, upload-time = "2025-01-29T04:18:17.688Z" },
-    { url = "https://files.pythonhosted.org/packages/58/db/4f5beb989b547f79096e035c4981ceb36ac2b552d0ac5f2620e941501c99/black-25.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:172b1dbff09f86ce6f4eb8edf9dede08b1fce58ba194c87d7a4f1a5aa2f5b3c2", size = 1428028, upload-time = "2025-01-29T04:18:51.711Z" },
-    { url = "https://files.pythonhosted.org/packages/83/71/3fe4741df7adf015ad8dfa082dd36c94ca86bb21f25608eb247b4afb15b2/black-25.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4b60580e829091e6f9238c848ea6750efed72140b91b048770b64e74fe04908b", size = 1650988, upload-time = "2025-01-29T05:37:16.707Z" },
-    { url = "https://files.pythonhosted.org/packages/13/f3/89aac8a83d73937ccd39bbe8fc6ac8860c11cfa0af5b1c96d081facac844/black-25.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e2978f6df243b155ef5fa7e558a43037c3079093ed5d10fd84c43900f2d8ecc", size = 1453985, upload-time = "2025-01-29T05:37:18.273Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/22/b99efca33f1f3a1d2552c714b1e1b5ae92efac6c43e790ad539a163d1754/black-25.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b48735872ec535027d979e8dcb20bf4f70b5ac75a8ea99f127c106a7d7aba9f", size = 1783816, upload-time = "2025-01-29T04:18:33.823Z" },
-    { url = "https://files.pythonhosted.org/packages/18/7e/a27c3ad3822b6f2e0e00d63d58ff6299a99a5b3aee69fa77cd4b0076b261/black-25.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea0213189960bda9cf99be5b8c8ce66bb054af5e9e861249cd23471bd7b0b3ba", size = 1440860, upload-time = "2025-01-29T04:19:12.944Z" },
-    { url = "https://files.pythonhosted.org/packages/98/87/0edf98916640efa5d0696e1abb0a8357b52e69e82322628f25bf14d263d1/black-25.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f0b18a02996a836cc9c9c78e5babec10930862827b1b724ddfe98ccf2f2fe4f", size = 1650673, upload-time = "2025-01-29T05:37:20.574Z" },
-    { url = "https://files.pythonhosted.org/packages/52/e5/f7bf17207cf87fa6e9b676576749c6b6ed0d70f179a3d812c997870291c3/black-25.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:afebb7098bfbc70037a053b91ae8437c3857482d3a690fefc03e9ff7aa9a5fd3", size = 1453190, upload-time = "2025-01-29T05:37:22.106Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/ee/adda3d46d4a9120772fae6de454c8495603c37c4c3b9c60f25b1ab6401fe/black-25.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:030b9759066a4ee5e5aca28c3c77f9c64789cdd4de8ac1df642c40b708be6171", size = 1782926, upload-time = "2025-01-29T04:18:58.564Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/64/94eb5f45dcb997d2082f097a3944cfc7fe87e071907f677e80788a2d7b7a/black-25.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:a22f402b410566e2d1c950708c77ebf5ebd5d0d88a6a2e87c86d9fb48afa0d18", size = 1442613, upload-time = "2025-01-29T04:19:27.63Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/b6/ae7507470a4830dbbfe875c701e84a4a5fb9183d1497834871a715716a92/black-25.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1ee0a0c330f7b5130ce0caed9936a904793576ef4d2b98c40835d6a65afa6a0", size = 1628593, upload-time = "2025-01-29T05:37:23.672Z" },
-    { url = "https://files.pythonhosted.org/packages/24/c1/ae36fa59a59f9363017ed397750a0cd79a470490860bc7713967d89cdd31/black-25.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3df5f1bf91d36002b0a75389ca8663510cf0531cca8aa5c1ef695b46d98655f", size = 1460000, upload-time = "2025-01-29T05:37:25.829Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/b6/98f832e7a6c49aa3a464760c67c7856363aa644f2f3c74cf7d624168607e/black-25.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9e6827d563a2c820772b32ce8a42828dc6790f095f441beef18f96aa6f8294e", size = 1765963, upload-time = "2025-01-29T04:18:38.116Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/e9/2cb0a017eb7024f70e0d2e9bdb8c5a5b078c5740c7f8816065d06f04c557/black-25.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:bacabb307dca5ebaf9c118d2d2f6903da0d62c9faa82bd21a33eecc319559355", size = 1419419, upload-time = "2025-01-29T04:18:30.191Z" },
-    { url = "https://files.pythonhosted.org/packages/09/71/54e999902aed72baf26bca0d50781b01838251a462612966e9fc4891eadd/black-25.1.0-py3-none-any.whl", hash = "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717", size = 207646, upload-time = "2025-01-29T04:15:38.082Z" },
-]
-
-[[package]]
-name = "bo-eval-server"
-version = "1.0.0"
-source = { editable = "." }
-dependencies = [
-    { name = "loguru" },
-    { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pandas", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "requests" },
-    { name = "websockets", version = "13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "websockets", version = "15.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.optional-dependencies]
-dev = [
-    { name = "black", version = "24.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "black", version = "25.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "mypy", version = "1.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest-asyncio", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest-asyncio", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.dev-dependencies]
-dev = [
-    { name = "black", version = "24.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "black", version = "25.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "mypy", version = "1.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "mypy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pytest-asyncio", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "pytest-asyncio", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
-    { name = "loguru", specifier = ">=0.7.0" },
-    { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.0.0" },
-    { name = "pandas", specifier = ">=2.0.0" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
-    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.21.0" },
-    { name = "requests", specifier = ">=2.31.0" },
-    { name = "websockets", specifier = ">=11.0.0" },
-]
-provides-extras = ["dev"]
-
-[package.metadata.requires-dev]
-dev = [
-    { name = "black", specifier = ">=24.8.0" },
-    { name = "mypy", specifier = ">=1.14.1" },
-    { name = "pytest", specifier = ">=8.3.5" },
-    { name = "pytest-asyncio", specifier = ">=0.24.0" },
-]
-
-[[package]]
-name = "certifi"
-version = "2025.8.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" },
-]
-
-[[package]]
-name = "charset-normalizer"
-version = "3.4.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload-time = "2025-05-02T08:34:42.01Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/28/9901804da60055b406e1a1c5ba7aac1276fb77f1dde635aabfc7fd84b8ab/charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941", size = 201818, upload-time = "2025-05-02T08:31:46.725Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/9b/892a8c8af9110935e5adcbb06d9c6fe741b6bb02608c6513983048ba1a18/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd", size = 144649, upload-time = "2025-05-02T08:31:48.889Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/a5/4179abd063ff6414223575e008593861d62abfc22455b5d1a44995b7c101/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6", size = 155045, upload-time = "2025-05-02T08:31:50.757Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/95/bc08c7dfeddd26b4be8c8287b9bb055716f31077c8b0ea1cd09553794665/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d", size = 147356, upload-time = "2025-05-02T08:31:52.634Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/2d/7a5b635aa65284bf3eab7653e8b4151ab420ecbae918d3e359d1947b4d61/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86", size = 149471, upload-time = "2025-05-02T08:31:56.207Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/38/51fc6ac74251fd331a8cfdb7ec57beba8c23fd5493f1050f71c87ef77ed0/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c", size = 151317, upload-time = "2025-05-02T08:31:57.613Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/17/edee1e32215ee6e9e46c3e482645b46575a44a2d72c7dfd49e49f60ce6bf/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0", size = 146368, upload-time = "2025-05-02T08:31:59.468Z" },
-    { url = "https://files.pythonhosted.org/packages/26/2c/ea3e66f2b5f21fd00b2825c94cafb8c326ea6240cd80a91eb09e4a285830/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef", size = 154491, upload-time = "2025-05-02T08:32:01.219Z" },
-    { url = "https://files.pythonhosted.org/packages/52/47/7be7fa972422ad062e909fd62460d45c3ef4c141805b7078dbab15904ff7/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6", size = 157695, upload-time = "2025-05-02T08:32:03.045Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/42/9f02c194da282b2b340f28e5fb60762de1151387a36842a92b533685c61e/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366", size = 154849, upload-time = "2025-05-02T08:32:04.651Z" },
-    { url = "https://files.pythonhosted.org/packages/67/44/89cacd6628f31fb0b63201a618049be4be2a7435a31b55b5eb1c3674547a/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db", size = 150091, upload-time = "2025-05-02T08:32:06.719Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/79/4b8da9f712bc079c0f16b6d67b099b0b8d808c2292c937f267d816ec5ecc/charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a", size = 98445, upload-time = "2025-05-02T08:32:08.66Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/d7/96970afb4fb66497a40761cdf7bd4f6fca0fc7bafde3a84f836c1f57a926/charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509", size = 105782, upload-time = "2025-05-02T08:32:10.46Z" },
-    { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload-time = "2025-05-02T08:32:11.945Z" },
-    { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload-time = "2025-05-02T08:32:13.946Z" },
-    { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload-time = "2025-05-02T08:32:15.873Z" },
-    { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload-time = "2025-05-02T08:32:17.283Z" },
-    { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload-time = "2025-05-02T08:32:18.807Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload-time = "2025-05-02T08:32:20.333Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload-time = "2025-05-02T08:32:21.86Z" },
-    { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload-time = "2025-05-02T08:32:23.434Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload-time = "2025-05-02T08:32:24.993Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload-time = "2025-05-02T08:32:26.435Z" },
-    { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload-time = "2025-05-02T08:32:28.376Z" },
-    { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload-time = "2025-05-02T08:32:30.281Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload-time = "2025-05-02T08:32:32.191Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload-time = "2025-05-02T08:32:33.712Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload-time = "2025-05-02T08:32:35.768Z" },
-    { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload-time = "2025-05-02T08:32:37.284Z" },
-    { url = "https://files.pythonhosted.org/packages/86/2d/fb55fdf41964ec782febbf33cb64be480a6b8f16ded2dbe8db27a405c09f/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214", size = 146626, upload-time = "2025-05-02T08:32:38.803Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/73/6ede2ec59bce19b3edf4209d70004253ec5f4e319f9a2e3f2f15601ed5f7/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a", size = 148567, upload-time = "2025-05-02T08:32:40.251Z" },
-    { url = "https://files.pythonhosted.org/packages/09/14/957d03c6dc343c04904530b6bef4e5efae5ec7d7990a7cbb868e4595ee30/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd", size = 150957, upload-time = "2025-05-02T08:32:41.705Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/c8/8174d0e5c10ccebdcb1b53cc959591c4c722a3ad92461a273e86b9f5a302/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981", size = 145408, upload-time = "2025-05-02T08:32:43.709Z" },
-    { url = "https://files.pythonhosted.org/packages/58/aa/8904b84bc8084ac19dc52feb4f5952c6df03ffb460a887b42615ee1382e8/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c", size = 153399, upload-time = "2025-05-02T08:32:46.197Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/26/89ee1f0e264d201cb65cf054aca6038c03b1a0c6b4ae998070392a3ce605/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b", size = 156815, upload-time = "2025-05-02T08:32:48.105Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/07/68e95b4b345bad3dbbd3a8681737b4338ff2c9df29856a6d6d23ac4c73cb/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d", size = 154537, upload-time = "2025-05-02T08:32:49.719Z" },
-    { url = "https://files.pythonhosted.org/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565, upload-time = "2025-05-02T08:32:51.404Z" },
-    { url = "https://files.pythonhosted.org/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357, upload-time = "2025-05-02T08:32:53.079Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776, upload-time = "2025-05-02T08:32:54.573Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload-time = "2025-05-02T08:32:56.363Z" },
-    { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload-time = "2025-05-02T08:32:58.551Z" },
-    { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload-time = "2025-05-02T08:33:00.342Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload-time = "2025-05-02T08:33:02.081Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload-time = "2025-05-02T08:33:04.063Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload-time = "2025-05-02T08:33:06.418Z" },
-    { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload-time = "2025-05-02T08:33:08.183Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload-time = "2025-05-02T08:33:09.986Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload-time = "2025-05-02T08:33:11.814Z" },
-    { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload-time = "2025-05-02T08:33:13.707Z" },
-    { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload-time = "2025-05-02T08:33:15.458Z" },
-    { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload-time = "2025-05-02T08:33:17.06Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload-time = "2025-05-02T08:33:18.753Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/fd/f700cfd4ad876def96d2c769d8a32d808b12d1010b6003dc6639157f99ee/charset_normalizer-3.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76af085e67e56c8816c3ccf256ebd136def2ed9654525348cfa744b6802b69eb", size = 198257, upload-time = "2025-05-02T08:33:45.511Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/95/6eec4cbbbd119e6a402e3bfd16246785cc52ce64cf21af2ecdf7b3a08e91/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e45ba65510e2647721e35323d6ef54c7974959f6081b58d4ef5d87c60c84919a", size = 143453, upload-time = "2025-05-02T08:33:47.463Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/b3/d4f913660383b3d93dbe6f687a312ea9f7e89879ae883c4e8942048174d4/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:046595208aae0120559a67693ecc65dd75d46f7bf687f159127046628178dc45", size = 153130, upload-time = "2025-05-02T08:33:50.568Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/69/7540141529eabc55bf19cc05cd9b61c2078bebfcdbd3e799af99b777fc28/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75d10d37a47afee94919c4fab4c22b9bc2a8bf7d4f46f87363bcf0573f3ff4f5", size = 145688, upload-time = "2025-05-02T08:33:52.828Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/bb/d76d3d6e340fb0967c43c564101e28a78c9a363ea62f736a68af59ee3683/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6333b3aa5a12c26b2a4d4e7335a28f1475e0e5e17d69d55141ee3cab736f66d1", size = 147418, upload-time = "2025-05-02T08:33:54.718Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/ef/b7c1f39c0dc3808160c8b72e0209c2479393966313bfebc833533cfff9cc/charset_normalizer-3.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8323a9b031aa0393768b87f04b4164a40037fb2a3c11ac06a03ffecd3618027", size = 150066, upload-time = "2025-05-02T08:33:56.597Z" },
-    { url = "https://files.pythonhosted.org/packages/20/26/4e47cc23d2a4a5eb6ed7d6f0f8cda87d753e2f8abc936d5cf5ad2aae8518/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:24498ba8ed6c2e0b56d4acbf83f2d989720a93b41d712ebd4f4979660db4417b", size = 144499, upload-time = "2025-05-02T08:33:58.637Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/9c/efdf59dd46593cecad0548d36a702683a0bdc056793398a9cd1e1546ad21/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:844da2b5728b5ce0e32d863af26f32b5ce61bc4273a9c720a9f3aa9df73b1455", size = 152954, upload-time = "2025-05-02T08:34:00.552Z" },
-    { url = "https://files.pythonhosted.org/packages/59/b3/4e8b73f7299d9aaabd7cd26db4a765f741b8e57df97b034bb8de15609002/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:65c981bdbd3f57670af8b59777cbfae75364b483fa8a9f420f08094531d54a01", size = 155876, upload-time = "2025-05-02T08:34:02.527Z" },
-    { url = "https://files.pythonhosted.org/packages/53/cb/6fa0ccf941a069adce3edb8a1e430bc80e4929f4d43b5140fdf8628bdf7d/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:3c21d4fca343c805a52c0c78edc01e3477f6dd1ad7c47653241cf2a206d4fc58", size = 153186, upload-time = "2025-05-02T08:34:04.481Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/c6/80b93fabc626b75b1665ffe405e28c3cef0aae9237c5c05f15955af4edd8/charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dc7039885fa1baf9be153a0626e337aa7ec8bf96b0128605fb0d77788ddc1681", size = 148007, upload-time = "2025-05-02T08:34:06.888Z" },
-    { url = "https://files.pythonhosted.org/packages/41/eb/c7367ac326a2628e4f05b5c737c86fe4a8eb3ecc597a4243fc65720b3eeb/charset_normalizer-3.4.2-cp38-cp38-win32.whl", hash = "sha256:8272b73e1c5603666618805fe821edba66892e2870058c94c53147602eab29c7", size = 97923, upload-time = "2025-05-02T08:34:08.792Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/02/1c82646582ccf2c757fa6af69b1a3ea88744b8d2b4ab93b7686b2533e023/charset_normalizer-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:70f7172939fdf8790425ba31915bfbe8335030f05b9913d7ae00a87d4395620a", size = 105020, upload-time = "2025-05-02T08:34:10.6Z" },
-    { url = "https://files.pythonhosted.org/packages/28/f8/dfb01ff6cc9af38552c69c9027501ff5a5117c4cc18dcd27cb5259fa1888/charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4", size = 201671, upload-time = "2025-05-02T08:34:12.696Z" },
-    { url = "https://files.pythonhosted.org/packages/32/fb/74e26ee556a9dbfe3bd264289b67be1e6d616329403036f6507bb9f3f29c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7", size = 144744, upload-time = "2025-05-02T08:34:14.665Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/06/8499ee5aa7addc6f6d72e068691826ff093329fe59891e83b092ae4c851c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836", size = 154993, upload-time = "2025-05-02T08:34:17.134Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/a2/5e4c187680728219254ef107a6949c60ee0e9a916a5dadb148c7ae82459c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597", size = 147382, upload-time = "2025-05-02T08:34:19.081Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/fe/56aca740dda674f0cc1ba1418c4d84534be51f639b5f98f538b332dc9a95/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7", size = 149536, upload-time = "2025-05-02T08:34:21.073Z" },
-    { url = "https://files.pythonhosted.org/packages/53/13/db2e7779f892386b589173dd689c1b1e304621c5792046edd8a978cbf9e0/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f", size = 151349, upload-time = "2025-05-02T08:34:23.193Z" },
-    { url = "https://files.pythonhosted.org/packages/69/35/e52ab9a276186f729bce7a0638585d2982f50402046e4b0faa5d2c3ef2da/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba", size = 146365, upload-time = "2025-05-02T08:34:25.187Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/d8/af7333f732fc2e7635867d56cb7c349c28c7094910c72267586947561b4b/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12", size = 154499, upload-time = "2025-05-02T08:34:27.359Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/3d/a5b2e48acef264d71e036ff30bcc49e51bde80219bb628ba3e00cf59baac/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518", size = 157735, upload-time = "2025-05-02T08:34:29.798Z" },
-    { url = "https://files.pythonhosted.org/packages/85/d8/23e2c112532a29f3eef374375a8684a4f3b8e784f62b01da931186f43494/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5", size = 154786, upload-time = "2025-05-02T08:34:31.858Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/57/93e0169f08ecc20fe82d12254a200dfaceddc1c12a4077bf454ecc597e33/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3", size = 150203, upload-time = "2025-05-02T08:34:33.88Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/9d/9bf2b005138e7e060d7ebdec7503d0ef3240141587651f4b445bdf7286c2/charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471", size = 98436, upload-time = "2025-05-02T08:34:35.907Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/24/5849d46cf4311bbf21b424c443b09b459f5b436b1558c04e45dbb7cc478b/charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e", size = 105772, upload-time = "2025-05-02T08:34:37.935Z" },
-    { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" },
-]
-
-[[package]]
-name = "click"
-version = "8.1.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.9.*'",
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload-time = "2024-12-21T18:38:44.339Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" },
-]
-
-[[package]]
-name = "click"
-version = "8.2.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
-]
-
-[[package]]
-name = "exceptiongroup"
-version = "1.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
-]
-
-[[package]]
-name = "idna"
-version = "3.10"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
-]
-
-[[package]]
-name = "iniconfig"
-version = "2.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" },
-]
-
-[[package]]
-name = "loguru"
-version = "0.7.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
-]
-
-[[package]]
-name = "mypy"
-version = "1.14.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "mypy-extensions", marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/eb/2c92d8ea1e684440f54fa49ac5d9a5f19967b7b472a281f419e69a8d228e/mypy-1.14.1.tar.gz", hash = "sha256:7ec88144fe9b510e8475ec2f5f251992690fcf89ccb4500b214b4226abcd32d6", size = 3216051, upload-time = "2024-12-30T16:39:07.335Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/7a/87ae2adb31d68402da6da1e5f30c07ea6063e9f09b5e7cfc9dfa44075e74/mypy-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:52686e37cf13d559f668aa398dd7ddf1f92c5d613e4f8cb262be2fb4fedb0fcb", size = 11211002, upload-time = "2024-12-30T16:37:22.435Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/23/eada4c38608b444618a132be0d199b280049ded278b24cbb9d3fc59658e4/mypy-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1fb545ca340537d4b45d3eecdb3def05e913299ca72c290326be19b3804b39c0", size = 10358400, upload-time = "2024-12-30T16:37:53.526Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c9/d6785c6f66241c62fd2992b05057f404237deaad1566545e9f144ced07f5/mypy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90716d8b2d1f4cd503309788e51366f07c56635a3309b0f6a32547eaaa36a64d", size = 12095172, upload-time = "2024-12-30T16:37:50.332Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/62/daa7e787770c83c52ce2aaf1a111eae5893de9e004743f51bfcad9e487ec/mypy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae753f5c9fef278bcf12e1a564351764f2a6da579d4a81347e1d5a15819997b", size = 12828732, upload-time = "2024-12-30T16:37:29.96Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/a2/5fb18318a3637f29f16f4e41340b795da14f4751ef4f51c99ff39ab62e52/mypy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0fe0f5feaafcb04505bcf439e991c6d8f1bf8b15f12b05feeed96e9e7bf1427", size = 13012197, upload-time = "2024-12-30T16:38:05.037Z" },
-    { url = "https://files.pythonhosted.org/packages/28/99/e153ce39105d164b5f02c06c35c7ba958aaff50a2babba7d080988b03fe7/mypy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:7d54bd85b925e501c555a3227f3ec0cfc54ee8b6930bd6141ec872d1c572f81f", size = 9780836, upload-time = "2024-12-30T16:37:19.726Z" },
-    { url = "https://files.pythonhosted.org/packages/da/11/a9422850fd506edbcdc7f6090682ecceaf1f87b9dd847f9df79942da8506/mypy-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f995e511de847791c3b11ed90084a7a0aafdc074ab88c5a9711622fe4751138c", size = 11120432, upload-time = "2024-12-30T16:37:11.533Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/9e/47e450fd39078d9c02d620545b2cb37993a8a8bdf7db3652ace2f80521ca/mypy-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d64169ec3b8461311f8ce2fd2eb5d33e2d0f2c7b49116259c51d0d96edee48d1", size = 10279515, upload-time = "2024-12-30T16:37:40.724Z" },
-    { url = "https://files.pythonhosted.org/packages/01/b5/6c8d33bd0f851a7692a8bfe4ee75eb82b6983a3cf39e5e32a5d2a723f0c1/mypy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba24549de7b89b6381b91fbc068d798192b1b5201987070319889e93038967a8", size = 12025791, upload-time = "2024-12-30T16:36:58.73Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/4c/e10e2c46ea37cab5c471d0ddaaa9a434dc1d28650078ac1b56c2d7b9b2e4/mypy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:183cf0a45457d28ff9d758730cd0210419ac27d4d3f285beda038c9083363b1f", size = 12749203, upload-time = "2024-12-30T16:37:03.741Z" },
-    { url = "https://files.pythonhosted.org/packages/88/55/beacb0c69beab2153a0f57671ec07861d27d735a0faff135a494cd4f5020/mypy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f2a0ecc86378f45347f586e4163d1769dd81c5a223d577fe351f26b179e148b1", size = 12885900, upload-time = "2024-12-30T16:37:57.948Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/75/8c93ff7f315c4d086a2dfcde02f713004357d70a163eddb6c56a6a5eff40/mypy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:ad3301ebebec9e8ee7135d8e3109ca76c23752bac1e717bc84cd3836b4bf3eae", size = 9777869, upload-time = "2024-12-30T16:37:33.428Z" },
-    { url = "https://files.pythonhosted.org/packages/43/1b/b38c079609bb4627905b74fc6a49849835acf68547ac33d8ceb707de5f52/mypy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:30ff5ef8519bbc2e18b3b54521ec319513a26f1bba19a7582e7b1f58a6e69f14", size = 11266668, upload-time = "2024-12-30T16:38:02.211Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/75/2ed0d2964c1ffc9971c729f7a544e9cd34b2cdabbe2d11afd148d7838aa2/mypy-1.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cb9f255c18052343c70234907e2e532bc7e55a62565d64536dbc7706a20b78b9", size = 10254060, upload-time = "2024-12-30T16:37:46.131Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/5f/7b8051552d4da3c51bbe8fcafffd76a6823779101a2b198d80886cd8f08e/mypy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b4e3413e0bddea671012b063e27591b953d653209e7a4fa5e48759cda77ca11", size = 11933167, upload-time = "2024-12-30T16:37:43.534Z" },
-    { url = "https://files.pythonhosted.org/packages/04/90/f53971d3ac39d8b68bbaab9a4c6c58c8caa4d5fd3d587d16f5927eeeabe1/mypy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:553c293b1fbdebb6c3c4030589dab9fafb6dfa768995a453d8a5d3b23784af2e", size = 12864341, upload-time = "2024-12-30T16:37:36.249Z" },
-    { url = "https://files.pythonhosted.org/packages/03/d2/8bc0aeaaf2e88c977db41583559319f1821c069e943ada2701e86d0430b7/mypy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fad79bfe3b65fe6a1efaed97b445c3d37f7be9fdc348bdb2d7cac75579607c89", size = 12972991, upload-time = "2024-12-30T16:37:06.743Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/17/07815114b903b49b0f2cf7499f1c130e5aa459411596668267535fe9243c/mypy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:8fa2220e54d2946e94ab6dbb3ba0a992795bd68b16dc852db33028df2b00191b", size = 9879016, upload-time = "2024-12-30T16:37:15.02Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/15/bb6a686901f59222275ab228453de741185f9d54fecbaacec041679496c6/mypy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:92c3ed5afb06c3a8e188cb5da4984cab9ec9a77ba956ee419c68a388b4595255", size = 11252097, upload-time = "2024-12-30T16:37:25.144Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/b3/8b0f74dfd072c802b7fa368829defdf3ee1566ba74c32a2cb2403f68024c/mypy-1.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:dbec574648b3e25f43d23577309b16534431db4ddc09fda50841f1e34e64ed34", size = 10239728, upload-time = "2024-12-30T16:38:08.634Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/9b/4fd95ab20c52bb5b8c03cc49169be5905d931de17edfe4d9d2986800b52e/mypy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8c6d94b16d62eb3e947281aa7347d78236688e21081f11de976376cf010eb31a", size = 11924965, upload-time = "2024-12-30T16:38:12.132Z" },
-    { url = "https://files.pythonhosted.org/packages/56/9d/4a236b9c57f5d8f08ed346914b3f091a62dd7e19336b2b2a0d85485f82ff/mypy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d4b19b03fdf54f3c5b2fa474c56b4c13c9dbfb9a2db4370ede7ec11a2c5927d9", size = 12867660, upload-time = "2024-12-30T16:38:17.342Z" },
-    { url = "https://files.pythonhosted.org/packages/40/88/a61a5497e2f68d9027de2bb139c7bb9abaeb1be1584649fa9d807f80a338/mypy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0c911fde686394753fff899c409fd4e16e9b294c24bfd5e1ea4675deae1ac6fd", size = 12969198, upload-time = "2024-12-30T16:38:32.839Z" },
-    { url = "https://files.pythonhosted.org/packages/54/da/3d6fc5d92d324701b0c23fb413c853892bfe0e1dbe06c9138037d459756b/mypy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8b21525cb51671219f5307be85f7e646a153e5acc656e5cebf64bfa076c50107", size = 9885276, upload-time = "2024-12-30T16:38:20.828Z" },
-    { url = "https://files.pythonhosted.org/packages/39/02/1817328c1372be57c16148ce7d2bfcfa4a796bedaed897381b1aad9b267c/mypy-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7084fb8f1128c76cd9cf68fe5971b37072598e7c31b2f9f95586b65c741a9d31", size = 11143050, upload-time = "2024-12-30T16:38:29.743Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/07/99db9a95ece5e58eee1dd87ca456a7e7b5ced6798fd78182c59c35a7587b/mypy-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8f845a00b4f420f693f870eaee5f3e2692fa84cc8514496114649cfa8fd5e2c6", size = 10321087, upload-time = "2024-12-30T16:38:14.739Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/eb/85ea6086227b84bce79b3baf7f465b4732e0785830726ce4a51528173b71/mypy-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44bf464499f0e3a2d14d58b54674dee25c031703b2ffc35064bd0df2e0fac319", size = 12066766, upload-time = "2024-12-30T16:38:47.038Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/bb/f01bebf76811475d66359c259eabe40766d2f8ac8b8250d4e224bb6df379/mypy-1.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c99f27732c0b7dc847adb21c9d47ce57eb48fa33a17bc6d7d5c5e9f9e7ae5bac", size = 12787111, upload-time = "2024-12-30T16:39:02.444Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/c9/84837ff891edcb6dcc3c27d85ea52aab0c4a34740ff5f0ccc0eb87c56139/mypy-1.14.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:bce23c7377b43602baa0bd22ea3265c49b9ff0b76eb315d6c34721af4cdf1d9b", size = 12974331, upload-time = "2024-12-30T16:38:23.849Z" },
-    { url = "https://files.pythonhosted.org/packages/84/5f/901e18464e6a13f8949b4909535be3fa7f823291b8ab4e4b36cfe57d6769/mypy-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:8edc07eeade7ebc771ff9cf6b211b9a7d93687ff892150cb5692e4f4272b0837", size = 9763210, upload-time = "2024-12-30T16:38:36.299Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/1f/186d133ae2514633f8558e78cd658070ba686c0e9275c5a5c24a1e1f0d67/mypy-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3888a1816d69f7ab92092f785a462944b3ca16d7c470d564165fe703b0970c35", size = 11200493, upload-time = "2024-12-30T16:38:26.935Z" },
-    { url = "https://files.pythonhosted.org/packages/af/fc/4842485d034e38a4646cccd1369f6b1ccd7bc86989c52770d75d719a9941/mypy-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:46c756a444117c43ee984bd055db99e498bc613a70bbbc120272bd13ca579fbc", size = 10357702, upload-time = "2024-12-30T16:38:50.623Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/e6/457b83f2d701e23869cfec013a48a12638f75b9d37612a9ddf99072c1051/mypy-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:27fc248022907e72abfd8e22ab1f10e903915ff69961174784a3900a8cba9ad9", size = 12091104, upload-time = "2024-12-30T16:38:53.735Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/bf/76a569158db678fee59f4fd30b8e7a0d75bcbaeef49edd882a0d63af6d66/mypy-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:499d6a72fb7e5de92218db961f1a66d5f11783f9ae549d214617edab5d4dbdbb", size = 12830167, upload-time = "2024-12-30T16:38:56.437Z" },
-    { url = "https://files.pythonhosted.org/packages/43/bc/0bc6b694b3103de9fed61867f1c8bd33336b913d16831431e7cb48ef1c92/mypy-1.14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:57961db9795eb566dc1d1b4e9139ebc4c6b0cb6e7254ecde69d1552bf7613f60", size = 13013834, upload-time = "2024-12-30T16:38:59.204Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/79/5f5ec47849b6df1e6943d5fd8e6632fbfc04b4fd4acfa5a5a9535d11b4e2/mypy-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:07ba89fdcc9451f2ebb02853deb6aaaa3d2239a236669a63ab3801bbf923ef5c", size = 9781231, upload-time = "2024-12-30T16:39:05.124Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/b5/32dd67b69a16d088e533962e5044e51004176a9952419de0370cdaead0f8/mypy-1.14.1-py3-none-any.whl", hash = "sha256:b66a60cc4073aeb8ae00057f9c1f64d49e90f918fbcef9a977eb121da8b8f1d1", size = 2752905, upload-time = "2024-12-30T16:38:42.021Z" },
-]
-
-[[package]]
-name = "mypy"
-version = "1.17.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "mypy-extensions", marker = "python_full_version >= '3.9'" },
-    { name = "pathspec", marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/8e/22/ea637422dedf0bf36f3ef238eab4e455e2a0dcc3082b5cc067615347ab8e/mypy-1.17.1.tar.gz", hash = "sha256:25e01ec741ab5bb3eec8ba9cdb0f769230368a22c959c4937360efb89b7e9f01", size = 3352570, upload-time = "2025-07-31T07:54:19.204Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/a9/3d7aa83955617cdf02f94e50aab5c830d205cfa4320cf124ff64acce3a8e/mypy-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3fbe6d5555bf608c47203baa3e72dbc6ec9965b3d7c318aa9a4ca76f465bd972", size = 11003299, upload-time = "2025-07-31T07:54:06.425Z" },
-    { url = "https://files.pythonhosted.org/packages/83/e8/72e62ff837dd5caaac2b4a5c07ce769c8e808a00a65e5d8f94ea9c6f20ab/mypy-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80ef5c058b7bce08c83cac668158cb7edea692e458d21098c7d3bce35a5d43e7", size = 10125451, upload-time = "2025-07-31T07:53:52.974Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/10/f3f3543f6448db11881776f26a0ed079865926b0c841818ee22de2c6bbab/mypy-1.17.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4a580f8a70c69e4a75587bd925d298434057fe2a428faaf927ffe6e4b9a98df", size = 11916211, upload-time = "2025-07-31T07:53:18.879Z" },
-    { url = "https://files.pythonhosted.org/packages/06/bf/63e83ed551282d67bb3f7fea2cd5561b08d2bb6eb287c096539feb5ddbc5/mypy-1.17.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd86bb649299f09d987a2eebb4d52d10603224500792e1bee18303bbcc1ce390", size = 12652687, upload-time = "2025-07-31T07:53:30.544Z" },
-    { url = "https://files.pythonhosted.org/packages/69/66/68f2eeef11facf597143e85b694a161868b3b006a5fbad50e09ea117ef24/mypy-1.17.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a76906f26bd8d51ea9504966a9c25419f2e668f012e0bdf3da4ea1526c534d94", size = 12896322, upload-time = "2025-07-31T07:53:50.74Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/87/8e3e9c2c8bd0d7e071a89c71be28ad088aaecbadf0454f46a540bda7bca6/mypy-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:e79311f2d904ccb59787477b7bd5d26f3347789c06fcd7656fa500875290264b", size = 9507962, upload-time = "2025-07-31T07:53:08.431Z" },
-    { url = "https://files.pythonhosted.org/packages/46/cf/eadc80c4e0a70db1c08921dcc220357ba8ab2faecb4392e3cebeb10edbfa/mypy-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad37544be07c5d7fba814eb370e006df58fed8ad1ef33ed1649cb1889ba6ff58", size = 10921009, upload-time = "2025-07-31T07:53:23.037Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/c1/c869d8c067829ad30d9bdae051046561552516cfb3a14f7f0347b7d973ee/mypy-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:064e2ff508e5464b4bd807a7c1625bc5047c5022b85c70f030680e18f37273a5", size = 10047482, upload-time = "2025-07-31T07:53:26.151Z" },
-    { url = "https://files.pythonhosted.org/packages/98/b9/803672bab3fe03cee2e14786ca056efda4bb511ea02dadcedde6176d06d0/mypy-1.17.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70401bbabd2fa1aa7c43bb358f54037baf0586f41e83b0ae67dd0534fc64edfd", size = 11832883, upload-time = "2025-07-31T07:53:47.948Z" },
-    { url = "https://files.pythonhosted.org/packages/88/fb/fcdac695beca66800918c18697b48833a9a6701de288452b6715a98cfee1/mypy-1.17.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e92bdc656b7757c438660f775f872a669b8ff374edc4d18277d86b63edba6b8b", size = 12566215, upload-time = "2025-07-31T07:54:04.031Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/37/a932da3d3dace99ee8eb2043b6ab03b6768c36eb29a02f98f46c18c0da0e/mypy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c1fdf4abb29ed1cb091cf432979e162c208a5ac676ce35010373ff29247bcad5", size = 12751956, upload-time = "2025-07-31T07:53:36.263Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/cf/6438a429e0f2f5cab8bc83e53dbebfa666476f40ee322e13cac5e64b79e7/mypy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:ff2933428516ab63f961644bc49bc4cbe42bbffb2cd3b71cc7277c07d16b1a8b", size = 9507307, upload-time = "2025-07-31T07:53:59.734Z" },
-    { url = "https://files.pythonhosted.org/packages/17/a2/7034d0d61af8098ec47902108553122baa0f438df8a713be860f7407c9e6/mypy-1.17.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:69e83ea6553a3ba79c08c6e15dbd9bfa912ec1e493bf75489ef93beb65209aeb", size = 11086295, upload-time = "2025-07-31T07:53:28.124Z" },
-    { url = "https://files.pythonhosted.org/packages/14/1f/19e7e44b594d4b12f6ba8064dbe136505cec813549ca3e5191e40b1d3cc2/mypy-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b16708a66d38abb1e6b5702f5c2c87e133289da36f6a1d15f6a5221085c6403", size = 10112355, upload-time = "2025-07-31T07:53:21.121Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/69/baa33927e29e6b4c55d798a9d44db5d394072eef2bdc18c3e2048c9ed1e9/mypy-1.17.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:89e972c0035e9e05823907ad5398c5a73b9f47a002b22359b177d40bdaee7056", size = 11875285, upload-time = "2025-07-31T07:53:55.293Z" },
-    { url = "https://files.pythonhosted.org/packages/90/13/f3a89c76b0a41e19490b01e7069713a30949d9a6c147289ee1521bcea245/mypy-1.17.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03b6d0ed2b188e35ee6d5c36b5580cffd6da23319991c49ab5556c023ccf1341", size = 12737895, upload-time = "2025-07-31T07:53:43.623Z" },
-    { url = "https://files.pythonhosted.org/packages/23/a1/c4ee79ac484241301564072e6476c5a5be2590bc2e7bfd28220033d2ef8f/mypy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c837b896b37cd103570d776bda106eabb8737aa6dd4f248451aecf53030cdbeb", size = 12931025, upload-time = "2025-07-31T07:54:17.125Z" },
-    { url = "https://files.pythonhosted.org/packages/89/b8/7409477be7919a0608900e6320b155c72caab4fef46427c5cc75f85edadd/mypy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:665afab0963a4b39dff7c1fa563cc8b11ecff7910206db4b2e64dd1ba25aed19", size = 9584664, upload-time = "2025-07-31T07:54:12.842Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/82/aec2fc9b9b149f372850291827537a508d6c4d3664b1750a324b91f71355/mypy-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93378d3203a5c0800c6b6d850ad2f19f7a3cdf1a3701d3416dbf128805c6a6a7", size = 11075338, upload-time = "2025-07-31T07:53:38.873Z" },
-    { url = "https://files.pythonhosted.org/packages/07/ac/ee93fbde9d2242657128af8c86f5d917cd2887584cf948a8e3663d0cd737/mypy-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:15d54056f7fe7a826d897789f53dd6377ec2ea8ba6f776dc83c2902b899fee81", size = 10113066, upload-time = "2025-07-31T07:54:14.707Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/68/946a1e0be93f17f7caa56c45844ec691ca153ee8b62f21eddda336a2d203/mypy-1.17.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:209a58fed9987eccc20f2ca94afe7257a8f46eb5df1fb69958650973230f91e6", size = 11875473, upload-time = "2025-07-31T07:53:14.504Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/0f/478b4dce1cb4f43cf0f0d00fba3030b21ca04a01b74d1cd272a528cf446f/mypy-1.17.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:099b9a5da47de9e2cb5165e581f158e854d9e19d2e96b6698c0d64de911dd849", size = 12744296, upload-time = "2025-07-31T07:53:03.896Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/70/afa5850176379d1b303f992a828de95fc14487429a7139a4e0bdd17a8279/mypy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ffadfbe6994d724c5a1bb6123a7d27dd68fc9c059561cd33b664a79578e14", size = 12914657, upload-time = "2025-07-31T07:54:08.576Z" },
-    { url = "https://files.pythonhosted.org/packages/53/f9/4a83e1c856a3d9c8f6edaa4749a4864ee98486e9b9dbfbc93842891029c2/mypy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:9a2b7d9180aed171f033c9f2fc6c204c1245cf60b0cb61cf2e7acc24eea78e0a", size = 9593320, upload-time = "2025-07-31T07:53:01.341Z" },
-    { url = "https://files.pythonhosted.org/packages/38/56/79c2fac86da57c7d8c48622a05873eaab40b905096c33597462713f5af90/mypy-1.17.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:15a83369400454c41ed3a118e0cc58bd8123921a602f385cb6d6ea5df050c733", size = 11040037, upload-time = "2025-07-31T07:54:10.942Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/c3/adabe6ff53638e3cad19e3547268482408323b1e68bf082c9119000cd049/mypy-1.17.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55b918670f692fc9fba55c3298d8a3beae295c5cded0a55dccdc5bbead814acd", size = 10131550, upload-time = "2025-07-31T07:53:41.307Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/c5/2e234c22c3bdeb23a7817af57a58865a39753bde52c74e2c661ee0cfc640/mypy-1.17.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:62761474061feef6f720149d7ba876122007ddc64adff5ba6f374fda35a018a0", size = 11872963, upload-time = "2025-07-31T07:53:16.878Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/26/c13c130f35ca8caa5f2ceab68a247775648fdcd6c9a18f158825f2bc2410/mypy-1.17.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c49562d3d908fd49ed0938e5423daed8d407774a479b595b143a3d7f87cdae6a", size = 12710189, upload-time = "2025-07-31T07:54:01.962Z" },
-    { url = "https://files.pythonhosted.org/packages/82/df/c7d79d09f6de8383fe800521d066d877e54d30b4fb94281c262be2df84ef/mypy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:397fba5d7616a5bc60b45c7ed204717eaddc38f826e3645402c426057ead9a91", size = 12900322, upload-time = "2025-07-31T07:53:10.551Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/98/3d5a48978b4f708c55ae832619addc66d677f6dc59f3ebad71bae8285ca6/mypy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:9d6b20b97d373f41617bd0708fd46aa656059af57f2ef72aa8c7d6a2b73b74ed", size = 9751879, upload-time = "2025-07-31T07:52:56.683Z" },
-    { url = "https://files.pythonhosted.org/packages/29/cb/673e3d34e5d8de60b3a61f44f80150a738bff568cd6b7efb55742a605e98/mypy-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5d1092694f166a7e56c805caaf794e0585cabdbf1df36911c414e4e9abb62ae9", size = 10992466, upload-time = "2025-07-31T07:53:57.574Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/d0/fe1895836eea3a33ab801561987a10569df92f2d3d4715abf2cfeaa29cb2/mypy-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:79d44f9bfb004941ebb0abe8eff6504223a9c1ac51ef967d1263c6572bbebc99", size = 10117638, upload-time = "2025-07-31T07:53:34.256Z" },
-    { url = "https://files.pythonhosted.org/packages/97/f3/514aa5532303aafb95b9ca400a31054a2bd9489de166558c2baaeea9c522/mypy-1.17.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b01586eed696ec905e61bd2568f48740f7ac4a45b3a468e6423a03d3788a51a8", size = 11915673, upload-time = "2025-07-31T07:52:59.361Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/c3/c0805f0edec96fe8e2c048b03769a6291523d509be8ee7f56ae922fa3882/mypy-1.17.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43808d9476c36b927fbcd0b0255ce75efe1b68a080154a38ae68a7e62de8f0f8", size = 12649022, upload-time = "2025-07-31T07:53:45.92Z" },
-    { url = "https://files.pythonhosted.org/packages/45/3e/d646b5a298ada21a8512fa7e5531f664535a495efa672601702398cea2b4/mypy-1.17.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:feb8cc32d319edd5859da2cc084493b3e2ce5e49a946377663cc90f6c15fb259", size = 12895536, upload-time = "2025-07-31T07:53:06.17Z" },
-    { url = "https://files.pythonhosted.org/packages/14/55/e13d0dcd276975927d1f4e9e2ec4fd409e199f01bdc671717e673cc63a22/mypy-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d7598cf74c3e16539d4e2f0b8d8c318e00041553d83d4861f87c7a72e95ac24d", size = 9512564, upload-time = "2025-07-31T07:53:12.346Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/f3/8fcd2af0f5b806f6cf463efaffd3c9548a28f84220493ecd38d127b6b66d/mypy-1.17.1-py3-none-any.whl", hash = "sha256:a9f52c0351c21fe24c21d8c0eb1f62967b262d6729393397b6f443c3b773c3b9", size = 2283411, upload-time = "2025-07-31T07:53:24.664Z" },
-]
-
-[[package]]
-name = "mypy-extensions"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "1.24.4"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229, upload-time = "2023-06-26T13:39:33.218Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140, upload-time = "2023-06-26T13:22:33.184Z" },
-    { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297, upload-time = "2023-06-26T13:22:59.541Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611, upload-time = "2023-06-26T13:23:22.167Z" },
-    { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357, upload-time = "2023-06-26T13:23:51.446Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222, upload-time = "2023-06-26T13:24:13.849Z" },
-    { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514, upload-time = "2023-06-26T13:24:38.129Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508, upload-time = "2023-06-26T13:25:08.882Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033, upload-time = "2023-06-26T13:25:33.417Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951, upload-time = "2023-06-26T13:25:55.725Z" },
-    { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923, upload-time = "2023-06-26T13:26:25.658Z" },
-    { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446, upload-time = "2023-06-26T13:26:49.302Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466, upload-time = "2023-06-26T13:27:16.029Z" },
-    { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722, upload-time = "2023-06-26T13:27:49.573Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102, upload-time = "2023-06-26T13:28:12.288Z" },
-    { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616, upload-time = "2023-06-26T13:28:35.659Z" },
-    { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263, upload-time = "2023-06-26T13:29:09.272Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660, upload-time = "2023-06-26T13:29:33.434Z" },
-    { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112, upload-time = "2023-06-26T13:29:58.385Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549, upload-time = "2023-06-26T13:30:36.976Z" },
-    { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950, upload-time = "2023-06-26T13:31:01.787Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228, upload-time = "2023-06-26T13:31:26.696Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170, upload-time = "2023-06-26T13:31:56.615Z" },
-    { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918, upload-time = "2023-06-26T13:32:16.8Z" },
-    { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441, upload-time = "2023-06-26T13:32:40.521Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590, upload-time = "2023-06-26T13:33:10.36Z" },
-    { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744, upload-time = "2023-06-26T13:33:36.703Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290, upload-time = "2023-06-26T13:34:05.409Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.0.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" },
-    { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" },
-    { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" },
-    { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" },
-    { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" },
-    { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" },
-    { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" },
-    { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" },
-    { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" },
-    { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" },
-    { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" },
-    { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" },
-    { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" },
-    { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.2.6"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" },
-    { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" },
-    { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" },
-    { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" },
-    { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" },
-    { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" },
-    { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" },
-    { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" },
-    { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" },
-    { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" },
-    { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" },
-    { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" },
-    { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" },
-    { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828, upload-time = "2025-05-17T21:37:56.699Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006, upload-time = "2025-05-17T21:38:18.291Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765, upload-time = "2025-05-17T21:38:27.319Z" },
-    { url = "https://files.pythonhosted.org/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736, upload-time = "2025-05-17T21:38:38.141Z" },
-    { url = "https://files.pythonhosted.org/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719, upload-time = "2025-05-17T21:38:58.433Z" },
-    { url = "https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072, upload-time = "2025-05-17T21:39:22.638Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213, upload-time = "2025-05-17T21:39:45.865Z" },
-    { url = "https://files.pythonhosted.org/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632, upload-time = "2025-05-17T21:40:13.331Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467, upload-time = "2025-05-17T21:40:44Z" },
-    { url = "https://files.pythonhosted.org/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144, upload-time = "2025-05-17T21:41:05.695Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217, upload-time = "2025-05-17T21:41:15.903Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014, upload-time = "2025-05-17T21:41:27.321Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935, upload-time = "2025-05-17T21:41:49.738Z" },
-    { url = "https://files.pythonhosted.org/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122, upload-time = "2025-05-17T21:42:14.046Z" },
-    { url = "https://files.pythonhosted.org/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143, upload-time = "2025-05-17T21:42:37.464Z" },
-    { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" },
-    { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" },
-    { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" },
-    { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" },
-    { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" },
-    { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" },
-]
-
-[[package]]
-name = "numpy"
-version = "2.3.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/37/7d/3fec4199c5ffb892bed55cff901e4f39a58c81df9c44c280499e92cad264/numpy-2.3.2.tar.gz", hash = "sha256:e0486a11ec30cdecb53f184d496d1c6a20786c81e55e41640270130056f8ee48", size = 20489306, upload-time = "2025-07-24T21:32:07.553Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/26/1320083986108998bd487e2931eed2aeedf914b6e8905431487543ec911d/numpy-2.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:852ae5bed3478b92f093e30f785c98e0cb62fa0a939ed057c31716e18a7a22b9", size = 21259016, upload-time = "2025-07-24T20:24:35.214Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/2b/792b341463fa93fc7e55abbdbe87dac316c5b8cb5e94fb7a59fb6fa0cda5/numpy-2.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a0e27186e781a69959d0230dd9909b5e26024f8da10683bd6344baea1885168", size = 14451158, upload-time = "2025-07-24T20:24:58.397Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/13/e792d7209261afb0c9f4759ffef6135b35c77c6349a151f488f531d13595/numpy-2.3.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f0a1a8476ad77a228e41619af2fa9505cf69df928e9aaa165746584ea17fed2b", size = 5379817, upload-time = "2025-07-24T20:25:07.746Z" },
-    { url = "https://files.pythonhosted.org/packages/49/ce/055274fcba4107c022b2113a213c7287346563f48d62e8d2a5176ad93217/numpy-2.3.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:cbc95b3813920145032412f7e33d12080f11dc776262df1712e1638207dde9e8", size = 6913606, upload-time = "2025-07-24T20:25:18.84Z" },
-    { url = "https://files.pythonhosted.org/packages/17/f2/e4d72e6bc5ff01e2ab613dc198d560714971900c03674b41947e38606502/numpy-2.3.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f75018be4980a7324edc5930fe39aa391d5734531b1926968605416ff58c332d", size = 14589652, upload-time = "2025-07-24T20:25:40.356Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/b0/fbeee3000a51ebf7222016e2939b5c5ecf8000a19555d04a18f1e02521b8/numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20b8200721840f5621b7bd03f8dcd78de33ec522fc40dc2641aa09537df010c3", size = 16938816, upload-time = "2025-07-24T20:26:05.721Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/ec/2f6c45c3484cc159621ea8fc000ac5a86f1575f090cac78ac27193ce82cd/numpy-2.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f91e5c028504660d606340a084db4b216567ded1056ea2b4be4f9d10b67197f", size = 16370512, upload-time = "2025-07-24T20:26:30.545Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/01/dd67cf511850bd7aefd6347aaae0956ed415abea741ae107834aae7d6d4e/numpy-2.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fb1752a3bb9a3ad2d6b090b88a9a0ae1cd6f004ef95f75825e2f382c183b2097", size = 18884947, upload-time = "2025-07-24T20:26:58.24Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/17/2cf60fd3e6a61d006778735edf67a222787a8c1a7842aed43ef96d777446/numpy-2.3.2-cp311-cp311-win32.whl", hash = "sha256:4ae6863868aaee2f57503c7a5052b3a2807cf7a3914475e637a0ecd366ced220", size = 6599494, upload-time = "2025-07-24T20:27:09.786Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/03/0eade211c504bda872a594f045f98ddcc6caef2b7c63610946845e304d3f/numpy-2.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:240259d6564f1c65424bcd10f435145a7644a65a6811cfc3201c4a429ba79170", size = 13087889, upload-time = "2025-07-24T20:27:29.558Z" },
-    { url = "https://files.pythonhosted.org/packages/13/32/2c7979d39dafb2a25087e12310fc7f3b9d3c7d960df4f4bc97955ae0ce1d/numpy-2.3.2-cp311-cp311-win_arm64.whl", hash = "sha256:4209f874d45f921bde2cff1ffcd8a3695f545ad2ffbef6d3d3c6768162efab89", size = 10459560, upload-time = "2025-07-24T20:27:46.803Z" },
-    { url = "https://files.pythonhosted.org/packages/00/6d/745dd1c1c5c284d17725e5c802ca4d45cfc6803519d777f087b71c9f4069/numpy-2.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bc3186bea41fae9d8e90c2b4fb5f0a1f5a690682da79b92574d63f56b529080b", size = 20956420, upload-time = "2025-07-24T20:28:18.002Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/96/e7b533ea5740641dd62b07a790af5d9d8fec36000b8e2d0472bd7574105f/numpy-2.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f4f0215edb189048a3c03bd5b19345bdfa7b45a7a6f72ae5945d2a28272727f", size = 14184660, upload-time = "2025-07-24T20:28:39.522Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/53/102c6122db45a62aa20d1b18c9986f67e6b97e0d6fbc1ae13e3e4c84430c/numpy-2.3.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b1224a734cd509f70816455c3cffe13a4f599b1bf7130f913ba0e2c0b2006c0", size = 5113382, upload-time = "2025-07-24T20:28:48.544Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/21/376257efcbf63e624250717e82b4fae93d60178f09eb03ed766dbb48ec9c/numpy-2.3.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3dcf02866b977a38ba3ec10215220609ab9667378a9e2150615673f3ffd6c73b", size = 6647258, upload-time = "2025-07-24T20:28:59.104Z" },
-    { url = "https://files.pythonhosted.org/packages/91/ba/f4ebf257f08affa464fe6036e13f2bf9d4642a40228781dc1235da81be9f/numpy-2.3.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:572d5512df5470f50ada8d1972c5f1082d9a0b7aa5944db8084077570cf98370", size = 14281409, upload-time = "2025-07-24T20:40:30.298Z" },
-    { url = "https://files.pythonhosted.org/packages/59/ef/f96536f1df42c668cbacb727a8c6da7afc9c05ece6d558927fb1722693e1/numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8145dd6d10df13c559d1e4314df29695613575183fa2e2d11fac4c208c8a1f73", size = 16641317, upload-time = "2025-07-24T20:40:56.625Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/a7/af813a7b4f9a42f498dde8a4c6fcbff8100eed00182cc91dbaf095645f38/numpy-2.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:103ea7063fa624af04a791c39f97070bf93b96d7af7eb23530cd087dc8dbe9dc", size = 16056262, upload-time = "2025-07-24T20:41:20.797Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/5d/41c4ef8404caaa7f05ed1cfb06afe16a25895260eacbd29b4d84dff2920b/numpy-2.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc927d7f289d14f5e037be917539620603294454130b6de200091e23d27dc9be", size = 18579342, upload-time = "2025-07-24T20:41:50.753Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/4f/9950e44c5a11636f4a3af6e825ec23003475cc9a466edb7a759ed3ea63bd/numpy-2.3.2-cp312-cp312-win32.whl", hash = "sha256:d95f59afe7f808c103be692175008bab926b59309ade3e6d25009e9a171f7036", size = 6320610, upload-time = "2025-07-24T20:42:01.551Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/2f/244643a5ce54a94f0a9a2ab578189c061e4a87c002e037b0829dd77293b6/numpy-2.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:9e196ade2400c0c737d93465327d1ae7c06c7cb8a1756121ebf54b06ca183c7f", size = 12786292, upload-time = "2025-07-24T20:42:20.738Z" },
-    { url = "https://files.pythonhosted.org/packages/54/cd/7b5f49d5d78db7badab22d8323c1b6ae458fbf86c4fdfa194ab3cd4eb39b/numpy-2.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:ee807923782faaf60d0d7331f5e86da7d5e3079e28b291973c545476c2b00d07", size = 10194071, upload-time = "2025-07-24T20:42:36.657Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/c0/c6bb172c916b00700ed3bf71cb56175fd1f7dbecebf8353545d0b5519f6c/numpy-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c8d9727f5316a256425892b043736d63e89ed15bbfe6556c5ff4d9d4448ff3b3", size = 20949074, upload-time = "2025-07-24T20:43:07.813Z" },
-    { url = "https://files.pythonhosted.org/packages/20/4e/c116466d22acaf4573e58421c956c6076dc526e24a6be0903219775d862e/numpy-2.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:efc81393f25f14d11c9d161e46e6ee348637c0a1e8a54bf9dedc472a3fae993b", size = 14177311, upload-time = "2025-07-24T20:43:29.335Z" },
-    { url = "https://files.pythonhosted.org/packages/78/45/d4698c182895af189c463fc91d70805d455a227261d950e4e0f1310c2550/numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dd937f088a2df683cbb79dda9a772b62a3e5a8a7e76690612c2737f38c6ef1b6", size = 5106022, upload-time = "2025-07-24T20:43:37.999Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/76/3e6880fef4420179309dba72a8c11f6166c431cf6dee54c577af8906f914/numpy-2.3.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:11e58218c0c46c80509186e460d79fbdc9ca1eb8d8aee39d8f2dc768eb781089", size = 6640135, upload-time = "2025-07-24T20:43:49.28Z" },
-    { url = "https://files.pythonhosted.org/packages/34/fa/87ff7f25b3c4ce9085a62554460b7db686fef1e0207e8977795c7b7d7ba1/numpy-2.3.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ad4ebcb683a1f99f4f392cc522ee20a18b2bb12a2c1c42c3d48d5a1adc9d3d2", size = 14278147, upload-time = "2025-07-24T20:44:10.328Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f", size = 16635989, upload-time = "2025-07-24T20:44:34.88Z" },
-    { url = "https://files.pythonhosted.org/packages/24/5a/84ae8dca9c9a4c592fe11340b36a86ffa9fd3e40513198daf8a97839345c/numpy-2.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:66459dccc65d8ec98cc7df61307b64bf9e08101f9598755d42d8ae65d9a7a6ee", size = 16053052, upload-time = "2025-07-24T20:44:58.872Z" },
-    { url = "https://files.pythonhosted.org/packages/57/7c/e5725d99a9133b9813fcf148d3f858df98511686e853169dbaf63aec6097/numpy-2.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7af9ed2aa9ec5950daf05bb11abc4076a108bd3c7db9aa7251d5f107079b6a6", size = 18577955, upload-time = "2025-07-24T20:45:26.714Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/11/7c546fcf42145f29b71e4d6f429e96d8d68e5a7ba1830b2e68d7418f0bbd/numpy-2.3.2-cp313-cp313-win32.whl", hash = "sha256:906a30249315f9c8e17b085cc5f87d3f369b35fedd0051d4a84686967bdbbd0b", size = 6311843, upload-time = "2025-07-24T20:49:24.444Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/6f/a428fd1cb7ed39b4280d057720fed5121b0d7754fd2a9768640160f5517b/numpy-2.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:c63d95dc9d67b676e9108fe0d2182987ccb0f11933c1e8959f42fa0da8d4fa56", size = 12782876, upload-time = "2025-07-24T20:49:43.227Z" },
-    { url = "https://files.pythonhosted.org/packages/65/85/4ea455c9040a12595fb6c43f2c217257c7b52dd0ba332c6a6c1d28b289fe/numpy-2.3.2-cp313-cp313-win_arm64.whl", hash = "sha256:b05a89f2fb84d21235f93de47129dd4f11c16f64c87c33f5e284e6a3a54e43f2", size = 10192786, upload-time = "2025-07-24T20:49:59.443Z" },
-    { url = "https://files.pythonhosted.org/packages/80/23/8278f40282d10c3f258ec3ff1b103d4994bcad78b0cba9208317f6bb73da/numpy-2.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4e6ecfeddfa83b02318f4d84acf15fbdbf9ded18e46989a15a8b6995dfbf85ab", size = 21047395, upload-time = "2025-07-24T20:45:58.821Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/2d/624f2ce4a5df52628b4ccd16a4f9437b37c35f4f8a50d00e962aae6efd7a/numpy-2.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:508b0eada3eded10a3b55725b40806a4b855961040180028f52580c4729916a2", size = 14300374, upload-time = "2025-07-24T20:46:20.207Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/62/ff1e512cdbb829b80a6bd08318a58698867bca0ca2499d101b4af063ee97/numpy-2.3.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:754d6755d9a7588bdc6ac47dc4ee97867271b17cee39cb87aef079574366db0a", size = 5228864, upload-time = "2025-07-24T20:46:30.58Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/8e/74bc18078fff03192d4032cfa99d5a5ca937807136d6f5790ce07ca53515/numpy-2.3.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a9f66e7d2b2d7712410d3bc5684149040ef5f19856f20277cd17ea83e5006286", size = 6737533, upload-time = "2025-07-24T20:46:46.111Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ea/0731efe2c9073ccca5698ef6a8c3667c4cf4eea53fcdcd0b50140aba03bc/numpy-2.3.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de6ea4e5a65d5a90c7d286ddff2b87f3f4ad61faa3db8dabe936b34c2275b6f8", size = 14352007, upload-time = "2025-07-24T20:47:07.1Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/90/36be0865f16dfed20f4bc7f75235b963d5939707d4b591f086777412ff7b/numpy-2.3.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3ef07ec8cbc8fc9e369c8dcd52019510c12da4de81367d8b20bc692aa07573a", size = 16701914, upload-time = "2025-07-24T20:47:32.459Z" },
-    { url = "https://files.pythonhosted.org/packages/94/30/06cd055e24cb6c38e5989a9e747042b4e723535758e6153f11afea88c01b/numpy-2.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:27c9f90e7481275c7800dc9c24b7cc40ace3fdb970ae4d21eaff983a32f70c91", size = 16132708, upload-time = "2025-07-24T20:47:58.129Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/14/ecede608ea73e58267fd7cb78f42341b3b37ba576e778a1a06baffbe585c/numpy-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:07b62978075b67eee4065b166d000d457c82a1efe726cce608b9db9dd66a73a5", size = 18651678, upload-time = "2025-07-24T20:48:25.402Z" },
-    { url = "https://files.pythonhosted.org/packages/40/f3/2fe6066b8d07c3685509bc24d56386534c008b462a488b7f503ba82b8923/numpy-2.3.2-cp313-cp313t-win32.whl", hash = "sha256:c771cfac34a4f2c0de8e8c97312d07d64fd8f8ed45bc9f5726a7e947270152b5", size = 6441832, upload-time = "2025-07-24T20:48:37.181Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/ba/0937d66d05204d8f28630c9c60bc3eda68824abde4cf756c4d6aad03b0c6/numpy-2.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:72dbebb2dcc8305c431b2836bcc66af967df91be793d63a24e3d9b741374c450", size = 12927049, upload-time = "2025-07-24T20:48:56.24Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/ed/13542dd59c104d5e654dfa2ac282c199ba64846a74c2c4bcdbc3a0f75df1/numpy-2.3.2-cp313-cp313t-win_arm64.whl", hash = "sha256:72c6df2267e926a6d5286b0a6d556ebe49eae261062059317837fda12ddf0c1a", size = 10262935, upload-time = "2025-07-24T20:49:13.136Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/7c/7659048aaf498f7611b783e000c7268fcc4dcf0ce21cd10aad7b2e8f9591/numpy-2.3.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:448a66d052d0cf14ce9865d159bfc403282c9bc7bb2a31b03cc18b651eca8b1a", size = 20950906, upload-time = "2025-07-24T20:50:30.346Z" },
-    { url = "https://files.pythonhosted.org/packages/80/db/984bea9d4ddf7112a04cfdfb22b1050af5757864cfffe8e09e44b7f11a10/numpy-2.3.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:546aaf78e81b4081b2eba1d105c3b34064783027a06b3ab20b6eba21fb64132b", size = 14185607, upload-time = "2025-07-24T20:50:51.923Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/76/b3d6f414f4eca568f469ac112a3b510938d892bc5a6c190cb883af080b77/numpy-2.3.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:87c930d52f45df092f7578889711a0768094debf73cfcde105e2d66954358125", size = 5114110, upload-time = "2025-07-24T20:51:01.041Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/d2/6f5e6826abd6bca52392ed88fe44a4b52aacb60567ac3bc86c67834c3a56/numpy-2.3.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:8dc082ea901a62edb8f59713c6a7e28a85daddcb67454c839de57656478f5b19", size = 6642050, upload-time = "2025-07-24T20:51:11.64Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/43/f12b2ade99199e39c73ad182f103f9d9791f48d885c600c8e05927865baf/numpy-2.3.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af58de8745f7fa9ca1c0c7c943616c6fe28e75d0c81f5c295810e3c83b5be92f", size = 14296292, upload-time = "2025-07-24T20:51:33.488Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/f9/77c07d94bf110a916b17210fac38680ed8734c236bfed9982fd8524a7b47/numpy-2.3.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed5527c4cf10f16c6d0b6bee1f89958bccb0ad2522c8cadc2efd318bcd545f5", size = 16638913, upload-time = "2025-07-24T20:51:58.517Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/d1/9d9f2c8ea399cc05cfff8a7437453bd4e7d894373a93cdc46361bbb49a7d/numpy-2.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:095737ed986e00393ec18ec0b21b47c22889ae4b0cd2d5e88342e08b01141f58", size = 16071180, upload-time = "2025-07-24T20:52:22.827Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/41/82e2c68aff2a0c9bf315e47d61951099fed65d8cb2c8d9dc388cb87e947e/numpy-2.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5e40e80299607f597e1a8a247ff8d71d79c5b52baa11cc1cce30aa92d2da6e0", size = 18576809, upload-time = "2025-07-24T20:52:51.015Z" },
-    { url = "https://files.pythonhosted.org/packages/14/14/4b4fd3efb0837ed252d0f583c5c35a75121038a8c4e065f2c259be06d2d8/numpy-2.3.2-cp314-cp314-win32.whl", hash = "sha256:7d6e390423cc1f76e1b8108c9b6889d20a7a1f59d9a60cac4a050fa734d6c1e2", size = 6366410, upload-time = "2025-07-24T20:56:44.949Z" },
-    { url = "https://files.pythonhosted.org/packages/11/9e/b4c24a6b8467b61aced5c8dc7dcfce23621baa2e17f661edb2444a418040/numpy-2.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:b9d0878b21e3918d76d2209c924ebb272340da1fb51abc00f986c258cd5e957b", size = 12918821, upload-time = "2025-07-24T20:57:06.479Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/0f/0dc44007c70b1007c1cef86b06986a3812dd7106d8f946c09cfa75782556/numpy-2.3.2-cp314-cp314-win_arm64.whl", hash = "sha256:2738534837c6a1d0c39340a190177d7d66fdf432894f469728da901f8f6dc910", size = 10477303, upload-time = "2025-07-24T20:57:22.879Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/3e/075752b79140b78ddfc9c0a1634d234cfdbc6f9bbbfa6b7504e445ad7d19/numpy-2.3.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:4d002ecf7c9b53240be3bb69d80f86ddbd34078bae04d87be81c1f58466f264e", size = 21047524, upload-time = "2025-07-24T20:53:22.086Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/6d/60e8247564a72426570d0e0ea1151b95ce5bd2f1597bb878a18d32aec855/numpy-2.3.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:293b2192c6bcce487dbc6326de5853787f870aeb6c43f8f9c6496db5b1781e45", size = 14300519, upload-time = "2025-07-24T20:53:44.053Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/73/d8326c442cd428d47a067070c3ac6cc3b651a6e53613a1668342a12d4479/numpy-2.3.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:0a4f2021a6da53a0d580d6ef5db29947025ae8b35b3250141805ea9a32bbe86b", size = 5228972, upload-time = "2025-07-24T20:53:53.81Z" },
-    { url = "https://files.pythonhosted.org/packages/34/2e/e71b2d6dad075271e7079db776196829019b90ce3ece5c69639e4f6fdc44/numpy-2.3.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9c144440db4bf3bb6372d2c3e49834cc0ff7bb4c24975ab33e01199e645416f2", size = 6737439, upload-time = "2025-07-24T20:54:04.742Z" },
-    { url = "https://files.pythonhosted.org/packages/15/b0/d004bcd56c2c5e0500ffc65385eb6d569ffd3363cb5e593ae742749b2daa/numpy-2.3.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f92d6c2a8535dc4fe4419562294ff957f83a16ebdec66df0805e473ffaad8bd0", size = 14352479, upload-time = "2025-07-24T20:54:25.819Z" },
-    { url = "https://files.pythonhosted.org/packages/11/e3/285142fcff8721e0c99b51686426165059874c150ea9ab898e12a492e291/numpy-2.3.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cefc2219baa48e468e3db7e706305fcd0c095534a192a08f31e98d83a7d45fb0", size = 16702805, upload-time = "2025-07-24T20:54:50.814Z" },
-    { url = "https://files.pythonhosted.org/packages/33/c3/33b56b0e47e604af2c7cd065edca892d180f5899599b76830652875249a3/numpy-2.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:76c3e9501ceb50b2ff3824c3589d5d1ab4ac857b0ee3f8f49629d0de55ecf7c2", size = 16133830, upload-time = "2025-07-24T20:55:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/ae/7b1476a1f4d6a48bc669b8deb09939c56dd2a439db1ab03017844374fb67/numpy-2.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:122bf5ed9a0221b3419672493878ba4967121514b1d7d4656a7580cd11dddcbf", size = 18652665, upload-time = "2025-07-24T20:55:46.665Z" },
-    { url = "https://files.pythonhosted.org/packages/14/ba/5b5c9978c4bb161034148ade2de9db44ec316fab89ce8c400db0e0c81f86/numpy-2.3.2-cp314-cp314t-win32.whl", hash = "sha256:6f1ae3dcb840edccc45af496f312528c15b1f79ac318169d094e85e4bb35fdf1", size = 6514777, upload-time = "2025-07-24T20:55:57.66Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/46/3dbaf0ae7c17cdc46b9f662c56da2054887b8d9e737c1476f335c83d33db/numpy-2.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:087ffc25890d89a43536f75c5fe8770922008758e8eeeef61733957041ed2f9b", size = 13111856, upload-time = "2025-07-24T20:56:17.318Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/ea/50ebc91d28b275b23b7128ef25c3d08152bc4068f42742867e07a870a42a/numpy-2.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:14a91ebac98813a49bc6aa1a0dfc09513dcec1d97eaf31ca21a87221a1cdcb15", size = 21130338, upload-time = "2025-07-24T20:57:54.37Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/57/cdd5eac00dd5f137277355c318a955c0d8fb8aa486020c22afd305f8b88f/numpy-2.3.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:71669b5daae692189540cffc4c439468d35a3f84f0c88b078ecd94337f6cb0ec", size = 14375776, upload-time = "2025-07-24T20:58:16.303Z" },
-    { url = "https://files.pythonhosted.org/packages/83/85/27280c7f34fcd305c2209c0cdca4d70775e4859a9eaa92f850087f8dea50/numpy-2.3.2-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:69779198d9caee6e547adb933941ed7520f896fd9656834c300bdf4dd8642712", size = 5304882, upload-time = "2025-07-24T20:58:26.199Z" },
-    { url = "https://files.pythonhosted.org/packages/48/b4/6500b24d278e15dd796f43824e69939d00981d37d9779e32499e823aa0aa/numpy-2.3.2-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2c3271cc4097beb5a60f010bcc1cc204b300bb3eafb4399376418a83a1c6373c", size = 6818405, upload-time = "2025-07-24T20:58:37.341Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/c9/142c1e03f199d202da8e980c2496213509291b6024fd2735ad28ae7065c7/numpy-2.3.2-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8446acd11fe3dc1830568c941d44449fd5cb83068e5c70bd5a470d323d448296", size = 14419651, upload-time = "2025-07-24T20:58:59.048Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/95/8023e87cbea31a750a6c00ff9427d65ebc5fef104a136bfa69f76266d614/numpy-2.3.2-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa098a5ab53fa407fded5870865c6275a5cd4101cfdef8d6fafc48286a96e981", size = 16760166, upload-time = "2025-07-24T21:28:56.38Z" },
-    { url = "https://files.pythonhosted.org/packages/78/e3/6690b3f85a05506733c7e90b577e4762517404ea78bab2ca3a5cb1aeb78d/numpy-2.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6936aff90dda378c09bea075af0d9c675fe3a977a9d2402f95a87f440f59f619", size = 12977811, upload-time = "2025-07-24T21:29:18.234Z" },
-]
-
-[[package]]
-name = "packaging"
-version = "25.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
-]
-
-[[package]]
-name = "pandas"
-version = "2.0.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "python-dateutil", marker = "python_full_version < '3.9'" },
-    { name = "pytz", marker = "python_full_version < '3.9'" },
-    { name = "tzdata", marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455, upload-time = "2023-06-28T23:19:33.371Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908, upload-time = "2023-06-28T23:15:57.001Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486, upload-time = "2023-06-28T23:16:06.863Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897, upload-time = "2023-06-28T23:16:14.208Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421, upload-time = "2023-06-28T23:16:23.26Z" },
-    { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792, upload-time = "2023-06-28T23:16:30.876Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333, upload-time = "2023-06-28T23:16:39.209Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672, upload-time = "2023-06-28T23:16:47.601Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229, upload-time = "2023-06-28T23:16:56.397Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591, upload-time = "2023-06-28T23:17:04.234Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370, upload-time = "2023-06-28T23:17:11.783Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935, upload-time = "2023-06-28T23:17:21.376Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692, upload-time = "2023-06-28T23:17:28.824Z" },
-    { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303, upload-time = "2023-06-28T23:17:36.329Z" },
-    { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932, upload-time = "2023-06-28T23:17:49.875Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018, upload-time = "2023-06-28T23:18:05.845Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723, upload-time = "2023-06-28T23:18:17.631Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403, upload-time = "2023-06-28T23:18:24.328Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638, upload-time = "2023-06-28T23:18:30.947Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160, upload-time = "2023-06-28T23:18:40.332Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752, upload-time = "2023-06-28T23:18:50.016Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852, upload-time = "2023-06-28T23:19:00.594Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496, upload-time = "2023-06-28T23:19:11.78Z" },
-    { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766, upload-time = "2023-06-28T23:19:18.182Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902, upload-time = "2023-06-28T23:19:25.151Z" },
-]
-
-[[package]]
-name = "pandas"
-version = "2.3.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
-    { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "python-dateutil", marker = "python_full_version >= '3.9'" },
-    { name = "pytz", marker = "python_full_version >= '3.9'" },
-    { name = "tzdata", marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/ca/aa97b47287221fa37a49634532e520300088e290b20d690b21ce3e448143/pandas-2.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22c2e866f7209ebc3a8f08d75766566aae02bcc91d196935a1d9e59c7b990ac9", size = 11542731, upload-time = "2025-07-07T19:18:12.619Z" },
-    { url = "https://files.pythonhosted.org/packages/80/bf/7938dddc5f01e18e573dcfb0f1b8c9357d9b5fa6ffdee6e605b92efbdff2/pandas-2.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3583d348546201aff730c8c47e49bc159833f971c2899d6097bce68b9112a4f1", size = 10790031, upload-time = "2025-07-07T19:18:16.611Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/2f/9af748366763b2a494fed477f88051dbf06f56053d5c00eba652697e3f94/pandas-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f951fbb702dacd390561e0ea45cdd8ecfa7fb56935eb3dd78e306c19104b9b0", size = 11724083, upload-time = "2025-07-07T19:18:20.512Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/95/79ab37aa4c25d1e7df953dde407bb9c3e4ae47d154bc0dd1692f3a6dcf8c/pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd05b72ec02ebfb993569b4931b2e16fbb4d6ad6ce80224a3ee838387d83a191", size = 12342360, upload-time = "2025-07-07T19:18:23.194Z" },
-    { url = "https://files.pythonhosted.org/packages/75/a7/d65e5d8665c12c3c6ff5edd9709d5836ec9b6f80071b7f4a718c6106e86e/pandas-2.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1b916a627919a247d865aed068eb65eb91a344b13f5b57ab9f610b7716c92de1", size = 13202098, upload-time = "2025-07-07T19:18:25.558Z" },
-    { url = "https://files.pythonhosted.org/packages/65/f3/4c1dbd754dbaa79dbf8b537800cb2fa1a6e534764fef50ab1f7533226c5c/pandas-2.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fe67dc676818c186d5a3d5425250e40f179c2a89145df477dd82945eaea89e97", size = 13837228, upload-time = "2025-07-07T19:18:28.344Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/d6/d7f5777162aa9b48ec3910bca5a58c9b5927cfd9cfde3aa64322f5ba4b9f/pandas-2.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:2eb789ae0274672acbd3c575b0598d213345660120a257b47b5dafdc618aec83", size = 11336561, upload-time = "2025-07-07T19:18:31.211Z" },
-    { url = "https://files.pythonhosted.org/packages/76/1c/ccf70029e927e473a4476c00e0d5b32e623bff27f0402d0a92b7fc29bb9f/pandas-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b", size = 11566608, upload-time = "2025-07-07T19:18:33.86Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/d3/3c37cb724d76a841f14b8f5fe57e5e3645207cc67370e4f84717e8bb7657/pandas-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f", size = 10823181, upload-time = "2025-07-07T19:18:36.151Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/4c/367c98854a1251940edf54a4df0826dcacfb987f9068abf3e3064081a382/pandas-2.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85", size = 11793570, upload-time = "2025-07-07T19:18:38.385Z" },
-    { url = "https://files.pythonhosted.org/packages/07/5f/63760ff107bcf5146eee41b38b3985f9055e710a72fdd637b791dea3495c/pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d", size = 12378887, upload-time = "2025-07-07T19:18:41.284Z" },
-    { url = "https://files.pythonhosted.org/packages/15/53/f31a9b4dfe73fe4711c3a609bd8e60238022f48eacedc257cd13ae9327a7/pandas-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678", size = 13230957, upload-time = "2025-07-07T19:18:44.187Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/94/6fce6bf85b5056d065e0a7933cba2616dcb48596f7ba3c6341ec4bcc529d/pandas-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299", size = 13883883, upload-time = "2025-07-07T19:18:46.498Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/7b/bdcb1ed8fccb63d04bdb7635161d0ec26596d92c9d7a6cce964e7876b6c1/pandas-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab", size = 11340212, upload-time = "2025-07-07T19:18:49.293Z" },
-    { url = "https://files.pythonhosted.org/packages/46/de/b8445e0f5d217a99fe0eeb2f4988070908979bec3587c0633e5428ab596c/pandas-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:689968e841136f9e542020698ee1c4fbe9caa2ed2213ae2388dc7b81721510d3", size = 11588172, upload-time = "2025-07-07T19:18:52.054Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/e0/801cdb3564e65a5ac041ab99ea6f1d802a6c325bb6e58c79c06a3f1cd010/pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:025e92411c16cbe5bb2a4abc99732a6b132f439b8aab23a59fa593eb00704232", size = 10717365, upload-time = "2025-07-07T19:18:54.785Z" },
-    { url = "https://files.pythonhosted.org/packages/51/a5/c76a8311833c24ae61a376dbf360eb1b1c9247a5d9c1e8b356563b31b80c/pandas-2.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b7ff55f31c4fcb3e316e8f7fa194566b286d6ac430afec0d461163312c5841e", size = 11280411, upload-time = "2025-07-07T19:18:57.045Z" },
-    { url = "https://files.pythonhosted.org/packages/da/01/e383018feba0a1ead6cf5fe8728e5d767fee02f06a3d800e82c489e5daaf/pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dcb79bf373a47d2a40cf7232928eb7540155abbc460925c2c96d2d30b006eb4", size = 11988013, upload-time = "2025-07-07T19:18:59.771Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/14/cec7760d7c9507f11c97d64f29022e12a6cc4fc03ac694535e89f88ad2ec/pandas-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:56a342b231e8862c96bdb6ab97170e203ce511f4d0429589c8ede1ee8ece48b8", size = 12767210, upload-time = "2025-07-07T19:19:02.944Z" },
-    { url = "https://files.pythonhosted.org/packages/50/b9/6e2d2c6728ed29fb3d4d4d302504fb66f1a543e37eb2e43f352a86365cdf/pandas-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ca7ed14832bce68baef331f4d7f294411bed8efd032f8109d690df45e00c4679", size = 13440571, upload-time = "2025-07-07T19:19:06.82Z" },
-    { url = "https://files.pythonhosted.org/packages/80/a5/3a92893e7399a691bad7664d977cb5e7c81cf666c81f89ea76ba2bff483d/pandas-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:ac942bfd0aca577bef61f2bc8da8147c4ef6879965ef883d8e8d5d2dc3e744b8", size = 10987601, upload-time = "2025-07-07T19:19:09.589Z" },
-    { url = "https://files.pythonhosted.org/packages/32/ed/ff0a67a2c5505e1854e6715586ac6693dd860fbf52ef9f81edee200266e7/pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22", size = 11531393, upload-time = "2025-07-07T19:19:12.245Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/db/d8f24a7cc9fb0972adab0cc80b6817e8bef888cfd0024eeb5a21c0bb5c4a/pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a", size = 10668750, upload-time = "2025-07-07T19:19:14.612Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/b0/80f6ec783313f1e2356b28b4fd8d2148c378370045da918c73145e6aab50/pandas-2.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928", size = 11342004, upload-time = "2025-07-07T19:19:16.857Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9", size = 12050869, upload-time = "2025-07-07T19:19:19.265Z" },
-    { url = "https://files.pythonhosted.org/packages/55/79/20d746b0a96c67203a5bee5fb4e00ac49c3e8009a39e1f78de264ecc5729/pandas-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12", size = 12750218, upload-time = "2025-07-07T19:19:21.547Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/0f/145c8b41e48dbf03dd18fdd7f24f8ba95b8254a97a3379048378f33e7838/pandas-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb", size = 13416763, upload-time = "2025-07-07T19:19:23.939Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/c0/54415af59db5cdd86a3d3bf79863e8cc3fa9ed265f0745254061ac09d5f2/pandas-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956", size = 10987482, upload-time = "2025-07-07T19:19:42.699Z" },
-    { url = "https://files.pythonhosted.org/packages/48/64/2fd2e400073a1230e13b8cd604c9bc95d9e3b962e5d44088ead2e8f0cfec/pandas-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a", size = 12029159, upload-time = "2025-07-07T19:19:26.362Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/0a/d84fd79b0293b7ef88c760d7dca69828d867c89b6d9bc52d6a27e4d87316/pandas-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9", size = 11393287, upload-time = "2025-07-07T19:19:29.157Z" },
-    { url = "https://files.pythonhosted.org/packages/50/ae/ff885d2b6e88f3c7520bb74ba319268b42f05d7e583b5dded9837da2723f/pandas-2.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275", size = 11309381, upload-time = "2025-07-07T19:19:31.436Z" },
-    { url = "https://files.pythonhosted.org/packages/85/86/1fa345fc17caf5d7780d2699985c03dbe186c68fee00b526813939062bb0/pandas-2.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab", size = 11883998, upload-time = "2025-07-07T19:19:34.267Z" },
-    { url = "https://files.pythonhosted.org/packages/81/aa/e58541a49b5e6310d89474333e994ee57fea97c8aaa8fc7f00b873059bbf/pandas-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96", size = 12704705, upload-time = "2025-07-07T19:19:36.856Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/21/ecf2df680982616459409b09962a8c2065330c7151dc6538069f3b634acf/pandas-2.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4645f770f98d656f11c69e81aeb21c6fca076a44bed3dcbb9396a4311bc7f6d8", size = 11567275, upload-time = "2025-07-07T19:19:45.152Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/1a/dcb50e44b75419e96b276c9fb023b0f147b3c411be1cd517492aa2a184d4/pandas-2.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:342e59589cc454aaff7484d75b816a433350b3d7964d7847327edda4d532a2e3", size = 10811488, upload-time = "2025-07-07T19:19:47.797Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/55/66cd2b679f6a27398380eac7574bc24746128f74626a3c02b978ea00e5ce/pandas-2.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d12f618d80379fde6af007f65f0c25bd3e40251dbd1636480dfffce2cf1e6da", size = 11763000, upload-time = "2025-07-07T19:19:50.83Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/1c/5b9b263c80fd5e231b77df6f78cd7426d1d4ad3a4e858e85b7b3d93d0e9c/pandas-2.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd71c47a911da120d72ef173aeac0bf5241423f9bfea57320110a978457e069e", size = 12361395, upload-time = "2025-07-07T19:19:53.714Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/74/7e817b31413fbb96366ea327d43d1926a9c48c58074e27e094e2839a0e36/pandas-2.3.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:09e3b1587f0f3b0913e21e8b32c3119174551deb4a4eba4a89bc7377947977e7", size = 13225086, upload-time = "2025-07-07T19:19:56.378Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/0f/bc0a44b47eba2f22ae4235719a573d552ef7ad76ed3ea39ae62d554e040b/pandas-2.3.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2323294c73ed50f612f67e2bf3ae45aea04dce5690778e08a09391897f35ff88", size = 13871698, upload-time = "2025-07-07T19:19:58.854Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/cb/6c32f8fadefa4314b740fbe8f74f6a02423bd1549e7c930826df35ac3c1b/pandas-2.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:b4b0de34dc8499c2db34000ef8baad684cfa4cbd836ecee05f323ebfba348c7d", size = 11357186, upload-time = "2025-07-07T19:20:01.475Z" },
-]
-
-[[package]]
-name = "pathspec"
-version = "0.12.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
-]
-
-[[package]]
-name = "platformdirs"
-version = "4.3.6"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302, upload-time = "2024-09-17T19:06:50.688Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439, upload-time = "2024-09-17T19:06:49.212Z" },
-]
-
-[[package]]
-name = "platformdirs"
-version = "4.3.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
-]
-
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload-time = "2024-04-20T21:34:42.531Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload-time = "2024-04-20T21:34:40.434Z" },
-]
-
-[[package]]
-name = "pluggy"
-version = "1.6.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
-]
-
-[[package]]
-name = "pygments"
-version = "2.19.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
-]
-
-[[package]]
-name = "pytest"
-version = "8.3.5"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" },
-    { name = "exceptiongroup", marker = "python_full_version < '3.9'" },
-    { name = "iniconfig", marker = "python_full_version < '3.9'" },
-    { name = "packaging", marker = "python_full_version < '3.9'" },
-    { name = "pluggy", version = "1.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "tomli", marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" },
-]
-
-[[package]]
-name = "pytest"
-version = "8.4.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "colorama", marker = "python_full_version >= '3.9' and sys_platform == 'win32'" },
-    { name = "exceptiongroup", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "iniconfig", marker = "python_full_version >= '3.9'" },
-    { name = "packaging", marker = "python_full_version >= '3.9'" },
-    { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "pygments", marker = "python_full_version >= '3.9'" },
-    { name = "tomli", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
-]
-
-[[package]]
-name = "pytest-asyncio"
-version = "0.24.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-dependencies = [
-    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/52/6d/c6cf50ce320cf8611df7a1254d86233b3df7cc07f9b5f5cbcb82e08aa534/pytest_asyncio-0.24.0.tar.gz", hash = "sha256:d081d828e576d85f875399194281e92bf8a68d60d72d1a2faf2feddb6c46b276", size = 49855, upload-time = "2024-08-22T08:03:18.145Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/31/6607dab48616902f76885dfcf62c08d929796fc3b2d2318faf9fd54dbed9/pytest_asyncio-0.24.0-py3-none-any.whl", hash = "sha256:a811296ed596b69bf0b6f3dc40f83bcaf341b155a269052d82efa2b25ac7037b", size = 18024, upload-time = "2024-08-22T08:03:15.536Z" },
-]
-
-[[package]]
-name = "pytest-asyncio"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-dependencies = [
-    { name = "backports-asyncio-runner", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" },
-    { name = "pytest", version = "8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-    { name = "typing-extensions", version = "4.14.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/4e/51/f8794af39eeb870e87a8c8068642fc07bce0c854d6865d7dd0f2a9d338c2/pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea", size = 46652, upload-time = "2025-07-16T04:29:26.393Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/9d/bf86eddabf8c6c9cb1ea9a869d6873b46f105a5d292d3a6f7071f5b07935/pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf", size = 15157, upload-time = "2025-07-16T04:29:24.929Z" },
-]
-
-[[package]]
-name = "python-dateutil"
-version = "2.9.0.post0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "six" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
-]
-
-[[package]]
-name = "pytz"
-version = "2025.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
-]
-
-[[package]]
-name = "requests"
-version = "2.32.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "charset-normalizer" },
-    { name = "idna" },
-    { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
-    { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload-time = "2025-06-09T16:43:07.34Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" },
-]
-
-[[package]]
-name = "six"
-version = "1.17.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
-]
-
-[[package]]
-name = "tomli"
-version = "2.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175, upload-time = "2024-11-27T22:38:36.873Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077, upload-time = "2024-11-27T22:37:54.956Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429, upload-time = "2024-11-27T22:37:56.698Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067, upload-time = "2024-11-27T22:37:57.63Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030, upload-time = "2024-11-27T22:37:59.344Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898, upload-time = "2024-11-27T22:38:00.429Z" },
-    { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894, upload-time = "2024-11-27T22:38:02.094Z" },
-    { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319, upload-time = "2024-11-27T22:38:03.206Z" },
-    { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273, upload-time = "2024-11-27T22:38:04.217Z" },
-    { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310, upload-time = "2024-11-27T22:38:05.908Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309, upload-time = "2024-11-27T22:38:06.812Z" },
-    { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762, upload-time = "2024-11-27T22:38:07.731Z" },
-    { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453, upload-time = "2024-11-27T22:38:09.384Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486, upload-time = "2024-11-27T22:38:10.329Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349, upload-time = "2024-11-27T22:38:11.443Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159, upload-time = "2024-11-27T22:38:13.099Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243, upload-time = "2024-11-27T22:38:14.766Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645, upload-time = "2024-11-27T22:38:15.843Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584, upload-time = "2024-11-27T22:38:17.645Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875, upload-time = "2024-11-27T22:38:19.159Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418, upload-time = "2024-11-27T22:38:20.064Z" },
-    { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708, upload-time = "2024-11-27T22:38:21.659Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582, upload-time = "2024-11-27T22:38:22.693Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543, upload-time = "2024-11-27T22:38:24.367Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691, upload-time = "2024-11-27T22:38:26.081Z" },
-    { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170, upload-time = "2024-11-27T22:38:27.921Z" },
-    { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530, upload-time = "2024-11-27T22:38:29.591Z" },
-    { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666, upload-time = "2024-11-27T22:38:30.639Z" },
-    { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954, upload-time = "2024-11-27T22:38:31.702Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724, upload-time = "2024-11-27T22:38:32.837Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383, upload-time = "2024-11-27T22:38:34.455Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.13.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" },
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.14.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" },
-]
-
-[[package]]
-name = "tzdata"
-version = "2025.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
-]
-
-[[package]]
-name = "urllib3"
-version = "2.2.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677, upload-time = "2024-09-12T10:52:18.401Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338, upload-time = "2024-09-12T10:52:16.589Z" },
-]
-
-[[package]]
-name = "urllib3"
-version = "2.5.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
-]
-
-[[package]]
-name = "websockets"
-version = "13.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.9'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e2/73/9223dbc7be3dcaf2a7bbf756c351ec8da04b1fa573edaf545b95f6b0c7fd/websockets-13.1.tar.gz", hash = "sha256:a3b3366087c1bc0a2795111edcadddb8b3b59509d5db5d7ea3fdd69f954a8878", size = 158549, upload-time = "2024-09-21T17:34:21.54Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0a/94/d15dbfc6a5eb636dbc754303fba18208f2e88cf97e733e1d64fb9cb5c89e/websockets-13.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f48c749857f8fb598fb890a75f540e3221d0976ed0bf879cf3c7eef34151acee", size = 157815, upload-time = "2024-09-21T17:32:27.107Z" },
-    { url = "https://files.pythonhosted.org/packages/30/02/c04af33f4663945a26f5e8cf561eb140c35452b50af47a83c3fbcfe62ae1/websockets-13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c7e72ce6bda6fb9409cc1e8164dd41d7c91466fb599eb047cfda72fe758a34a7", size = 155466, upload-time = "2024-09-21T17:32:28.428Z" },
-    { url = "https://files.pythonhosted.org/packages/35/e8/719f08d12303ea643655e52d9e9851b2dadbb1991d4926d9ce8862efa2f5/websockets-13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f779498eeec470295a2b1a5d97aa1bc9814ecd25e1eb637bd9d1c73a327387f6", size = 155716, upload-time = "2024-09-21T17:32:29.905Z" },
-    { url = "https://files.pythonhosted.org/packages/91/e1/14963ae0252a8925f7434065d25dcd4701d5e281a0b4b460a3b5963d2594/websockets-13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4676df3fe46956fbb0437d8800cd5f2b6d41143b6e7e842e60554398432cf29b", size = 164806, upload-time = "2024-09-21T17:32:31.384Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/fa/ab28441bae5e682a0f7ddf3d03440c0c352f930da419301f4a717f675ef3/websockets-13.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7affedeb43a70351bb811dadf49493c9cfd1ed94c9c70095fd177e9cc1541fa", size = 163810, upload-time = "2024-09-21T17:32:32.384Z" },
-    { url = "https://files.pythonhosted.org/packages/44/77/dea187bd9d16d4b91566a2832be31f99a40d0f5bfa55eeb638eb2c3bc33d/websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1971e62d2caa443e57588e1d82d15f663b29ff9dfe7446d9964a4b6f12c1e700", size = 164125, upload-time = "2024-09-21T17:32:33.398Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/d9/3af14544e83f1437eb684b399e6ba0fa769438e869bf5d83d74bc197fae8/websockets-13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5f2e75431f8dc4a47f31565a6e1355fb4f2ecaa99d6b89737527ea917066e26c", size = 164532, upload-time = "2024-09-21T17:32:35.109Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/8a/6d332eabe7d59dfefe4b8ba6f46c8c5fabb15b71c8a8bc3d2b65de19a7b6/websockets-13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:58cf7e75dbf7e566088b07e36ea2e3e2bd5676e22216e4cad108d4df4a7402a0", size = 163948, upload-time = "2024-09-21T17:32:36.214Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/91/a0aeadbaf3017467a1ee03f8fb67accdae233fe2d5ad4b038c0a84e357b0/websockets-13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c90d6dec6be2c7d03378a574de87af9b1efea77d0c52a8301dd831ece938452f", size = 163898, upload-time = "2024-09-21T17:32:37.277Z" },
-    { url = "https://files.pythonhosted.org/packages/71/31/a90fb47c63e0ae605be914b0b969d7c6e6ffe2038cd744798e4b3fbce53b/websockets-13.1-cp310-cp310-win32.whl", hash = "sha256:730f42125ccb14602f455155084f978bd9e8e57e89b569b4d7f0f0c17a448ffe", size = 158706, upload-time = "2024-09-21T17:32:38.755Z" },
-    { url = "https://files.pythonhosted.org/packages/93/ca/9540a9ba80da04dc7f36d790c30cae4252589dbd52ccdc92e75b0be22437/websockets-13.1-cp310-cp310-win_amd64.whl", hash = "sha256:5993260f483d05a9737073be197371940c01b257cc45ae3f1d5d7adb371b266a", size = 159141, upload-time = "2024-09-21T17:32:40.495Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/f0/cf0b8a30d86b49e267ac84addbebbc7a48a6e7bb7c19db80f62411452311/websockets-13.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:61fc0dfcda609cda0fc9fe7977694c0c59cf9d749fbb17f4e9483929e3c48a19", size = 157813, upload-time = "2024-09-21T17:32:42.188Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/e7/22285852502e33071a8cf0ac814f8988480ec6db4754e067b8b9d0e92498/websockets-13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ceec59f59d092c5007e815def4ebb80c2de330e9588e101cf8bd94c143ec78a5", size = 155469, upload-time = "2024-09-21T17:32:43.858Z" },
-    { url = "https://files.pythonhosted.org/packages/68/d4/c8c7c1e5b40ee03c5cc235955b0fb1ec90e7e37685a5f69229ad4708dcde/websockets-13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1dca61c6db1166c48b95198c0b7d9c990b30c756fc2923cc66f68d17dc558fd", size = 155717, upload-time = "2024-09-21T17:32:44.914Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/e4/c50999b9b848b1332b07c7fd8886179ac395cb766fda62725d1539e7bc6c/websockets-13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:308e20f22c2c77f3f39caca508e765f8725020b84aa963474e18c59accbf4c02", size = 165379, upload-time = "2024-09-21T17:32:45.933Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/49/4a4ad8c072f18fd79ab127650e47b160571aacfc30b110ee305ba25fffc9/websockets-13.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62d516c325e6540e8a57b94abefc3459d7dab8ce52ac75c96cad5549e187e3a7", size = 164376, upload-time = "2024-09-21T17:32:46.987Z" },
-    { url = "https://files.pythonhosted.org/packages/af/9b/8c06d425a1d5a74fd764dd793edd02be18cf6fc3b1ccd1f29244ba132dc0/websockets-13.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c6e35319b46b99e168eb98472d6c7d8634ee37750d7693656dc766395df096", size = 164753, upload-time = "2024-09-21T17:32:48.046Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/5b/0acb5815095ff800b579ffc38b13ab1b915b317915023748812d24e0c1ac/websockets-13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5f9fee94ebafbc3117c30be1844ed01a3b177bb6e39088bc6b2fa1dc15572084", size = 165051, upload-time = "2024-09-21T17:32:49.271Z" },
-    { url = "https://files.pythonhosted.org/packages/30/93/c3891c20114eacb1af09dedfcc620c65c397f4fd80a7009cd12d9457f7f5/websockets-13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7c1e90228c2f5cdde263253fa5db63e6653f1c00e7ec64108065a0b9713fa1b3", size = 164489, upload-time = "2024-09-21T17:32:50.392Z" },
-    { url = "https://files.pythonhosted.org/packages/28/09/af9e19885539759efa2e2cd29b8b3f9eecef7ecefea40d46612f12138b36/websockets-13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6548f29b0e401eea2b967b2fdc1c7c7b5ebb3eeb470ed23a54cd45ef078a0db9", size = 164438, upload-time = "2024-09-21T17:32:52.223Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/08/6f38b8e625b3d93de731f1d248cc1493327f16cb45b9645b3e791782cff0/websockets-13.1-cp311-cp311-win32.whl", hash = "sha256:c11d4d16e133f6df8916cc5b7e3e96ee4c44c936717d684a94f48f82edb7c92f", size = 158710, upload-time = "2024-09-21T17:32:53.244Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/39/ec8832ecb9bb04a8d318149005ed8cee0ba4e0205835da99e0aa497a091f/websockets-13.1-cp311-cp311-win_amd64.whl", hash = "sha256:d04f13a1d75cb2b8382bdc16ae6fa58c97337253826dfe136195b7f89f661557", size = 159137, upload-time = "2024-09-21T17:32:54.721Z" },
-    { url = "https://files.pythonhosted.org/packages/df/46/c426282f543b3c0296cf964aa5a7bb17e984f58dde23460c3d39b3148fcf/websockets-13.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9d75baf00138f80b48f1eac72ad1535aac0b6461265a0bcad391fc5aba875cfc", size = 157821, upload-time = "2024-09-21T17:32:56.442Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/85/22529867010baac258da7c45848f9415e6cf37fef00a43856627806ffd04/websockets-13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9b6f347deb3dcfbfde1c20baa21c2ac0751afaa73e64e5b693bb2b848efeaa49", size = 155480, upload-time = "2024-09-21T17:32:57.698Z" },
-    { url = "https://files.pythonhosted.org/packages/29/2c/bdb339bfbde0119a6e84af43ebf6275278698a2241c2719afc0d8b0bdbf2/websockets-13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de58647e3f9c42f13f90ac7e5f58900c80a39019848c5547bc691693098ae1bd", size = 155715, upload-time = "2024-09-21T17:32:59.429Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/d0/8612029ea04c5c22bf7af2fd3d63876c4eaeef9b97e86c11972a43aa0e6c/websockets-13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1b54689e38d1279a51d11e3467dd2f3a50f5f2e879012ce8f2d6943f00e83f0", size = 165647, upload-time = "2024-09-21T17:33:00.495Z" },
-    { url = "https://files.pythonhosted.org/packages/56/04/1681ed516fa19ca9083f26d3f3a302257e0911ba75009533ed60fbb7b8d1/websockets-13.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf1781ef73c073e6b0f90af841aaf98501f975d306bbf6221683dd594ccc52b6", size = 164592, upload-time = "2024-09-21T17:33:02.223Z" },
-    { url = "https://files.pythonhosted.org/packages/38/6f/a96417a49c0ed132bb6087e8e39a37db851c70974f5c724a4b2a70066996/websockets-13.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d23b88b9388ed85c6faf0e74d8dec4f4d3baf3ecf20a65a47b836d56260d4b9", size = 165012, upload-time = "2024-09-21T17:33:03.288Z" },
-    { url = "https://files.pythonhosted.org/packages/40/8b/fccf294919a1b37d190e86042e1a907b8f66cff2b61e9befdbce03783e25/websockets-13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3c78383585f47ccb0fcf186dcb8a43f5438bd7d8f47d69e0b56f71bf431a0a68", size = 165311, upload-time = "2024-09-21T17:33:04.728Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/61/f8615cf7ce5fe538476ab6b4defff52beb7262ff8a73d5ef386322d9761d/websockets-13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d6d300f8ec35c24025ceb9b9019ae9040c1ab2f01cddc2bcc0b518af31c75c14", size = 164692, upload-time = "2024-09-21T17:33:05.829Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/f1/a29dd6046d3a722d26f182b783a7997d25298873a14028c4760347974ea3/websockets-13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a9dcaf8b0cc72a392760bb8755922c03e17a5a54e08cca58e8b74f6902b433cf", size = 164686, upload-time = "2024-09-21T17:33:06.823Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/99/ab1cdb282f7e595391226f03f9b498f52109d25a2ba03832e21614967dfa/websockets-13.1-cp312-cp312-win32.whl", hash = "sha256:2f85cf4f2a1ba8f602298a853cec8526c2ca42a9a4b947ec236eaedb8f2dc80c", size = 158712, upload-time = "2024-09-21T17:33:07.877Z" },
-    { url = "https://files.pythonhosted.org/packages/46/93/e19160db48b5581feac8468330aa11b7292880a94a37d7030478596cc14e/websockets-13.1-cp312-cp312-win_amd64.whl", hash = "sha256:38377f8b0cdeee97c552d20cf1865695fcd56aba155ad1b4ca8779a5b6ef4ac3", size = 159145, upload-time = "2024-09-21T17:33:09.202Z" },
-    { url = "https://files.pythonhosted.org/packages/51/20/2b99ca918e1cbd33c53db2cace5f0c0cd8296fc77558e1908799c712e1cd/websockets-13.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a9ab1e71d3d2e54a0aa646ab6d4eebfaa5f416fe78dfe4da2839525dc5d765c6", size = 157828, upload-time = "2024-09-21T17:33:10.987Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/47/0932a71d3d9c0e9483174f60713c84cee58d62839a143f21a2bcdbd2d205/websockets-13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b9d7439d7fab4dce00570bb906875734df13d9faa4b48e261c440a5fec6d9708", size = 155487, upload-time = "2024-09-21T17:33:12.153Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/60/f1711eb59ac7a6c5e98e5637fef5302f45b6f76a2c9d64fd83bbb341377a/websockets-13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:327b74e915cf13c5931334c61e1a41040e365d380f812513a255aa804b183418", size = 155721, upload-time = "2024-09-21T17:33:13.909Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/e6/ba9a8db7f9d9b0e5f829cf626ff32677f39824968317223605a6b419d445/websockets-13.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:325b1ccdbf5e5725fdcb1b0e9ad4d2545056479d0eee392c291c1bf76206435a", size = 165609, upload-time = "2024-09-21T17:33:14.967Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/22/4ec80f1b9c27a0aebd84ccd857252eda8418ab9681eb571b37ca4c5e1305/websockets-13.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:346bee67a65f189e0e33f520f253d5147ab76ae42493804319b5716e46dddf0f", size = 164556, upload-time = "2024-09-21T17:33:17.113Z" },
-    { url = "https://files.pythonhosted.org/packages/27/ac/35f423cb6bb15600438db80755609d27eda36d4c0b3c9d745ea12766c45e/websockets-13.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91a0fa841646320ec0d3accdff5b757b06e2e5c86ba32af2e0815c96c7a603c5", size = 164993, upload-time = "2024-09-21T17:33:18.168Z" },
-    { url = "https://files.pythonhosted.org/packages/31/4e/98db4fd267f8be9e52e86b6ee4e9aa7c42b83452ea0ea0672f176224b977/websockets-13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:18503d2c5f3943e93819238bf20df71982d193f73dcecd26c94514f417f6b135", size = 165360, upload-time = "2024-09-21T17:33:19.233Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/15/3f0de7cda70ffc94b7e7024544072bc5b26e2c1eb36545291abb755d8cdb/websockets-13.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:a9cd1af7e18e5221d2878378fbc287a14cd527fdd5939ed56a18df8a31136bb2", size = 164745, upload-time = "2024-09-21T17:33:20.361Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/6e/66b6b756aebbd680b934c8bdbb6dcb9ce45aad72cde5f8a7208dbb00dd36/websockets-13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:70c5be9f416aa72aab7a2a76c90ae0a4fe2755c1816c153c1a2bcc3333ce4ce6", size = 164732, upload-time = "2024-09-21T17:33:23.103Z" },
-    { url = "https://files.pythonhosted.org/packages/35/c6/12e3aab52c11aeb289e3dbbc05929e7a9d90d7a9173958477d3ef4f8ce2d/websockets-13.1-cp313-cp313-win32.whl", hash = "sha256:624459daabeb310d3815b276c1adef475b3e6804abaf2d9d2c061c319f7f187d", size = 158709, upload-time = "2024-09-21T17:33:24.196Z" },
-    { url = "https://files.pythonhosted.org/packages/41/d8/63d6194aae711d7263df4498200c690a9c39fb437ede10f3e157a6343e0d/websockets-13.1-cp313-cp313-win_amd64.whl", hash = "sha256:c518e84bb59c2baae725accd355c8dc517b4a3ed8db88b4bc93c78dae2974bf2", size = 159144, upload-time = "2024-09-21T17:33:25.96Z" },
-    { url = "https://files.pythonhosted.org/packages/83/69/59872420e5bce60db166d6fba39ee24c719d339fb0ae48cb2ce580129882/websockets-13.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c7934fd0e920e70468e676fe7f1b7261c1efa0d6c037c6722278ca0228ad9d0d", size = 157811, upload-time = "2024-09-21T17:33:27.379Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/f7/0610032e0d3981758fdd6ee7c68cc02ebf668a762c5178d3d91748228849/websockets-13.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:149e622dc48c10ccc3d2760e5f36753db9cacf3ad7bc7bbbfd7d9c819e286f23", size = 155471, upload-time = "2024-09-21T17:33:28.473Z" },
-    { url = "https://files.pythonhosted.org/packages/55/2f/c43173a72ea395263a427a36d25bce2675f41c809424466a13c61a9a2d61/websockets-13.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a569eb1b05d72f9bce2ebd28a1ce2054311b66677fcd46cf36204ad23acead8c", size = 155713, upload-time = "2024-09-21T17:33:29.795Z" },
-    { url = "https://files.pythonhosted.org/packages/92/7e/8fa930c6426a56c47910792717787640329e4a0e37cdfda20cf89da67126/websockets-13.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95df24ca1e1bd93bbca51d94dd049a984609687cb2fb08a7f2c56ac84e9816ea", size = 164995, upload-time = "2024-09-21T17:33:30.802Z" },
-    { url = "https://files.pythonhosted.org/packages/27/29/50ed4c68a3f606565a2db4b13948ae7b6f6c53aa9f8f258d92be6698d276/websockets-13.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8dbb1bf0c0a4ae8b40bdc9be7f644e2f3fb4e8a9aca7145bfa510d4a374eeb7", size = 164057, upload-time = "2024-09-21T17:33:31.862Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/0e/60da63b1c53c47f389f79312b3356cb305600ffad1274d7ec473128d4e6b/websockets-13.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:035233b7531fb92a76beefcbf479504db8c72eb3bff41da55aecce3a0f729e54", size = 164340, upload-time = "2024-09-21T17:33:33.022Z" },
-    { url = "https://files.pythonhosted.org/packages/20/ef/d87c5fc0aa7fafad1d584b6459ddfe062edf0d0dd64800a02e67e5de048b/websockets-13.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:e4450fc83a3df53dec45922b576e91e94f5578d06436871dce3a6be38e40f5db", size = 164222, upload-time = "2024-09-21T17:33:34.423Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/c4/7916e1f6b5252d3dcb9121b67d7fdbb2d9bf5067a6d8c88885ba27a9e69c/websockets-13.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:463e1c6ec853202dd3657f156123d6b4dad0c546ea2e2e38be2b3f7c5b8e7295", size = 163647, upload-time = "2024-09-21T17:33:35.841Z" },
-    { url = "https://files.pythonhosted.org/packages/de/df/2ebebb807f10993c35c10cbd3628a7944b66bd5fb6632a561f8666f3a68e/websockets-13.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6d6855bbe70119872c05107e38fbc7f96b1d8cb047d95c2c50869a46c65a8e96", size = 163590, upload-time = "2024-09-21T17:33:37.61Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/82/d48911f56bb993c11099a1ff1d4041d9d1481d50271100e8ee62bc28f365/websockets-13.1-cp38-cp38-win32.whl", hash = "sha256:204e5107f43095012b00f1451374693267adbb832d29966a01ecc4ce1db26faf", size = 158701, upload-time = "2024-09-21T17:33:38.695Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/b3/945aacb21fc89ad150403cbaa974c9e846f098f16d9f39a3dd6094f9beb1/websockets-13.1-cp38-cp38-win_amd64.whl", hash = "sha256:485307243237328c022bc908b90e4457d0daa8b5cf4b3723fd3c4a8012fce4c6", size = 159146, upload-time = "2024-09-21T17:33:39.855Z" },
-    { url = "https://files.pythonhosted.org/packages/61/26/5f7a7fb03efedb4f90ed61968338bfe7c389863b0ceda239b94ae61c5ae4/websockets-13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b37c184f8b976f0c0a231a5f3d6efe10807d41ccbe4488df8c74174805eea7d", size = 157810, upload-time = "2024-09-21T17:33:40.94Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/d4/9b4814a07dffaa7a79d71b4944d10836f9adbd527a113f6675734ef3abed/websockets-13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:163e7277e1a0bd9fb3c8842a71661ad19c6aa7bb3d6678dc7f89b17fbcc4aeb7", size = 155467, upload-time = "2024-09-21T17:33:42.075Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/1a/2abdc7ce3b56429ae39d6bfb48d8c791f5a26bbcb6f44aabcf71ffc3fda2/websockets-13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4b889dbd1342820cc210ba44307cf75ae5f2f96226c0038094455a96e64fb07a", size = 155714, upload-time = "2024-09-21T17:33:43.128Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/98/189d7cf232753a719b2726ec55e7922522632248d5d830adf078e3f612be/websockets-13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:586a356928692c1fed0eca68b4d1c2cbbd1ca2acf2ac7e7ebd3b9052582deefa", size = 164587, upload-time = "2024-09-21T17:33:44.27Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/2b/fb77cedf3f9f55ef8605238c801eef6b9a5269b01a396875a86896aea3a6/websockets-13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7bd6abf1e070a6b72bfeb71049d6ad286852e285f146682bf30d0296f5fbadfa", size = 163588, upload-time = "2024-09-21T17:33:45.38Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/b7/070481b83d2d5ac0f19233d9f364294e224e6478b0762f07fa7f060e0619/websockets-13.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2aad13a200e5934f5a6767492fb07151e1de1d6079c003ab31e1823733ae79", size = 163894, upload-time = "2024-09-21T17:33:46.651Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/be/d6e1cff7d441cfe5eafaacc5935463e5f14c8b1c0d39cb8afde82709b55a/websockets-13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:df01aea34b6e9e33572c35cd16bae5a47785e7d5c8cb2b54b2acdb9678315a17", size = 164315, upload-time = "2024-09-21T17:33:48.432Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/5e/ffa234473e46ab2d3f9fd9858163d5db3ecea1439e4cb52966d78906424b/websockets-13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e54affdeb21026329fb0744ad187cf812f7d3c2aa702a5edb562b325191fcab6", size = 163714, upload-time = "2024-09-21T17:33:49.548Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/92/cea9eb9d381ca57065a5eb4ec2ce7a291bd96c85ce742915c3c9ffc1069f/websockets-13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ef8aa8bdbac47f4968a5d66462a2a0935d044bf35c0e5a8af152d58516dbeb5", size = 163673, upload-time = "2024-09-21T17:33:51.056Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/f1/279104fff239bfd04c12b1e58afea227d72fd1acf431e3eed3f6ac2c96d2/websockets-13.1-cp39-cp39-win32.whl", hash = "sha256:deeb929efe52bed518f6eb2ddc00cc496366a14c726005726ad62c2dd9017a3c", size = 158702, upload-time = "2024-09-21T17:33:52.584Z" },
-    { url = "https://files.pythonhosted.org/packages/25/0b/b87370ff141375c41f7dd67941728e4b3682ebb45882591516c792a2ebee/websockets-13.1-cp39-cp39-win_amd64.whl", hash = "sha256:7c65ffa900e7cc958cd088b9a9157a8141c991f8c53d11087e6fb7277a03f81d", size = 159146, upload-time = "2024-09-21T17:33:53.781Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/75/6da22cb3ad5b8c606963f9a5f9f88656256fecc29d420b4b2bf9e0c7d56f/websockets-13.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5dd6da9bec02735931fccec99d97c29f47cc61f644264eb995ad6c0c27667238", size = 155499, upload-time = "2024-09-21T17:33:54.917Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ba/22833d58629088fcb2ccccedfae725ac0bbcd713319629e97125b52ac681/websockets-13.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2510c09d8e8df777177ee3d40cd35450dc169a81e747455cc4197e63f7e7bfe5", size = 155737, upload-time = "2024-09-21T17:33:56.052Z" },
-    { url = "https://files.pythonhosted.org/packages/95/54/61684fe22bdb831e9e1843d972adadf359cf04ab8613285282baea6a24bb/websockets-13.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1c3cf67185543730888b20682fb186fc8d0fa6f07ccc3ef4390831ab4b388d9", size = 157095, upload-time = "2024-09-21T17:33:57.21Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/f5/6652fb82440813822022a9301a30afde85e5ff3fb2aebb77f34aabe2b4e8/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcc03c8b72267e97b49149e4863d57c2d77f13fae12066622dc78fe322490fe6", size = 156701, upload-time = "2024-09-21T17:33:59.061Z" },
-    { url = "https://files.pythonhosted.org/packages/67/33/ae82a7b860fa8a08aba68818bdf7ff61f04598aa5ab96df4cd5a3e418ca4/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004280a140f220c812e65f36944a9ca92d766b6cc4560be652a0a3883a79ed8a", size = 156654, upload-time = "2024-09-21T17:34:00.944Z" },
-    { url = "https://files.pythonhosted.org/packages/63/0b/a1b528d36934f833e20f6da1032b995bf093d55cb416b9f2266f229fb237/websockets-13.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2620453c075abeb0daa949a292e19f56de518988e079c36478bacf9546ced23", size = 159192, upload-time = "2024-09-21T17:34:02.656Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/a1/5ae6d0ef2e61e2b77b3b4678949a634756544186620a728799acdf5c3482/websockets-13.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9156c45750b37337f7b0b00e6248991a047be4aa44554c9886fe6bdd605aab3b", size = 155433, upload-time = "2024-09-21T17:34:03.88Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/2f/addd33f85600d210a445f817ff0d79d2b4d0eb6f3c95b9f35531ebf8f57c/websockets-13.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:80c421e07973a89fbdd93e6f2003c17d20b69010458d3a8e37fb47874bd67d51", size = 155733, upload-time = "2024-09-21T17:34:05.173Z" },
-    { url = "https://files.pythonhosted.org/packages/74/0b/f8ec74ac3b14a983289a1b42dc2c518a0e2030b486d0549d4f51ca11e7c9/websockets-13.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82d0ba76371769d6a4e56f7e83bb8e81846d17a6190971e38b5de108bde9b0d7", size = 157093, upload-time = "2024-09-21T17:34:06.398Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/4c/aa5cc2f718ee4d797411202f332c8281f04c42d15f55b02f7713320f7a03/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9875a0143f07d74dc5e1ded1c4581f0d9f7ab86c78994e2ed9e95050073c94d", size = 156701, upload-time = "2024-09-21T17:34:07.582Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/4b/7c5b2d0d0f0f1a54f27c60107cf1f201bee1f88c5508f87408b470d09a9c/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a11e38ad8922c7961447f35c7b17bffa15de4d17c70abd07bfbe12d6faa3e027", size = 156648, upload-time = "2024-09-21T17:34:08.734Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/63/35f3fb073884a9fd1ce5413b2dcdf0d9198b03dac6274197111259cbde06/websockets-13.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4059f790b6ae8768471cddb65d3c4fe4792b0ab48e154c9f0a04cefaabcd5978", size = 159188, upload-time = "2024-09-21T17:34:10.018Z" },
-    { url = "https://files.pythonhosted.org/packages/59/fd/e4bf9a7159dba6a16c59ae9e670e3e8ad9dcb6791bc0599eb86de32d50a9/websockets-13.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25c35bf84bf7c7369d247f0b8cfa157f989862c49104c5cf85cb5436a641d93e", size = 155499, upload-time = "2024-09-21T17:34:11.3Z" },
-    { url = "https://files.pythonhosted.org/packages/74/42/d48ede93cfe0c343f3b552af08efc60778d234989227b16882eed1b8b189/websockets-13.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:83f91d8a9bb404b8c2c41a707ac7f7f75b9442a0a876df295de27251a856ad09", size = 155731, upload-time = "2024-09-21T17:34:13.151Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/f2/2ef6bff1c90a43b80622a17c0852b48c09d3954ab169266ad7b15e17cdcb/websockets-13.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a43cfdcddd07f4ca2b1afb459824dd3c6d53a51410636a2c7fc97b9a8cf4842", size = 157093, upload-time = "2024-09-21T17:34:14.52Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/14/6f20bbaeeb350f155edf599aad949c554216f90e5d4ae7373d1f2e5931fb/websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a2ef1381632a2f0cb4efeff34efa97901c9fbc118e01951ad7cfc10601a9bb", size = 156701, upload-time = "2024-09-21T17:34:15.692Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/86/38279dfefecd035e22b79c38722d4f87c4b6196f1556b7a631d0a3095ca7/websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459bf774c754c35dbb487360b12c5727adab887f1622b8aed5755880a21c4a20", size = 156649, upload-time = "2024-09-21T17:34:17.335Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/c5/12c6859a2eaa8c53f59a647617a27f1835a226cd7106c601067c53251d98/websockets-13.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:95858ca14a9f6fa8413d29e0a585b31b278388aa775b8a81fa24830123874678", size = 159187, upload-time = "2024-09-21T17:34:18.538Z" },
-    { url = "https://files.pythonhosted.org/packages/56/27/96a5cd2626d11c8280656c6c71d8ab50fe006490ef9971ccd154e0c42cd2/websockets-13.1-py3-none-any.whl", hash = "sha256:a9a396a6ad26130cdae92ae10c36af09d9bfe6cafe69670fd3b6da9b07b4044f", size = 152134, upload-time = "2024-09-21T17:34:19.904Z" },
-]
-
-[[package]]
-name = "websockets"
-version = "15.0.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version == '3.9.*'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/da/6462a9f510c0c49837bbc9345aca92d767a56c1fb2939e1579df1e1cdcf7/websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b", size = 175423, upload-time = "2025-03-05T20:01:35.363Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/9f/9d11c1a4eb046a9e106483b9ff69bce7ac880443f00e5ce64261b47b07e7/websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205", size = 173080, upload-time = "2025-03-05T20:01:37.304Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/4f/b462242432d93ea45f297b6179c7333dd0402b855a912a04e7fc61c0d71f/websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a", size = 173329, upload-time = "2025-03-05T20:01:39.668Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/0c/6afa1f4644d7ed50284ac59cc70ef8abd44ccf7d45850d989ea7310538d0/websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e", size = 182312, upload-time = "2025-03-05T20:01:41.815Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/d4/ffc8bd1350b229ca7a4db2a3e1c482cf87cea1baccd0ef3e72bc720caeec/websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf", size = 181319, upload-time = "2025-03-05T20:01:43.967Z" },
-    { url = "https://files.pythonhosted.org/packages/97/3a/5323a6bb94917af13bbb34009fac01e55c51dfde354f63692bf2533ffbc2/websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb", size = 181631, upload-time = "2025-03-05T20:01:46.104Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/cc/1aeb0f7cee59ef065724041bb7ed667b6ab1eeffe5141696cccec2687b66/websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d", size = 182016, upload-time = "2025-03-05T20:01:47.603Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f9/c86f8f7af208e4161a7f7e02774e9d0a81c632ae76db2ff22549e1718a51/websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9", size = 181426, upload-time = "2025-03-05T20:01:48.949Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/b9/828b0bc6753db905b91df6ae477c0b14a141090df64fb17f8a9d7e3516cf/websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c", size = 181360, upload-time = "2025-03-05T20:01:50.938Z" },
-    { url = "https://files.pythonhosted.org/packages/89/fb/250f5533ec468ba6327055b7d98b9df056fb1ce623b8b6aaafb30b55d02e/websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256", size = 176388, upload-time = "2025-03-05T20:01:52.213Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/46/aca7082012768bb98e5608f01658ff3ac8437e563eca41cf068bd5849a5e/websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41", size = 176830, upload-time = "2025-03-05T20:01:53.922Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" },
-    { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" },
-    { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" },
-    { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" },
-    { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" },
-    { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" },
-    { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" },
-    { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" },
-    { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" },
-    { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" },
-    { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" },
-    { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" },
-    { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" },
-    { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" },
-    { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" },
-    { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" },
-    { url = "https://files.pythonhosted.org/packages/36/db/3fff0bcbe339a6fa6a3b9e3fbc2bfb321ec2f4cd233692272c5a8d6cf801/websockets-15.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5f4c04ead5aed67c8a1a20491d54cdfba5884507a48dd798ecaf13c74c4489f5", size = 175424, upload-time = "2025-03-05T20:02:56.505Z" },
-    { url = "https://files.pythonhosted.org/packages/46/e6/519054c2f477def4165b0ec060ad664ed174e140b0d1cbb9fafa4a54f6db/websockets-15.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abdc0c6c8c648b4805c5eacd131910d2a7f6455dfd3becab248ef108e89ab16a", size = 173077, upload-time = "2025-03-05T20:02:58.37Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/21/c0712e382df64c93a0d16449ecbf87b647163485ca1cc3f6cbadb36d2b03/websockets-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a625e06551975f4b7ea7102bc43895b90742746797e2e14b70ed61c43a90f09b", size = 173324, upload-time = "2025-03-05T20:02:59.773Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/cb/51ba82e59b3a664df54beed8ad95517c1b4dc1a913730e7a7db778f21291/websockets-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d591f8de75824cbb7acad4e05d2d710484f15f29d4a915092675ad3456f11770", size = 182094, upload-time = "2025-03-05T20:03:01.827Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/0f/bf3788c03fec679bcdaef787518dbe60d12fe5615a544a6d4cf82f045193/websockets-15.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47819cea040f31d670cc8d324bb6435c6f133b8c7a19ec3d61634e62f8d8f9eb", size = 181094, upload-time = "2025-03-05T20:03:03.123Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/da/9fb8c21edbc719b66763a571afbaf206cb6d3736d28255a46fc2fe20f902/websockets-15.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac017dd64572e5c3bd01939121e4d16cf30e5d7e110a119399cf3133b63ad054", size = 181397, upload-time = "2025-03-05T20:03:04.443Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/65/65f379525a2719e91d9d90c38fe8b8bc62bd3c702ac651b7278609b696c4/websockets-15.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4a9fac8e469d04ce6c25bb2610dc535235bd4aa14996b4e6dbebf5e007eba5ee", size = 181794, upload-time = "2025-03-05T20:03:06.708Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/26/31ac2d08f8e9304d81a1a7ed2851c0300f636019a57cbaa91342015c72cc/websockets-15.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363c6f671b761efcb30608d24925a382497c12c506b51661883c3e22337265ed", size = 181194, upload-time = "2025-03-05T20:03:08.844Z" },
-    { url = "https://files.pythonhosted.org/packages/98/72/1090de20d6c91994cd4b357c3f75a4f25ee231b63e03adea89671cc12a3f/websockets-15.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2034693ad3097d5355bfdacfffcbd3ef5694f9718ab7f29c29689a9eae841880", size = 181164, upload-time = "2025-03-05T20:03:10.242Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/37/098f2e1c103ae8ed79b0e77f08d83b0ec0b241cf4b7f2f10edd0126472e1/websockets-15.0.1-cp39-cp39-win32.whl", hash = "sha256:3b1ac0d3e594bf121308112697cf4b32be538fb1444468fb0a6ae4feebc83411", size = 176381, upload-time = "2025-03-05T20:03:12.77Z" },
-    { url = "https://files.pythonhosted.org/packages/75/8b/a32978a3ab42cebb2ebdd5b05df0696a09f4d436ce69def11893afa301f0/websockets-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7643a03db5c95c799b89b31c036d5f27eeb4d259c798e878d6937d71832b1e4", size = 176841, upload-time = "2025-03-05T20:03:14.367Z" },
-    { url = "https://files.pythonhosted.org/packages/02/9e/d40f779fa16f74d3468357197af8d6ad07e7c5a27ea1ca74ceb38986f77a/websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3", size = 173109, upload-time = "2025-03-05T20:03:17.769Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/cd/5b887b8585a593073fd92f7c23ecd3985cd2c3175025a91b0d69b0551372/websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1", size = 173343, upload-time = "2025-03-05T20:03:19.094Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/ae/d34f7556890341e900a95acf4886833646306269f899d58ad62f588bf410/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475", size = 174599, upload-time = "2025-03-05T20:03:21.1Z" },
-    { url = "https://files.pythonhosted.org/packages/71/e6/5fd43993a87db364ec60fc1d608273a1a465c0caba69176dd160e197ce42/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9", size = 174207, upload-time = "2025-03-05T20:03:23.221Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/fb/c492d6daa5ec067c2988ac80c61359ace5c4c674c532985ac5a123436cec/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04", size = 174155, upload-time = "2025-03-05T20:03:25.321Z" },
-    { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/48/4b67623bac4d79beb3a6bb27b803ba75c1bdedc06bd827e465803690a4b2/websockets-15.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7f493881579c90fc262d9cdbaa05a6b54b3811c2f300766748db79f098db9940", size = 173106, upload-time = "2025-03-05T20:03:29.404Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/f0/adb07514a49fe5728192764e04295be78859e4a537ab8fcc518a3dbb3281/websockets-15.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:47b099e1f4fbc95b701b6e85768e1fcdaf1630f3cbe4765fa216596f12310e2e", size = 173339, upload-time = "2025-03-05T20:03:30.755Z" },
-    { url = "https://files.pythonhosted.org/packages/87/28/bd23c6344b18fb43df40d0700f6d3fffcd7cef14a6995b4f976978b52e62/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67f2b6de947f8c757db2db9c71527933ad0019737ec374a8a6be9a956786aaf9", size = 174597, upload-time = "2025-03-05T20:03:32.247Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/79/ca288495863d0f23a60f546f0905ae8f3ed467ad87f8b6aceb65f4c013e4/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d08eb4c2b7d6c41da6ca0600c077e93f5adcfd979cd777d747e9ee624556da4b", size = 174205, upload-time = "2025-03-05T20:03:33.731Z" },
-    { url = "https://files.pythonhosted.org/packages/04/e4/120ff3180b0872b1fe6637f6f995bcb009fb5c87d597c1fc21456f50c848/websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b826973a4a2ae47ba357e4e82fa44a463b8f168e1ca775ac64521442b19e87f", size = 174150, upload-time = "2025-03-05T20:03:35.757Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/c3/30e2f9c539b8da8b1d76f64012f3b19253271a63413b2d3adb94b143407f/websockets-15.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:21c1fa28a6a7e3cbdc171c694398b6df4744613ce9b36b1a498e816787e28123", size = 176877, upload-time = "2025-03-05T20:03:37.199Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
-]
-
-[[package]]
-name = "win32-setctime"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
-]
diff --git a/evals/config.yml b/evals/config.yml
index 5b42f16..b1cab25 100644
--- a/evals/config.yml
+++ b/evals/config.yml
@@ -7,20 +7,20 @@ api_endpoint: "http://localhost:8080"
 # Model configurations for running evaluations
 # These models are sent to the agent for processing requests
 
-main_model:
-  provider: "openai"
-  model_name: "gpt-5-mini"
-  api_key: "${OPENAI_API_KEY}"
+# main_model:
+#   provider: "openai"
+#   model_name: "gpt-5-mini"
+#   api_key: "${OPENAI_API_KEY}"
 
-mini_model:
-  provider: "openai"
-  model_name: "gpt-5-nano"
-  api_key: "${OPENAI_API_KEY}"
+# mini_model:
+#   provider: "openai"
+#   model_name: "gpt-5-nano"
+#   api_key: "${OPENAI_API_KEY}"
 
-nano_model:
-  provider: "openai"
-  model_name: "gpt-5-nano"
-  api_key: "${OPENAI_API_KEY}"
+# nano_model:
+#   provider: "openai"
+#   model_name: "gpt-5-nano"
+#   api_key: "${OPENAI_API_KEY}"
 
 # main_model:
 #   provider: "openrouter"
@@ -38,6 +38,22 @@ nano_model:
 #   model_name: "openai/gpt-oss-20b:free"
 #   api_key: "${OPENROUTER_API_KEY}"
 
+main_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  # model_name: "tngtech/deepseek-r1t2-chimera:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+mini_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+nano_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
 # Model configuration for judging evaluation responses
 # This model is used locally to assess the quality of agent responses
 

From 05ea2a6d3e7aed2bfd6e0078892e48f81b6e76f0 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 09:33:08 -0500
Subject: [PATCH 15/24] Refactor config.yml to use example files for different
 providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created separate example config files for different model providers:
- config.example.openai.yml: OpenAI models (now default)
- config.example.openrouter-gemini.yml: OpenRouter with Gemini
- config.example.openrouter-gpt.yml: OpenRouter with GPT

Main config.yml now defaults to OpenAI provider for reliability.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 evals/config.example.openai.yml            | 60 ++++++++++++++++++++++
 evals/config.example.openrouter-gemini.yml | 60 ++++++++++++++++++++++
 evals/config.example.openrouter-gpt.yml    | 60 ++++++++++++++++++++++
 evals/config.yml                           | 51 ++++--------------
 4 files changed, 190 insertions(+), 41 deletions(-)
 create mode 100644 evals/config.example.openai.yml
 create mode 100644 evals/config.example.openrouter-gemini.yml
 create mode 100644 evals/config.example.openrouter-gpt.yml

diff --git a/evals/config.example.openai.yml b/evals/config.example.openai.yml
new file mode 100644
index 0000000..d2b0733
--- /dev/null
+++ b/evals/config.example.openai.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenAI models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
+
+mini_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
+
+nano_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.example.openrouter-gemini.yml b/evals/config.example.openrouter-gemini.yml
new file mode 100644
index 0000000..dd894ed
--- /dev/null
+++ b/evals/config.example.openrouter-gemini.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenRouter with Google Gemini models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+mini_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+nano_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.example.openrouter-gpt.yml b/evals/config.example.openrouter-gpt.yml
new file mode 100644
index 0000000..286db1c
--- /dev/null
+++ b/evals/config.example.openrouter-gpt.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenRouter with GPT models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+mini_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+nano_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.yml b/evals/config.yml
index b1cab25..c8582e6 100644
--- a/evals/config.yml
+++ b/evals/config.yml
@@ -6,53 +6,22 @@ api_endpoint: "http://localhost:8080"
 
 # Model configurations for running evaluations
 # These models are sent to the agent for processing requests
-
-# main_model:
-#   provider: "openai"
-#   model_name: "gpt-5-mini"
-#   api_key: "${OPENAI_API_KEY}"
-
-# mini_model:
-#   provider: "openai"
-#   model_name: "gpt-5-nano"
-#   api_key: "${OPENAI_API_KEY}"
-
-# nano_model:
-#   provider: "openai"
-#   model_name: "gpt-5-nano"
-#   api_key: "${OPENAI_API_KEY}"
-
-# main_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   # model_name: "tngtech/deepseek-r1t2-chimera:free"
-#   api_key: "${OPENROUTER_API_KEY}"
-
-# mini_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   api_key: "${OPENROUTER_API_KEY}"
-
-# nano_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   api_key: "${OPENROUTER_API_KEY}"
+# See config.example.*.yml files for other provider/model configurations
 
 main_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  # model_name: "tngtech/deepseek-r1t2-chimera:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
 
 mini_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
 nano_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
 # Model configuration for judging evaluation responses
 # This model is used locally to assess the quality of agent responses

From 035855f7d28edb6af8df7a43f73356e090b5d198 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 11:24:37 -0500
Subject: [PATCH 16/24] Cleanup and refactoring; fixed docker-compose setup

---
 .gitignore                                    |   9 +-
 CLAUDE.md                                     | 458 ++++++++++++++++++
 Dockerfile.devtools                           |   2 +-
 Dockerfile.local                              |  21 +-
 Makefile                                      |  29 +-
 Readme.md                                     | 419 ++++++++++++++--
 docker-compose.yml                            |  15 +-
 eval-server/nodejs/package-lock.json          | 435 +----------------
 eval-server/nodejs/package.json               |  13 +-
 .../nodejs/src/lib/EvaluationLoader.js        | 448 +++++++++++++++++
 eval-server/nodejs/src/lib/EvaluationStack.js |  85 ++++
 eval-server/nodejs/src/lib/judges/Judge.js    |  80 +++
 eval-server/nodejs/src/lib/judges/LLMJudge.js | 344 +++++++++++++
 scripts/cleanup-chromium-locks.sh             |  18 +
 scripts/init-container.sh                     |  27 ++
 scripts/wrapper-with-cleanup.sh               |  22 +
 supervisor/services/eval-server.conf          |   2 +-
 17 files changed, 1932 insertions(+), 495 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 eval-server/nodejs/src/lib/EvaluationLoader.js
 create mode 100644 eval-server/nodejs/src/lib/EvaluationStack.js
 create mode 100644 eval-server/nodejs/src/lib/judges/Judge.js
 create mode 100644 eval-server/nodejs/src/lib/judges/LLMJudge.js
 create mode 100644 scripts/cleanup-chromium-locks.sh
 create mode 100644 scripts/init-container.sh
 create mode 100644 scripts/wrapper-with-cleanup.sh

diff --git a/.gitignore b/.gitignore
index 8371b9d..1f8e6ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,4 +60,11 @@ chromium-data/
 browser-operator-core/devtools-frontend/
 browser-operator-core/depot_tools/
 browser-operator-core/.devtools-built
-browser-operator-core/.devtools-base-built
\ No newline at end of file
+browser-operator-core/.devtools-base-built
+
+# Eval server runtime files
+eval-server/nodejs/clients/
+eval-server/nodejs/logs/
+
+# Evaluation screenshots
+evals/screenshots/
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..01c2ef1
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,458 @@
+# Claude Code - Technical Documentation
+
+## Project Overview
+
+This project extends the kernel-images Chromium browser environment with:
+- **Browser Operator DevTools**: Custom DevTools frontend with AI chat panel
+- **Eval Server**: HTTP/WebSocket API for browser automation and evaluation
+- **Local Development**: Docker Compose setup for rapid iteration
+- **Cloud Deployment**: Google Cloud Run deployment (legacy)
+
+## Architecture
+
+### Core Components
+
+1. **Chromium Browser** (headful, GUI via WebRTC)
+   - CDP (Chrome DevTools Protocol) on port 9223
+   - Custom DevTools frontend at http://localhost:8001/
+   - Auto-opens DevTools for all tabs
+
+2. **Eval Server** (Node.js)
+   - HTTP API on port 8080 (`/v1/responses`, `/page/content`, `/page/screenshot`)
+   - WebSocket API on port 8082 (JSON-RPC 2.0)
+   - Manages browser tabs and automation
+
+3. **WebRTC Streaming** (Neko)
+   - Live browser view on port 8000
+   - WebRTC control interface on port 8081
+
+4. **DevTools Frontend** (nginx)
+   - Browser Operator custom DevTools on port 8001
+   - Includes AI chat panel and automation features
+
+5. **Recording API** (kernel-images)
+   - Screen recording on port 444
+
+### Service Dependencies
+
+```
+supervisord
+├── xorg (X11 display :1)
+├── mutter (window manager)
+├── dbus (message bus)
+├── chromium (browser + CDP port 9223)
+├── neko (WebRTC on ports 8000, 8081)
+├── kernel-images-api (recording on port 444)
+├── eval-server (HTTP 8080, WS 8082)
+└── nginx-devtools (DevTools UI on port 8001)
+```
+
+## Directory Structure
+
+```
+web-agent/
+├── browser-operator-core/      # Submodule: DevTools frontend source
+├── kernel-images/              # Submodule: Base browser environment
+├── eval-server/
+│   └── nodejs/                 # Eval server source (use this, NOT submodule)
+│       ├── src/
+│       │   ├── api-server.js   # HTTP REST API
+│       │   ├── evaluation-server.js  # WebSocket + CDP
+│       │   └── lib/            # EvaluationLoader, EvaluationStack, judges
+│       ├── start.js            # Server entrypoint
+│       └── package.json
+├── evals/
+│   ├── run.py                  # Python evaluation runner
+│   ├── lib/judge.py            # LLMJudge, VisionJudge, SimpleJudge
+│   └── data/                   # Evaluation YAML files
+├── scripts/
+│   ├── init-container.sh       # Auto-cleanup of lock files
+│   └── cleanup-chromium-locks.sh
+├── supervisor/services/        # Service configs (override defaults)
+│   ├── chromium.conf           # Auto-open DevTools
+│   ├── eval-server.conf        # Eval server with CDP_PORT=9223
+│   ├── neko.conf
+│   └── nginx-devtools.conf
+├── Dockerfile.local            # Main Docker build
+├── Dockerfile.devtools         # DevTools frontend build
+├── docker-compose.yml          # Local deployment config
+├── run-local.sh                # Interactive mode startup
+├── Makefile                    # Build/deployment commands
+└── README.md
+```
+
+## Key Files and What They Do
+
+### Dockerfile.local
+Multi-stage build that:
+1. Copies pre-built DevTools from `browser-operator-devtools:latest`
+2. Builds eval-server with `npm install`
+3. Builds kernel-images Go API
+4. Builds WebRTC client
+5. Compiles custom Xorg drivers
+6. Assembles final Ubuntu 22.04 image with all components
+7. Adds init script for automatic lock cleanup
+
+**Critical sections:**
+- Line 284: Copies `scripts/init-container.sh` for lock cleanup
+- Line 288-294: Creates `/entrypoint.sh` wrapper
+- Line 299: Sets entrypoint to run init before main wrapper
+
+### docker-compose.yml
+Configures container with:
+- Port mappings for all services (8000-8082, 9222, 444)
+- Volume mounts: recordings, chromium-data, eval-server code
+- tmpfs: `/dev/shm` and `/tmp` (prevents lock file persistence)
+- Environment: `CHROMIUM_FLAGS` with custom DevTools frontend
+
+**Recent fixes:**
+- Added missing ports 8000, 8001, 8081, 8082
+- Added `/tmp` tmpfs mount to prevent X11 lock persistence
+- Added `--custom-devtools-frontend=http://localhost:8001/`
+
+### scripts/init-container.sh
+Runs on every container start to clean:
+- Chromium lock files (`SingletonLock`, `SingletonSocket`, `SingletonCookie`)
+- X11 lock files (`/tmp/.X*-lock`)
+
+This prevents "profile in use" and "display already active" errors.
+
+### eval-server/nodejs/src/api-server.js
+HTTP REST API with endpoints:
+- `POST /v1/responses` - Execute browser automation tasks
+- `POST /page/content` - Get page HTML/text content
+- `POST /page/screenshot` - Capture screenshots
+- `GET /status` - Health check
+
+### supervisor/services/eval-server.conf
+**Critical environment variables:**
+```ini
+environment=NODE_ENV="production",PORT="8082",API_PORT="8080",HOST="0.0.0.0",CDP_PORT="9223"
+```
+
+Note: CDP_PORT must be 9223 (not 9222) to match Chromium configuration.
+
+### Makefile
+Key targets:
+- `make init` - Initialize git submodules
+- `make build-devtools` - Build DevTools base (slow, ~30 min, cached)
+- `make rebuild-devtools` - Fast rebuild with local changes
+- `make build` - Build main image (auto-builds DevTools if missing)
+- `make compose-up` - Start with docker-compose (background)
+- `make run` - Start with run-local.sh (interactive)
+- `make test` - Verify API and run simple eval
+- `make stop` - Stop all containers
+- `make clean` - Clean up everything
+
+### run-local.sh
+Interactive Docker run script that:
+- Sources kernel-images common build variables
+- Creates local recordings directory
+- Configures Chromium data persistence (customizable with `CHROMIUM_DATA_HOST`)
+- **Cleans lock files from host before starting** (lines 84-89)
+- Builds docker run arguments with all port mappings
+- Supports `URLS` environment variable to open URLs on startup
+- Uses custom DevTools frontend flag
+- Runs container with `docker run -d` (detached but logs visible via docker logs)
+
+**Key difference from docker-compose:**
+- Lock cleanup happens on HOST before container starts
+- Eval server code is NOT volume-mounted (baked into image)
+- More flexible for custom configurations via environment variables
+- Better for seeing startup issues and debugging
+
+## Common Issues and Solutions
+
+### 1. Chromium Profile Lock Errors
+**Symptom:** "The profile appears to be in use by another Chromium process"
+
+**Solution:** Now handled automatically by `scripts/init-container.sh`
+- Runs on every container start
+- Cleans lock files before services start
+- No manual intervention needed
+
+### 2. X11 Display Lock Errors
+**Symptom:** "Server is already active for display 1"
+
+**Solution:** Fixed by adding `/tmp` to tmpfs in docker-compose.yml
+- Line 54: `- /tmp` in tmpfs section
+- Prevents lock files from persisting across restarts
+
+### 3. CDP Connection Failures
+**Symptom:** "Failed to connect to Chrome DevTools Protocol"
+
+**Solution:** Ensure CDP_PORT=9223 in `supervisor/services/eval-server.conf`
+- Chromium runs on port 9223 (not 9222)
+- Check logs: `docker logs kernel-browser-extended | grep CDP`
+
+### 4. Module Not Found Errors
+**Symptom:** "Cannot find module 'js-yaml'" or "Cannot find module 'EvaluationLoader.js'"
+
+**Solution:**
+- Ensure `eval-server/nodejs/` has all dependencies
+- Run `cd eval-server/nodejs && npm install`
+- Copy missing files from `browser-operator-core/eval-server/` if needed
+- **Always use local `eval-server/`, NOT the submodule version**
+
+### 5. Docker Volume Caching on macOS
+**Symptom:** File changes not visible in running container with docker-compose
+
+**Solution:** Completely recreate container
+```bash
+docker-compose down
+docker-compose up -d
+```
+macOS Docker has aggressive volume caching.
+
+**Note:** This only affects `make compose-up`. With `make run`, code is baked into the image, so you must rebuild to see changes.
+
+### 6. Port Already in Use
+**Symptom:** "Ports are not available: UDP 56065 already in use"
+
+**Solution:**
+```bash
+# Remove existing container
+docker rm -f kernel-browser-extended
+
+# Then start with your preferred method
+make compose-up  # OR make run
+```
+
+## Deployment Workflows
+
+### Two Local Deployment Options
+
+#### Option 1: Docker Compose (Recommended for Development)
+
+**Advantages:**
+- Background operation
+- Easy restart without rebuilding
+- Volume-mounted eval-server code (live reload)
+- Managed by docker-compose
+- Better for long-running development
+
+**Usage:**
+```bash
+# First time setup
+make init                    # Initialize submodules
+make build                   # Build images (~30 min first time)
+
+# Start services in background
+make compose-up
+
+# Verify
+make test                    # Run simple eval test
+
+# View logs
+make logs                    # Follow all logs
+
+# Iterate on eval-server code (NO REBUILD NEEDED)
+vim eval-server/nodejs/src/api-server.js
+docker-compose restart       # Picks up changes immediately
+
+# Stop
+make stop                    # OR docker-compose down
+```
+
+#### Option 2: Direct Docker Run (Interactive Mode)
+
+**Advantages:**
+- Live logs in terminal
+- Better for debugging
+- See all output immediately
+- Good for quick testing
+- Chromium data location customizable
+
+**Disadvantages:**
+- Requires rebuild for code changes
+- Runs in foreground (blocks terminal)
+- No volume mount for eval-server
+
+**Usage:**
+```bash
+# First time setup
+make init                    # Initialize submodules
+make build                   # Build images (~30 min first time)
+
+# Start in interactive mode (logs to stdout)
+make run
+
+# In another terminal, verify
+make test
+
+# Stop
+# Press Ctrl+C in terminal running 'make run'
+# OR: docker stop kernel-browser-extended
+
+# Iterate on eval-server code (REQUIRES REBUILD)
+vim eval-server/nodejs/src/api-server.js
+make rebuild
+make run                     # Restart after rebuild
+```
+
+### Comparison: `make run` vs `make compose-up`
+
+| Aspect | `make run` | `make compose-up` |
+|--------|-----------|-------------------|
+| **Logs** | Live in terminal | Background, use `make logs` |
+| **Stopping** | Ctrl+C or docker stop | `make stop` |
+| **Eval server code** | Baked into image, rebuild needed | Volume-mounted, restart only |
+| **DevTools code** | Baked into image, rebuild needed | Baked into image, rebuild needed |
+| **Best for** | Debugging, seeing startup issues | Development iteration |
+| **Script** | `run-local.sh` | `docker-compose.yml` |
+| **Data location** | Easy to customize with env vars | Set in compose file or env var |
+| **Lock cleanup** | Script cleans host before start | Container init cleans on start |
+| **URLs on startup** | `URLS="..." make run` | Edit compose file |
+
+### Rebuild After Changes
+
+#### With Docker Compose:
+
+```bash
+# Eval server changes (NO REBUILD)
+vim eval-server/nodejs/src/api-server.js
+docker-compose restart       # Volume-mounted, picks up changes
+
+# DevTools changes
+vim browser-operator-core/front_end/panels/ai_chat/...
+make rebuild-devtools        # Fast rebuild
+docker-compose down
+docker-compose up -d
+
+# Dockerfile changes
+make rebuild                 # Full rebuild
+make compose-up
+```
+
+#### With Direct Docker Run:
+
+```bash
+# ANY code changes (eval-server OR DevTools)
+make rebuild                 # Must rebuild
+# Press Ctrl+C in terminal running 'make run'
+make run                     # Restart
+
+# DevTools only changes (faster)
+make rebuild-devtools        # Fast rebuild
+# Press Ctrl+C
+make run
+
+# Dockerfile changes
+make rebuild                 # Full rebuild
+make run
+```
+
+### Advanced run-local.sh Options
+
+```bash
+# Custom Chromium data directory
+CHROMIUM_DATA_HOST=/custom/path make run
+
+# Ephemeral mode (no data persistence)
+CHROMIUM_DATA_HOST="" make run
+
+# Open URLs on startup
+URLS="https://google.com https://github.com" make run
+
+# Combine options
+CHROMIUM_DATA_HOST=/tmp/browser URLS="https://example.com" make run
+```
+
+## Important Notes
+
+### Always Use Local eval-server/
+**DO NOT** use files from `browser-operator-core/eval-server/`
+
+The correct path is: `eval-server/nodejs/`
+
+Dockerfile.devtools has been updated to copy from local directory.
+
+### CDP Port is 9223, Not 9222
+The default Chrome DevTools port is 9222, but this project uses 9223.
+
+Check these files:
+- `supervisor/services/eval-server.conf` - Must have `CDP_PORT="9223"`
+- Chromium startup config uses port 9223
+
+### Dependencies in eval-server/nodejs/
+Required packages:
+- js-yaml (for parsing YAML eval files)
+- express (HTTP server)
+- ws (WebSocket server)
+- chrome-remote-interface (CDP client)
+
+All managed by `package.json` and `npm install`.
+
+### Lock File Cleanup is Automatic
+After implementing `scripts/init-container.sh`, you should never need to manually clean lock files again. The script runs on every container start.
+
+## Testing
+
+### Quick API Test
+```bash
+make test
+```
+
+Runs `evals/data/test-simple/math-001.yaml` which:
+1. Checks API endpoint health
+2. Sends simple math question via `/v1/responses`
+3. Validates response using SimpleJudge
+4. Reports PASS/FAIL
+
+### Running Specific Evals
+```bash
+cd evals
+python3 run.py --path data/web-task-agent/flight-001.yaml --verbose
+```
+
+### Manual API Testing
+```bash
+# Health check
+curl http://localhost:8080/status
+
+# Execute task
+curl -X POST http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": "Navigate to google.com",
+    "url": "about:blank",
+    "wait_timeout": 5000,
+    "model": {
+      "main_model": {"provider": "openai", "model": "gpt-4", "api_key": "..."}
+    }
+  }'
+
+# Get page content
+curl -X POST http://localhost:8080/page/content \
+  -H "Content-Type: application/json" \
+  -d '{"clientId": "test", "tabId": "tab-001", "format": "html"}'
+
+# Capture screenshot
+curl -X POST http://localhost:8080/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d '{"clientId": "test", "tabId": "tab-001", "fullPage": false}'
+```
+
+## Access Points
+
+### Local Docker Compose Deployment
+
+| Service | URL | Purpose |
+|---------|-----|---------|
+| WebRTC Client | http://localhost:8000 | Live browser view with mouse/keyboard control |
+| Enhanced DevTools UI | http://localhost:8001 | Custom DevTools with AI chat panel |
+| Eval Server HTTP API | http://localhost:8080 | REST API for automation |
+| WebRTC Neko | http://localhost:8081 | WebRTC control interface |
+| Eval Server WebSocket | ws://localhost:8082 | JSON-RPC 2.0 bidirectional API |
+| Chrome DevTools Protocol | http://localhost:9222/json | CDP endpoint list |
+| Recording API | http://localhost:444/api | Screen recording controls |
+
+## Recent Changes Summary
+
+1. **Fixed docker-compose.yml** - Added missing port mappings (8000, 8001, 8081, 8082)
+2. **Fixed tmpfs mounts** - Added `/tmp` to prevent X11 lock persistence
+3. **Added automatic lock cleanup** - `scripts/init-container.sh` runs on every start
+4. **Updated Chromium flags** - Added `--custom-devtools-frontend=http://localhost:8001/`
+5. **Fixed CDP port** - Set `CDP_PORT="9223"` in eval-server supervisor config
+6. **Created make test** - Quick verification of eval API functionality
+7. **Fixed eval-server source** - Always use local `eval-server/`, not submodule
diff --git a/Dockerfile.devtools b/Dockerfile.devtools
index 9545ab0..9f9461b 100644
--- a/Dockerfile.devtools
+++ b/Dockerfile.devtools
@@ -69,7 +69,7 @@ FROM devtools-base AS devtools-local
 # Copy local changes from browser-operator-core submodule FIRST
 # This happens before checking out upstream, so we copy over the upstream code
 COPY browser-operator-core/front_end /workspace/devtools/devtools-frontend/front_end
-COPY browser-operator-core/eval-server /workspace/devtools/devtools-frontend/eval-server
+COPY eval-server /workspace/devtools/devtools-frontend/eval-server
 
 WORKDIR /workspace/devtools/devtools-frontend
 
diff --git a/Dockerfile.local b/Dockerfile.local
index 4a833cb..6685a9c 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -258,7 +258,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
     rm -rf /var/lib/apt/lists/*
 
 # Create eval server startup script
-RUN echo '#!/bin/bash\ncd /opt/eval-server && node examples/with-http-wrapper.js' > /usr/local/bin/start-eval-server.sh && \
+RUN echo '#!/bin/bash\ncd /opt/eval-server && node start.js' > /usr/local/bin/start-eval-server.sh && \
     chmod +x /usr/local/bin/start-eval-server.sh
 
 # ============================================================================
@@ -276,7 +276,24 @@ RUN mkdir -p /data/user-data /data/config /data/cache && \
 # Declare volume for optional mounting of Chromium profiles and data
 VOLUME ["/data"]
 
+# ============================================================================
+# Container Initialization Script
+# ============================================================================
+
+# Copy container initialization script that cleans up lock files
+COPY scripts/init-container.sh /usr/local/bin/init-container.sh
+RUN chmod +x /usr/local/bin/init-container.sh
+
+# Create a wrapper entrypoint that runs init script before main wrapper
+RUN echo '#!/bin/bash\n\
+set -e\n\
+# Run initialization script\n\
+/usr/local/bin/init-container.sh\n\
+# Execute main wrapper\n\
+exec /wrapper.sh "$@"' > /entrypoint.sh && \
+    chmod +x /entrypoint.sh
+
 # Expose ports
 EXPOSE 8000 8001 8080 8081 8082
 
-ENTRYPOINT [ "/wrapper.sh" ]
\ No newline at end of file
+ENTRYPOINT [ "/entrypoint.sh" ]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index df5e2ef..5323481 100644
--- a/Makefile
+++ b/Makefile
@@ -127,31 +127,16 @@ info: ## Show connection information
 	@echo "   Enhanced DevTools UI: http://localhost:8001"
 	@echo "   DevTools Health:      http://localhost:8001/health"
 
-test: ## Test service endpoints
-	@echo "🧪 Testing service endpoints..."
-	@echo -n "WebRTC Client (8000): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/ || echo "Failed to connect"
+test: ## Test eval API with simple math eval
+	@echo "🧪 Testing Eval Server API..."
 	@echo ""
-	@echo -n "Eval Server API (8081): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:8081/ || echo "Failed to connect"
+	@echo "1️⃣  Checking API endpoint..."
+	@curl -s -o /dev/null -w "   Status: %{http_code}\n" http://localhost:8080/status || (echo "   ❌ API not responding"; exit 1)
 	@echo ""
-	@echo -n "Chrome DevTools (9222): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:9222/json/version || echo "Failed to connect" 
+	@echo "2️⃣  Running simple eval test (test-simple/math-001.yaml)..."
+	@cd evals && python3 run.py --path data/test-simple/math-001.yaml || (echo "   ❌ Eval test failed"; exit 1)
 	@echo ""
-	@echo -n "Recording API (444): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:444/ && echo " (404 is normal - API is running)" || echo "Failed to connect"
-	@echo ""
-	@echo -n "DevTools UI (8001): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:8001/ || echo "Failed to connect"
-	@echo ""
-	@echo -n "DevTools Health (8001): "
-	@curl -s -o /dev/null -w "%{http_code}" http://localhost:8001/health || echo "Failed to connect"
-	@echo ""
-	@echo "🎯 All services are ready! Access points:"
-	@echo "   WebRTC Client:        http://localhost:8000"
-	@echo "   Eval Server API:      http://localhost:8081"
-	@echo "   Chrome DevTools:      http://localhost:9222/json"
-	@echo "   Enhanced DevTools UI: http://localhost:8001"
+	@echo "✅ API is working correctly!"
 
 clean: stop ## Clean up everything
 	@echo "🧹 Cleaning up..."
diff --git a/Readme.md b/Readme.md
index 1594f56..96d106a 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,36 +1,308 @@
-# Kernel Browser - Google Cloud Run Deployment
+# Web Agent - Browser Automation & Evaluation Platform
 
-Deploy the [kernel-images](https://github.com/onkernel/kernel-images) Chrome browser environment to Google Cloud Run with WebRTC support, Chrome DevTools Protocol, and screen recording capabilities.
+Extended [kernel-images](https://github.com/onkernel/kernel-images) Chromium environment with Browser Operator DevTools and eval server for browser automation, testing, and AI agent evaluation.
 
 ## 🏗️ Architecture
 
-This deployment provides:
+This platform provides:
+- **Browser Operator DevTools** - Custom DevTools frontend with AI chat panel
+- **Eval Server API** - HTTP/WebSocket API for browser automation and evaluation
 - **Headful Chrome** with GUI access via WebRTC
 - **Chrome DevTools Protocol** for automation (Playwright, Puppeteer)
 - **Screen Recording API** for session capture
-- **nginx Reverse Proxy** for Cloud Run port requirements
-- **Auto-scaling** from 0 to multiple instances
+- **Local Docker Compose** for development
+- **Google Cloud Run** deployment option
 
 ## 📋 Prerequisites
 
+### For Local Development
+1. **Docker** and **Docker Compose** installed
+2. **Make** utility
+3. **Git** with submodule access
+4. **Python 3** (for running evals)
+
+### For Cloud Run Deployment
 1. **Google Cloud Account** with billing enabled
 2. **gcloud CLI** installed and authenticated
-3. **Docker** installed locally (for local builds)
-4. **Git** with submodule access
+3. All of the above
+
+---
+
+## 🚀 Local Development - Two Deployment Options
+
+### Option 1: Docker Compose (Recommended for Development)
+
+**Best for:** Background services, docker-compose workflows, persistent containers
+
+```bash
+# 1. Initialize submodules
+make init
 
-## 🚀 Quick Start
+# 2. Build Docker images (takes ~30 minutes first time)
+make build
 
-### 1. Clone and Setup
+# 3. Start all services in background
+make compose-up
+
+# 4. Verify everything works
+make test
+```
+
+### Option 2: Direct Docker Run (Interactive Mode)
+
+**Best for:** Interactive debugging, seeing live logs, quick testing
 
 ```bash
-# The kernel-images submodule should already be initialized
-cd /Users/tyson/codebase/blue-browser/web-agent
+# 1. Initialize submodules
+make init
+
+# 2. Build Docker images (takes ~30 minutes first time)
+make build
+
+# 3. Start in interactive mode (logs to terminal)
+make run
+
+# In another terminal, verify
+make test
+```
 
-# Verify submodule
-git submodule status
+### Access Points
+
+After starting with either `make compose-up` or `make run`, access:
+
+| Service | URL | Purpose |
+|---------|-----|---------|
+| **WebRTC Client** | http://localhost:8000 | Live browser view with control |
+| **DevTools UI** | http://localhost:8001 | Enhanced DevTools with AI chat |
+| **Eval Server API** | http://localhost:8080 | HTTP REST API for automation |
+| **WebRTC Neko** | http://localhost:8081 | WebRTC control interface |
+| **Eval Server WS** | ws://localhost:8082 | WebSocket JSON-RPC API |
+| **CDP Endpoint** | http://localhost:9222/json | Chrome DevTools Protocol |
+| **Recording API** | http://localhost:444/api | Screen recording controls |
+
+### Available Make Commands
+
+```bash
+make help              # Show all available commands
+make init              # Initialize git submodules
+make build             # Build images (smart caching)
+make rebuild           # Force complete rebuild
+make build-devtools    # Build DevTools base (~30 min)
+make rebuild-devtools  # Fast rebuild with local changes
+make compose-up        # Start in background
+make run               # Start in interactive mode
+make stop              # Stop all containers
+make restart           # Restart containers
+make logs              # View container logs
+make test              # Run API verification test
+make clean             # Clean up everything
+```
+
+### Comparison: `make run` vs `make compose-up`
+
+| Feature | `make run` | `make compose-up` |
+|---------|------------|-------------------|
+| **Log visibility** | Live logs in terminal | Background, use `make logs` |
+| **Stopping** | Ctrl+C or `docker stop` | `make stop` or `docker-compose down` |
+| **Restarting** | Stop and run again | `docker-compose restart` |
+| **Use case** | Interactive debugging | Background development |
+| **Startup script** | `run-local.sh` | `docker-compose.yml` |
+| **Lock cleanup** | Script cleans before start | Container cleans on start |
+| **Volume mounts** | Defined in script | Defined in compose file |
+
+### Development Workflow
+
+**With Docker Compose (make compose-up):**
+
+*Editing Eval Server Code:*
+```bash
+# 1. Make changes in eval-server/nodejs/
+vim eval-server/nodejs/src/api-server.js
+
+# 2. Restart container (no rebuild needed, volume-mounted)
+docker-compose restart
+
+# 3. Test changes
+make test
+```
+
+*Editing DevTools:*
+```bash
+# 1. Make changes in browser-operator-core/front_end/
+vim browser-operator-core/front_end/panels/ai_chat/...
+
+# 2. Rebuild DevTools only
+make rebuild-devtools
+
+# 3. Restart containers
+docker-compose down && docker-compose up -d
+```
+
+*Full Rebuild:*
+```bash
+make rebuild        # Rebuild everything from scratch
+make compose-up     # Start containers
+```
+
+**With Direct Docker Run (make run):**
+
+*Editing Eval Server Code:*
+```bash
+# 1. Make changes in eval-server/nodejs/
+vim eval-server/nodejs/src/api-server.js
+
+# 2. Since eval-server is NOT volume-mounted in run mode, rebuild
+make rebuild
+
+# 3. Stop and restart
+# Press Ctrl+C in the terminal running 'make run'
+make run
+```
+
+*Editing DevTools:*
+```bash
+# 1. Make changes in browser-operator-core/front_end/
+vim browser-operator-core/front_end/panels/ai_chat/...
+
+# 2. Rebuild DevTools only
+make rebuild-devtools
+
+# 3. Stop and restart
+# Press Ctrl+C in the terminal running 'make run'
+make run
+```
+
+*Full Rebuild:*
+```bash
+make rebuild        # Rebuild everything from scratch
+# Press Ctrl+C in the terminal running 'make run'
+make run           # Start in interactive mode
+```
+
+### Customizing Browser Data Location
+
+**With `make run`:**
+```bash
+# Default: ./chromium-data
+make run
+
+# Custom location
+CHROMIUM_DATA_HOST=/path/to/data make run
+
+# Ephemeral (no persistence)
+CHROMIUM_DATA_HOST="" make run
+```
+
+**With `make compose-up`:**
+```bash
+# Edit docker-compose.yml to change CHROMIUM_DATA_HOST
+# Or set environment variable:
+CHROMIUM_DATA_HOST=/path/to/data make compose-up
 ```
 
-### 2. Configure Google Cloud
+### Opening URLs on Startup
+
+**With `make run`:**
+```bash
+# Open specific URLs when browser starts
+URLS="https://google.com https://github.com" make run
+```
+
+**With `make compose-up`:**
+```bash
+# Add URLS to docker-compose.yml environment section
+```
+
+### Running Evaluations
+
+```bash
+# Simple test
+make test
+
+# Specific evaluation
+cd evals
+python3 run.py --path data/web-task-agent/flight-001.yaml --verbose
+
+# All evaluations in a directory
+python3 run.py --path data/web-task-agent/ --verbose
+```
+
+### Troubleshooting
+
+**Container won't start (docker-compose):**
+```bash
+# Check logs
+docker logs kernel-browser-extended
+
+# Clean restart
+make stop
+make clean
+make build
+make compose-up
+```
+
+**Container won't start (make run):**
+```bash
+# Stop existing container
+docker stop kernel-browser-extended
+docker rm kernel-browser-extended
+
+# Clean rebuild
+make clean
+make rebuild
+make run
+```
+
+**Port conflicts:**
+```bash
+# Remove existing container
+docker rm -f kernel-browser-extended
+
+# Then start with your preferred method
+make compose-up  # OR make run
+```
+
+**Lock file errors (should be automatic now):**
+The system now automatically cleans lock files on startup. If you still see errors:
+
+*With docker-compose:*
+```bash
+docker-compose down
+rm -f ./chromium-data/user-data/Singleton*
+make compose-up
+```
+
+*With make run:*
+```bash
+# Press Ctrl+C to stop
+rm -f ./chromium-data/user-data/Singleton*
+make run
+```
+
+**Seeing stale code after changes (make run):**
+```bash
+# Eval server code is NOT volume-mounted in run mode
+# You must rebuild after code changes
+make rebuild
+# Press Ctrl+C in terminal running 'make run'
+make run
+```
+
+**Want to see live logs (docker-compose):**
+```bash
+# Option 1: Follow logs
+make logs
+
+# Option 2: Switch to interactive mode
+make stop
+make run
+```
+
+---
+
+## 🚀 Google Cloud Run Deployment
+
+### Configure Google Cloud
 
 ```bash
 # Set your project ID
@@ -42,7 +314,7 @@ gcloud auth login
 gcloud auth application-default login
 ```
 
-### 3. Deploy
+### Deploy to Cloud Run
 
 ```bash
 # Automated deployment (recommended)
@@ -52,7 +324,7 @@ gcloud auth application-default login
 ./deploy.sh --project your-project-id --region us-central1
 ```
 
-### 4. Access Your Service
+### Access Cloud Run Service
 
 After deployment, you'll get URLs like:
 ```
@@ -157,24 +429,57 @@ For production WebRTC, configure a TURN server:
   value: '[{"urls": ["turn:turn.example.com:3478"], "username": "user", "credential": "pass"}]'
 ```
 
-## 📁 File Structure
+## 📁 Project Structure
 
 ```
 web-agent/
-├── kernel-images/          # Git submodule
-├── Dockerfile.cloudrun     # Cloud Run optimized build
-├── nginx.conf             # Reverse proxy config
-├── cloudrun-wrapper.sh    # Cloud Run startup script
-├── service.yaml           # Cloud Run service definition
-├── cloudbuild.yaml        # CI/CD pipeline
-├── deploy.sh              # Deployment script
-├── .gcloudignore          # Build ignore rules
-└── README.md              # This file
+├── browser-operator-core/      # Submodule: DevTools frontend source
+├── kernel-images/              # Submodule: Base browser environment
+├── eval-server/
+│   └── nodejs/                 # Eval server (use this, NOT submodule)
+│       ├── src/                # API server, evaluation server, lib
+│       ├── start.js            # Server entrypoint
+│       └── package.json
+├── evals/
+│   ├── run.py                  # Python evaluation runner
+│   ├── lib/judge.py            # Judge implementations
+│   └── data/                   # Evaluation YAML files
+├── scripts/
+│   └── init-container.sh       # Auto-cleanup of lock files
+├── supervisor/services/        # Service configs (overrides)
+├── Dockerfile.local            # Main Docker build
+├── Dockerfile.devtools         # DevTools frontend build
+├── docker-compose.yml          # Local deployment
+├── run-local.sh                # Interactive mode
+├── Makefile                    # Build commands
+├── Dockerfile.cloudrun         # Cloud Run build
+├── nginx.conf                  # Reverse proxy config
+├── service.yaml                # Cloud Run service config
+├── cloudbuild.yaml             # CI/CD pipeline
+├── deploy.sh                   # Cloud deployment script
+├── CLAUDE.md                   # Technical documentation
+└── README.md                   # This file
 ```
 
 ## 🐛 Troubleshooting
 
-### Common Issues
+### Local Development Issues
+
+See the detailed troubleshooting section under **Local Docker Compose Deployment** above.
+
+Common quick fixes:
+```bash
+# Clean restart
+make stop && make clean && make build && make compose-up
+
+# Check logs
+docker logs kernel-browser-extended
+
+# Verify services
+docker exec kernel-browser-extended supervisorctl status
+```
+
+### Cloud Run Issues
 
 1. **Build Timeout**
    ```bash
@@ -192,7 +497,7 @@ web-agent/
    - Check memory limits (8GB minimum)
    - Verify non-root user execution
 
-### Debug Commands
+### Cloud Run Debug Commands
 
 ```bash
 # View service logs
@@ -258,11 +563,67 @@ Use `_NO_CACHE=true` only when:
 
 ## 📚 Additional Resources
 
+- [CLAUDE.md](./CLAUDE.md) - Detailed technical documentation for Claude Code
 - [kernel-images Documentation](https://github.com/onkernel/kernel-images)
+- [Browser Operator DevTools](https://github.com/BrowserOperator/browser-operator-core)
 - [Cloud Run Documentation](https://cloud.google.com/run/docs)
 - [WebRTC Documentation](https://webrtc.org/getting-started/)
 - [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/)
 
+## 🎯 API Examples
+
+### Eval Server HTTP API
+
+```bash
+# Execute browser task
+curl -X POST http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": "Navigate to google.com and search for puppies",
+    "url": "about:blank",
+    "wait_timeout": 5000,
+    "model": {
+      "main_model": {
+        "provider": "openai",
+        "model": "gpt-4",
+        "api_key": "your-api-key"
+      }
+    }
+  }'
+
+# Get page content
+curl -X POST http://localhost:8080/page/content \
+  -H "Content-Type: application/json" \
+  -d '{"clientId": "test", "tabId": "tab-001", "format": "html"}'
+
+# Capture screenshot
+curl -X POST http://localhost:8080/page/screenshot \
+  -H "Content-Type: application/json" \
+  -d '{"clientId": "test", "tabId": "tab-001", "fullPage": false}'
+```
+
+### WebSocket JSON-RPC API
+
+```javascript
+const WebSocket = require('ws');
+const ws = new WebSocket('ws://localhost:8082');
+
+ws.on('open', () => {
+  // Subscribe to evaluations
+  ws.send(JSON.stringify({
+    jsonrpc: '2.0',
+    method: 'subscribe',
+    params: { clientId: 'my-client' },
+    id: 1
+  }));
+});
+
+ws.on('message', (data) => {
+  const response = JSON.parse(data);
+  console.log('Received:', response);
+});
+```
+
 ---
 
-**Need help?** Open an issue or check the kernel-images Discord community.
\ No newline at end of file
+**Need help?** Check [CLAUDE.md](./CLAUDE.md) for detailed technical docs or open an issue.
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index d783215..1841cf0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,10 +13,18 @@ services:
     ports:
       # Chrome DevTools Protocol (matches kernel-images default)
       - "9222:9222"
-      # Recording API (matches kernel-images default)  
+      # Recording API (matches kernel-images default)
       - "444:10001"
       # WebRTC client interface
+      - "8000:8000"
+      # Enhanced DevTools UI
+      - "8001:8001"
+      # Eval Server HTTP API
       - "8080:8080"
+      # WebRTC Neko interface
+      - "8081:8081"
+      # Eval Server WebSocket
+      - "8082:8082"
       # WebRTC UDP port range for local development
       - "56000-56100:56000-56100/udp"
     environment:
@@ -30,8 +38,8 @@ services:
       - NEKO_WEBRTC_NAT1TO1=127.0.0.1
       # Run as kernel user (not root)
       - RUN_AS_ROOT=false
-      # Chromium flags with persistent data directory
-      - CHROMIUM_FLAGS=--user-data-dir=/data/user-data --disable-dev-shm-usage --start-maximized --remote-allow-origins=* --no-sandbox --disable-setuid-sandbox
+      # Chromium flags with persistent data directory and custom DevTools frontend
+      - CHROMIUM_FLAGS=--user-data-dir=/data/user-data --disable-dev-shm-usage --start-maximized --remote-allow-origins=* --no-sandbox --disable-setuid-sandbox --custom-devtools-frontend=http://localhost:8001/
     volumes:
       # Persist recordings in local directory
       - "./recordings:/recordings"
@@ -43,6 +51,7 @@ services:
       - "./eval-server/nodejs:/opt/eval-server"
     tmpfs:
       - /dev/shm:size=2g
+      - /tmp
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080"]
diff --git a/eval-server/nodejs/package-lock.json b/eval-server/nodejs/package-lock.json
index 99f3ff7..b000a61 100644
--- a/eval-server/nodejs/package-lock.json
+++ b/eval-server/nodejs/package-lock.json
@@ -11,14 +11,10 @@
       "dependencies": {
         "dotenv": "^16.3.1",
         "js-yaml": "^4.1.0",
-        "openai": "^4.24.1",
         "uuid": "^9.0.1",
         "winston": "^3.11.0",
         "ws": "^8.16.0"
       },
-      "bin": {
-        "eval-server": "src/cli/index.js"
-      },
       "devDependencies": {
         "@types/ws": "^8.5.10"
       },
@@ -50,21 +46,12 @@
       "version": "24.0.13",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.13.tgz",
       "integrity": "sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==",
+      "dev": true,
       "license": "MIT",
       "dependencies": {
         "undici-types": "~7.8.0"
       }
     },
-    "node_modules/@types/node-fetch": {
-      "version": "2.6.12",
-      "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz",
-      "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==",
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*",
-        "form-data": "^4.0.0"
-      }
-    },
     "node_modules/@types/triple-beam": {
       "version": "1.3.5",
       "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz",
@@ -81,30 +68,6 @@
         "@types/node": "*"
       }
     },
-    "node_modules/abort-controller": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
-      "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
-      "license": "MIT",
-      "dependencies": {
-        "event-target-shim": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=6.5"
-      }
-    },
-    "node_modules/agentkeepalive": {
-      "version": "4.6.0",
-      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz",
-      "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==",
-      "license": "MIT",
-      "dependencies": {
-        "humanize-ms": "^1.2.1"
-      },
-      "engines": {
-        "node": ">= 8.0.0"
-      }
-    },
     "node_modules/argparse": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
@@ -117,25 +80,6 @@
       "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==",
       "license": "MIT"
     },
-    "node_modules/asynckit": {
-      "version": "0.4.0",
-      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
-      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
-      "license": "MIT"
-    },
-    "node_modules/call-bind-apply-helpers": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
-      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
     "node_modules/color": {
       "version": "3.2.1",
       "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz",
@@ -181,27 +125,6 @@
         "text-hex": "1.0.x"
       }
     },
-    "node_modules/combined-stream": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
-      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
-      "license": "MIT",
-      "dependencies": {
-        "delayed-stream": "~1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/delayed-stream": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
-      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.4.0"
-      }
-    },
     "node_modules/dotenv": {
       "version": "16.6.1",
       "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
@@ -214,80 +137,12 @@
         "url": "https://dotenvx.com"
       }
     },
-    "node_modules/dunder-proto": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
-      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.2.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
     "node_modules/enabled": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz",
       "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==",
       "license": "MIT"
     },
-    "node_modules/es-define-property": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
-      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-errors": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
-      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-object-atoms": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
-      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-set-tostringtag": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
-      "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.6",
-        "has-tostringtag": "^1.0.2",
-        "hasown": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/event-target-shim": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
-      "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/fecha": {
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz",
@@ -300,147 +155,6 @@
       "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==",
       "license": "MIT"
     },
-    "node_modules/form-data": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz",
-      "integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==",
-      "license": "MIT",
-      "dependencies": {
-        "asynckit": "^0.4.0",
-        "combined-stream": "^1.0.8",
-        "es-set-tostringtag": "^2.1.0",
-        "hasown": "^2.0.2",
-        "mime-types": "^2.1.12"
-      },
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/form-data-encoder": {
-      "version": "1.7.2",
-      "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz",
-      "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==",
-      "license": "MIT"
-    },
-    "node_modules/formdata-node": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz",
-      "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==",
-      "license": "MIT",
-      "dependencies": {
-        "node-domexception": "1.0.0",
-        "web-streams-polyfill": "4.0.0-beta.3"
-      },
-      "engines": {
-        "node": ">= 12.20"
-      }
-    },
-    "node_modules/function-bind": {
-      "version": "1.1.2",
-      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
-      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/get-intrinsic": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
-      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.2",
-        "es-define-property": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.1.1",
-        "function-bind": "^1.1.2",
-        "get-proto": "^1.0.1",
-        "gopd": "^1.2.0",
-        "has-symbols": "^1.1.0",
-        "hasown": "^2.0.2",
-        "math-intrinsics": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/get-proto": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
-      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
-      "license": "MIT",
-      "dependencies": {
-        "dunder-proto": "^1.0.1",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/gopd": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
-      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-symbols": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
-      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-tostringtag": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
-      "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
-      "license": "MIT",
-      "dependencies": {
-        "has-symbols": "^1.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/hasown": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
-      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
-      "license": "MIT",
-      "dependencies": {
-        "function-bind": "^1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/humanize-ms": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz",
-      "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==",
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.0.0"
-      }
-    },
     "node_modules/inherits": {
       "version": "2.0.4",
       "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
@@ -500,82 +214,12 @@
         "node": ">= 12.0.0"
       }
     },
-    "node_modules/math-intrinsics": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
-      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/mime-db": {
-      "version": "1.52.0",
-      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
-      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/mime-types": {
-      "version": "2.1.35",
-      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
-      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
-      "license": "MIT",
-      "dependencies": {
-        "mime-db": "1.52.0"
-      },
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
     "node_modules/ms": {
       "version": "2.1.3",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
       "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
       "license": "MIT"
     },
-    "node_modules/node-domexception": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz",
-      "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==",
-      "deprecated": "Use your platform's native DOMException instead",
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/jimmywarting"
-        },
-        {
-          "type": "github",
-          "url": "https://paypal.me/jimmywarting"
-        }
-      ],
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.5.0"
-      }
-    },
-    "node_modules/node-fetch": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
-      "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
-      "license": "MIT",
-      "dependencies": {
-        "whatwg-url": "^5.0.0"
-      },
-      "engines": {
-        "node": "4.x || >=6.0.0"
-      },
-      "peerDependencies": {
-        "encoding": "^0.1.0"
-      },
-      "peerDependenciesMeta": {
-        "encoding": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/one-time": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz",
@@ -585,51 +229,6 @@
         "fn.name": "1.x.x"
       }
     },
-    "node_modules/openai": {
-      "version": "4.104.0",
-      "resolved": "https://registry.npmjs.org/openai/-/openai-4.104.0.tgz",
-      "integrity": "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@types/node": "^18.11.18",
-        "@types/node-fetch": "^2.6.4",
-        "abort-controller": "^3.0.0",
-        "agentkeepalive": "^4.2.1",
-        "form-data-encoder": "1.7.2",
-        "formdata-node": "^4.3.2",
-        "node-fetch": "^2.6.7"
-      },
-      "bin": {
-        "openai": "bin/cli"
-      },
-      "peerDependencies": {
-        "ws": "^8.18.0",
-        "zod": "^3.23.8"
-      },
-      "peerDependenciesMeta": {
-        "ws": {
-          "optional": true
-        },
-        "zod": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/openai/node_modules/@types/node": {
-      "version": "18.19.118",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.118.tgz",
-      "integrity": "sha512-hIPK0hSrrcaoAu/gJMzN3QClXE4QdCdFvaenJ0JsjIbExP1JFFVH+RHcBt25c9n8bx5dkIfqKE+uw6BmBns7ug==",
-      "license": "MIT",
-      "dependencies": {
-        "undici-types": "~5.26.4"
-      }
-    },
-    "node_modules/openai/node_modules/undici-types": {
-      "version": "5.26.5",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
-      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
-      "license": "MIT"
-    },
     "node_modules/readable-stream": {
       "version": "3.6.2",
       "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
@@ -706,12 +305,6 @@
       "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
       "license": "MIT"
     },
-    "node_modules/tr46": {
-      "version": "0.0.3",
-      "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
-      "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
-      "license": "MIT"
-    },
     "node_modules/triple-beam": {
       "version": "1.4.1",
       "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
@@ -725,6 +318,7 @@
       "version": "7.8.0",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz",
       "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==",
+      "dev": true,
       "license": "MIT"
     },
     "node_modules/util-deprecate": {
@@ -746,31 +340,6 @@
         "uuid": "dist/bin/uuid"
       }
     },
-    "node_modules/web-streams-polyfill": {
-      "version": "4.0.0-beta.3",
-      "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz",
-      "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/webidl-conversions": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
-      "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
-      "license": "BSD-2-Clause"
-    },
-    "node_modules/whatwg-url": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
-      "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
-      "license": "MIT",
-      "dependencies": {
-        "tr46": "~0.0.3",
-        "webidl-conversions": "^3.0.0"
-      }
-    },
     "node_modules/winston": {
       "version": "3.17.0",
       "resolved": "https://registry.npmjs.org/winston/-/winston-3.17.0.tgz",
diff --git a/eval-server/nodejs/package.json b/eval-server/nodejs/package.json
index c6315fa..add45c2 100644
--- a/eval-server/nodejs/package.json
+++ b/eval-server/nodejs/package.json
@@ -12,14 +12,21 @@
   "scripts": {
     "start": "node src/lib/EvalServer.js"
   },
-  "keywords": ["websocket", "browser-automation", "cdp", "http-api", "rpc"],
+  "keywords": [
+    "websocket",
+    "browser-automation",
+    "cdp",
+    "http-api",
+    "rpc"
+  ],
   "author": "",
   "license": "MIT",
   "dependencies": {
-    "ws": "^8.16.0",
+    "dotenv": "^16.3.1",
+    "js-yaml": "^4.1.0",
     "uuid": "^9.0.1",
     "winston": "^3.11.0",
-    "dotenv": "^16.3.1"
+    "ws": "^8.16.0"
   },
   "devDependencies": {
     "@types/ws": "^8.5.10"
diff --git a/eval-server/nodejs/src/lib/EvaluationLoader.js b/eval-server/nodejs/src/lib/EvaluationLoader.js
new file mode 100644
index 0000000..8f85459
--- /dev/null
+++ b/eval-server/nodejs/src/lib/EvaluationLoader.js
@@ -0,0 +1,448 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import fs from 'fs';
+import path from 'path';
+import yaml from 'js-yaml';
+import logger from '../logger.js';
+
+/**
+ * EvaluationLoader - Handles loading and managing evaluations from YAML files
+ * 
+ * Example usage:
+ * ```js
+ * const loader = new EvaluationLoader('./evals');
+ * await loader.loadFromDirectory('./evals');
+ * 
+ * const evaluations = loader.getAllEvaluations();
+ * const filtered = loader.getEvaluationsByCategory('action-agent');
+ * const specific = loader.getEvaluationById('a11y-001');
+ * ```
+ */
+export class EvaluationLoader {
+  constructor(evalsDir = './evals') {
+    this.evalsDir = path.resolve(evalsDir);
+    this.evaluations = new Map(); // evaluationId -> evaluation
+    this.categories = new Map(); // category -> evaluations[]
+    this.configDefaults = null;
+    
+    // Ensure directory exists
+    if (!fs.existsSync(this.evalsDir)) {
+      fs.mkdirSync(this.evalsDir, { recursive: true });
+    }
+    
+    this.loadConfigDefaults();
+  }
+
+  /**
+   * Load default model configuration from config.yaml
+   */
+  loadConfigDefaults() {
+    try {
+      const configPath = path.resolve(this.evalsDir, 'config.yaml');
+      if (fs.existsSync(configPath)) {
+        const configContent = fs.readFileSync(configPath, 'utf8');
+        this.configDefaults = yaml.load(configContent);
+        logger.info('EvaluationLoader: Loaded config.yaml defaults', this.configDefaults);
+      } else {
+        // Don't warn about missing config.yaml - it's optional
+        this.configDefaults = null;
+      }
+    } catch (error) {
+      logger.error('EvaluationLoader: Failed to load config.yaml:', error);
+      this.configDefaults = null;
+    }
+  }
+
+  /**
+   * Apply model precedence logic
+   * API calls OR test YAML models override config.yaml fallback
+   */
+  applyModelPrecedence(evaluation, apiModelOverride = null) {
+    if (apiModelOverride) {
+      return {
+        ...(this.configDefaults?.model || {}),
+        ...apiModelOverride
+      };
+    }
+    
+    const testModel = evaluation.model;
+    if (testModel && Object.keys(testModel).length > 0) {
+      return {
+        ...(this.configDefaults?.model || {}),
+        ...testModel
+      };
+    }
+    
+    return this.configDefaults?.model || {};
+  }
+
+  /**
+   * Load all evaluations from the specified directory
+   */
+  async loadFromDirectory(evalsDir = this.evalsDir) {
+    try {
+      this.evalsDir = path.resolve(evalsDir);
+      
+      // Clear existing evaluations
+      this.evaluations.clear();
+      this.categories.clear();
+      
+      // Reload config defaults
+      this.loadConfigDefaults();
+      
+      // Find all category directories
+      const categories = fs.readdirSync(this.evalsDir)
+        .filter(dir => {
+          const fullPath = path.join(this.evalsDir, dir);
+          return fs.statSync(fullPath).isDirectory();
+        });
+      
+      let totalEvaluations = 0;
+      
+      for (const category of categories) {
+        const categoryDir = path.join(this.evalsDir, category);
+        const evalFiles = fs.readdirSync(categoryDir)
+          .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
+        
+        const categoryEvaluations = [];
+        
+        for (const file of evalFiles) {
+          try {
+            const evalPath = path.join(categoryDir, file);
+            const evaluation = await this.loadEvaluationFile(evalPath, category);
+            
+            if (evaluation && evaluation.enabled !== false) {
+              this.evaluations.set(evaluation.id, evaluation);
+              categoryEvaluations.push(evaluation);
+              totalEvaluations++;
+            }
+          } catch (error) {
+            logger.error(`EvaluationLoader: Failed to load evaluation ${file}:`, error);
+          }
+        }
+        
+        if (categoryEvaluations.length > 0) {
+          this.categories.set(category, categoryEvaluations);
+        }
+      }
+      
+      logger.info(`EvaluationLoader: Loaded ${totalEvaluations} evaluations from ${categories.length} categories`);
+      return { totalEvaluations, categories: categories.length };
+      
+    } catch (error) {
+      logger.error('EvaluationLoader: Failed to load evaluations:', error);
+      throw error;
+    }
+  }
+
+  /**
+   * Load a specific evaluation file
+   */
+  async loadEvaluationFile(filePath, category) {
+    try {
+      const yamlContent = fs.readFileSync(filePath, 'utf8');
+      const evaluation = yaml.load(yamlContent);
+      
+      if (!evaluation || !evaluation.id) {
+        throw new Error('Evaluation must have an id field');
+      }
+      
+      // Apply model precedence
+      const resolvedModel = this.applyModelPrecedence(evaluation);
+      
+      // Enhance evaluation with metadata
+      const enhancedEvaluation = {
+        ...evaluation,
+        model: resolvedModel,
+        category,
+        filePath,
+        status: 'pending',
+        loadedAt: new Date().toISOString()
+      };
+      
+      // Validate required fields
+      this.validateEvaluation(enhancedEvaluation);
+      
+      return enhancedEvaluation;
+      
+    } catch (error) {
+      logger.error(`EvaluationLoader: Failed to load evaluation file ${filePath}:`, error);
+      throw error;
+    }
+  }
+
+  /**
+   * Validate evaluation structure
+   */
+  validateEvaluation(evaluation) {
+    const required = ['id', 'name', 'tool'];
+    
+    for (const field of required) {
+      if (!evaluation[field]) {
+        throw new Error(`Evaluation missing required field: ${field}`);
+      }
+    }
+    
+    // Validate tool is supported
+    const supportedTools = [
+      'action_agent',
+      'research_agent', 
+      'schema_extractor',
+      'streamlined_schema_extractor',
+      'screenshot_verification',
+      'web_task_agent',
+      'chat'
+    ];
+    
+    if (!supportedTools.includes(evaluation.tool)) {
+      logger.warn(`EvaluationLoader: Unknown tool type: ${evaluation.tool}`);
+    }
+    
+    return true;
+  }
+
+  /**
+   * Get all loaded evaluations
+   */
+  getAllEvaluations() {
+    return Array.from(this.evaluations.values());
+  }
+
+  /**
+   * Get evaluations by category
+   */
+  getEvaluationsByCategory(category) {
+    return this.categories.get(category) || [];
+  }
+
+  /**
+   * Get all available categories
+   */
+  getCategories() {
+    return Array.from(this.categories.keys());
+  }
+
+  /**
+   * Get evaluation by ID
+   */
+  getEvaluationById(evaluationId) {
+    return this.evaluations.get(evaluationId);
+  }
+
+  /**
+   * Filter evaluations by criteria
+   */
+  filterEvaluations(criteria = {}) {
+    let evaluations = this.getAllEvaluations();
+    
+    // Filter by category
+    if (criteria.category) {
+      evaluations = evaluations.filter(e => e.category === criteria.category);
+    }
+    
+    // Filter by tool
+    if (criteria.tool) {
+      evaluations = evaluations.filter(e => e.tool === criteria.tool);
+    }
+    
+    // Filter by tags
+    if (criteria.tags && criteria.tags.length > 0) {
+      evaluations = evaluations.filter(e => {
+        const evalTags = e.metadata?.tags || [];
+        return criteria.tags.some(tag => evalTags.includes(tag));
+      });
+    }
+    
+    // Filter by enabled status
+    if (criteria.enabled !== undefined) {
+      evaluations = evaluations.filter(e => e.enabled === criteria.enabled);
+    }
+    
+    // Filter by priority
+    if (criteria.priority) {
+      evaluations = evaluations.filter(e => e.metadata?.priority === criteria.priority);
+    }
+    
+    return evaluations;
+  }
+
+  /**
+   * Get evaluation statistics
+   */
+  getStatistics() {
+    const evaluations = this.getAllEvaluations();
+    const stats = {
+      total: evaluations.length,
+      byCategory: {},
+      byTool: {},
+      byStatus: {},
+      enabled: 0,
+      disabled: 0
+    };
+    
+    for (const evaluation of evaluations) {
+      // Count by category
+      const category = evaluation.category;
+      stats.byCategory[category] = (stats.byCategory[category] || 0) + 1;
+      
+      // Count by tool
+      const tool = evaluation.tool;
+      stats.byTool[tool] = (stats.byTool[tool] || 0) + 1;
+      
+      // Count by status
+      const status = evaluation.status || 'pending';
+      stats.byStatus[status] = (stats.byStatus[status] || 0) + 1;
+      
+      // Count enabled/disabled
+      if (evaluation.enabled !== false) {
+        stats.enabled++;
+      } else {
+        stats.disabled++;
+      }
+    }
+    
+    return stats;
+  }
+
+  /**
+   * Reload evaluations from disk
+   */
+  async reload() {
+    return this.loadFromDirectory(this.evalsDir);
+  }
+
+  /**
+   * Create a new evaluation programmatically
+   */
+  createEvaluation(evaluationData) {
+    const evaluation = {
+      id: evaluationData.id || `eval-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
+      name: evaluationData.name || 'Untitled Evaluation',
+      description: evaluationData.description || '',
+      enabled: evaluationData.enabled !== false,
+      tool: evaluationData.tool || 'chat',
+      timeout: evaluationData.timeout || 45000,
+      input: evaluationData.input || {},
+      model: this.applyModelPrecedence(evaluationData, evaluationData.model),
+      validation: evaluationData.validation || { type: 'none' },
+      metadata: {
+        tags: ['programmatic'],
+        priority: 'medium',
+        ...evaluationData.metadata
+      },
+      category: evaluationData.category || 'programmatic',
+      status: 'pending',
+      loadedAt: new Date().toISOString(),
+      ...evaluationData
+    };
+    
+    // Validate the evaluation
+    this.validateEvaluation(evaluation);
+    
+    // Store the evaluation
+    this.evaluations.set(evaluation.id, evaluation);
+    
+    // Add to category
+    const category = evaluation.category;
+    if (!this.categories.has(category)) {
+      this.categories.set(category, []);
+    }
+    this.categories.get(category).push(evaluation);
+    
+    logger.info(`EvaluationLoader: Created evaluation ${evaluation.id} in category ${category}`);
+    return evaluation;
+  }
+
+  /**
+   * Remove an evaluation
+   */
+  removeEvaluation(evaluationId) {
+    const evaluation = this.evaluations.get(evaluationId);
+    if (!evaluation) {
+      return false;
+    }
+    
+    // Remove from main map
+    this.evaluations.delete(evaluationId);
+    
+    // Remove from category
+    const category = evaluation.category;
+    if (this.categories.has(category)) {
+      const categoryEvals = this.categories.get(category);
+      const index = categoryEvals.findIndex(e => e.id === evaluationId);
+      if (index !== -1) {
+        categoryEvals.splice(index, 1);
+        
+        // Remove category if empty
+        if (categoryEvals.length === 0) {
+          this.categories.delete(category);
+        }
+      }
+    }
+    
+    logger.info(`EvaluationLoader: Removed evaluation ${evaluationId}`);
+    return true;
+  }
+
+  /**
+   * Update an existing evaluation
+   */
+  updateEvaluation(evaluationId, updates) {
+    const evaluation = this.evaluations.get(evaluationId);
+    if (!evaluation) {
+      throw new Error(`Evaluation ${evaluationId} not found`);
+    }
+    
+    // Apply updates
+    const updatedEvaluation = {
+      ...evaluation,
+      ...updates,
+      id: evaluationId, // Ensure ID doesn't change
+      updatedAt: new Date().toISOString()
+    };
+    
+    // Validate updated evaluation
+    this.validateEvaluation(updatedEvaluation);
+    
+    // Update in storage
+    this.evaluations.set(evaluationId, updatedEvaluation);
+    
+    // Update in category if category changed
+    if (updates.category && updates.category !== evaluation.category) {
+      // Remove from old category
+      const oldCategory = evaluation.category;
+      if (this.categories.has(oldCategory)) {
+        const oldCategoryEvals = this.categories.get(oldCategory);
+        const index = oldCategoryEvals.findIndex(e => e.id === evaluationId);
+        if (index !== -1) {
+          oldCategoryEvals.splice(index, 1);
+          if (oldCategoryEvals.length === 0) {
+            this.categories.delete(oldCategory);
+          }
+        }
+      }
+      
+      // Add to new category
+      const newCategory = updates.category;
+      if (!this.categories.has(newCategory)) {
+        this.categories.set(newCategory, []);
+      }
+      this.categories.get(newCategory).push(updatedEvaluation);
+    } else {
+      // Update existing entry in category
+      const category = evaluation.category;
+      if (this.categories.has(category)) {
+        const categoryEvals = this.categories.get(category);
+        const index = categoryEvals.findIndex(e => e.id === evaluationId);
+        if (index !== -1) {
+          categoryEvals[index] = updatedEvaluation;
+        }
+      }
+    }
+    
+    logger.info(`EvaluationLoader: Updated evaluation ${evaluationId}`);
+    return updatedEvaluation;
+  }
+}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/EvaluationStack.js b/eval-server/nodejs/src/lib/EvaluationStack.js
new file mode 100644
index 0000000..04d7b36
--- /dev/null
+++ b/eval-server/nodejs/src/lib/EvaluationStack.js
@@ -0,0 +1,85 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+/**
+ * EvaluationStack - A simple stack-like structure for managing evaluations
+ * 
+ * Provides LIFO (Last In, First Out) access to evaluation objects.
+ * Useful for distributing different evaluations across multiple client connections.
+ */
+export class EvaluationStack {
+  constructor() {
+    this.evaluations = [];
+  }
+
+  /**
+   * Add an evaluation to the top of the stack
+   * @param {Object} evaluation - The evaluation object to add
+   */
+  push(evaluation) {
+    if (!evaluation || typeof evaluation !== 'object') {
+      throw new Error('Evaluation must be a valid object');
+    }
+    
+    // Validate required fields
+    const requiredFields = ['id', 'name', 'tool', 'input'];
+    for (const field of requiredFields) {
+      if (!evaluation[field]) {
+        throw new Error(`Evaluation missing required field: ${field}`);
+      }
+    }
+    
+    this.evaluations.push(evaluation);
+  }
+
+  /**
+   * Remove and return the evaluation from the top of the stack
+   * @returns {Object|null} The evaluation object, or null if stack is empty
+   */
+  pop() {
+    return this.evaluations.pop() || null;
+  }
+
+  /**
+   * Check if the stack is empty
+   * @returns {boolean} True if stack has no evaluations
+   */
+  isEmpty() {
+    return this.evaluations.length === 0;
+  }
+
+  /**
+   * Get the number of evaluations in the stack
+   * @returns {number} The stack size
+   */
+  size() {
+    return this.evaluations.length;
+  }
+
+  /**
+   * Peek at the top evaluation without removing it
+   * @returns {Object|null} The top evaluation object, or null if stack is empty
+   */
+  peek() {
+    if (this.isEmpty()) {
+      return null;
+    }
+    return this.evaluations[this.evaluations.length - 1];
+  }
+
+  /**
+   * Clear all evaluations from the stack
+   */
+  clear() {
+    this.evaluations = [];
+  }
+
+  /**
+   * Get a copy of all evaluations in the stack (top to bottom)
+   * @returns {Array} Array of evaluation objects
+   */
+  toArray() {
+    return [...this.evaluations].reverse();
+  }
+}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/judges/Judge.js b/eval-server/nodejs/src/lib/judges/Judge.js
new file mode 100644
index 0000000..83b0f53
--- /dev/null
+++ b/eval-server/nodejs/src/lib/judges/Judge.js
@@ -0,0 +1,80 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+/**
+ * Judge - Abstract interface for evaluation judges
+ * 
+ * A Judge is responsible for evaluating the quality of responses from LLM agents.
+ * Different implementations can provide different evaluation strategies.
+ */
+export class Judge {
+  /**
+   * Evaluate an agent response against a task
+   * 
+   * @param {string} task - The original task or prompt
+   * @param {string} agentResponse - The response from the agent
+   * @param {Object} options - Additional options for evaluation
+   * @returns {Promise<Object>} Evaluation result with scores and feedback
+   */
+  async evaluate(task, agentResponse, options = {}) {
+    throw new Error('Judge.evaluate() must be implemented by subclass');
+  }
+
+  /**
+   * Get the name of this judge implementation
+   * @returns {string} The judge name
+   */
+  getName() {
+    return this.constructor.name;
+  }
+
+  /**
+   * Get configuration schema for this judge
+   * @returns {Object} Configuration schema
+   */
+  getConfigSchema() {
+    return {};
+  }
+
+  /**
+   * Validate judge configuration
+   * @param {Object} config - Configuration to validate
+   * @returns {boolean} Whether configuration is valid
+   */
+  validateConfig(config) {
+    return true;
+  }
+}
+
+/**
+ * Default evaluation result structure
+ */
+export const DEFAULT_EVALUATION_RESULT = {
+  overall_score: null,
+  criteria_scores: {},
+  reasoning: '',
+  strengths: [],
+  weaknesses: [],
+  suggestions: [],
+  metadata: {
+    judge: 'unknown',
+    timestamp: null,
+    duration: null
+  }
+};
+
+/**
+ * Utility function to create a standardized evaluation result
+ */
+export function createEvaluationResult(overrides = {}) {
+  return {
+    ...DEFAULT_EVALUATION_RESULT,
+    ...overrides,
+    metadata: {
+      ...DEFAULT_EVALUATION_RESULT.metadata,
+      ...overrides.metadata,
+      timestamp: new Date().toISOString()
+    }
+  };
+}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/judges/LLMJudge.js b/eval-server/nodejs/src/lib/judges/LLMJudge.js
new file mode 100644
index 0000000..9e4c8a5
--- /dev/null
+++ b/eval-server/nodejs/src/lib/judges/LLMJudge.js
@@ -0,0 +1,344 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import OpenAI from 'openai';
+import { Judge, createEvaluationResult } from './Judge.js';
+import { CONFIG } from '../../config.js';
+import logger from '../../logger.js';
+
+/**
+ * LLMJudge - Uses an LLM (like GPT-4) to evaluate agent responses
+ * 
+ * This is a refactored version of the original LLMEvaluator class,
+ * now implementing the Judge interface for better modularity.
+ */
+export class LLMJudge extends Judge {
+  constructor(config = {}) {
+    super();
+    
+    this.config = {
+      apiKey: config.apiKey || CONFIG.llm.apiKey,
+      model: config.model || CONFIG.llm.model,
+      temperature: config.temperature || CONFIG.llm.temperature,
+      maxTokens: config.maxTokens || 1000,
+      ...config
+    };
+    
+    if (!this.config.apiKey) {
+      throw new Error('OpenAI API key is required for LLMJudge');
+    }
+    
+    this.openai = new OpenAI({
+      apiKey: this.config.apiKey
+    });
+  }
+
+  /**
+   * Evaluate an agent response using an LLM
+   */
+  async evaluate(task, agentResponse, options = {}) {
+    const startTime = Date.now();
+    
+    try {
+      // Merge options with default config
+      const evalConfig = {
+        criteria: [],
+        model: this.config.model,
+        temperature: this.config.temperature,
+        ...options
+      };
+      
+      const prompt = this.buildEvaluationPrompt(task, agentResponse, evalConfig);
+      
+      const completion = await this.openai.chat.completions.create({
+        model: evalConfig.model,
+        messages: [
+          {
+            role: 'system',
+            content: 'You are an expert evaluator of AI agent responses. Provide objective, detailed evaluations in the requested JSON format.'
+          },
+          {
+            role: 'user',
+            content: prompt
+          }
+        ],
+        temperature: evalConfig.temperature,
+        max_tokens: this.config.maxTokens
+      });
+
+      const evaluation = completion.choices[0].message.content;
+      const usage = completion.usage;
+      const duration = Date.now() - startTime;
+
+      logger.info('LLMJudge: Evaluation completed', {
+        tokens_used: usage.total_tokens,
+        model: evalConfig.model,
+        duration
+      });
+
+      const result = this.parseEvaluation(evaluation);
+      
+      // Add metadata
+      result.metadata = {
+        judge: this.getName(),
+        model: evalConfig.model,
+        timestamp: new Date().toISOString(),
+        duration,
+        tokens_used: usage.total_tokens,
+        criteria: evalConfig.criteria
+      };
+
+      return result;
+      
+    } catch (error) {
+      logger.error('LLMJudge: Evaluation failed', { error: error.message });
+      
+      return createEvaluationResult({
+        overall_score: 0,
+        reasoning: `Evaluation failed: ${error.message}`,
+        metadata: {
+          judge: this.getName(),
+          timestamp: new Date().toISOString(),
+          duration: Date.now() - startTime,
+          error: error.message
+        }
+      });
+    }
+  }
+
+  /**
+   * Build the evaluation prompt
+   */
+  buildEvaluationPrompt(task, agentResponse, config) {
+    const { criteria } = config;
+    
+    let prompt = `Please evaluate the following AI agent response to a given task.
+
+TASK:
+${task}
+
+AGENT RESPONSE:
+${agentResponse}
+
+Please evaluate the response on the following criteria and provide a JSON response:
+
+`;
+
+    // Use custom criteria if provided, otherwise use default criteria
+    if (criteria && criteria.length > 0) {
+      criteria.forEach((criterion, index) => {
+        prompt += `${index + 1}. **${criterion}**: Evaluate how well the response meets this criterion\n`;
+      });
+    } else {
+      prompt += `1. **Correctness**: Is the response factually accurate and correct?
+2. **Completeness**: Does the response fully address the task?
+3. **Clarity**: Is the response clear and well-structured?
+4. **Relevance**: Is the response relevant to the task?
+5. **Helpfulness**: How helpful is the response to the user?
+`;
+    }
+
+    prompt += `
+Provide your evaluation in the following JSON format:
+{
+  "overall_score": <score from 0-10>,
+  "criteria_scores": {`;
+    
+    if (criteria && criteria.length > 0) {
+      criteria.forEach((criterion, index) => {
+        const key = criterion.toLowerCase().replace(/[^a-z0-9]/g, '_');
+        prompt += `\n    "${key}": <score from 0-10>`;
+        if (index < criteria.length - 1) prompt += ',';
+      });
+    } else {
+      prompt += `
+    "correctness": <score from 0-10>,
+    "completeness": <score from 0-10>,
+    "clarity": <score from 0-10>,
+    "relevance": <score from 0-10>,
+    "helpfulness": <score from 0-10>`;
+    }
+    
+    prompt += `
+  },
+  "reasoning": "<detailed explanation of your evaluation>",
+  "strengths": ["<list of strengths>"],
+  "weaknesses": ["<list of weaknesses>"],
+  "suggestions": ["<list of improvement suggestions>"]
+}`;
+
+    return prompt;
+  }
+
+  /**
+   * Parse the LLM evaluation response
+   */
+  parseEvaluation(evaluationText) {
+    try {
+      // Try to extract JSON from the response
+      const jsonMatch = evaluationText.match(/\{[\s\S]*\}/);
+      if (jsonMatch) {
+        const parsedResult = JSON.parse(jsonMatch[0]);
+        
+        // Validate and normalize the result
+        return createEvaluationResult({
+          overall_score: this.normalizeScore(parsedResult.overall_score),
+          criteria_scores: this.normalizeCriteriaScores(parsedResult.criteria_scores || {}),
+          reasoning: parsedResult.reasoning || '',
+          strengths: Array.isArray(parsedResult.strengths) ? parsedResult.strengths : [],
+          weaknesses: Array.isArray(parsedResult.weaknesses) ? parsedResult.weaknesses : [],
+          suggestions: Array.isArray(parsedResult.suggestions) ? parsedResult.suggestions : [],
+          raw_evaluation: evaluationText
+        });
+      }
+      
+      // If no JSON found, return a structured response with the raw text
+      return createEvaluationResult({
+        overall_score: null,
+        criteria_scores: {},
+        reasoning: evaluationText,
+        strengths: [],
+        weaknesses: [],
+        suggestions: [],
+        raw_evaluation: evaluationText
+      });
+      
+    } catch (error) {
+      logger.warn('LLMJudge: Failed to parse evaluation JSON', { error: error.message });
+      
+      return createEvaluationResult({
+        overall_score: null,
+        criteria_scores: {},
+        reasoning: evaluationText,
+        strengths: [],
+        weaknesses: [],
+        suggestions: [],
+        raw_evaluation: evaluationText,
+        parse_error: error.message
+      });
+    }
+  }
+
+  /**
+   * Normalize score to be between 0 and 10
+   */
+  normalizeScore(score) {
+    if (typeof score !== 'number' || isNaN(score)) {
+      return null;
+    }
+    
+    // Clamp score between 0 and 10
+    return Math.max(0, Math.min(10, score));
+  }
+
+  /**
+   * Normalize criteria scores
+   */
+  normalizeCriteriaScores(scores) {
+    const normalized = {};
+    
+    for (const [criterion, score] of Object.entries(scores)) {
+      normalized[criterion] = this.normalizeScore(score);
+    }
+    
+    return normalized;
+  }
+
+  /**
+   * Get configuration schema
+   */
+  getConfigSchema() {
+    return {
+      type: 'object',
+      properties: {
+        apiKey: {
+          type: 'string',
+          description: 'OpenAI API key'
+        },
+        model: {
+          type: 'string',
+          description: 'OpenAI model to use for evaluation',
+          default: 'gpt-4'
+        },
+        temperature: {
+          type: 'number',
+          description: 'Temperature for LLM generation',
+          minimum: 0,
+          maximum: 2,
+          default: 0.1
+        },
+        maxTokens: {
+          type: 'number',
+          description: 'Maximum tokens for evaluation response',
+          minimum: 100,
+          maximum: 4000,
+          default: 1000
+        }
+      },
+      required: ['apiKey']
+    };
+  }
+
+  /**
+   * Validate configuration
+   */
+  validateConfig(config) {
+    if (!config.apiKey) {
+      throw new Error('LLMJudge requires an API key');
+    }
+    
+    if (config.temperature !== undefined) {
+      if (typeof config.temperature !== 'number' || config.temperature < 0 || config.temperature > 2) {
+        throw new Error('Temperature must be a number between 0 and 2');
+      }
+    }
+    
+    if (config.maxTokens !== undefined) {
+      if (typeof config.maxTokens !== 'number' || config.maxTokens < 100 || config.maxTokens > 4000) {
+        throw new Error('maxTokens must be a number between 100 and 4000');
+      }
+    }
+    
+    return true;
+  }
+
+  /**
+   * Get available OpenAI models for evaluation
+   */
+  async getAvailableModels() {
+    try {
+      const models = await this.openai.models.list();
+      return models.data
+        .filter(model => model.id.includes('gpt'))
+        .map(model => model.id)
+        .sort();
+    } catch (error) {
+      logger.error('LLMJudge: Failed to fetch available models', { error: error.message });
+      return ['gpt-4', 'gpt-3.5-turbo']; // Fallback list
+    }
+  }
+
+  /**
+   * Test the judge with a simple evaluation
+   */
+  async test() {
+    const testTask = 'Summarize the main points of artificial intelligence';
+    const testResponse = 'AI is a technology that enables machines to perform tasks that typically require human intelligence, such as learning, reasoning, and problem-solving.';
+    
+    try {
+      const result = await this.evaluate(testTask, testResponse);
+      return {
+        success: true,
+        result,
+        message: 'LLMJudge test completed successfully'
+      };
+    } catch (error) {
+      return {
+        success: false,
+        error: error.message,
+        message: 'LLMJudge test failed'
+      };
+    }
+  }
+}
\ No newline at end of file
diff --git a/scripts/cleanup-chromium-locks.sh b/scripts/cleanup-chromium-locks.sh
new file mode 100644
index 0000000..6dd678a
--- /dev/null
+++ b/scripts/cleanup-chromium-locks.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Cleanup Chromium lock files before starting services
+# This prevents "profile in use" errors after container restarts
+
+set -e
+
+echo "🧹 Cleaning up Chromium lock files..."
+
+# Remove lock files from persistent data directory
+rm -f /data/user-data/SingletonLock \
+      /data/user-data/SingletonSocket \
+      /data/user-data/SingletonCookie \
+      2>/dev/null || true
+
+# Remove X11 lock files
+rm -f /tmp/.X*-lock 2>/dev/null || true
+
+echo "✅ Chromium lock cleanup complete"
diff --git a/scripts/init-container.sh b/scripts/init-container.sh
new file mode 100644
index 0000000..9bfb3cf
--- /dev/null
+++ b/scripts/init-container.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Container initialization script
+# Runs before services start to clean up stale lock files
+
+set +e  # Don't exit on errors
+
+echo "🔧 [init] Running container initialization..."
+
+# Clean up Chromium lock files from persistent data directory
+# These prevent "profile in use" errors after container restarts
+if [ -d "/data/user-data" ]; then
+  echo "🧹 [init] Cleaning Chromium profile locks..."
+  rm -f /data/user-data/SingletonLock \
+        /data/user-data/SingletonSocket \
+        /data/user-data/SingletonCookie \
+        2>/dev/null || true
+fi
+
+# Clean up X11 lock files
+# These prevent "Server is already active for display" errors
+if [ -d "/tmp" ]; then
+  echo "🧹 [init] Cleaning X11 lock files..."
+  rm -f /tmp/.X*-lock 2>/dev/null || true
+fi
+
+echo "✅ [init] Container initialization complete"
+exit 0
diff --git a/scripts/wrapper-with-cleanup.sh b/scripts/wrapper-with-cleanup.sh
new file mode 100644
index 0000000..efab2e1
--- /dev/null
+++ b/scripts/wrapper-with-cleanup.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Wrapper script extension that adds Chromium lock cleanup
+# This script will be injected into the Docker image
+
+# Add this right after supervisord starts
+cleanup_chromium_locks() {
+  echo "[wrapper] 🧹 Cleaning up Chromium lock files..."
+
+  # Remove Chromium profile locks from persistent data directory
+  rm -f /data/user-data/SingletonLock \
+        /data/user-data/SingletonSocket \
+        /data/user-data/SingletonCookie \
+        2>/dev/null || true
+
+  # Remove X11 lock files from /tmp
+  rm -f /tmp/.X*-lock 2>/dev/null || true
+
+  echo "[wrapper] ✅ Chromium lock cleanup complete"
+}
+
+# Export the function so it can be called from the main wrapper
+export -f cleanup_chromium_locks
diff --git a/supervisor/services/eval-server.conf b/supervisor/services/eval-server.conf
index 530f00d..c35a1f0 100644
--- a/supervisor/services/eval-server.conf
+++ b/supervisor/services/eval-server.conf
@@ -4,5 +4,5 @@ autostart=true
 autorestart=true
 stdout_logfile=/var/log/supervisor/eval-server.log
 stderr_logfile=/var/log/supervisor/eval-server.error.log
-environment=NODE_ENV="production"
+environment=NODE_ENV="production",PORT="8082",API_PORT="8080",HOST="0.0.0.0",CDP_PORT="9223"
 priority=30
\ No newline at end of file

From dc584343eb2ca678f1d3d5699bc8d5546f31c809 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 12:36:50 -0500
Subject: [PATCH 17/24] Adjust UDP ranges

---
 docker-compose.yml | 4 ++--
 run-local.sh       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1841cf0..14b286a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -26,7 +26,7 @@ services:
       # Eval Server WebSocket
       - "8082:8082"
       # WebRTC UDP port range for local development
-      - "56000-56100:56000-56100/udp"
+      - "57000-57100:57000-57100/udp"
     environment:
       # Display settings
       - DISPLAY_NUM=1
@@ -34,7 +34,7 @@ services:
       - WIDTH=1024
       # WebRTC settings
       - ENABLE_WEBRTC=true
-      - NEKO_WEBRTC_EPR=56000-56100
+      - NEKO_WEBRTC_EPR=57000-57100
       - NEKO_WEBRTC_NAT1TO1=127.0.0.1
       # Run as kernel user (not root)
       - RUN_AS_ROOT=false
diff --git a/run-local.sh b/run-local.sh
index 1452635..b474a5a 100755
--- a/run-local.sh
+++ b/run-local.sh
@@ -141,9 +141,9 @@ if [[ "${ENABLE_WEBRTC:-}" == "true" ]]; then
   if [[ -n "${NEKO_ICESERVERS:-}" ]]; then
     RUN_ARGS+=( -e NEKO_ICESERVERS="$NEKO_ICESERVERS" )
   else
-    RUN_ARGS+=( -e NEKO_WEBRTC_EPR=56000-56100 )
+    RUN_ARGS+=( -e NEKO_WEBRTC_EPR=57000-57100 )
     RUN_ARGS+=( -e NEKO_WEBRTC_NAT1TO1=127.0.0.1 )
-    RUN_ARGS+=( -p 56000-56100:56000-56100/udp )
+    RUN_ARGS+=( -p 57000-57100:57000-57100/udp )
   fi
 fi
 

From 6b1baa49941d37de021277e3b501ff40743b072f Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 16:09:43 -0500
Subject: [PATCH 18/24] Remove unused code from eval-server

---
 eval-server/nodejs/src/api-server.js          |  94 +---
 eval-server/nodejs/src/client-manager.js      | 176 +------
 eval-server/nodejs/src/lib/EvalServer.js      |  55 +--
 .../nodejs/src/lib/EvaluationLoader.js        | 448 ------------------
 run-local.sh                                  |  16 +-
 supervisor/services/chromium.conf             |   2 +-
 6 files changed, 15 insertions(+), 776 deletions(-)
 delete mode 100644 eval-server/nodejs/src/lib/EvaluationLoader.js

diff --git a/eval-server/nodejs/src/api-server.js b/eval-server/nodejs/src/api-server.js
index fd3b13d..6e72fa7 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/eval-server/nodejs/src/api-server.js
@@ -92,11 +92,8 @@ class APIServer {
 
       let result;
 
-      // Handle dynamic client evaluations route
-      if (pathname.startsWith('/clients/') && pathname.endsWith('/evaluations')) {
-        const clientId = pathname.split('/')[2];
-        result = this.getClientEvaluations(clientId);
-      } else if (pathname.startsWith('/clients/') && pathname.endsWith('/tabs')) {
+      // Handle dynamic client tabs route
+      if (pathname.startsWith('/clients/') && pathname.endsWith('/tabs')) {
         // Handle dynamic client tabs route
         const clientId = pathname.split('/')[2];
         result = this.getClientTabsById(clientId);
@@ -110,14 +107,6 @@ class APIServer {
             result = this.getClients();
             break;
 
-          case '/evaluate':
-            if (method !== 'POST') {
-              this.sendError(res, 405, 'Method not allowed');
-              return;
-            }
-            result = await this.triggerEvaluation(JSON.parse(body));
-            break;
-
           case '/tabs/open':
             if (method !== 'POST') {
               this.sendError(res, 405, 'Method not allowed');
@@ -211,27 +200,6 @@ class APIServer {
     });
   }
 
-  getClientEvaluations(clientId) {
-    if (!clientId) {
-      throw new Error('Client ID is required');
-    }
-
-    const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId);
-    return {
-      clientId,
-      evaluations: evaluations.map(evaluation => ({
-        id: evaluation.id,
-        name: evaluation.name,
-        description: evaluation.description,
-        tool: evaluation.tool,
-        status: evaluation.status || 'pending',
-        enabled: evaluation.enabled !== false,
-        lastRun: evaluation.lastRun,
-        lastResult: evaluation.lastResult
-      }))
-    };
-  }
-
   getClientTabsById(clientId) {
     if (!clientId) {
       throw new Error('Client ID is required');
@@ -260,64 +228,6 @@ class APIServer {
     };
   }
 
-  async triggerEvaluation(payload) {
-    const { clientId, evaluationId, runAll = false } = payload;
-
-    if (!clientId) {
-      throw new Error('Client ID is required');
-    }
-
-    // Check if client is connected
-    const connection = this.evaluationServer.connectedClients.get(clientId);
-    if (!connection || !connection.ready) {
-      throw new Error(`Client '${clientId}' is not connected or not ready`);
-    }
-
-    if (runAll) {
-      // Run all evaluations for the client
-      const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId);
-      const results = [];
-
-      for (const evaluation of evaluations) {
-        try {
-          this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluation.id, 'pending');
-          await this.evaluationServer.executeEvaluation(connection, evaluation);
-          results.push({ id: evaluation.id, status: 'completed' });
-        } catch (error) {
-          results.push({ id: evaluation.id, status: 'failed', error: error.message });
-        }
-      }
-
-      return {
-        clientId,
-        type: 'batch',
-        results
-      };
-    }
-      // Run specific evaluation
-      if (!evaluationId) {
-        throw new Error('Evaluation ID is required when runAll is false');
-      }
-
-      const evaluation = this.evaluationServer.getClientManager().getClientEvaluations(clientId)
-        .find(e => e.id === evaluationId);
-
-      if (!evaluation) {
-        throw new Error(`Evaluation '${evaluationId}' not found for client '${clientId}'`);
-      }
-
-      this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluationId, 'pending');
-      await this.evaluationServer.executeEvaluation(connection, evaluation);
-
-      return {
-        clientId,
-        evaluationId,
-        type: 'single',
-        status: 'completed'
-      };
-
-  }
-
   async openTab(payload) {
     const { clientId, url = 'about:blank', background = false } = payload;
 
diff --git a/eval-server/nodejs/src/client-manager.js b/eval-server/nodejs/src/client-manager.js
index d21b88d..78f4c60 100644
--- a/eval-server/nodejs/src/client-manager.js
+++ b/eval-server/nodejs/src/client-manager.js
@@ -5,74 +5,17 @@ import { v4 as uuidv4 } from 'uuid';
 import logger from './logger.js';
 
 class ClientManager {
-  constructor(clientsDir = './clients', evalsDir = './evals') {
+  constructor(clientsDir = './clients') {
     this.clientsDir = path.resolve(clientsDir);
-    this.evalsDir = path.resolve(evalsDir);
     this.clients = new Map();
-    this.evaluations = new Map(); // clientId -> evaluations array
-    this.configDefaults = null; // Config.yaml defaults for model precedence
     this.activeTabs = new Map(); // clientId -> Set of { tabId, connection, metadata }
-    
+
     // Ensure directories exist
     if (!fs.existsSync(this.clientsDir)) {
       fs.mkdirSync(this.clientsDir, { recursive: true });
     }
-    if (!fs.existsSync(this.evalsDir)) {
-      fs.mkdirSync(this.evalsDir, { recursive: true });
-    }
-    
-    this.loadConfigDefaults();
-    this.loadAllClients();
-    this.loadAllEvaluations();
-  }
-
-  /**
-   * Load default model configuration from config.yaml
-   */
-  loadConfigDefaults() {
-    try {
-      const configPath = path.resolve(this.evalsDir, 'config.yaml');
-      if (fs.existsSync(configPath)) {
-        const configContent = fs.readFileSync(configPath, 'utf8');
-        this.configDefaults = yaml.load(configContent);
-        logger.info('Loaded config.yaml defaults:', this.configDefaults);
-      } else {
-        // Don't warn about missing config.yaml - it's optional
-        this.configDefaults = null;
-      }
-    } catch (error) {
-      logger.error('Failed to load config.yaml:', error);
-      this.configDefaults = null;
-    }
-  }
-
-  /**
-   * Apply model precedence: API calls OR test YAML models override config.yaml fallback
-   * Precedence logic:
-   * 1. API calls OR individual test YAML models (highest priority - either overrides everything)
-   * 2. config.yaml defaults (fallback only when neither API nor test YAML specify models)
-   * @param {Object} evaluation - Evaluation object with optional model configuration
-   * @param {import('../types/model-config').ModelConfig} apiModelOverride - Optional API model override
-   * @returns {import('../types/model-config').ModelConfig} Final model configuration
-   */
-  applyModelPrecedence(evaluation, apiModelOverride = null) {
-    // Check if API override is provided
-    if (apiModelOverride) {
-      // API model override takes precedence over everything
-      // Ensure nested format is used
-      return apiModelOverride;
-    }
-
-    // Check if evaluation has its own model config from YAML
-    const testModel = evaluation.model;
-    if (testModel && Object.keys(testModel).length > 0) {
-      // Test YAML model takes precedence
-      // Ensure nested format is returned
-      return testModel;
-    }
 
-    // Neither API nor test YAML specified models, use config.yaml defaults only
-    return this.configDefaults?.model || {};
+    this.loadAllClients();
   }
 
   /**
@@ -130,80 +73,11 @@ class ClientManager {
       settings: config.settings || {},
       yamlPath
     });
-    
-    // Note: Evaluations are now loaded separately from the evals directory
-    // Initialize empty evaluations array for this client
-    if (!this.evaluations.has(clientId)) {
-      this.evaluations.set(clientId, []);
-    }
-    
+
     logger.info(`Loaded client ${clientId}`);
     return config;
   }
 
-  /**
-   * Load all evaluations from the evals directory structure
-   */
-  loadAllEvaluations() {
-    try {
-      // Clear existing evaluations to prevent duplicates on reload
-      this.evaluations.clear();
-      
-      // Find all category directories
-      const categories = fs.readdirSync(this.evalsDir)
-        .filter(dir => fs.statSync(path.join(this.evalsDir, dir)).isDirectory());
-      
-      let totalEvaluations = 0;
-      
-      for (const category of categories) {
-        const categoryDir = path.join(this.evalsDir, category);
-        const evalFiles = fs.readdirSync(categoryDir)
-          .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
-        
-        for (const file of evalFiles) {
-          try {
-            const evalPath = path.join(categoryDir, file);
-            const yamlContent = fs.readFileSync(evalPath, 'utf8');
-            const evaluation = yaml.load(yamlContent);
-            
-            if (evaluation.enabled !== false) {
-              // Apply model precedence: config.yaml overrides individual test models
-              const resolvedModel = this.applyModelPrecedence(evaluation);
-              
-              // Add evaluation to all clients for now
-              // In the future, you might want to have client-specific evaluation assignments
-              for (const [clientId] of this.clients) {
-                const clientEvals = this.evaluations.get(clientId) || [];
-                clientEvals.push({
-                  ...evaluation,
-                  model: resolvedModel, // Use resolved model with precedence applied
-                  clientId,
-                  status: 'pending',
-                  category,
-                  filePath: evalPath
-                });
-                this.evaluations.set(clientId, clientEvals);
-              }
-              totalEvaluations++;
-            }
-          } catch (error) {
-            logger.error(`Failed to load evaluation ${file}:`, error);
-          }
-        }
-      }
-      
-      // Update the client evaluation counts
-      for (const [clientId] of this.clients) {
-        const evalCount = this.evaluations.get(clientId)?.length || 0;
-        logger.info(`Loaded client ${clientId} with ${evalCount} evaluations`);
-      }
-      
-      logger.info(`Loaded ${totalEvaluations} evaluations from ${categories.length} categories`);
-    } catch (error) {
-      logger.error('Failed to load evaluations:', error);
-    }
-  }
-
   /**
    * Register a new client with authentication
    */
@@ -225,8 +99,7 @@ class ClientManager {
     
     return {
       success: true,
-      clientName: client.name,
-      evaluationsCount: this.evaluations.get(clientId)?.length || 0
+      clientName: client.name
     };
   }
 
@@ -237,38 +110,6 @@ class ClientManager {
     return this.clients.get(clientId);
   }
 
-  /**
-   * Get evaluations for a client
-   */
-  getClientEvaluations(clientId) {
-    return this.evaluations.get(clientId) || [];
-  }
-
-  /**
-   * Get next pending evaluation for a client
-   */
-  getNextEvaluation(clientId) {
-    const evaluations = this.evaluations.get(clientId) || [];
-    return evaluations.find(e => e.status === 'pending');
-  }
-
-  /**
-   * Update evaluation status
-   */
-  updateEvaluationStatus(clientId, evaluationId, status, result = null) {
-    const evaluations = this.evaluations.get(clientId);
-    if (!evaluations) return;
-    
-    const evaluation = evaluations.find(e => e.id === evaluationId);
-    if (evaluation) {
-      evaluation.status = status;
-      evaluation.lastRun = new Date().toISOString();
-      if (result) {
-        evaluation.lastResult = result;
-      }
-    }
-  }
-
   /**
    * Create a new client with default configuration
    */
@@ -305,13 +146,10 @@ class ClientManager {
     // Write YAML file
     const yamlContent = yaml.dump(defaultConfig, { indent: 2 });
     fs.writeFileSync(yamlPath, yamlContent);
-    
+
     // Load the new client
     this.loadClient(clientId);
-    
-    // Load evaluations for the new client
-    this.loadAllEvaluations();
-    
+
     logger.info(`Created new client: ${clientId}`);
     return { clientId, yamlPath };
   }
diff --git a/eval-server/nodejs/src/lib/EvalServer.js b/eval-server/nodejs/src/lib/EvalServer.js
index 34db421..7c87249 100644
--- a/eval-server/nodejs/src/lib/EvalServer.js
+++ b/eval-server/nodejs/src/lib/EvalServer.js
@@ -10,7 +10,6 @@ import { ClientManager } from '../client-manager.js';
 import { CONFIG, validateConfig } from '../config.js';
 import logger, { logConnection, logEvaluation } from '../logger.js';
 import { RpcClient } from '../rpc-client.js';
-import { EvaluationLoader } from './EvaluationLoader.js';
 
 /**
  * EvalServer - A library for programmatically managing evaluation servers
@@ -51,14 +50,12 @@ export class EvalServer extends EventEmitter {
       port: options.port || CONFIG.server.port,
       authKey: options.authKey || null,
       clientsDir: options.clientsDir || './clients',
-      evalsDir: options.evalsDir || './evals',
       ...options
     };
-    
+
     // Internal state
     this.connectedClients = new Map();
-    this.clientManager = new ClientManager(this.config.clientsDir, this.config.evalsDir);
-    this.evaluationLoader = new EvaluationLoader(this.config.evalsDir);
+    this.clientManager = new ClientManager(this.config.clientsDir);
     this.judge = null; // Judge is optional - can be set later
     this.wss = null;
     this.isRunning = false;
@@ -182,20 +179,6 @@ export class EvalServer extends EventEmitter {
     };
   }
 
-  /**
-   * Load evaluations from YAML files
-   */
-  async loadEvaluations(evalsDir = './evals') {
-    return this.evaluationLoader.loadFromDirectory(evalsDir);
-  }
-
-  /**
-   * Get all available evaluations
-   */
-  getEvaluations() {
-    return this.evaluationLoader.getAllEvaluations();
-  }
-
   /**
    * Get the client manager instance
    */
@@ -630,12 +613,6 @@ export class EvalServer extends EventEmitter {
       progress,
       message
     });
-
-    this.clientManager.updateEvaluationStatus(
-      connection.clientId,
-      evaluationId,
-      status
-    );
   }
 
   /**
@@ -709,13 +686,6 @@ export class EvalServer extends EventEmitter {
         tool: evaluation.tool
       });
 
-      // Update status to running
-      this.clientManager.updateEvaluationStatus(
-        connection.clientId,
-        evaluation.id,
-        'running'
-      );
-
       // Prepare model configuration - use client config if available, otherwise evaluation config, otherwise defaults
       let modelConfig = evaluation.model || {};
 
@@ -773,17 +743,6 @@ export class EvalServer extends EventEmitter {
         evaluation.timeout || 45000
       );
 
-      // Update evaluation status
-      this.clientManager.updateEvaluationStatus(
-        connection.clientId,
-        evaluation.id,
-        'completed',
-        {
-          response,
-          duration: Date.now() - startTime
-        }
-      );
-
       // Log evaluation
       logEvaluation({
         evaluationId: evaluation.id,
@@ -804,16 +763,6 @@ export class EvalServer extends EventEmitter {
         error: error.message
       });
 
-      this.clientManager.updateEvaluationStatus(
-        connection.clientId,
-        evaluation.id,
-        'failed',
-        {
-          error: error.message,
-          duration: Date.now() - startTime
-        }
-      );
-
       throw error;
     }
   }
diff --git a/eval-server/nodejs/src/lib/EvaluationLoader.js b/eval-server/nodejs/src/lib/EvaluationLoader.js
deleted file mode 100644
index 8f85459..0000000
--- a/eval-server/nodejs/src/lib/EvaluationLoader.js
+++ /dev/null
@@ -1,448 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-import fs from 'fs';
-import path from 'path';
-import yaml from 'js-yaml';
-import logger from '../logger.js';
-
-/**
- * EvaluationLoader - Handles loading and managing evaluations from YAML files
- * 
- * Example usage:
- * ```js
- * const loader = new EvaluationLoader('./evals');
- * await loader.loadFromDirectory('./evals');
- * 
- * const evaluations = loader.getAllEvaluations();
- * const filtered = loader.getEvaluationsByCategory('action-agent');
- * const specific = loader.getEvaluationById('a11y-001');
- * ```
- */
-export class EvaluationLoader {
-  constructor(evalsDir = './evals') {
-    this.evalsDir = path.resolve(evalsDir);
-    this.evaluations = new Map(); // evaluationId -> evaluation
-    this.categories = new Map(); // category -> evaluations[]
-    this.configDefaults = null;
-    
-    // Ensure directory exists
-    if (!fs.existsSync(this.evalsDir)) {
-      fs.mkdirSync(this.evalsDir, { recursive: true });
-    }
-    
-    this.loadConfigDefaults();
-  }
-
-  /**
-   * Load default model configuration from config.yaml
-   */
-  loadConfigDefaults() {
-    try {
-      const configPath = path.resolve(this.evalsDir, 'config.yaml');
-      if (fs.existsSync(configPath)) {
-        const configContent = fs.readFileSync(configPath, 'utf8');
-        this.configDefaults = yaml.load(configContent);
-        logger.info('EvaluationLoader: Loaded config.yaml defaults', this.configDefaults);
-      } else {
-        // Don't warn about missing config.yaml - it's optional
-        this.configDefaults = null;
-      }
-    } catch (error) {
-      logger.error('EvaluationLoader: Failed to load config.yaml:', error);
-      this.configDefaults = null;
-    }
-  }
-
-  /**
-   * Apply model precedence logic
-   * API calls OR test YAML models override config.yaml fallback
-   */
-  applyModelPrecedence(evaluation, apiModelOverride = null) {
-    if (apiModelOverride) {
-      return {
-        ...(this.configDefaults?.model || {}),
-        ...apiModelOverride
-      };
-    }
-    
-    const testModel = evaluation.model;
-    if (testModel && Object.keys(testModel).length > 0) {
-      return {
-        ...(this.configDefaults?.model || {}),
-        ...testModel
-      };
-    }
-    
-    return this.configDefaults?.model || {};
-  }
-
-  /**
-   * Load all evaluations from the specified directory
-   */
-  async loadFromDirectory(evalsDir = this.evalsDir) {
-    try {
-      this.evalsDir = path.resolve(evalsDir);
-      
-      // Clear existing evaluations
-      this.evaluations.clear();
-      this.categories.clear();
-      
-      // Reload config defaults
-      this.loadConfigDefaults();
-      
-      // Find all category directories
-      const categories = fs.readdirSync(this.evalsDir)
-        .filter(dir => {
-          const fullPath = path.join(this.evalsDir, dir);
-          return fs.statSync(fullPath).isDirectory();
-        });
-      
-      let totalEvaluations = 0;
-      
-      for (const category of categories) {
-        const categoryDir = path.join(this.evalsDir, category);
-        const evalFiles = fs.readdirSync(categoryDir)
-          .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
-        
-        const categoryEvaluations = [];
-        
-        for (const file of evalFiles) {
-          try {
-            const evalPath = path.join(categoryDir, file);
-            const evaluation = await this.loadEvaluationFile(evalPath, category);
-            
-            if (evaluation && evaluation.enabled !== false) {
-              this.evaluations.set(evaluation.id, evaluation);
-              categoryEvaluations.push(evaluation);
-              totalEvaluations++;
-            }
-          } catch (error) {
-            logger.error(`EvaluationLoader: Failed to load evaluation ${file}:`, error);
-          }
-        }
-        
-        if (categoryEvaluations.length > 0) {
-          this.categories.set(category, categoryEvaluations);
-        }
-      }
-      
-      logger.info(`EvaluationLoader: Loaded ${totalEvaluations} evaluations from ${categories.length} categories`);
-      return { totalEvaluations, categories: categories.length };
-      
-    } catch (error) {
-      logger.error('EvaluationLoader: Failed to load evaluations:', error);
-      throw error;
-    }
-  }
-
-  /**
-   * Load a specific evaluation file
-   */
-  async loadEvaluationFile(filePath, category) {
-    try {
-      const yamlContent = fs.readFileSync(filePath, 'utf8');
-      const evaluation = yaml.load(yamlContent);
-      
-      if (!evaluation || !evaluation.id) {
-        throw new Error('Evaluation must have an id field');
-      }
-      
-      // Apply model precedence
-      const resolvedModel = this.applyModelPrecedence(evaluation);
-      
-      // Enhance evaluation with metadata
-      const enhancedEvaluation = {
-        ...evaluation,
-        model: resolvedModel,
-        category,
-        filePath,
-        status: 'pending',
-        loadedAt: new Date().toISOString()
-      };
-      
-      // Validate required fields
-      this.validateEvaluation(enhancedEvaluation);
-      
-      return enhancedEvaluation;
-      
-    } catch (error) {
-      logger.error(`EvaluationLoader: Failed to load evaluation file ${filePath}:`, error);
-      throw error;
-    }
-  }
-
-  /**
-   * Validate evaluation structure
-   */
-  validateEvaluation(evaluation) {
-    const required = ['id', 'name', 'tool'];
-    
-    for (const field of required) {
-      if (!evaluation[field]) {
-        throw new Error(`Evaluation missing required field: ${field}`);
-      }
-    }
-    
-    // Validate tool is supported
-    const supportedTools = [
-      'action_agent',
-      'research_agent', 
-      'schema_extractor',
-      'streamlined_schema_extractor',
-      'screenshot_verification',
-      'web_task_agent',
-      'chat'
-    ];
-    
-    if (!supportedTools.includes(evaluation.tool)) {
-      logger.warn(`EvaluationLoader: Unknown tool type: ${evaluation.tool}`);
-    }
-    
-    return true;
-  }
-
-  /**
-   * Get all loaded evaluations
-   */
-  getAllEvaluations() {
-    return Array.from(this.evaluations.values());
-  }
-
-  /**
-   * Get evaluations by category
-   */
-  getEvaluationsByCategory(category) {
-    return this.categories.get(category) || [];
-  }
-
-  /**
-   * Get all available categories
-   */
-  getCategories() {
-    return Array.from(this.categories.keys());
-  }
-
-  /**
-   * Get evaluation by ID
-   */
-  getEvaluationById(evaluationId) {
-    return this.evaluations.get(evaluationId);
-  }
-
-  /**
-   * Filter evaluations by criteria
-   */
-  filterEvaluations(criteria = {}) {
-    let evaluations = this.getAllEvaluations();
-    
-    // Filter by category
-    if (criteria.category) {
-      evaluations = evaluations.filter(e => e.category === criteria.category);
-    }
-    
-    // Filter by tool
-    if (criteria.tool) {
-      evaluations = evaluations.filter(e => e.tool === criteria.tool);
-    }
-    
-    // Filter by tags
-    if (criteria.tags && criteria.tags.length > 0) {
-      evaluations = evaluations.filter(e => {
-        const evalTags = e.metadata?.tags || [];
-        return criteria.tags.some(tag => evalTags.includes(tag));
-      });
-    }
-    
-    // Filter by enabled status
-    if (criteria.enabled !== undefined) {
-      evaluations = evaluations.filter(e => e.enabled === criteria.enabled);
-    }
-    
-    // Filter by priority
-    if (criteria.priority) {
-      evaluations = evaluations.filter(e => e.metadata?.priority === criteria.priority);
-    }
-    
-    return evaluations;
-  }
-
-  /**
-   * Get evaluation statistics
-   */
-  getStatistics() {
-    const evaluations = this.getAllEvaluations();
-    const stats = {
-      total: evaluations.length,
-      byCategory: {},
-      byTool: {},
-      byStatus: {},
-      enabled: 0,
-      disabled: 0
-    };
-    
-    for (const evaluation of evaluations) {
-      // Count by category
-      const category = evaluation.category;
-      stats.byCategory[category] = (stats.byCategory[category] || 0) + 1;
-      
-      // Count by tool
-      const tool = evaluation.tool;
-      stats.byTool[tool] = (stats.byTool[tool] || 0) + 1;
-      
-      // Count by status
-      const status = evaluation.status || 'pending';
-      stats.byStatus[status] = (stats.byStatus[status] || 0) + 1;
-      
-      // Count enabled/disabled
-      if (evaluation.enabled !== false) {
-        stats.enabled++;
-      } else {
-        stats.disabled++;
-      }
-    }
-    
-    return stats;
-  }
-
-  /**
-   * Reload evaluations from disk
-   */
-  async reload() {
-    return this.loadFromDirectory(this.evalsDir);
-  }
-
-  /**
-   * Create a new evaluation programmatically
-   */
-  createEvaluation(evaluationData) {
-    const evaluation = {
-      id: evaluationData.id || `eval-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
-      name: evaluationData.name || 'Untitled Evaluation',
-      description: evaluationData.description || '',
-      enabled: evaluationData.enabled !== false,
-      tool: evaluationData.tool || 'chat',
-      timeout: evaluationData.timeout || 45000,
-      input: evaluationData.input || {},
-      model: this.applyModelPrecedence(evaluationData, evaluationData.model),
-      validation: evaluationData.validation || { type: 'none' },
-      metadata: {
-        tags: ['programmatic'],
-        priority: 'medium',
-        ...evaluationData.metadata
-      },
-      category: evaluationData.category || 'programmatic',
-      status: 'pending',
-      loadedAt: new Date().toISOString(),
-      ...evaluationData
-    };
-    
-    // Validate the evaluation
-    this.validateEvaluation(evaluation);
-    
-    // Store the evaluation
-    this.evaluations.set(evaluation.id, evaluation);
-    
-    // Add to category
-    const category = evaluation.category;
-    if (!this.categories.has(category)) {
-      this.categories.set(category, []);
-    }
-    this.categories.get(category).push(evaluation);
-    
-    logger.info(`EvaluationLoader: Created evaluation ${evaluation.id} in category ${category}`);
-    return evaluation;
-  }
-
-  /**
-   * Remove an evaluation
-   */
-  removeEvaluation(evaluationId) {
-    const evaluation = this.evaluations.get(evaluationId);
-    if (!evaluation) {
-      return false;
-    }
-    
-    // Remove from main map
-    this.evaluations.delete(evaluationId);
-    
-    // Remove from category
-    const category = evaluation.category;
-    if (this.categories.has(category)) {
-      const categoryEvals = this.categories.get(category);
-      const index = categoryEvals.findIndex(e => e.id === evaluationId);
-      if (index !== -1) {
-        categoryEvals.splice(index, 1);
-        
-        // Remove category if empty
-        if (categoryEvals.length === 0) {
-          this.categories.delete(category);
-        }
-      }
-    }
-    
-    logger.info(`EvaluationLoader: Removed evaluation ${evaluationId}`);
-    return true;
-  }
-
-  /**
-   * Update an existing evaluation
-   */
-  updateEvaluation(evaluationId, updates) {
-    const evaluation = this.evaluations.get(evaluationId);
-    if (!evaluation) {
-      throw new Error(`Evaluation ${evaluationId} not found`);
-    }
-    
-    // Apply updates
-    const updatedEvaluation = {
-      ...evaluation,
-      ...updates,
-      id: evaluationId, // Ensure ID doesn't change
-      updatedAt: new Date().toISOString()
-    };
-    
-    // Validate updated evaluation
-    this.validateEvaluation(updatedEvaluation);
-    
-    // Update in storage
-    this.evaluations.set(evaluationId, updatedEvaluation);
-    
-    // Update in category if category changed
-    if (updates.category && updates.category !== evaluation.category) {
-      // Remove from old category
-      const oldCategory = evaluation.category;
-      if (this.categories.has(oldCategory)) {
-        const oldCategoryEvals = this.categories.get(oldCategory);
-        const index = oldCategoryEvals.findIndex(e => e.id === evaluationId);
-        if (index !== -1) {
-          oldCategoryEvals.splice(index, 1);
-          if (oldCategoryEvals.length === 0) {
-            this.categories.delete(oldCategory);
-          }
-        }
-      }
-      
-      // Add to new category
-      const newCategory = updates.category;
-      if (!this.categories.has(newCategory)) {
-        this.categories.set(newCategory, []);
-      }
-      this.categories.get(newCategory).push(updatedEvaluation);
-    } else {
-      // Update existing entry in category
-      const category = evaluation.category;
-      if (this.categories.has(category)) {
-        const categoryEvals = this.categories.get(category);
-        const index = categoryEvals.findIndex(e => e.id === evaluationId);
-        if (index !== -1) {
-          categoryEvals[index] = updatedEvaluation;
-        }
-      }
-    }
-    
-    logger.info(`EvaluationLoader: Updated evaluation ${evaluationId}`);
-    return updatedEvaluation;
-  }
-}
\ No newline at end of file
diff --git a/run-local.sh b/run-local.sh
index b474a5a..d369bd4 100755
--- a/run-local.sh
+++ b/run-local.sh
@@ -37,7 +37,7 @@ export UKC_METRO="dummy-metro-for-local-run"
 
 
 # Local-friendly Chrome flags (less restrictive than cloud) + custom DevTools frontend
-export CHROMIUM_FLAGS="--user-data-dir=/data/user-data --disable-dev-shm-usage --start-maximized --remote-allow-origins=* --no-sandbox --disable-setuid-sandbox --custom-devtools-frontend=http://localhost:8001/"
+export CHROMIUM_FLAGS="--user-data-dir=/data/user-data --disable-dev-shm-usage --start-maximized --remote-allow-origins=* --no-sandbox --disable-setuid-sandbox --custom-devtools-frontend=http://localhost:8001/ --auto-open-devtools-for-tabs"
 
 echo "🔧 Configuration:"
 echo "   Image: $IMAGE"
@@ -91,18 +91,8 @@ else
   CHROMIUM_DATA_VOLUME="${CHROMIUM_DATA_REAL}:/data"
 fi
 
-# Build Chromium flags file and mount
-CHROMIUM_FLAGS_DEFAULT="--user-data-dir=/data/user-data --disable-dev-shm-usage --disable-gpu --start-maximized --disable-software-rasterizer --remote-allow-origins=*"
-if [[ "$RUN_AS_ROOT" == "true" ]]; then
-  CHROMIUM_FLAGS_DEFAULT="$CHROMIUM_FLAGS_DEFAULT --no-sandbox --no-zygote"
-fi
-CHROMIUM_FLAGS="${CHROMIUM_FLAGS:-$CHROMIUM_FLAGS_DEFAULT}"
-rm -rf .tmp/chromium
-mkdir -p .tmp/chromium
-FLAGS_FILE="$(pwd)/.tmp/chromium/flags"
-echo "$CHROMIUM_FLAGS" > "$FLAGS_FILE"
-
 # Build docker run argument list
+# Note: CHROMIUM_FLAGS is already set above (line 40) with custom DevTools frontend
 RUN_ARGS=(
   --name "$NAME"
   --privileged
@@ -120,7 +110,7 @@ RUN_ARGS=(
   -e HEIGHT=768
   -e WIDTH=1024
   -e RUN_AS_ROOT="$RUN_AS_ROOT"
-  --mount type=bind,src="$FLAGS_FILE",dst=/chromium/flags,ro
+  -e CHROMIUM_FLAGS="$CHROMIUM_FLAGS"
 )
 
 # Add Chromium data volume if specified
diff --git a/supervisor/services/chromium.conf b/supervisor/services/chromium.conf
index 12b9fd7..778fb84 100644
--- a/supervisor/services/chromium.conf
+++ b/supervisor/services/chromium.conf
@@ -5,4 +5,4 @@ autorestart=true
 startsecs=5
 stdout_logfile=/var/log/supervisord/chromium
 redirect_stderr=true
-environment=HOME="/home/kernel",USER="kernel",CHROMIUM_FLAGS="--auto-open-devtools-for-tabs"
\ No newline at end of file
+environment=HOME="/home/kernel",USER="kernel"
\ No newline at end of file

From d0b464cbb19af1e66d2924a9845b58632756305b Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 17:07:16 -0500
Subject: [PATCH 19/24] Refactored mounted directories

---
 .gitignore         | 7 ++++++-
 Makefile           | 2 +-
 docker-compose.yml | 8 +++-----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1f8e6ff..35f7d1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,9 +53,12 @@ service-account-key.json
 *.bak
 *.backup
 
-# Chromium persistent data
+# Chromium persistent data (deprecated, now in @mount/)
 chromium-data/
 
+# All mounted volumes (recordings, chromium-data, eval-server, etc.)
+@mount/
+
 # Browser Operator DevTools build artifacts
 browser-operator-core/devtools-frontend/
 browser-operator-core/depot_tools/
@@ -65,6 +68,8 @@ browser-operator-core/.devtools-base-built
 # Eval server runtime files
 eval-server/nodejs/clients/
 eval-server/nodejs/logs/
+eval-server/nodejs/node_modules/
+eval-server/nodejs/.env
 
 # Evaluation screenshots
 evals/screenshots/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 5323481..241b0b1 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ help: ## Show this help message
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  %-15s %s\n", $$1, $$2}'
 	@echo ""
 	@echo "Chromium Data Persistence:"
-	@echo "  - Browser data persists to ./chromium-data by default"
+	@echo "  - Browser data persists to @mount/chromium-data by default"
 	@echo "  - Customize location: CHROMIUM_DATA_HOST=/path/to/data make run"
 	@echo "  - Disable persistence: CHROMIUM_DATA_HOST=\"\" make run"
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 14b286a..8acb2e5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -42,13 +42,11 @@ services:
       - CHROMIUM_FLAGS=--user-data-dir=/data/user-data --disable-dev-shm-usage --start-maximized --remote-allow-origins=* --no-sandbox --disable-setuid-sandbox --custom-devtools-frontend=http://localhost:8001/
     volumes:
       # Persist recordings in local directory
-      - "./recordings:/recordings"
+      - "./@mount/recordings:/recordings"
       # Mount Chromium flags file (will be created by run script)
-      - "./kernel-images/images/chromium-headful/.tmp/chromium/flags:/chromium/flags:ro"
+      - "./@mount/chromium-flags/flags:/chromium/flags:ro"
       # Persist Chromium data across container restarts (set CHROMIUM_DATA_HOST env var to customize path)
-      - "${CHROMIUM_DATA_HOST:-./chromium-data}:/data"
-      # Mount eval-server code for live updates during development
-      - "./eval-server/nodejs:/opt/eval-server"
+      - "${CHROMIUM_DATA_HOST:-./@mount/chromium-data}:/data"
     tmpfs:
       - /dev/shm:size=2g
       - /tmp

From 501fbb59b792ad58d9e6099a294fbb467a42932c Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 17:46:30 -0500
Subject: [PATCH 20/24] Refactoring

---
 CLAUDE.md                                     |  57 ++++++++--
 Dockerfile.cloudrun                           |   4 +-
 Dockerfile.kernel-cloud                       |   2 +-
 Dockerfile.local                              |   2 +-
 Makefile                                      |   2 +-
 Readme.md                                     |  42 ++++---
 .../cloudrun/cloudbuild.yaml                  |   0
 .../cloudrun/cloudrun-kernel-wrapper.sh       |   0
 .../cloudrun/cloudrun-wrapper.sh              |   0
 deploy.sh => deployment/cloudrun/deploy.sh    |  14 ++-
 nginx.conf => deployment/cloudrun/nginx.conf  |   0
 .../cloudrun/service-secrets.yaml             |   0
 .../cloudrun/service.yaml                     |   0
 .../cloudrun/supervisord-cloudrun.conf        |   0
 run-local.sh => deployment/local/run-local.sh |  13 ++-
 nginx-devtools-cloudrun.conf                  | 105 ------------------
 .../nginx-devtools.conf                       |   0
 scripts/cleanup-chromium-locks.sh             |  18 ---
 .../test-eval-server.sh                       |   0
 scripts/wrapper-with-cleanup.sh               |  22 ----
 20 files changed, 94 insertions(+), 187 deletions(-)
 rename cloudbuild.yaml => deployment/cloudrun/cloudbuild.yaml (100%)
 rename cloudrun-kernel-wrapper.sh => deployment/cloudrun/cloudrun-kernel-wrapper.sh (100%)
 rename cloudrun-wrapper.sh => deployment/cloudrun/cloudrun-wrapper.sh (100%)
 rename deploy.sh => deployment/cloudrun/deploy.sh (96%)
 rename nginx.conf => deployment/cloudrun/nginx.conf (100%)
 rename service-secrets.yaml => deployment/cloudrun/service-secrets.yaml (100%)
 rename service.yaml => deployment/cloudrun/service.yaml (100%)
 rename supervisord-cloudrun.conf => deployment/cloudrun/supervisord-cloudrun.conf (100%)
 rename run-local.sh => deployment/local/run-local.sh (94%)
 delete mode 100644 nginx-devtools-cloudrun.conf
 rename nginx-devtools.conf => nginx/nginx-devtools.conf (100%)
 delete mode 100644 scripts/cleanup-chromium-locks.sh
 rename test-eval-server.sh => scripts/test-eval-server.sh (100%)
 delete mode 100644 scripts/wrapper-with-cleanup.sh

diff --git a/CLAUDE.md b/CLAUDE.md
index 01c2ef1..6548a90 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -53,6 +53,28 @@ supervisord
 web-agent/
 ├── browser-operator-core/      # Submodule: DevTools frontend source
 ├── kernel-images/              # Submodule: Base browser environment
+├── deployment/                 # Deployment configurations
+│   ├── cloudrun/               # Google Cloud Run deployment
+│   │   ├── deploy.sh           # Cloud deployment script
+│   │   ├── cloudbuild.yaml     # CI/CD pipeline config
+│   │   ├── service.yaml        # Cloud Run service definition
+│   │   ├── service-secrets.yaml # Service with Secret Manager
+│   │   ├── cloudrun-wrapper.sh # Cloud Run entrypoint
+│   │   ├── cloudrun-kernel-wrapper.sh # Alternative wrapper
+│   │   ├── supervisord-cloudrun.conf # Supervisor for Cloud Run
+│   │   └── nginx.conf          # Reverse proxy config
+│   └── local/                  # Local deployment
+│       └── run-local.sh        # Interactive Docker run script
+├── nginx/                      # Nginx configurations
+│   └── nginx-devtools.conf     # DevTools nginx config
+├── scripts/                    # Utility scripts
+│   ├── init-container.sh       # Auto-cleanup of lock files
+│   └── test-eval-server.sh     # Eval server build test
+├── supervisor/services/        # Service configs (override defaults)
+│   ├── chromium.conf           # Auto-open DevTools
+│   ├── eval-server.conf        # Eval server with CDP_PORT=9223
+│   ├── neko.conf
+│   └── nginx-devtools.conf
 ├── eval-server/
 │   └── nodejs/                 # Eval server source (use this, NOT submodule)
 │       ├── src/
@@ -65,20 +87,13 @@ web-agent/
 │   ├── run.py                  # Python evaluation runner
 │   ├── lib/judge.py            # LLMJudge, VisionJudge, SimpleJudge
 │   └── data/                   # Evaluation YAML files
-├── scripts/
-│   ├── init-container.sh       # Auto-cleanup of lock files
-│   └── cleanup-chromium-locks.sh
-├── supervisor/services/        # Service configs (override defaults)
-│   ├── chromium.conf           # Auto-open DevTools
-│   ├── eval-server.conf        # Eval server with CDP_PORT=9223
-│   ├── neko.conf
-│   └── nginx-devtools.conf
-├── Dockerfile.local            # Main Docker build
+├── Dockerfile.local            # Main Docker build (local dev)
 ├── Dockerfile.devtools         # DevTools frontend build
+├── Dockerfile.cloudrun         # Cloud Run build
 ├── docker-compose.yml          # Local deployment config
-├── run-local.sh                # Interactive mode startup
 ├── Makefile                    # Build/deployment commands
-└── README.md
+├── CLAUDE.md                   # This file
+└── README.md                   # User documentation
 ```
 
 ## Key Files and What They Do
@@ -144,7 +159,7 @@ Key targets:
 - `make stop` - Stop all containers
 - `make clean` - Clean up everything
 
-### run-local.sh
+### deployment/local/run-local.sh
 Interactive Docker run script that:
 - Sources kernel-images common build variables
 - Creates local recordings directory
@@ -161,6 +176,24 @@ Interactive Docker run script that:
 - More flexible for custom configurations via environment variables
 - Better for seeing startup issues and debugging
 
+### deployment/cloudrun/
+Contains all Google Cloud Run deployment files:
+- `deploy.sh` - Automated deployment script with Twilio TURN setup
+- `cloudbuild.yaml` - CI/CD pipeline for Cloud Build
+- `service.yaml` / `service-secrets.yaml` - Cloud Run service definitions
+- `cloudrun-wrapper.sh` - Cloud Run container entrypoint
+- `supervisord-cloudrun.conf` - Supervisor configuration for Cloud Run
+- `nginx.conf` - Reverse proxy for Cloud Run port requirements
+
+### nginx/
+Nginx configuration files:
+- `nginx-devtools.conf` - DevTools UI server config (used by Dockerfile.local)
+
+### scripts/
+Utility scripts:
+- `init-container.sh` - Automatic lock file cleanup on container start
+- `test-eval-server.sh` - Test eval-server Docker build stage
+
 ## Common Issues and Solutions
 
 ### 1. Chromium Profile Lock Errors
diff --git a/Dockerfile.cloudrun b/Dockerfile.cloudrun
index c66acf6..5156d27 100644
--- a/Dockerfile.cloudrun
+++ b/Dockerfile.cloudrun
@@ -259,7 +259,7 @@ COPY kernel-images/images/chromium-headful/image-chromium/ /
 COPY kernel-images/images/chromium-headful/start-chromium.sh /images/chromium-headful/start-chromium.sh
 RUN chmod +x /images/chromium-headful/start-chromium.sh
 COPY kernel-images/images/chromium-headful/supervisord.conf /etc/supervisor/supervisord.conf
-COPY supervisord-cloudrun.conf /etc/supervisor/supervisord-cloudrun.conf
+COPY deployment/cloudrun/supervisord-cloudrun.conf /etc/supervisor/supervisord-cloudrun.conf
 COPY kernel-images/images/chromium-headful/supervisor/services/ /etc/supervisor/conf.d/services/
 
 # Copy the kernel-images API binary
@@ -291,7 +291,7 @@ RUN chown -R kernel:kernel /opt/eval-server
 
 # Cloud Run specific: wrapper scripts (nginx config is inline)
 # DO NOT copy nginx.conf to avoid auto-start conflicts
-COPY cloudrun-wrapper.sh /cloudrun-wrapper.sh
+COPY deployment/cloudrun/cloudrun-wrapper.sh /cloudrun-wrapper.sh
 COPY twilio/twilio-credential-updater.sh /twilio-credential-updater.sh
 RUN chmod +x /cloudrun-wrapper.sh /twilio-credential-updater.sh
 
diff --git a/Dockerfile.kernel-cloud b/Dockerfile.kernel-cloud
index c0cefcd..a260864 100644
--- a/Dockerfile.kernel-cloud
+++ b/Dockerfile.kernel-cloud
@@ -208,7 +208,7 @@ RUN mkdir -p /chromium && \
     chown -R kernel:kernel /chromium
 
 # Cloud Run wrapper that starts kernel-images services + nginx proxy
-COPY cloudrun-kernel-wrapper.sh /cloudrun-kernel-wrapper.sh
+COPY deployment/cloudrun/cloudrun-kernel-wrapper.sh /cloudrun-kernel-wrapper.sh
 RUN chmod +x /cloudrun-kernel-wrapper.sh
 
 # Cloud Run requires non-root execution
diff --git a/Dockerfile.local b/Dockerfile.local
index 6685a9c..19a688a 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -221,7 +221,7 @@ COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-a
 COPY --from=devtools-source /usr/share/nginx/html /usr/share/nginx/devtools
 
 # Create DevTools nginx configuration
-COPY nginx-devtools.conf /etc/nginx/sites-available/devtools
+COPY nginx/nginx-devtools.conf /etc/nginx/sites-available/devtools
 RUN ln -s /etc/nginx/sites-available/devtools /etc/nginx/sites-enabled/devtools && \
     rm /etc/nginx/sites-enabled/default
 
diff --git a/Makefile b/Makefile
index 241b0b1..eefdc84 100644
--- a/Makefile
+++ b/Makefile
@@ -78,7 +78,7 @@ rebuild: init ## Force complete rebuild (including DevTools)
 run: ## Run extended container with DevTools (interactive)
 	@echo "🚀 Starting extended kernel-browser with DevTools..."
 	@if [ -n "$(URLS)" ]; then echo "📄 Opening URLs: $(URLS)"; fi
-	@./run-local.sh
+	@./deployment/local/run-local.sh
 
 compose-up: build ## Start with docker-compose (background)
 	@echo "🚀 Starting with docker-compose..."
diff --git a/Readme.md b/Readme.md
index 96d106a..c3d4991 100644
--- a/Readme.md
+++ b/Readme.md
@@ -318,10 +318,10 @@ gcloud auth application-default login
 
 ```bash
 # Automated deployment (recommended)
-./deploy.sh
+./deployment/cloudrun/deploy.sh
 
 # Or with custom settings
-./deploy.sh --project your-project-id --region us-central1
+./deployment/cloudrun/deploy.sh --project your-project-id --region us-central1
 ```
 
 ### Access Cloud Run Service
@@ -435,6 +435,24 @@ For production WebRTC, configure a TURN server:
 web-agent/
 ├── browser-operator-core/      # Submodule: DevTools frontend source
 ├── kernel-images/              # Submodule: Base browser environment
+├── deployment/                 # Deployment configurations
+│   ├── cloudrun/               # Google Cloud Run deployment
+│   │   ├── deploy.sh           # Cloud deployment script
+│   │   ├── cloudbuild.yaml     # CI/CD pipeline config
+│   │   ├── service.yaml        # Cloud Run service definition
+│   │   ├── service-secrets.yaml # Service with Secret Manager
+│   │   ├── cloudrun-wrapper.sh # Cloud Run entrypoint
+│   │   ├── cloudrun-kernel-wrapper.sh # Alternative wrapper
+│   │   ├── supervisord-cloudrun.conf # Supervisor for Cloud Run
+│   │   └── nginx.conf          # Reverse proxy config
+│   └── local/                  # Local deployment
+│       └── run-local.sh        # Interactive Docker run script
+├── nginx/                      # Nginx configurations
+│   └── nginx-devtools.conf     # DevTools nginx config
+├── scripts/                    # Utility scripts
+│   ├── init-container.sh       # Auto-cleanup of lock files
+│   └── test-eval-server.sh     # Eval server build test
+├── supervisor/services/        # Service configs (overrides)
 ├── eval-server/
 │   └── nodejs/                 # Eval server (use this, NOT submodule)
 │       ├── src/                # API server, evaluation server, lib
@@ -444,19 +462,11 @@ web-agent/
 │   ├── run.py                  # Python evaluation runner
 │   ├── lib/judge.py            # Judge implementations
 │   └── data/                   # Evaluation YAML files
-├── scripts/
-│   └── init-container.sh       # Auto-cleanup of lock files
-├── supervisor/services/        # Service configs (overrides)
-├── Dockerfile.local            # Main Docker build
+├── Dockerfile.local            # Main Docker build (local dev)
 ├── Dockerfile.devtools         # DevTools frontend build
-├── docker-compose.yml          # Local deployment
-├── run-local.sh                # Interactive mode
-├── Makefile                    # Build commands
 ├── Dockerfile.cloudrun         # Cloud Run build
-├── nginx.conf                  # Reverse proxy config
-├── service.yaml                # Cloud Run service config
-├── cloudbuild.yaml             # CI/CD pipeline
-├── deploy.sh                   # Cloud deployment script
+├── docker-compose.yml          # Local deployment config
+├── Makefile                    # Build commands
 ├── CLAUDE.md                   # Technical documentation
 └── README.md                   # This file
 ```
@@ -541,13 +551,13 @@ The `cloudbuild.yaml` provides:
 
 ```bash
 # Normal build (with cache) - recommended for development
-gcloud builds submit --config cloudbuild.yaml
+gcloud builds submit --config deployment/cloudrun/cloudbuild.yaml
 
 # Force rebuild without cache - use when dependencies change
-gcloud builds submit --config cloudbuild.yaml --substitutions=_NO_CACHE=true
+gcloud builds submit --config deployment/cloudrun/cloudbuild.yaml --substitutions=_NO_CACHE=true
 
 # Automated deployment with Twilio TURN server setup
-./deploy.sh
+./deployment/cloudrun/deploy.sh
 ```
 
 ### Cache Control
diff --git a/cloudbuild.yaml b/deployment/cloudrun/cloudbuild.yaml
similarity index 100%
rename from cloudbuild.yaml
rename to deployment/cloudrun/cloudbuild.yaml
diff --git a/cloudrun-kernel-wrapper.sh b/deployment/cloudrun/cloudrun-kernel-wrapper.sh
similarity index 100%
rename from cloudrun-kernel-wrapper.sh
rename to deployment/cloudrun/cloudrun-kernel-wrapper.sh
diff --git a/cloudrun-wrapper.sh b/deployment/cloudrun/cloudrun-wrapper.sh
similarity index 100%
rename from cloudrun-wrapper.sh
rename to deployment/cloudrun/cloudrun-wrapper.sh
diff --git a/deploy.sh b/deployment/cloudrun/deploy.sh
similarity index 96%
rename from deploy.sh
rename to deployment/cloudrun/deploy.sh
index 55d344b..96c6a00 100755
--- a/deploy.sh
+++ b/deployment/cloudrun/deploy.sh
@@ -3,8 +3,16 @@
 set -euo pipefail
 
 # Kernel-Browser Cloud Run Deployment Script
+# NOTE: This script should be run from the project root directory
 echo "🚀 Starting kernel-browser deployment to Google Cloud Run..."
 
+# Verify we're in the project root
+if [ ! -f "docker-compose.yml" ]; then
+    echo "❌ Error: This script must be run from the project root directory"
+    echo "   Expected to find docker-compose.yml in current directory"
+    exit 1
+fi
+
 # Configuration
 PROJECT_ID="${PROJECT_ID:-}"
 REGION="${REGION:-us-central1}"
@@ -298,7 +306,7 @@ deploy_with_cloudbuild() {
     
     # Submit build
     gcloud builds submit \
-        --config=cloudbuild.yaml \
+        --config=deployment/cloudrun/cloudbuild.yaml \
         --project="$PROJECT_ID" \
         --timeout="2h" \
         --machine-type="e2-highcpu-32"
@@ -321,11 +329,11 @@ deploy_local() {
     info "Deploying to Cloud Run..."
     
     # Choose appropriate service.yaml based on secrets availability
-    local service_file="service.yaml"
+    local service_file="deployment/cloudrun/service.yaml"
     if [ "${USE_SECRETS:-false}" = "true" ]; then
         if gcloud secrets describe twilio-account-sid --project="$PROJECT_ID" &>/dev/null && \
            gcloud secrets describe twilio-auth-token --project="$PROJECT_ID" &>/dev/null; then
-            service_file="service-secrets.yaml"
+            service_file="deployment/cloudrun/service-secrets.yaml"
             info "Using service-secrets.yaml with Secret Manager references"
         else
             warning "Secrets not found, falling back to standard service.yaml"
diff --git a/nginx.conf b/deployment/cloudrun/nginx.conf
similarity index 100%
rename from nginx.conf
rename to deployment/cloudrun/nginx.conf
diff --git a/service-secrets.yaml b/deployment/cloudrun/service-secrets.yaml
similarity index 100%
rename from service-secrets.yaml
rename to deployment/cloudrun/service-secrets.yaml
diff --git a/service.yaml b/deployment/cloudrun/service.yaml
similarity index 100%
rename from service.yaml
rename to deployment/cloudrun/service.yaml
diff --git a/supervisord-cloudrun.conf b/deployment/cloudrun/supervisord-cloudrun.conf
similarity index 100%
rename from supervisord-cloudrun.conf
rename to deployment/cloudrun/supervisord-cloudrun.conf
diff --git a/run-local.sh b/deployment/local/run-local.sh
similarity index 94%
rename from run-local.sh
rename to deployment/local/run-local.sh
index d369bd4..a32d516 100755
--- a/run-local.sh
+++ b/deployment/local/run-local.sh
@@ -5,9 +5,10 @@ set -e -o pipefail
 
 echo "🚀 Starting kernel-browser (EXTENDED) locally using kernel-images run system..."
 
-# Ensure we're in the right directory
+# Get script directory and project root
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
-cd "$SCRIPT_DIR"
+PROJECT_ROOT=$(cd "$SCRIPT_DIR/../.." && pwd)
+cd "$PROJECT_ROOT"
 
 # Check if kernel-images submodule exists
 if [ ! -d "kernel-images" ] || [ ! -f "kernel-images/images/chromium-headful/run-docker.sh" ]; then
@@ -17,7 +18,7 @@ if [ ! -d "kernel-images" ] || [ ! -f "kernel-images/images/chromium-headful/run
 fi
 
 # Create local recordings directory
-mkdir -p "$SCRIPT_DIR/recordings"
+mkdir -p "$PROJECT_ROOT/recordings"
 
 # Change to kernel-images directory
 cd kernel-images/images/chromium-headful
@@ -45,7 +46,7 @@ echo "   Container: $NAME"
 echo "   WebRTC: $ENABLE_WEBRTC"
 echo "   DevTools UI: enabled"
 echo "   Run as root: $RUN_AS_ROOT"
-echo "   Recordings: $SCRIPT_DIR/recordings"
+echo "   Recordings: $PROJECT_ROOT/recordings"
 echo ""
 
 echo "🏃 Starting extended container with kernel-images run system..."
@@ -57,7 +58,7 @@ echo "🏃 Starting extended container with kernel-images run system..."
 source ../../shared/ensure-common-build-run-vars.sh chromium-headful
 
 # Directory on host where recordings will be saved
-HOST_RECORDINGS_DIR="$SCRIPT_DIR/recordings"
+HOST_RECORDINGS_DIR="$PROJECT_ROOT/recordings"
 mkdir -p "$HOST_RECORDINGS_DIR"
 
 # Chromium data directory for persistence
@@ -68,7 +69,7 @@ if [[ "${CHROMIUM_DATA_HOST+set}" == "set" && -z "$CHROMIUM_DATA_HOST" ]]; then
   CHROMIUM_DATA_VOLUME=""
 else
   # Default to ./chromium-data if not specified
-  CHROMIUM_DATA_HOST="${CHROMIUM_DATA_HOST:-$SCRIPT_DIR/chromium-data}"
+  CHROMIUM_DATA_HOST="${CHROMIUM_DATA_HOST:-$PROJECT_ROOT/chromium-data}"
   echo "🗂️  Using persistent Chromium data directory: $CHROMIUM_DATA_HOST"
   CHROMIUM_DATA_REAL=$(realpath "$CHROMIUM_DATA_HOST" 2>/dev/null || echo "")
   if [[ -z "$CHROMIUM_DATA_REAL" ]]; then
diff --git a/nginx-devtools-cloudrun.conf b/nginx-devtools-cloudrun.conf
deleted file mode 100644
index 0514c8b..0000000
--- a/nginx-devtools-cloudrun.conf
+++ /dev/null
@@ -1,105 +0,0 @@
-# nginx configuration for DevTools frontend in Cloud Run
-worker_processes 1;
-pid /tmp/nginx-devtools.pid;
-
-events {
-    worker_connections 1024;
-}
-
-http {
-    include /etc/nginx/mime.types;
-    default_type application/octet-stream;
-    
-    # Temporary paths for non-root execution
-    client_body_temp_path /tmp/nginx_devtools_client_temp;
-    proxy_temp_path /tmp/nginx_devtools_proxy_temp;
-    fastcgi_temp_path /tmp/nginx_devtools_fastcgi_temp;
-    uwsgi_temp_path /tmp/nginx_devtools_uwsgi_temp;
-    scgi_temp_path /tmp/nginx_devtools_scgi_temp;
-
-    # Logging
-    access_log /tmp/nginx-devtools-access.log;
-    error_log /tmp/nginx-devtools-error.log;
-
-    # Performance
-    sendfile on;
-    tcp_nopush on;
-    tcp_nodelay on;
-    keepalive_timeout 65;
-    types_hash_max_size 2048;
-
-    server {
-        listen 8001;
-        server_name localhost;
-
-        # Root directory for DevTools frontend
-        root /usr/share/nginx/devtools;
-        index inspector.html devtools_app.html index.html;
-
-        # Compression
-        gzip on;
-        gzip_vary on;
-        gzip_min_length 1024;
-        gzip_types text/plain text/css text/xml text/javascript application/javascript application/xml+rss application/json application/wasm;
-
-        # Security headers
-        add_header X-Frame-Options "SAMEORIGIN" always;
-        add_header X-Content-Type-Options "nosniff" always;
-        add_header X-XSS-Protection "1; mode=block" always;
-
-        # CORS headers for DevTools
-        add_header Access-Control-Allow-Origin "*" always;
-        add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always;
-        add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range" always;
-
-        # Handle OPTIONS requests
-        if ($request_method = 'OPTIONS') {
-            return 204;
-        }
-
-        # Cache control for static assets
-        location ~* \.(jpg|jpeg|png|gif|ico|css|js|svg|woff|woff2|ttf|eot|avif)$ {
-            expires 1d;
-            add_header Cache-Control "public, immutable";
-        }
-
-        # Specific handling for WebAssembly files
-        location ~ \.wasm$ {
-            add_header Content-Type application/wasm;
-        }
-
-        # JSON files
-        location ~ \.json$ {
-            add_header Content-Type application/json;
-        }
-
-        # Main location
-        location / {
-            try_files $uri $uri/ /index.html;
-        }
-
-        # Specific paths for DevTools
-        location /front_end/ {
-            alias /usr/share/nginx/devtools/;
-            try_files $uri $uri/ =404;
-        }
-
-        # Health check for DevTools service
-        location /health {
-            access_log off;
-            add_header Content-Type application/json;
-            return 200 '{"status": "healthy", "service": "devtools-frontend-cloudrun"}';
-        }
-
-        # Error pages
-        error_page 404 /404.html;
-        location = /404.html {
-            internal;
-        }
-
-        error_page 500 502 503 504 /50x.html;
-        location = /50x.html {
-            internal;
-        }
-    }
-}
\ No newline at end of file
diff --git a/nginx-devtools.conf b/nginx/nginx-devtools.conf
similarity index 100%
rename from nginx-devtools.conf
rename to nginx/nginx-devtools.conf
diff --git a/scripts/cleanup-chromium-locks.sh b/scripts/cleanup-chromium-locks.sh
deleted file mode 100644
index 6dd678a..0000000
--- a/scripts/cleanup-chromium-locks.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Cleanup Chromium lock files before starting services
-# This prevents "profile in use" errors after container restarts
-
-set -e
-
-echo "🧹 Cleaning up Chromium lock files..."
-
-# Remove lock files from persistent data directory
-rm -f /data/user-data/SingletonLock \
-      /data/user-data/SingletonSocket \
-      /data/user-data/SingletonCookie \
-      2>/dev/null || true
-
-# Remove X11 lock files
-rm -f /tmp/.X*-lock 2>/dev/null || true
-
-echo "✅ Chromium lock cleanup complete"
diff --git a/test-eval-server.sh b/scripts/test-eval-server.sh
similarity index 100%
rename from test-eval-server.sh
rename to scripts/test-eval-server.sh
diff --git a/scripts/wrapper-with-cleanup.sh b/scripts/wrapper-with-cleanup.sh
deleted file mode 100644
index efab2e1..0000000
--- a/scripts/wrapper-with-cleanup.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Wrapper script extension that adds Chromium lock cleanup
-# This script will be injected into the Docker image
-
-# Add this right after supervisord starts
-cleanup_chromium_locks() {
-  echo "[wrapper] 🧹 Cleaning up Chromium lock files..."
-
-  # Remove Chromium profile locks from persistent data directory
-  rm -f /data/user-data/SingletonLock \
-        /data/user-data/SingletonSocket \
-        /data/user-data/SingletonCookie \
-        2>/dev/null || true
-
-  # Remove X11 lock files from /tmp
-  rm -f /tmp/.X*-lock 2>/dev/null || true
-
-  echo "[wrapper] ✅ Chromium lock cleanup complete"
-}
-
-# Export the function so it can be called from the main wrapper
-export -f cleanup_chromium_locks

From 0c8530da0ea4fb012958610cf347242ee42f0f64 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 21:26:02 -0500
Subject: [PATCH 21/24] Refactoring and proper naming

---
 .gitignore                                    |  10 +-
 Dockerfile.cloudrun                           |  30 ++--
 Dockerfile.devtools                           |   2 +-
 Dockerfile.local                              |  14 +-
 Makefile                                      |  18 +--
 .../.env.example                              |   0
 .../.gitignore                                |   0
 .../README.md                                 |   0
 .../nodejs/.env.example                       |   0
 .../nodejs/CLAUDE.md                          |   0
 .../nodejs/README.md                          |   0
 .../nodejs/package-lock.json                  |   0
 .../nodejs/package.json                       |   8 +-
 .../nodejs/src/api-server.js                  |  68 ++++-----
 .../nodejs/src/client-manager.js              |   6 +-
 .../nodejs/src/config.js                      |   2 +-
 .../nodejs/src/lib/BrowserAgentServer.js      | 136 +++++++++---------
 .../nodejs/src/lib/HTTPWrapper.js             |  32 ++---
 .../nodejs/src/lib/RequestStack.js            |  85 +++++++++++
 .../nodejs/src/lib/judges/Judge.js            |   0
 .../nodejs/src/lib/judges/LLMJudge.js         |   0
 .../nodejs/src/logger.js                      |  63 ++++----
 .../nodejs/src/rpc-client.js                  |   0
 .../nodejs/start.js                           |  14 +-
 .../start.js                                  |   2 +-
 deployment/cloudrun/service-secrets.yaml      |   2 +-
 deployment/cloudrun/service.yaml              |   2 +-
 docker-compose.yml                            |   4 +-
 eval-server/nodejs/src/lib/EvaluationStack.js |  85 -----------
 .../action-agent/action-agent-login-001.yaml  |   3 +-
 scripts/test-browser-agent-server.sh          |  30 ++++
 scripts/test-eval-server.sh                   |  30 ----
 ...-server.conf => browser-agent-server.conf} |   8 +-
 supervisor/services/browser-agent-server.conf |   8 ++
 supervisor/services/eval-server.conf          |   8 --
 35 files changed, 344 insertions(+), 326 deletions(-)
 rename {eval-server => browser-agent-server}/.env.example (100%)
 rename {eval-server => browser-agent-server}/.gitignore (100%)
 rename {eval-server => browser-agent-server}/README.md (100%)
 rename {eval-server => browser-agent-server}/nodejs/.env.example (100%)
 rename {eval-server => browser-agent-server}/nodejs/CLAUDE.md (100%)
 rename {eval-server => browser-agent-server}/nodejs/README.md (100%)
 rename {eval-server => browser-agent-server}/nodejs/package-lock.json (100%)
 rename {eval-server => browser-agent-server}/nodejs/package.json (78%)
 rename {eval-server => browser-agent-server}/nodejs/src/api-server.js (88%)
 rename {eval-server => browser-agent-server}/nodejs/src/client-manager.js (98%)
 rename {eval-server => browser-agent-server}/nodejs/src/config.js (95%)
 rename eval-server/nodejs/src/lib/EvalServer.js => browser-agent-server/nodejs/src/lib/BrowserAgentServer.js (92%)
 rename {eval-server => browser-agent-server}/nodejs/src/lib/HTTPWrapper.js (66%)
 create mode 100644 browser-agent-server/nodejs/src/lib/RequestStack.js
 rename {eval-server => browser-agent-server}/nodejs/src/lib/judges/Judge.js (100%)
 rename {eval-server => browser-agent-server}/nodejs/src/lib/judges/LLMJudge.js (100%)
 rename {eval-server => browser-agent-server}/nodejs/src/logger.js (59%)
 rename {eval-server => browser-agent-server}/nodejs/src/rpc-client.js (100%)
 rename {eval-server => browser-agent-server}/nodejs/start.js (66%)
 rename eval-server-start.js => browser-agent-server/start.js (96%)
 delete mode 100644 eval-server/nodejs/src/lib/EvaluationStack.js
 create mode 100755 scripts/test-browser-agent-server.sh
 delete mode 100755 scripts/test-eval-server.sh
 rename supervisor/services-cloudrun/{eval-server.conf => browser-agent-server.conf} (55%)
 create mode 100644 supervisor/services/browser-agent-server.conf
 delete mode 100644 supervisor/services/eval-server.conf

diff --git a/.gitignore b/.gitignore
index 35f7d1f..b86fd31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,7 +56,7 @@ service-account-key.json
 # Chromium persistent data (deprecated, now in @mount/)
 chromium-data/
 
-# All mounted volumes (recordings, chromium-data, eval-server, etc.)
+# All mounted volumes (recordings, chromium-data, browser-agent-server, etc.)
 @mount/
 
 # Browser Operator DevTools build artifacts
@@ -66,10 +66,10 @@ browser-operator-core/.devtools-built
 browser-operator-core/.devtools-base-built
 
 # Eval server runtime files
-eval-server/nodejs/clients/
-eval-server/nodejs/logs/
-eval-server/nodejs/node_modules/
-eval-server/nodejs/.env
+browser-agent-server/nodejs/clients/
+browser-agent-server/nodejs/logs/
+browser-agent-server/nodejs/node_modules/
+browser-agent-server/nodejs/.env
 
 # Evaluation screenshots
 evals/screenshots/
\ No newline at end of file
diff --git a/Dockerfile.cloudrun b/Dockerfile.cloudrun
index 5156d27..d5a37a3 100644
--- a/Dockerfile.cloudrun
+++ b/Dockerfile.cloudrun
@@ -56,11 +56,11 @@ RUN sed -i 's/AUTOMATED_MODE: false/AUTOMATED_MODE: true/' front_end/panels/ai_c
 RUN npm run build
 
 # Eval-Server build stage
-FROM node:22-bullseye-slim AS eval-server-builder
-WORKDIR /eval-server
-COPY eval-server/nodejs/package*.json ./
+FROM node:22-bullseye-slim AS browser-agent-server-builder
+WORKDIR /browser-agent-server
+COPY browser-agent-server/nodejs/package*.json ./
 RUN npm install --production
-COPY eval-server/nodejs/ ./
+COPY browser-agent-server/nodejs/ ./
 
 # Multi-stage build using kernel-images as base
 FROM docker.io/golang:1.25.0 AS server-builder
@@ -154,10 +154,10 @@ RUN apt-get update && \
     nginx \
     # PPA req
     software-properties-common \
-    # Node.js for eval-server
+    # Node.js for browser-agent-server
     ca-certificates \
     gnupg && \
-    # Install Node.js 22.x for eval-server
+    # Install Node.js 22.x for browser-agent-server
     mkdir -p /etc/apt/keyrings && \
     curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
     echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_22.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
@@ -279,15 +279,15 @@ RUN chown -R kernel:kernel /usr/share/nginx/devtools
 # Eval-Server Integration
 # ============================================================================
 
-# Copy eval-server from builder
-COPY --from=eval-server-builder /eval-server /opt/eval-server
+# Copy browser-agent-server from builder
+COPY --from=browser-agent-server-builder /browser-agent-server /opt/browser-agent-server
 
-# Copy custom eval-server startup script INTO eval-server directory
-COPY eval-server-start.js /opt/eval-server/start-cloudrun.js
-RUN chmod +x /opt/eval-server/start-cloudrun.js
+# Copy custom browser-agent-server startup script INTO browser-agent-server directory
+COPY browser-agent-server/start.js /opt/browser-agent-server/start-cloudrun.js
+RUN chmod +x /opt/browser-agent-server/start-cloudrun.js
 
-# Set permissions for eval-server
-RUN chown -R kernel:kernel /opt/eval-server
+# Set permissions for browser-agent-server
+RUN chown -R kernel:kernel /opt/browser-agent-server
 
 # Cloud Run specific: wrapper scripts (nginx config is inline)
 # DO NOT copy nginx.conf to avoid auto-start conflicts
@@ -301,7 +301,7 @@ COPY supervisor/services-cloudrun/xorg.conf /etc/supervisor/conf.d/services-clou
 COPY supervisor/services-cloudrun/neko.conf /etc/supervisor/conf.d/services-cloudrun/neko.conf
 COPY supervisor/services-cloudrun/chromium.conf /etc/supervisor/conf.d/services-cloudrun/chromium.conf
 COPY supervisor/services-cloudrun/devtools-frontend.conf /etc/supervisor/conf.d/services-cloudrun/devtools-frontend.conf
-COPY supervisor/services-cloudrun/eval-server.conf /etc/supervisor/conf.d/services-cloudrun/eval-server.conf
+COPY supervisor/services-cloudrun/browser-agent-server.conf /etc/supervisor/conf.d/services-cloudrun/browser-agent-server.conf
 
 # Create nginx temp directories for non-root execution
 RUN mkdir -p /tmp/nginx_client_temp /tmp/nginx_proxy_temp /tmp/nginx_fastcgi_temp \
@@ -313,7 +313,7 @@ RUN mkdir -p /tmp/nginx_client_temp /tmp/nginx_proxy_temp /tmp/nginx_fastcgi_tem
 # Create supervisor log directories
 RUN mkdir -p /var/log/supervisord/chromium /var/log/supervisord/neko /var/log/supervisord/xorg \
     /var/log/supervisord/dbus /var/log/supervisord/kernel-images-api /var/log/supervisord/mutter \
-    /var/log/supervisord/nginx /var/log/supervisord/devtools-frontend /var/log/supervisord/eval-server && \
+    /var/log/supervisord/nginx /var/log/supervisord/devtools-frontend /var/log/supervisord/browser-agent-server && \
     chown -R kernel:kernel /var/log/supervisord
 
 # Create health check endpoint
diff --git a/Dockerfile.devtools b/Dockerfile.devtools
index 9f9461b..ca04745 100644
--- a/Dockerfile.devtools
+++ b/Dockerfile.devtools
@@ -69,7 +69,7 @@ FROM devtools-base AS devtools-local
 # Copy local changes from browser-operator-core submodule FIRST
 # This happens before checking out upstream, so we copy over the upstream code
 COPY browser-operator-core/front_end /workspace/devtools/devtools-frontend/front_end
-COPY eval-server /workspace/devtools/devtools-frontend/eval-server
+COPY browser-agent-server /workspace/devtools/devtools-frontend/browser-agent-server
 
 WORKDIR /workspace/devtools/devtools-frontend
 
diff --git a/Dockerfile.local b/Dockerfile.local
index 19a688a..ad69d1b 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -12,14 +12,14 @@ FROM browser-operator-devtools:latest AS devtools-source
 # ============================================================================
 # Eval Server build stage
 # ============================================================================
-FROM --platform=linux/arm64 node:18-alpine AS eval-server-builder
+FROM --platform=linux/arm64 node:18-alpine AS browser-agent-server-builder
 
 WORKDIR /workspace
 
 # Copy eval server from browser-operator-core submodule
-COPY eval-server/nodejs /workspace/eval-server
+COPY browser-agent-server/nodejs /workspace/browser-agent-server
 
-WORKDIR /workspace/eval-server
+WORKDIR /workspace/browser-agent-server
 
 # Install dependencies
 RUN npm install
@@ -229,7 +229,7 @@ RUN ln -s /etc/nginx/sites-available/devtools /etc/nginx/sites-enabled/devtools
 COPY supervisor/services/nginx-devtools.conf /etc/supervisor/conf.d/services/nginx-devtools.conf
 
 # Add eval server service to supervisor
-COPY supervisor/services/eval-server.conf /etc/supervisor/conf.d/services/eval-server.conf
+COPY supervisor/services/browser-agent-server.conf /etc/supervisor/conf.d/services/browser-agent-server.conf
 
 # Add neko service to supervisor (configured for port 8000)
 COPY supervisor/services/neko.conf /etc/supervisor/conf.d/services/neko.conf
@@ -250,7 +250,7 @@ RUN useradd -m -s /bin/bash kernel
 # ============================================================================
 
 # Copy eval server from builder
-COPY --from=eval-server-builder /workspace/eval-server /opt/eval-server
+COPY --from=browser-agent-server-builder /workspace/browser-agent-server /opt/browser-agent-server
 
 # Install Node.js in final image for eval server
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
@@ -258,8 +258,8 @@ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
     rm -rf /var/lib/apt/lists/*
 
 # Create eval server startup script
-RUN echo '#!/bin/bash\ncd /opt/eval-server && node start.js' > /usr/local/bin/start-eval-server.sh && \
-    chmod +x /usr/local/bin/start-eval-server.sh
+RUN echo '#!/bin/bash\ncd /opt/browser-agent-server && node start.js' > /usr/local/bin/start-browser-agent-server.sh && \
+    chmod +x /usr/local/bin/start-browser-agent-server.sh
 
 # ============================================================================
 # Chromium Data Directory Configuration
diff --git a/Makefile b/Makefile
index eefdc84..68bb4cb 100644
--- a/Makefile
+++ b/Makefile
@@ -120,15 +120,15 @@ shell: ## Get shell access to running container
 info: ## Show connection information
 	@echo ""
 	@echo "🌐 Service Access Points:"
-	@echo "   WebRTC Client:        http://localhost:8000"
-	@echo "   Eval Server API:      http://localhost:8081"
-	@echo "   Chrome DevTools:      http://localhost:9222/json"
-	@echo "   Recording API:        http://localhost:444/api"
-	@echo "   Enhanced DevTools UI: http://localhost:8001"
-	@echo "   DevTools Health:      http://localhost:8001/health"
-
-test: ## Test eval API with simple math eval
-	@echo "🧪 Testing Eval Server API..."
+	@echo "   WebRTC Client:              http://localhost:8000"
+	@echo "   Browser Agent Server API:   http://localhost:8081"
+	@echo "   Chrome DevTools:            http://localhost:9222/json"
+	@echo "   Recording API:              http://localhost:444/api"
+	@echo "   Enhanced DevTools UI:       http://localhost:8001"
+	@echo "   DevTools Health:            http://localhost:8001/health"
+
+test: ## Test Browser Agent Server API with simple eval
+	@echo "🧪 Testing Browser Agent Server API..."
 	@echo ""
 	@echo "1️⃣  Checking API endpoint..."
 	@curl -s -o /dev/null -w "   Status: %{http_code}\n" http://localhost:8080/status || (echo "   ❌ API not responding"; exit 1)
diff --git a/eval-server/.env.example b/browser-agent-server/.env.example
similarity index 100%
rename from eval-server/.env.example
rename to browser-agent-server/.env.example
diff --git a/eval-server/.gitignore b/browser-agent-server/.gitignore
similarity index 100%
rename from eval-server/.gitignore
rename to browser-agent-server/.gitignore
diff --git a/eval-server/README.md b/browser-agent-server/README.md
similarity index 100%
rename from eval-server/README.md
rename to browser-agent-server/README.md
diff --git a/eval-server/nodejs/.env.example b/browser-agent-server/nodejs/.env.example
similarity index 100%
rename from eval-server/nodejs/.env.example
rename to browser-agent-server/nodejs/.env.example
diff --git a/eval-server/nodejs/CLAUDE.md b/browser-agent-server/nodejs/CLAUDE.md
similarity index 100%
rename from eval-server/nodejs/CLAUDE.md
rename to browser-agent-server/nodejs/CLAUDE.md
diff --git a/eval-server/nodejs/README.md b/browser-agent-server/nodejs/README.md
similarity index 100%
rename from eval-server/nodejs/README.md
rename to browser-agent-server/nodejs/README.md
diff --git a/eval-server/nodejs/package-lock.json b/browser-agent-server/nodejs/package-lock.json
similarity index 100%
rename from eval-server/nodejs/package-lock.json
rename to browser-agent-server/nodejs/package-lock.json
diff --git a/eval-server/nodejs/package.json b/browser-agent-server/nodejs/package.json
similarity index 78%
rename from eval-server/nodejs/package.json
rename to browser-agent-server/nodejs/package.json
index add45c2..44d5270 100644
--- a/eval-server/nodejs/package.json
+++ b/browser-agent-server/nodejs/package.json
@@ -2,15 +2,15 @@
   "name": "bo-eval-server",
   "version": "1.0.0",
   "description": "HTTP API wrapper for Browser Operator - WebSocket server with CDP integration",
-  "main": "src/lib/EvalServer.js",
+  "main": "src/lib/BrowserAgentServer.js",
   "type": "module",
   "exports": {
-    ".": "./src/lib/EvalServer.js",
-    "./EvalServer": "./src/lib/EvalServer.js",
+    ".": "./src/lib/BrowserAgentServer.js",
+    "./BrowserAgentServer": "./src/lib/BrowserAgentServer.js",
     "./HTTPWrapper": "./src/lib/HTTPWrapper.js"
   },
   "scripts": {
-    "start": "node src/lib/EvalServer.js"
+    "start": "node start.js"
   },
   "keywords": [
     "websocket",
diff --git a/eval-server/nodejs/src/api-server.js b/browser-agent-server/nodejs/src/api-server.js
similarity index 88%
rename from eval-server/nodejs/src/api-server.js
rename to browser-agent-server/nodejs/src/api-server.js
index 6e72fa7..bca2726 100644
--- a/eval-server/nodejs/src/api-server.js
+++ b/browser-agent-server/nodejs/src/api-server.js
@@ -10,11 +10,11 @@ import yaml from 'js-yaml';
 import { v4 as uuidv4 } from 'uuid';
 
 import logger from './logger.js';
-// No need to import EvaluationServer - it's passed as constructor parameter
+// No need to import BrowserAgentServer - it's passed as constructor parameter
 
 class APIServer {
-  constructor(evaluationServer, port = 8081) {
-    this.evaluationServer = evaluationServer;
+  constructor(browserAgentServer, port = 8081) {
+    this.browserAgentServer = browserAgentServer;
     this.port = port;
     this.server = null;
     this.configDefaults = null;
@@ -162,26 +162,26 @@ class APIServer {
   }
 
   getStatus() {
-    const status = this.evaluationServer.getStatus();
-    const clients = this.evaluationServer.getClientManager().getAllClients();
+    const status = this.browserAgentServer.getStatus();
+    const clients = this.browserAgentServer.getClientManager().getAllClients();
 
     return {
       server: status,
       clients: clients.map(client => ({
         id: client.id,
         name: client.name,
-        connected: this.evaluationServer.connectedClients.has(client.id),
-        ready: this.evaluationServer.connectedClients.get(client.id)?.ready || false
+        connected: this.browserAgentServer.connectedClients.has(client.id),
+        ready: this.browserAgentServer.connectedClients.get(client.id)?.ready || false
       }))
     };
   }
 
   getClients() {
-    const clients = this.evaluationServer.getClientManager().getAllClients();
-    const connectedClients = this.evaluationServer.connectedClients;
+    const clients = this.browserAgentServer.getClientManager().getAllClients();
+    const connectedClients = this.browserAgentServer.connectedClients;
 
     return clients.map(client => {
-      const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
+      const tabs = this.browserAgentServer.getClientManager().getClientTabs(client.id);
 
       return {
         id: client.id,
@@ -205,9 +205,9 @@ class APIServer {
       throw new Error('Client ID is required');
     }
 
-    const tabs = this.evaluationServer.getClientManager().getClientTabs(clientId);
-    const connectedClients = this.evaluationServer.connectedClients;
-    const client = this.evaluationServer.getClientManager().getClient(clientId);
+    const tabs = this.browserAgentServer.getClientManager().getClientTabs(clientId);
+    const connectedClients = this.browserAgentServer.connectedClients;
+    const client = this.browserAgentServer.getClientManager().getClient(clientId);
 
     if (!client) {
       throw new Error(`Client '${clientId}' not found`);
@@ -239,7 +239,7 @@ class APIServer {
     // Just extract the baseClientId (first part before colon if composite, or the whole ID)
     const baseClientId = clientId.split(':')[0];
 
-    const result = await this.evaluationServer.openTab(baseClientId, { url, background });
+    const result = await this.browserAgentServer.openTab(baseClientId, { url, background });
 
     return {
       clientId: baseClientId,
@@ -265,7 +265,7 @@ class APIServer {
     // Just extract the baseClientId
     const baseClientId = clientId.split(':')[0];
 
-    const result = await this.evaluationServer.closeTab(baseClientId, { tabId });
+    const result = await this.browserAgentServer.closeTab(baseClientId, { tabId });
 
     return {
       clientId: baseClientId,
@@ -296,8 +296,8 @@ class APIServer {
 
     // Call appropriate method based on format
     const result = format === 'html'
-      ? await this.evaluationServer.getPageHTML(tabId)
-      : await this.evaluationServer.getPageText(tabId);
+      ? await this.browserAgentServer.getPageHTML(tabId)
+      : await this.browserAgentServer.getPageText(tabId);
 
     return {
       clientId: baseClientId,
@@ -324,7 +324,7 @@ class APIServer {
 
     logger.info('Capturing screenshot', { baseClientId, tabId, fullPage });
 
-    const result = await this.evaluationServer.captureScreenshot(tabId, { fullPage });
+    const result = await this.browserAgentServer.captureScreenshot(tabId, { fullPage });
 
     return {
       clientId: baseClientId,
@@ -373,7 +373,7 @@ class APIServer {
 
       // Open a new tab for this request at the specified URL
       logger.info('Opening new tab for responses request', { baseClientId, url: targetUrl });
-      const tabResult = await this.evaluationServer.openTab(baseClientId, {
+      const tabResult = await this.browserAgentServer.openTab(baseClientId, {
         url: targetUrl,
         background: false
       });
@@ -392,19 +392,19 @@ class APIServer {
         await new Promise(resolve => setTimeout(resolve, waitTimeout));
       }
 
-      // Create a dynamic evaluation for this request
-      const evaluation = this.createDynamicEvaluationNested(requestBody.input, nestedModelConfig);
+      // Create a dynamic request for this request
+      const request = this.createDynamicRequestNested(requestBody.input, nestedModelConfig);
 
-      // Execute the evaluation on the new tab's DevTools client
-      logger.info('Executing evaluation on new tab', {
+      // Execute the request on the new tab's DevTools client
+      logger.info('Executing request on new tab', {
         compositeClientId: tabResult.compositeClientId,
-        evaluationId: evaluation.id
+        requestId: request.id
       });
 
-      const result = await this.evaluationServer.executeEvaluation(tabClient, evaluation);
+      const result = await this.browserAgentServer.executeRequest(tabClient, request);
 
       // Debug: log the result structure
-      logger.debug('executeEvaluation result:', result);
+      logger.debug('executeRequest result:', result);
 
       // Extract the response text from the result
       const responseText = this.extractResponseText(result);
@@ -468,7 +468,7 @@ class APIServer {
    * Find a connected and ready client
    */
   findReadyClient() {
-    for (const [clientId, connection] of this.evaluationServer.connectedClients) {
+    for (const [clientId, connection] of this.browserAgentServer.connectedClients) {
       if (connection.ready) {
         return connection;
       }
@@ -481,11 +481,11 @@ class APIServer {
    * @returns {string} Base client ID
    */
   findClientWithTabs() {
-    const clients = this.evaluationServer.getClientManager().getAllClients();
+    const clients = this.browserAgentServer.getClientManager().getAllClients();
 
     // First, try to find a client with existing tabs
     for (const client of clients) {
-      const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
+      const tabs = this.browserAgentServer.getClientManager().getClientTabs(client.id);
       if (tabs.length > 0) {
         logger.info('Found client with tabs', { clientId: client.id, tabCount: tabs.length });
         return client.id;
@@ -514,7 +514,7 @@ class APIServer {
     logger.info('Waiting for client connection', { compositeClientId, maxWaitMs });
 
     while (Date.now() - startTime < maxWaitMs) {
-      const connection = this.evaluationServer.connectedClients.get(compositeClientId);
+      const connection = this.browserAgentServer.connectedClients.get(compositeClientId);
 
       if (connection && connection.ready) {
         logger.info('Client connection established and ready', {
@@ -537,13 +537,13 @@ class APIServer {
    * @param {import('./types/model-config').ModelConfig} nestedModelConfig - Model configuration
    * @returns {import('./types/model-config').EvaluationRequest} Evaluation request object
    */
-  createDynamicEvaluationNested(input, nestedModelConfig) {
-    const evaluationId = `api-eval-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
+  createDynamicRequestNested(input, nestedModelConfig) {
+    const requestId = `api-req-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
 
     return {
-      id: evaluationId,
+      id: requestId,
       name: 'API Request',
-      description: 'Dynamic evaluation created from API request',
+      description: 'Dynamic request created from API request',
       enabled: true,
       tool: 'chat',
       timeout: 7200000, // 2 hours (increased for slow custom API)
diff --git a/eval-server/nodejs/src/client-manager.js b/browser-agent-server/nodejs/src/client-manager.js
similarity index 98%
rename from eval-server/nodejs/src/client-manager.js
rename to browser-agent-server/nodejs/src/client-manager.js
index 78f4c60..b0d4ca6 100644
--- a/eval-server/nodejs/src/client-manager.js
+++ b/browser-agent-server/nodejs/src/client-manager.js
@@ -124,16 +124,16 @@ class ClientManager {
   async createClientWithId(clientId, clientName, secretKey = null) {
     const yamlPath = path.join(this.clientsDir, `${clientId}.yaml`);
     
-    // Create simplified client configuration (evaluations come from evals directory)
+    // Create simplified client configuration
     const defaultConfig = {
       client: {
         id: clientId,
         name: clientName,
         secret_key: secretKey,
-        description: `Auto-generated DevTools evaluation client`
+        description: `Auto-generated DevTools client`
       },
       settings: {
-        max_concurrent_evaluations: 3,
+        max_concurrent_requests: 3,
         default_timeout: 45000,
         retry_policy: {
           max_retries: 2,
diff --git a/eval-server/nodejs/src/config.js b/browser-agent-server/nodejs/src/config.js
similarity index 95%
rename from eval-server/nodejs/src/config.js
rename to browser-agent-server/nodejs/src/config.js
index b8945f9..b273dcd 100644
--- a/eval-server/nodejs/src/config.js
+++ b/browser-agent-server/nodejs/src/config.js
@@ -46,7 +46,7 @@ export const CONFIG = {
 
   rpc: {
     timeout: parseInt(process.env.RPC_TIMEOUT) || 7200000, // 2 hours default (increased for slow custom API)
-    maxConcurrentEvaluations: parseInt(process.env.MAX_CONCURRENT_EVALUATIONS) || 10
+    maxConcurrentRequests: parseInt(process.env.MAX_CONCURRENT_REQUESTS) || 10
   },
 
   security: {
diff --git a/eval-server/nodejs/src/lib/EvalServer.js b/browser-agent-server/nodejs/src/lib/BrowserAgentServer.js
similarity index 92%
rename from eval-server/nodejs/src/lib/EvalServer.js
rename to browser-agent-server/nodejs/src/lib/BrowserAgentServer.js
index 7c87249..f4a43b2 100644
--- a/eval-server/nodejs/src/lib/EvalServer.js
+++ b/browser-agent-server/nodejs/src/lib/BrowserAgentServer.js
@@ -8,39 +8,39 @@ import { WebSocketServer } from 'ws';
 
 import { ClientManager } from '../client-manager.js';
 import { CONFIG, validateConfig } from '../config.js';
-import logger, { logConnection, logEvaluation } from '../logger.js';
+import logger, { logConnection, logRequest } from '../logger.js';
 import { RpcClient } from '../rpc-client.js';
 
 /**
- * EvalServer - A library for programmatically managing evaluation servers
- * 
+ * BrowserAgentServer - OpenAI-compatible HTTP API wrapper for Browser Operator
+ *
  * Example usage:
  * ```js
- * const server = new EvalServer({
+ * const server = new BrowserAgentServer({
  *   authKey: 'your-secret-key',
  *   host: '127.0.0.1',
  *   port: 8080
  * });
- * 
+ *
  * server.onConnect(client => {
  *   console.log(`Client connected: ${client.id}`);
- *   
- *   client.evaluate({
- *     id: "test_eval",
- *     name: "Bloomberg Eval",
- *     description: "Test Eval for Bloomberg website",
+ *
+ *   client.execute({
+ *     id: "test_request",
+ *     name: "Bloomberg Task",
+ *     description: "Navigate to Bloomberg and summarize latest news",
  *     input: {
  *       objective: "Navigate to Bloomberg, summarize and return sentiment of the latest news."
  *     }
  *   }).then(response => {
- *     console.log('Evaluation response:', response);
+ *     console.log('Request response:', response);
  *   });
  * });
- * 
+ *
  * server.start();
  * ```
  */
-export class EvalServer extends EventEmitter {
+export class BrowserAgentServer extends EventEmitter {
   constructor(options = {}) {
     super();
     
@@ -65,7 +65,7 @@ export class EvalServer extends EventEmitter {
   }
 
   /**
-   * Start the evaluation server
+   * Start the browser agent server
    */
   async start() {
     if (this.isRunning) {
@@ -91,14 +91,14 @@ export class EvalServer extends EventEmitter {
     });
 
     this.isRunning = true;
-    logger.info(`Evaluation server started on ws://${this.config.host}:${this.config.port}`);
+    logger.info(`Browser agent server started on ws://${this.config.host}:${this.config.port}`);
     this.emit('started', { host: this.config.host, port: this.config.port });
 
     return this;
   }
 
   /**
-   * Stop the evaluation server
+   * Stop the browser agent server
    */
   async stop() {
     if (!this.isRunning) {
@@ -120,7 +120,7 @@ export class EvalServer extends EventEmitter {
     this.connectedClients.clear();
 
     this.isRunning = false;
-    logger.info('Evaluation server stopped');
+    logger.info('Browser agent server stopped');
     this.emit('stopped');
   }
 
@@ -143,8 +143,8 @@ export class EvalServer extends EventEmitter {
   }
 
   /**
-   * Set the judge for evaluations (optional)
-   * @param {Judge} judge - Judge instance for evaluation validation
+   * Set the judge for request validation (optional)
+   * @param {Judge} judge - Judge instance for request validation
    */
   setJudge(judge) {
     // If server is already running, validate LLM config when setting judge
@@ -154,7 +154,7 @@ export class EvalServer extends EventEmitter {
         throw new Error(`Cannot set judge: ${configErrors.join(', ')}`);
       }
     }
-    
+
     this.judge = judge;
     return this;
   }
@@ -280,7 +280,7 @@ export class EvalServer extends EventEmitter {
             return;
           }
           connection.ready = true;
-          logger.info('Client ready for evaluations', {
+          logger.info('Client ready for requests', {
             clientId: connection.clientId
           });
           
@@ -575,7 +575,7 @@ export class EvalServer extends EventEmitter {
         clientId,
         status: 'accepted',
         message: result.clientName ? `Welcome ${result.clientName}` : 'Client authenticated successfully',
-        evaluationsCount: result.evaluationsCount,
+        requestsCount: result.requestsCount,
         tabId: tabId,
         isComposite: isComposite
       });
@@ -604,11 +604,11 @@ export class EvalServer extends EventEmitter {
   handleStatusUpdate(connection, data) {
     if (!connection.registered) return;
 
-    const { evaluationId, status, progress, message } = data;
+    const { requestId, status, progress, message } = data;
 
-    logger.info('Evaluation status update', {
+    logger.info('Request status update', {
       clientId: connection.clientId,
-      evaluationId,
+      requestId,
       status,
       progress,
       message
@@ -673,21 +673,21 @@ export class EvalServer extends EventEmitter {
   }
 
   /**
-   * Execute evaluation on a specific client
+   * Execute request on a specific client
    */
-  async executeEvaluation(connection, evaluation) {
+  async executeRequest(connection, request) {
     const startTime = Date.now();
     const rpcId = `rpc-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
 
     try {
-      logger.info('Starting evaluation', {
+      logger.info('Starting request', {
         clientId: connection.clientId,
-        evaluationId: evaluation.id,
-        tool: evaluation.tool
+        requestId: request.id,
+        tool: request.tool
       });
 
-      // Prepare model configuration - use client config if available, otherwise evaluation config, otherwise defaults
-      let modelConfig = evaluation.model || {};
+      // Prepare model configuration - use client config if available, otherwise request config, otherwise defaults
+      let modelConfig = request.model || {};
 
       if (connection.llmConfig) {
         // New nested format: separate config objects for each model tier
@@ -710,7 +710,7 @@ export class EvalServer extends EventEmitter {
             api_key: connection.llmConfig.apiKey,
             endpoint: connection.llmConfig.endpoint
           },
-          // Include any evaluation-specific overrides
+          // Include any request-specific overrides
           ...modelConfig
         };
       }
@@ -720,16 +720,16 @@ export class EvalServer extends EventEmitter {
         jsonrpc: '2.0',
         method: 'evaluate',
         params: {
-          evaluationId: evaluation.id,
-          name: evaluation.name,
-          url: evaluation.target?.url || evaluation.url,
-          tool: evaluation.tool,
-          input: evaluation.input,
+          requestId: request.id,
+          name: request.name,
+          url: request.target?.url || request.url,
+          tool: request.tool,
+          input: request.input,
           model: modelConfig,
-          timeout: evaluation.timeout || 30000,
+          timeout: request.timeout || 30000,
           metadata: {
-            tags: evaluation.metadata?.tags || [],
-            retries: evaluation.settings?.retry_policy?.max_retries || 0
+            tags: request.metadata?.tags || [],
+            retries: request.settings?.retry_policy?.max_retries || 0
           }
         },
         id: rpcId
@@ -740,15 +740,15 @@ export class EvalServer extends EventEmitter {
         connection.ws,
         'evaluate',
         rpcRequest.params,
-        evaluation.timeout || 45000
+        request.timeout || 45000
       );
 
-      // Log evaluation
-      logEvaluation({
-        evaluationId: evaluation.id,
+      // Log request
+      logRequest({
+        requestId: request.id,
         clientId: connection.clientId,
-        name: evaluation.name,
-        tool: evaluation.tool,
+        name: request.name,
+        tool: request.tool,
         response,
         timestamp: new Date().toISOString(),
         duration: Date.now() - startTime
@@ -757,9 +757,9 @@ export class EvalServer extends EventEmitter {
       return response;
 
     } catch (error) {
-      logger.error('Evaluation failed', {
+      logger.error('Request failed', {
         clientId: connection.clientId,
-        evaluationId: evaluation.id,
+        requestId: request.id,
         error: error.message
       });
 
@@ -1180,25 +1180,33 @@ class ClientProxy {
   }
 
   /**
-   * Execute an evaluation on this client
+   * Execute a request on this client
    */
-  async evaluate(evaluation) {
-    // Ensure evaluation has required fields
-    const fullEvaluation = {
-      id: evaluation.id || `eval-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
-      name: evaluation.name || 'Dynamic Evaluation',
-      description: evaluation.description || 'Programmatically created evaluation',
+  async execute(request) {
+    // Ensure request has required fields
+    const fullRequest = {
+      id: request.id || `req-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
+      name: request.name || 'Dynamic Request',
+      description: request.description || 'Programmatically created request',
       enabled: true,
-      tool: evaluation.tool || 'chat',
-      timeout: evaluation.timeout || 45000,
-      input: evaluation.input || {},
-      model: evaluation.model || {},
-      validation: evaluation.validation || { type: 'none' },
-      metadata: evaluation.metadata || { tags: ['api', 'dynamic'] },
-      ...evaluation
+      tool: request.tool || 'chat',
+      timeout: request.timeout || 45000,
+      input: request.input || {},
+      model: request.model || {},
+      validation: request.validation || { type: 'none' },
+      metadata: request.metadata || { tags: ['api', 'dynamic'] },
+      ...request
     };
 
-    return this.server.executeEvaluation(this.connection, fullEvaluation);
+    return this.server.executeRequest(this.connection, fullRequest);
+  }
+
+  /**
+   * Alias for backward compatibility
+   * @deprecated Use execute() instead
+   */
+  async evaluate(request) {
+    return this.execute(request);
   }
 
   /**
diff --git a/eval-server/nodejs/src/lib/HTTPWrapper.js b/browser-agent-server/nodejs/src/lib/HTTPWrapper.js
similarity index 66%
rename from eval-server/nodejs/src/lib/HTTPWrapper.js
rename to browser-agent-server/nodejs/src/lib/HTTPWrapper.js
index f377690..c3653c5 100644
--- a/eval-server/nodejs/src/lib/HTTPWrapper.js
+++ b/browser-agent-server/nodejs/src/lib/HTTPWrapper.js
@@ -5,33 +5,33 @@
 import { APIServer } from '../api-server.js';
 
 /**
- * HTTPWrapper - Optional HTTP API wrapper for EvalServer
- * 
- * This provides an HTTP REST API on top of the core EvalServer,
+ * HTTPWrapper - Optional HTTP API wrapper for BrowserAgentServer
+ *
+ * This provides an HTTP REST API on top of the core BrowserAgentServer,
  * following the same pattern as the CLI wrapper.
- * 
+ *
  * Example usage:
  * ```js
- * import { EvalServer } from './EvalServer.js';
+ * import { BrowserAgentServer } from './BrowserAgentServer.js';
  * import { HTTPWrapper } from './HTTPWrapper.js';
- * 
- * const evalServer = new EvalServer({ port: 8080 });
- * const httpWrapper = new HTTPWrapper(evalServer, { port: 8081 });
- * 
- * await evalServer.start();
+ *
+ * const browserAgentServer = new BrowserAgentServer({ port: 8080 });
+ * const httpWrapper = new HTTPWrapper(browserAgentServer, { port: 8081 });
+ *
+ * await browserAgentServer.start();
  * await httpWrapper.start();
  * ```
  */
 export class HTTPWrapper {
-  constructor(evalServer, options = {}) {
-    this.evalServer = evalServer;
+  constructor(browserAgentServer, options = {}) {
+    this.browserAgentServer = browserAgentServer;
     this.config = {
       port: options.port || 8081,
       host: options.host || 'localhost',
       ...options
     };
-    
-    this.apiServer = new APIServer(evalServer, this.config.port);
+
+    this.apiServer = new APIServer(browserAgentServer, this.config.port);
     this.isRunning = false;
   }
 
@@ -43,8 +43,8 @@ export class HTTPWrapper {
       throw new Error('HTTP wrapper is already running');
     }
 
-    if (!this.evalServer.isRunning) {
-      throw new Error('EvalServer must be started before starting HTTP wrapper');
+    if (!this.browserAgentServer.isRunning) {
+      throw new Error('BrowserAgentServer must be started before starting HTTP wrapper');
     }
 
     this.apiServer.start();
diff --git a/browser-agent-server/nodejs/src/lib/RequestStack.js b/browser-agent-server/nodejs/src/lib/RequestStack.js
new file mode 100644
index 0000000..9eb363b
--- /dev/null
+++ b/browser-agent-server/nodejs/src/lib/RequestStack.js
@@ -0,0 +1,85 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+/**
+ * RequestStack - A simple stack-like structure for managing requests
+ *
+ * Provides LIFO (Last In, First Out) access to request objects.
+ * Useful for distributing different requests across multiple client connections.
+ */
+export class RequestStack {
+  constructor() {
+    this.requests = [];
+  }
+
+  /**
+   * Add a request to the top of the stack
+   * @param {Object} request - The request object to add
+   */
+  push(request) {
+    if (!request || typeof request !== 'object') {
+      throw new Error('Request must be a valid object');
+    }
+
+    // Validate required fields
+    const requiredFields = ['id', 'name', 'tool', 'input'];
+    for (const field of requiredFields) {
+      if (!request[field]) {
+        throw new Error(`Request missing required field: ${field}`);
+      }
+    }
+
+    this.requests.push(request);
+  }
+
+  /**
+   * Remove and return the request from the top of the stack
+   * @returns {Object|null} The request object, or null if stack is empty
+   */
+  pop() {
+    return this.requests.pop() || null;
+  }
+
+  /**
+   * Check if the stack is empty
+   * @returns {boolean} True if stack has no requests
+   */
+  isEmpty() {
+    return this.requests.length === 0;
+  }
+
+  /**
+   * Get the number of requests in the stack
+   * @returns {number} The stack size
+   */
+  size() {
+    return this.requests.length;
+  }
+
+  /**
+   * Peek at the top request without removing it
+   * @returns {Object|null} The top request object, or null if stack is empty
+   */
+  peek() {
+    if (this.isEmpty()) {
+      return null;
+    }
+    return this.requests[this.requests.length - 1];
+  }
+
+  /**
+   * Clear all requests from the stack
+   */
+  clear() {
+    this.requests = [];
+  }
+
+  /**
+   * Get a copy of all requests in the stack (top to bottom)
+   * @returns {Array} Array of request objects
+   */
+  toArray() {
+    return [...this.requests].reverse();
+  }
+}
\ No newline at end of file
diff --git a/eval-server/nodejs/src/lib/judges/Judge.js b/browser-agent-server/nodejs/src/lib/judges/Judge.js
similarity index 100%
rename from eval-server/nodejs/src/lib/judges/Judge.js
rename to browser-agent-server/nodejs/src/lib/judges/Judge.js
diff --git a/eval-server/nodejs/src/lib/judges/LLMJudge.js b/browser-agent-server/nodejs/src/lib/judges/LLMJudge.js
similarity index 100%
rename from eval-server/nodejs/src/lib/judges/LLMJudge.js
rename to browser-agent-server/nodejs/src/lib/judges/LLMJudge.js
diff --git a/eval-server/nodejs/src/logger.js b/browser-agent-server/nodejs/src/logger.js
similarity index 59%
rename from eval-server/nodejs/src/logger.js
rename to browser-agent-server/nodejs/src/logger.js
index c935eb9..dba2fe1 100644
--- a/eval-server/nodejs/src/logger.js
+++ b/browser-agent-server/nodejs/src/logger.js
@@ -32,40 +32,40 @@ const logger = winston.createLogger({
   ]
 });
 
-// Create dedicated evaluation logger once to avoid recreating on each call
-const evaluationLogger = winston.createLogger({
+// Create dedicated request logger once to avoid recreating on each call
+const requestLogger = winston.createLogger({
   format: winston.format.json(),
   transports: [
     new winston.transports.File({
-      filename: `${CONFIG.logging.dir}/evaluations.jsonl`
+      filename: `${CONFIG.logging.dir}/requests.jsonl`
     })
   ]
 });
 
-export function logEvaluation(evaluationData) {
+export function logRequest(requestData) {
   const logEntry = {
-    type: 'evaluation',
+    type: 'request',
     timestamp: new Date().toISOString(),
-    ...evaluationData
+    ...requestData
   };
-  
-  // Pretty print evaluation summary to console
+
+  // Pretty print request summary to console
   console.log('\n' + '='.repeat(80));
-  console.log(`📊 EVALUATION COMPLETED: ${evaluationData.name}`);
+  console.log(`📊 REQUEST COMPLETED: ${requestData.name}`);
   console.log('='.repeat(80));
-  console.log(`🆔 ID: ${evaluationData.evaluationId}`);
-  console.log(`🔧 Tool: ${evaluationData.tool}`);
-  console.log(`⏱️  Duration: ${evaluationData.duration}ms`);
-  console.log(`👤 Client: ${evaluationData.clientId}`);
-  
-  if (evaluationData.response?.output?.output) {
-    console.log(`\n📝 Output:\n${evaluationData.response.output.output}`);
+  console.log(`🆔 ID: ${requestData.requestId}`);
+  console.log(`🔧 Tool: ${requestData.tool}`);
+  console.log(`⏱️  Duration: ${requestData.duration}ms`);
+  console.log(`👤 Client: ${requestData.clientId}`);
+
+  if (requestData.response?.output?.output) {
+    console.log(`\n📝 Output:\n${requestData.response.output.output}`);
   }
-  
-  if (evaluationData.validation?.result) {
-    const val = evaluationData.validation.result;
+
+  if (requestData.validation?.result) {
+    const val = requestData.validation.result;
     console.log(`\n📋 Validation:`);
-    console.log(`   ✅ Passed: ${evaluationData.validation.passed ? 'YES' : 'NO'}`);
+    console.log(`   ✅ Passed: ${requestData.validation.passed ? 'YES' : 'NO'}`);
     console.log(`   📊 Overall Score: ${val.overall_score}/10`);
     if (val.strengths?.length > 0) {
       console.log(`   💪 Strengths: ${val.strengths.join(', ')}`);
@@ -74,14 +74,25 @@ export function logEvaluation(evaluationData) {
       console.log(`   ⚠️  Weaknesses: ${val.weaknesses.join(', ')}`);
     }
   }
-  
+
   console.log('='.repeat(80) + '\n');
-  
+
   // Also log structured data for file logs
-  logger.info('Evaluation completed', logEntry);
-  
-  // Also save to dedicated evaluation log
-  evaluationLogger.info(logEntry);
+  logger.info('Request completed', logEntry);
+
+  // Also save to dedicated request log
+  requestLogger.info(logEntry);
+}
+
+// Backward compatibility alias
+export function logEvaluation(evaluationData) {
+  // Map evaluationId to requestId if present
+  const requestData = { ...evaluationData };
+  if (evaluationData.evaluationId && !evaluationData.requestId) {
+    requestData.requestId = evaluationData.evaluationId;
+    delete requestData.evaluationId;
+  }
+  return logRequest(requestData);
 }
 
 export function logRpcCall(callData) {
diff --git a/eval-server/nodejs/src/rpc-client.js b/browser-agent-server/nodejs/src/rpc-client.js
similarity index 100%
rename from eval-server/nodejs/src/rpc-client.js
rename to browser-agent-server/nodejs/src/rpc-client.js
diff --git a/eval-server/nodejs/start.js b/browser-agent-server/nodejs/start.js
similarity index 66%
rename from eval-server/nodejs/start.js
rename to browser-agent-server/nodejs/start.js
index a3d45bf..9a0bc5f 100644
--- a/eval-server/nodejs/start.js
+++ b/browser-agent-server/nodejs/start.js
@@ -1,25 +1,25 @@
-import { EvalServer } from "./src/lib/EvalServer.js";
+import { BrowserAgentServer } from "./src/lib/BrowserAgentServer.js";
 import { HTTPWrapper } from "./src/lib/HTTPWrapper.js";
 
 const WS_PORT = parseInt(process.env.PORT || "8082");
 const HTTP_PORT = parseInt(process.env.API_PORT || "8081");
 const HOST = process.env.HOST || "0.0.0.0";
 
-console.log("🔧 Creating EvalServer...");
-const evalServer = new EvalServer({
+console.log("🔧 Creating BrowserAgentServer...");
+const browserAgentServer = new BrowserAgentServer({
   host: HOST,
   port: WS_PORT
 });
 
 console.log("🔧 Creating HTTP wrapper...");
-const httpWrapper = new HTTPWrapper(evalServer, {
+const httpWrapper = new HTTPWrapper(browserAgentServer, {
   port: HTTP_PORT,
   host: HOST
 });
 
-console.log("🔧 Starting EvalServer...");
-await evalServer.start();
-console.log(`✅ EvalServer started on ws://${HOST}:${WS_PORT}`);
+console.log("🔧 Starting BrowserAgentServer...");
+await browserAgentServer.start();
+console.log(`✅ BrowserAgentServer started on ws://${HOST}:${WS_PORT}`);
 
 console.log("🔧 Starting HTTP wrapper...");
 await httpWrapper.start();
diff --git a/eval-server-start.js b/browser-agent-server/start.js
similarity index 96%
rename from eval-server-start.js
rename to browser-agent-server/start.js
index bd4c8b8..f678a93 100644
--- a/eval-server-start.js
+++ b/browser-agent-server/start.js
@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 
-// Custom eval-server startup script for Cloud Run
+// Custom browser-agent-server startup script for Cloud Run
 // Uses environment variables for port configuration
 
 import { EvalServer } from './src/lib/EvalServer.js';
diff --git a/deployment/cloudrun/service-secrets.yaml b/deployment/cloudrun/service-secrets.yaml
index bd585f0..3fa570c 100644
--- a/deployment/cloudrun/service-secrets.yaml
+++ b/deployment/cloudrun/service-secrets.yaml
@@ -101,7 +101,7 @@ spec:
           value: "127.0.0.1"
         # Force new revision
         - name: DEPLOYMENT_VERSION
-          value: "v16-eval-server-with-nodejs"
+          value: "v16-browser-agent-server-with-nodejs"
   traffic:
   - percent: 100
     latestRevision: true
\ No newline at end of file
diff --git a/deployment/cloudrun/service.yaml b/deployment/cloudrun/service.yaml
index 9a19969..48c6b57 100644
--- a/deployment/cloudrun/service.yaml
+++ b/deployment/cloudrun/service.yaml
@@ -91,7 +91,7 @@ spec:
           value: "127.0.0.1"
         # Force new revision
         - name: DEPLOYMENT_VERSION
-          value: "v16-eval-server-with-nodejs"
+          value: "v16-browser-agent-server-with-nodejs"
   traffic:
   - percent: 100
     latestRevision: true
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 8acb2e5..008173f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,11 +19,11 @@ services:
       - "8000:8000"
       # Enhanced DevTools UI
       - "8001:8001"
-      # Eval Server HTTP API
+      # Browser Agent Server HTTP API
       - "8080:8080"
       # WebRTC Neko interface
       - "8081:8081"
-      # Eval Server WebSocket
+      # Browser Agent Server WebSocket
       - "8082:8082"
       # WebRTC UDP port range for local development
       - "57000-57100:57000-57100/udp"
diff --git a/eval-server/nodejs/src/lib/EvaluationStack.js b/eval-server/nodejs/src/lib/EvaluationStack.js
deleted file mode 100644
index 04d7b36..0000000
--- a/eval-server/nodejs/src/lib/EvaluationStack.js
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2025 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-/**
- * EvaluationStack - A simple stack-like structure for managing evaluations
- * 
- * Provides LIFO (Last In, First Out) access to evaluation objects.
- * Useful for distributing different evaluations across multiple client connections.
- */
-export class EvaluationStack {
-  constructor() {
-    this.evaluations = [];
-  }
-
-  /**
-   * Add an evaluation to the top of the stack
-   * @param {Object} evaluation - The evaluation object to add
-   */
-  push(evaluation) {
-    if (!evaluation || typeof evaluation !== 'object') {
-      throw new Error('Evaluation must be a valid object');
-    }
-    
-    // Validate required fields
-    const requiredFields = ['id', 'name', 'tool', 'input'];
-    for (const field of requiredFields) {
-      if (!evaluation[field]) {
-        throw new Error(`Evaluation missing required field: ${field}`);
-      }
-    }
-    
-    this.evaluations.push(evaluation);
-  }
-
-  /**
-   * Remove and return the evaluation from the top of the stack
-   * @returns {Object|null} The evaluation object, or null if stack is empty
-   */
-  pop() {
-    return this.evaluations.pop() || null;
-  }
-
-  /**
-   * Check if the stack is empty
-   * @returns {boolean} True if stack has no evaluations
-   */
-  isEmpty() {
-    return this.evaluations.length === 0;
-  }
-
-  /**
-   * Get the number of evaluations in the stack
-   * @returns {number} The stack size
-   */
-  size() {
-    return this.evaluations.length;
-  }
-
-  /**
-   * Peek at the top evaluation without removing it
-   * @returns {Object|null} The top evaluation object, or null if stack is empty
-   */
-  peek() {
-    if (this.isEmpty()) {
-      return null;
-    }
-    return this.evaluations[this.evaluations.length - 1];
-  }
-
-  /**
-   * Clear all evaluations from the stack
-   */
-  clear() {
-    this.evaluations = [];
-  }
-
-  /**
-   * Get a copy of all evaluations in the stack (top to bottom)
-   * @returns {Array} Array of evaluation objects
-   */
-  toArray() {
-    return [...this.evaluations].reverse();
-  }
-}
\ No newline at end of file
diff --git a/evals/data/action-agent/action-agent-login-001.yaml b/evals/data/action-agent/action-agent-login-001.yaml
index 1b705ce..82b7f1b 100644
--- a/evals/data/action-agent/action-agent-login-001.yaml
+++ b/evals/data/action-agent/action-agent-login-001.yaml
@@ -27,7 +27,7 @@ validation:
       - "Filled username field with correct value"
       - "Filled password field with correct value"
       - "Handled password field type appropriately"
-      - "Used the provided input_data XML format correctly"
+      - "Agent used the provided input_data XML format correctly from the request"
     visual_verification:
       enabled: true
       capture_before: true
@@ -35,7 +35,6 @@ validation:
       prompts:
         - "Verify the username field shows \"tomsmith\" entered"
         - "Confirm the password field has dots/asterisks indicating password entry"
-        - "Check that both fields are properly filled before submission"
         - "Ensure no validation errors are shown for the filled fields"
 
 metadata:
diff --git a/scripts/test-browser-agent-server.sh b/scripts/test-browser-agent-server.sh
new file mode 100755
index 0000000..5e96b12
--- /dev/null
+++ b/scripts/test-browser-agent-server.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+echo "🧪 Testing browser-agent-server startup script..."
+
+# Build only the browser-agent-server stage
+echo "📦 Building browser-agent-server stage..."
+docker build \
+  --file Dockerfile.cloudrun \
+  --target browser-agent-server-builder \
+  -t browser-agent-server-test \
+  .
+
+echo "✅ Build successful!"
+echo ""
+echo "📂 Contents of /browser-agent-server:"
+docker run --rm browser-agent-server-test ls -la /browser-agent-server
+
+echo ""
+echo "📄 Checking package.json:"
+docker run --rm browser-agent-server-test cat /browser-agent-server/package.json | grep '"type"'
+
+echo ""
+echo "🔍 Checking if node_modules exist:"
+docker run --rm browser-agent-server-test ls -la /browser-agent-server/node_modules | head -5
+
+echo ""
+echo "✅ All checks passed! Eval-server build is working."
+echo ""
+echo "Next: Test the full image with 'docker build -f Dockerfile.cloudrun -t kernel-browser:cloudrun-test .'"
diff --git a/scripts/test-eval-server.sh b/scripts/test-eval-server.sh
deleted file mode 100755
index a7d9569..0000000
--- a/scripts/test-eval-server.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "🧪 Testing eval-server startup script..."
-
-# Build only the eval-server stage
-echo "📦 Building eval-server stage..."
-docker build \
-  --file Dockerfile.cloudrun \
-  --target eval-server-builder \
-  -t eval-server-test \
-  .
-
-echo "✅ Build successful!"
-echo ""
-echo "📂 Contents of /eval-server:"
-docker run --rm eval-server-test ls -la /eval-server
-
-echo ""
-echo "📄 Checking package.json:"
-docker run --rm eval-server-test cat /eval-server/package.json | grep '"type"'
-
-echo ""
-echo "🔍 Checking if node_modules exist:"
-docker run --rm eval-server-test ls -la /eval-server/node_modules | head -5
-
-echo ""
-echo "✅ All checks passed! Eval-server build is working."
-echo ""
-echo "Next: Test the full image with 'docker build -f Dockerfile.cloudrun -t kernel-browser:cloudrun-test .'"
diff --git a/supervisor/services-cloudrun/eval-server.conf b/supervisor/services-cloudrun/browser-agent-server.conf
similarity index 55%
rename from supervisor/services-cloudrun/eval-server.conf
rename to supervisor/services-cloudrun/browser-agent-server.conf
index b605e98..2748f70 100644
--- a/supervisor/services-cloudrun/eval-server.conf
+++ b/supervisor/services-cloudrun/browser-agent-server.conf
@@ -1,11 +1,11 @@
-[program:eval-server]
-command=/usr/bin/node /opt/eval-server/start-cloudrun.js
-directory=/opt/eval-server
+[program:browser-agent-server]
+command=/usr/bin/node /opt/browser-agent-server/start-cloudrun.js
+directory=/opt/browser-agent-server
 autostart=true
 autorestart=true
 startsecs=5
 priority=25
-stdout_logfile=/var/log/supervisord/eval-server/eval-server.log
+stdout_logfile=/var/log/supervisord/browser-agent-server/browser-agent-server.log
 stdout_logfile_maxbytes=50MB
 redirect_stderr=true
 environment=HOME="/home/kernel",USER="kernel",NODE_ENV="production",EVAL_SERVER_WS_PORT="8082",EVAL_SERVER_HTTP_PORT="8083",EVAL_SERVER_HOST="127.0.0.1"
diff --git a/supervisor/services/browser-agent-server.conf b/supervisor/services/browser-agent-server.conf
new file mode 100644
index 0000000..4e544ac
--- /dev/null
+++ b/supervisor/services/browser-agent-server.conf
@@ -0,0 +1,8 @@
+[program:browser-agent-server]
+command=/usr/local/bin/start-browser-agent-server.sh
+autostart=true
+autorestart=true
+stdout_logfile=/var/log/supervisor/browser-agent-server.log
+stderr_logfile=/var/log/supervisor/browser-agent-server.error.log
+environment=NODE_ENV="production",PORT="8082",API_PORT="8080",HOST="0.0.0.0",CDP_PORT="9223"
+priority=30
\ No newline at end of file
diff --git a/supervisor/services/eval-server.conf b/supervisor/services/eval-server.conf
deleted file mode 100644
index c35a1f0..0000000
--- a/supervisor/services/eval-server.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-[program:eval-server]
-command=/usr/local/bin/start-eval-server.sh
-autostart=true
-autorestart=true
-stdout_logfile=/var/log/supervisor/eval-server.log
-stderr_logfile=/var/log/supervisor/eval-server.error.log
-environment=NODE_ENV="production",PORT="8082",API_PORT="8080",HOST="0.0.0.0",CDP_PORT="9223"
-priority=30
\ No newline at end of file

From 3044f90ba12e4cb4644c7eff99144c3697609032 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 21:53:52 -0500
Subject: [PATCH 22/24] Cleanup config

---
 browser-agent-server/.gitignore                   | 7 +++++++
 browser-agent-server/nodejs/src/client-manager.js | 9 ---------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/browser-agent-server/.gitignore b/browser-agent-server/.gitignore
index 78e7d64..a1e6add 100644
--- a/browser-agent-server/.gitignore
+++ b/browser-agent-server/.gitignore
@@ -1,3 +1,10 @@
 .env
 node_modules
 *.log
+
+# Exclude all client configuration files
+nodejs/clients/*.yaml
+nodejs/clients/*.yml
+
+# But keep the example file
+!nodejs/clients/example-client.yaml
diff --git a/browser-agent-server/nodejs/src/client-manager.js b/browser-agent-server/nodejs/src/client-manager.js
index b0d4ca6..3c101c6 100644
--- a/browser-agent-server/nodejs/src/client-manager.js
+++ b/browser-agent-server/nodejs/src/client-manager.js
@@ -131,15 +131,6 @@ class ClientManager {
         name: clientName,
         secret_key: secretKey,
         description: `Auto-generated DevTools client`
-      },
-      settings: {
-        max_concurrent_requests: 3,
-        default_timeout: 45000,
-        retry_policy: {
-          max_retries: 2,
-          backoff_multiplier: 2,
-          initial_delay: 1000
-        }
       }
     };
     

From bd5061e9bb6f456aefd865e2164cc02e9f2fc4bc Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Tue, 21 Oct 2025 21:56:36 -0500
Subject: [PATCH 23/24] Example configs

---
 browser-agent-server/nodejs/clients/README.md | 41 +++++++++++++++++++
 .../nodejs/clients/example-client.yaml        | 15 +++++++
 2 files changed, 56 insertions(+)
 create mode 100644 browser-agent-server/nodejs/clients/README.md
 create mode 100644 browser-agent-server/nodejs/clients/example-client.yaml

diff --git a/browser-agent-server/nodejs/clients/README.md b/browser-agent-server/nodejs/clients/README.md
new file mode 100644
index 0000000..5c00064
--- /dev/null
+++ b/browser-agent-server/nodejs/clients/README.md
@@ -0,0 +1,41 @@
+# Client Configurations
+
+This directory contains client configuration files for the Browser Agent Server.
+
+## File Naming Convention
+
+Client configuration files **must** be named using the client's UUID:
+
+```
+{client-uuid}.yaml
+```
+
+Example:
+```
+9907fd8d-92a8-4a6a-bce9-458ec8c57306.yaml
+```
+
+## Creating a New Client
+
+1. Copy `example-client.yaml` to a new file with your client's UUID:
+   ```bash
+   cp example-client.yaml {your-client-uuid}.yaml
+   ```
+
+2. Edit the new file and update:
+   - `client.id` - Must match the filename (without .yaml extension)
+   - `client.name` - A friendly name for the client
+   - `client.secret_key` - Authentication secret (default: "hello")
+   - `client.description` - Description of the client
+
+## Auto-Discovery
+
+The server automatically discovers and loads all `.yaml` files in this directory on startup. The filename (without extension) is used as the client ID.
+
+## Security Note
+
+Client configuration files contain secret keys and are excluded from version control via `.gitignore`. Keep these files secure and never commit them to the repository.
+
+## Example Configuration
+
+See `example-client.yaml` for a template configuration file.
diff --git a/browser-agent-server/nodejs/clients/example-client.yaml b/browser-agent-server/nodejs/clients/example-client.yaml
new file mode 100644
index 0000000..3356598
--- /dev/null
+++ b/browser-agent-server/nodejs/clients/example-client.yaml
@@ -0,0 +1,15 @@
+# Example Client Configuration
+#
+# To create a new client:
+# 1. Copy this file and rename it to: {client-uuid}.yaml
+#    Example: 9907fd8d-92a8-4a6a-bce9-458ec8c57306.yaml
+# 2. Update the client ID to match the filename
+# 3. Set a secret key for authentication
+#
+# The filename MUST match the client ID for proper client discovery.
+
+client:
+  id: YOUR-CLIENT-UUID-HERE
+  name: My DevTools Client
+  secret_key: your-secret-key-here
+  description: Browser Operator DevTools client

From 29081c5e25bf35ac9b53f702f6f231f65bff83d9 Mon Sep 17 00:00:00 2001
From: Oleh Luchkiv <olesho@gmail.com>
Date: Wed, 22 Oct 2025 12:23:13 -0500
Subject: [PATCH 24/24] Fix the old ./src/lib/EvalServer.js reference

---
 browser-agent-server/start.js | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/browser-agent-server/start.js b/browser-agent-server/start.js
index f678a93..159fc11 100644
--- a/browser-agent-server/start.js
+++ b/browser-agent-server/start.js
@@ -3,29 +3,29 @@
 // Custom browser-agent-server startup script for Cloud Run
 // Uses environment variables for port configuration
 
-import { EvalServer } from './src/lib/EvalServer.js';
+import { BrowserAgentServer } from './src/lib/BrowserAgentServer.js';
 import { HTTPWrapper } from './src/lib/HTTPWrapper.js';
 
 const WS_PORT = parseInt(process.env.EVAL_SERVER_WS_PORT || '8082');
 const HTTP_PORT = parseInt(process.env.EVAL_SERVER_HTTP_PORT || '8083');
 const HOST = process.env.EVAL_SERVER_HOST || '127.0.0.1';
 
-console.log('🔧 Creating EvalServer...');
-const evalServer = new EvalServer({
+console.log('🔧 Creating BrowserAgentServer...');
+const browserAgentServer = new BrowserAgentServer({
   // No authKey - authentication disabled for automated mode
   host: HOST,
   port: WS_PORT
 });
 
 console.log('🔧 Creating HTTP wrapper...');
-const httpWrapper = new HTTPWrapper(evalServer, {
+const httpWrapper = new HTTPWrapper(browserAgentServer, {
   port: HTTP_PORT,
   host: HOST
 });
 
-console.log('🔧 Starting EvalServer...');
-await evalServer.start();
-console.log(`✅ EvalServer started on ws://${HOST}:${WS_PORT}`);
+console.log('🔧 Starting BrowserAgentServer...');
+await browserAgentServer.start();
+console.log(`✅ BrowserAgentServer started on ws://${HOST}:${WS_PORT}`);
 
 console.log('🔧 Starting HTTP wrapper...');
 await httpWrapper.start();
@@ -38,8 +38,8 @@ console.log('   Auth: Disabled (automated mode)');
 
 // Add periodic status check
 setInterval(() => {
-  const evalServerStatus = evalServer.getStatus();
+  const browserAgentServerStatus = browserAgentServer.getStatus();
   const httpWrapperStatus = httpWrapper.getStatus();
-  console.log(`📊 EvalServer: ${evalServerStatus.connectedClients} clients, ${evalServerStatus.readyClients} ready`);
+  console.log(`📊 BrowserAgentServer: ${browserAgentServerStatus.connectedClients} clients, ${browserAgentServerStatus.readyClients} ready`);
   console.log(`📊 HTTP API: ${httpWrapperStatus.isRunning ? 'running' : 'stopped'} on ${httpWrapperStatus.url}`);
 }, 30000);