Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions scripts/test-modal-tools.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env node
/**
* Modal sandbox tools — offline self-tests.
*
* Runs without making any paid x402 calls. Covers:
* 1. Tool registration: all 4 ModalXxx capabilities exist in `allCapabilities`.
* 2. Hidden by default: none of them appear in CORE_TOOL_NAMES.
* 3. Schema sanity: each spec has the expected required/optional fields.
* 4. normalizeCommand behavior (string ↔ array, invalid).
* 5. SessionSandboxTracker add / remove / drain semantics.
* 6. Gateway contract: re-runs the probe curls (free 400/402 responses)
* to verify the endpoint surface still matches the implementation
* (catches gateway-side breaking changes early).
*
* Run: node scripts/test-modal-tools.mjs
* Exit 0 = pass, exit 1 = fail.
*/

import { allCapabilities } from '../dist/tools/index.js';
import { CORE_TOOL_NAMES } from '../dist/tools/tool-categories.js';
import { sessionSandboxTracker } from '../dist/tools/modal.js';

let failures = 0;
function check(name, ok, detail = '') {
const mark = ok ? '\x1b[32m✓\x1b[0m' : '\x1b[31m✗\x1b[0m';
console.log(` ${mark} ${name}${detail ? ' — ' + detail : ''}`);
if (!ok) failures++;
}

console.log('\n[1] Tool registration');
const expected = ['ModalCreate', 'ModalExec', 'ModalStatus', 'ModalTerminate'];
const registered = new Set(allCapabilities.map(c => c.spec.name));
for (const name of expected) {
check(`${name} registered`, registered.has(name));
}

console.log('\n[2] Hidden by default (must require ActivateTool)');
for (const name of expected) {
check(`${name} NOT in CORE_TOOL_NAMES`, !CORE_TOOL_NAMES.has(name));
}

console.log('\n[3] Schema sanity');
const create = allCapabilities.find(c => c.spec.name === 'ModalCreate');
const exec = allCapabilities.find(c => c.spec.name === 'ModalExec');
const status = allCapabilities.find(c => c.spec.name === 'ModalStatus');
const terminate = allCapabilities.find(c => c.spec.name === 'ModalTerminate');

check('ModalCreate has gpu/timeout/cpu/memory props',
create && ['gpu', 'timeout', 'cpu', 'memory'].every(k => k in create.spec.input_schema.properties));
check('ModalCreate has NO required fields',
create && (!create.spec.input_schema.required || create.spec.input_schema.required.length === 0));
check('ModalExec requires sandbox_id + command',
exec && exec.spec.input_schema.required?.includes('sandbox_id') &&
exec.spec.input_schema.required?.includes('command'));
check('ModalStatus requires sandbox_id only',
status && JSON.stringify(status.spec.input_schema.required) === '["sandbox_id"]');
check('ModalTerminate requires sandbox_id only',
terminate && JSON.stringify(terminate.spec.input_schema.required) === '["sandbox_id"]');
check('ModalCreate concurrent=false (high-cost, must be serial)', create && create.concurrent === false);
check('ModalExec concurrent=false (writes shared sandbox state)', exec && exec.concurrent === false);

console.log('\n[4] normalizeCommand — internal, exercised via ModalExec.execute');
// We can't easily import the un-exported helper, but ModalExec returns
// a clear error message on bad command, so probe via its public surface.
const stubScope = {
workingDir: '/tmp',
abortSignal: new AbortController().signal,
onAskUser: undefined,
};
const badCases = [
{ input: { sandbox_id: 'x', command: '' }, label: 'empty string command rejected' },
{ input: { sandbox_id: 'x', command: [] }, label: 'empty array command rejected' },
{ input: { sandbox_id: 'x', command: [1, 2] }, label: 'non-string array command rejected' },
{ input: { sandbox_id: 'x', command: null }, label: 'null command rejected' },
{ input: { sandbox_id: 'x' }, label: 'missing command rejected' },
];
for (const c of badCases) {
const r = await exec.execute(c.input, stubScope);
check(c.label, r.isError === true && /invalid command|command is required|expected/i.test(r.output));
}

console.log('\n[5] SessionSandboxTracker semantics');
sessionSandboxTracker.drainIds(); // start clean
sessionSandboxTracker.add({ id: 'sbx_a', gpu: 'cpu', createdAt: Date.now() });
sessionSandboxTracker.add({ id: 'sbx_b', gpu: 'T4', createdAt: Date.now() });
const list1 = sessionSandboxTracker.list();
check('add registers 2 sandboxes', list1.length === 2);
sessionSandboxTracker.remove('sbx_a');
check('remove drops one', sessionSandboxTracker.list().length === 1);
const drained = sessionSandboxTracker.drainIds();
check('drainIds returns remaining ids', drained.length === 1 && drained[0] === 'sbx_b');
check('drainIds clears the tracker', sessionSandboxTracker.list().length === 0);

console.log('\n[6] Gateway contract probe (live, free — relies on 400/402 responses)');
const BASE = 'https://blockrun.ai/api';
async function probe(path, body, expectStatus, validator) {
const ctrl = new AbortController();
const t = setTimeout(() => ctrl.abort(), 8000);
try {
const r = await fetch(`${BASE}${path}`, {
method: 'POST',
signal: ctrl.signal,
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body),
});
const text = await r.text().catch(() => '');
let json = {};
try { json = JSON.parse(text); } catch { /* ignore */ }
const ok = r.status === expectStatus && (validator ? validator(json) : true);
return { ok, status: r.status, json };
} catch (err) {
return { ok: false, status: 0, error: err.message };
} finally {
clearTimeout(t);
}
}

const c1 = await probe('/v1/modal/sandbox/create', {}, 402, j => j.price?.amount === '0.0100');
check('create endpoint reachable, CPU price still $0.01', c1.ok, `got status ${c1.status}`);

const c2 = await probe('/v1/modal/sandbox/create', { gpu: 'H100' }, 402, j => j.price?.amount === '0.4000');
check('H100 price still $0.40', c2.ok, `got status ${c2.status}`);

const c3 = await probe('/v1/modal/sandbox/create', { gpu: 'invalid_xxx' }, 400);
check('invalid gpu rejected with 400', c3.ok, `got status ${c3.status}`);

const c4 = await probe('/v1/modal/sandbox/exec', {}, 400, j =>
j.details?.some(d => d.path?.[0] === 'sandbox_id') &&
j.details?.some(d => d.path?.[0] === 'command' && d.expected === 'array'));
check('exec still requires sandbox_id + command-as-array', c4.ok, `got status ${c4.status}`);

const c5 = await probe('/v1/modal/sandbox/status', {}, 400);
check('status requires sandbox_id', c5.ok, `got status ${c5.status}`);

const c6 = await probe('/v1/modal/sandbox/terminate', {}, 400);
check('terminate requires sandbox_id', c6.ok, `got status ${c6.status}`);

const c7 = await probe('/v1/modal/sandbox/create', { image: 'python:3.12' }, 400, j =>
j.details?.some(d => /python:3\.11/i.test(String(d.message ?? ''))));
check('image still locked to python:3.11', c7.ok, `got status ${c7.status}`);

console.log(`\n${failures === 0 ? '\x1b[32mAll checks passed.\x1b[0m' : `\x1b[31m${failures} check(s) failed.\x1b[0m`}\n`);
process.exit(failures === 0 ? 0 : 1);
18 changes: 16 additions & 2 deletions src/agent/tool-guard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,23 @@ export class SessionToolGuard {
invocation: CapabilityInvocation,
scope: ExecutionScope
): Promise<CapabilityResult | null> {
// Hard-block tools that have failed too many times this session
// Hard-block tools that have failed too many times this session.
// Modal lifecycle tools are exempt: orphan sandboxes keep billing
// GPU time, and ModalTerminate is the only way to recover from
// agent-side. Auto-disabling it after 3 transient errors would
// strand a $0.40/hr H100 until the session ends. Same logic for
// media-gen tools: failures are usually transient (gateway hiccup,
// prompt rejection) and the user often wants to retry.
const FAILURE_EXEMPT = new Set([
'ImageGen',
'VideoGen',
'ModalCreate',
'ModalExec',
'ModalStatus',
'ModalTerminate',
]);
const errorCount = this.toolErrorCounts.get(invocation.name) ?? 0;
if (errorCount >= 3) {
if (errorCount >= 3 && !FAILURE_EXEMPT.has(invocation.name)) {
return {
output: `${invocation.name} has failed ${errorCount} times this session and is now disabled. ` +
'Tell the user what went wrong and suggest alternatives.',
Expand Down
41 changes: 41 additions & 0 deletions src/stats/insights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,25 @@ export interface InsightsReport {
avgRequestCostUsd: number;
/** Efficiency: cost per 1K tokens */
costPer1KTokens: number;
/**
* Cost breakdown by capability category. Lets the UI show a clean
* "where did your USDC go" split alongside the per-model bar list.
* - chat: LLM token-billed calls (anything with non-zero tokens)
* - media: ImageGen / VideoGen / MusicGen (per_image / per_second / per_track)
* - sandbox: Modal GPU sandbox lifecycle (create / exec / status / terminate)
*
* Categorization is by `model` name prefix:
* - `modal/*` → sandbox
* - rows with 0 input + 0 output tokens → media (image/video/music are
* stored with 0 tokens by recordUsage; modal/* matches first)
* - everything else → chat
*/
byCategory: {
chatCostUsd: number;
mediaCostUsd: number;
sandboxCostUsd: number;
sandboxRequests: number;
};
}

// ─── Generate Report ──────────────────────────────────────────────────────
Expand All @@ -72,6 +91,11 @@ export function generateInsights(days = 30): InsightsReport {
let totalCost = 0;
let totalInput = 0;
let totalOutput = 0;
// Category totals — see InsightsReport.byCategory doc.
let chatCost = 0;
let mediaCost = 0;
let sandboxCost = 0;
let sandboxRequests = 0;
const modelAgg = new Map<string, {
requests: number;
costUsd: number;
Expand All @@ -85,6 +109,17 @@ export function generateInsights(days = 30): InsightsReport {
totalInput += r.inputTokens;
totalOutput += r.outputTokens;

// Categorize: modal/* always goes to sandbox; zero-token entries are
// media (image/video/music recordUsage stores 0/0 tokens); rest = chat.
if (r.model.startsWith('modal/')) {
sandboxCost += r.costUsd;
sandboxRequests++;
} else if ((r.inputTokens + r.outputTokens) === 0) {
mediaCost += r.costUsd;
} else {
chatCost += r.costUsd;
}

const existing = modelAgg.get(r.model) ?? {
requests: 0,
costUsd: 0,
Expand Down Expand Up @@ -164,6 +199,12 @@ export function generateInsights(days = 30): InsightsReport {
projections,
avgRequestCostUsd,
costPer1KTokens,
byCategory: {
chatCostUsd: chatCost,
mediaCostUsd: mediaCost,
sandboxCostUsd: sandboxCost,
sandboxRequests,
},
};
}

Expand Down
6 changes: 6 additions & 0 deletions src/tools/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import {
defiLlamaYieldsCapability,
defiLlamaPriceCapability,
} from './defillama.js';
import { modalCapabilities } from './modal.js';
import { createTradingCapabilities } from './trading-execute.js';
import { Portfolio } from '../trading/portfolio.js';
import { RiskEngine } from '../trading/risk.js';
Expand Down Expand Up @@ -180,6 +181,11 @@ export const allCapabilities: CapabilityHandler[] = [
defiLlamaChainsCapability,
defiLlamaYieldsCapability,
defiLlamaPriceCapability,
// Modal GPU sandbox tools — registered but hidden by default (not in
// CORE_TOOL_NAMES). Agent must `ActivateTool({names:["ModalCreate",...]})`
// before they appear in its tool inventory. High-cost ($0.40/H100 create)
// operations should not be in the default surface.
...modalCapabilities, // ModalCreate, ModalExec, ModalStatus, ModalTerminate
];

export {
Expand Down
Loading
Loading