From 0ad89f40ec39692ce2ebd0d742b6b1b6799a3b16 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Tue, 19 May 2026 17:15:31 +0000 Subject: [PATCH] feat: improve recall tool description + add cross-session cue eval scenarios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite RECALL_TOOL_DESCRIPTION with dual-trigger structure so the LLM uses recall at layer 0 (early session) when users reference past sessions: (1) Cross-session references — explicit cue phrases like 'last time', 'we discussed', 'earlier', 'remember'. Prior sessions are never in context. (2) Missing details — file paths, decisions, preferences not visible in the current window. Extend the eval suite to test cross-session recall trigger sensitivity: - Add x-lore-recall-invoked response header to non-streaming recall paths - Add RECALL_TRIGGER scoring criterion and crossSessionCueRecall rubric - Pass recallInvoked metadata through judge for recall_trigger scoring - Add 8 new MSR-1 questions using conversational cross-session cues (msr1-q13 through msr1-q20) --- .lore.md | 18 ++-- packages/core/eval/harness.ts | 11 ++- packages/core/eval/judge.ts | 38 +++++++- .../eval/scenarios/multi-session-recall.ts | 93 ++++++++++++++++++- packages/core/src/recall.ts | 2 +- packages/gateway/src/pipeline.ts | 22 ++++- 6 files changed, 168 insertions(+), 16 deletions(-) diff --git a/.lore.md b/.lore.md index 7aaa985..7c6c500 100644 --- a/.lore.md +++ b/.lore.md @@ -34,7 +34,7 @@ * **git remote -v in hosted gateway — skip when header present, never run with client-controlled cwd**: \`LORE\_HOSTED\_MODE=1\` makes all FS-touching functions no-op: \`getGitRemote()\` returns null, \`config.load()\` skips \`.lore.json\`, agents-file/lat-reader/knowledge-watcher are no-ops. Activation: \`lore start\` (headless) enables hosted mode by default; opt-out via \`--local\` or \`LORE\_HOSTED\_MODE=0\`. \`lore run\` is always local. Flag set in \`initIfNeeded()\` from \`GatewayConfig.hostedMode\`. Never run \`git remote -v\` with client-controlled cwd. \`LORE\_REMOTE\_URL\` + local CLI: \`lore run\`/\`lore start\` skips local gateway and proxies to remote. Local CLI injects \`X-Lore-Git-Remote\`; remote gateway trusts it. CLI-less/SaaS: \`ANTHROPIC\_CUSTOM\_HEADERS\` requires a local \`lore\` CLI process — pure SaaS alternative not yet implemented. -* **LTM cache delete must be inside the 'changes made' guard in curator.ts**: Curator/recall path bugs: (1) \`ltmSessionCache.delete(sessionId)\` must be inside \`if (changesApplied)\` guard in curator.ts — unconditional placement forces expensive LTM rebuilds on every no-op run. (2) Recall follow-up requests must set \`cacheConversation: false\` — otherwise modified message array triggers full cache write at 5m TTL pricing. (3) Non-streaming recall follow-up path must NOT re-issue the upstream request — capture response body once to prevent double token cost and double cache prime. Strip \`recall\` from tools list to prevent re-invocation; convert \`tool\_use\`/\`tool\_result\` pair to plain text blocks. Thinking blocks must be preserved in assistant messages when extended thinking is enabled. +* **LTM cache delete must be inside the 'changes made' guard in curator.ts**: Curator/recall path bugs: (1) \`ltmSessionCache.delete(sessionId)\` must be inside \`if (changesApplied)\` guard in curator.ts — unconditional placement forces expensive LTM rebuilds on every no-op run. (2) Recall follow-up requests must set \`cacheConversation: false\` — otherwise modified message array triggers full cache write at 5m TTL pricing. (3) Non-streaming recall follow-up path must NOT re-issue the upstream request — capture response body once to prevent double token cost. Strip \`recall\` from tools list to prevent re-invocation; convert \`tool\_use\`/\`tool\_result\` pair to plain text blocks. Thinking blocks must be preserved in assistant messages when extended thinking is enabled. * **OpenAI/Responses API upstreams don't receive LTM — req.system passed through unchanged**: OpenAI/Responses API upstreams don't receive LTM injection — \`req.system\` is passed through unchanged. Only the Anthropic path in \`packages/gateway/src/pipeline.ts\` injects LTM into the system prompt. Sessions using OpenAI-protocol upstreams get no knowledge context. Fix: apply the same LTM injection logic to all upstream paths before forwarding. The LTM 3-block system prompt (stable preferences at 1h TTL, context-bound at 5m TTL) is Anthropic-only and must be adapted for other protocols. @@ -52,7 +52,7 @@ * **splitSegments() infinite recursion on oversized single messages**: splitSegments() infinite recursion on oversized single messages: In \`packages/core/src/distillation.ts\`, \`splitSegments()\` recurses infinitely when a single message exceeds \`maxSegmentTokens\` (16384). \`findSplitIndex()\` returns \`messages.length\` (=1), so \`left = messages.slice(0, 1)\` produces an identical recursive call. Triggered on large tool outputs (~49KB+). Fix: add base case after the \`totalTokens <= maxTokens\` guard — \`if (messages.length <= 1) return \[messages]\`. The oversized message becomes an indivisible segment. -* **TTL downgrade hysteresis: downgradeStreak field prevents compounding cache busts**: Auto-TTL downgrade hysteresis in \`packages/gateway/src/pipeline.ts\`: downgrade from 1h→5m TTL requires 3 consecutive short-gap turns (\`ttlDowngradeStreak\` in \`SessionState\`). Block downgrade if >50% of session tokens are cached. Reset streak on any long-gap turn. Subagent turns and tool-use continuations excluded from gap recording — capture \`prevStopReason\` before line 1667 overwrites it, skip when \`prevStopReason === 'tool\_use'\` or \`isSubagentTurn\`. State persistence tiers: (1) Immediate — session identity fields on mutation. (2) Per-turn — cost snapshot piggybacked on \`saveSessionTracking\` in \`postResponse\`. (3) 30s periodic — gradient EMAs and cache warming state via dirty flag + idle scheduler. Max data loss on crash: ~30s of gradient/warmup state. +* **TTL downgrade hysteresis: downgradeStreak field prevents compounding cache busts**: Auto-TTL downgrade hysteresis in \`packages/gateway/src/pipeline.ts\`: downgrade from 1h→5m TTL requires 3 consecutive short-gap turns (\`ttlDowngradeStreak\` in \`SessionState\`). Block downgrade if >50% of session tokens are cached. Reset streak on any long-gap turn. Subagent turns and tool-use continuations excluded from gap recording — capture \`prevStopReason\` before line 1667 overwrites it, skip when \`prevStopReason === 'tool\_use'\` or \`isSubagentTurn\`. State persistence: immediate (session identity), per-turn (cost snapshot), 30s periodic (gradient EMAs + cache warming via dirty flag). Max data loss on crash: ~30s. * **Upgrade lock double-acquisition bug: same process re-locks same file**: In \`packages/gateway/src/cli/lib/binary.ts\`, \`downloadBinaryToTemp()\` acquires a lock on \`\.lock\` and holds it. Then \`installBinary()\` computes the same install path and tries to \`acquireLock()\` again. \`handleExistingLock()\` only allows re-entry if \`existingPid === process.ppid\` (parent), but the lock was written by the same process (\`existingPid === process.pid\`), so it throws 'Another upgrade is already in progress'. Fix: in \`handleExistingLock\`, also allow re-entry when \`existingPid === process.pid\`. Double \`releaseLock()\` is safe — \`releaseLock\` swallows errors so the second call is a no-op after the file is deleted. @@ -70,17 +70,23 @@ ### Preference + +* **Always analyze root causes before proposing solutions, with explicit enumerated failure points**: When the user identifies a problem, they enumerate specific failure points explicitly and numbered before designing solutions. Mirror this structure: acknowledge the enumerated failure analysis, then address each failure point directly. Don't jump straight to a fix — validate or extend the root cause breakdown first. Also applies when helping design improvements to evals, tool descriptions, or system behavior. + -* **Always request critical self-review via subagent before merging PRs**: Before merging any PR, the user consistently asks the assistant to critically review its own code and PR description using a subagent for objectivity. The subagent review should identify real bugs, misleading logs, wrong parameters, dead code, and other issues categorized by severity (critical/medium/low). Only actionable issues should be fixed; cosmetic or deferred items are noted but skipped. After fixes are applied, all tests must pass before the commit is amended/pushed and the PR is merged. This pattern applies to every PR regardless of size or apparent simplicity. +* **Always request critical self-review via subagent before merging PRs**: Before merging any PR, critically review code and PR description using a subagent for objectivity. Subagent should identify real bugs, misleading logs, wrong parameters, dead code — categorized by severity (critical/medium/low). Only fix actionable issues; note but skip cosmetic/deferred items. All tests must pass before committing and merging. + + +* **Always request thorough architectural understanding before implementing eval features**: When starting work on the Lore eval suite, the user consistently asks for a comprehensive exploration of the existing system before making changes or additions. This includes requesting analysis of specific files, directory structures, type definitions, scenario formats, harness execution, and baseline implementations. The user wants to understand key functions, signatures, and measurable aspects before designing or building anything new. Follow this pattern by proactively reading and summarizing all relevant eval files (types.ts, harness.ts, judge.ts, baselines.ts, scenario files) when the user begins a new eval-related task, without waiting to be asked. -* **IDs starting with LOREAI-GATEWAY- are Sentry issue IDs**: Any identifier starting with \`LOREAI-GATEWAY-\` (e.g. \`LOREAI-GATEWAY-F\`) is a Sentry issue ID for the gateway project. Always treat these as Sentry issue references when encountered in conversation — fetch the issue via Sentry CLI/API to get stack traces, user counts, and release info before investigating the codebase. +* **IDs starting with LOREAI-GATEWAY- are Sentry issue IDs**: Any identifier starting with \`LOREAI-GATEWAY-\` (e.g. \`LOREAI-GATEWAY-F\`) is a Sentry issue ID for the gateway project. Always treat these as Sentry issue references — fetch via Sentry CLI/API to get stack traces, user counts, and release info before investigating the codebase. -* **Lore eval scores must beat or match tail-window — scoring below it means lost information**: Lore eval scores must beat or match tail-window baseline — scoring below it means lost information (treat as bug). \`inflateScenario(scenario, opts?)\` in \`packages/eval/src/inflate.ts\` — opts is \`{ targetTokens?, excludeKeywords? }\`, NOT positional args; silently fails. Token estimation: chars/4 (scenario convention; chars/3 in baselines.ts for budget safety). Auto-extracts protected keywords from question+referenceAnswer. Adjusts \`question.metadata.turnIndex\` after inflation. 8 replay fixtures, 16 scenarios, 130 questions, 6 baselines in CI. \`--inflate\` is incompatible with replay mode — run inflated scenarios in live mode only. Inflator buries preference-change turns (known issue). +* **Lore eval scores must beat or match tail-window — scoring below it means lost information**: Lore eval scores must beat or match tail-window baseline — scoring below means lost information (treat as bug). \`inflateScenario(scenario, opts?)\` in \`packages/eval/src/inflate.ts\` — opts is \`{ targetTokens?, excludeKeywords? }\`, NOT positional args; silently fails. Token estimation: chars/4 (scenario convention; chars/3 in baselines.ts for budget safety). Auto-extracts protected keywords from question+referenceAnswer. Adjusts \`question.metadata.turnIndex\` after inflation. 8 replay fixtures, 16 scenarios, 130 questions, 6 baselines in CI. \`--inflate\` incompatible with replay mode — run inflated scenarios in live mode only. Inflator buries preference-change turns (known issue). * **Prefer WASM backend over native onnxruntime-node for compiled binaries**: WASM backend for Bun \`--compile\` binaries with transformers.js: \`binaryExternalsPlugin\` in esbuild redirects \`onnxruntime-node\` → \`onnxruntime-web\` via \`onResolve\` (static imports only — does NOT redirect dynamic \`import()\` calls) and patches transformers.js CDN fallback via \`onLoad\` to read \`wasmPaths\` from \`globalThis.\_\_LORE\_VENDOR\_WASM\_PATHS\_\_\` (object form \`{ mjs, wasm }\` with exact hashed \`$bunfs\` filenames — directory strings fail because Bun hashes bundled WASM filenames). WASM files embedded as Bun \`{ type: 'file' }\` assets in the wrapper; wrapper sets \`globalThis.\_\_LORE\_VENDOR\_WASM\_PATHS\_\_\` before importing the worker. No onnxruntime import in wrapper or worker. For npm/CJS builds, \`onnxruntime-node\` stays external. WASM is ~2x faster on batches than native. Importing \`onnxruntime-web\` explicitly alongside the redirect creates two ort instances — 'cannot register backend cpu using priority 10' error. -* **Use Vitest as the project-wide testing framework, not Mocha + Chai + ts-node**: Use Vitest as the project-wide testing framework (migrated from Mocha + Chai + ts-node on May 19, 2026 — 312ms vs 30s startup). Always write new tests with \`import { describe, it, expect } from 'vitest'\`. Use kebab-case file naming (e.g., \`auth-integration.test.ts\`). Never revert to Mocha + Chai. Treat the most recent explicit framework directive as authoritative. +* **Use Vitest as the project-wide testing framework, not Mocha + Chai + ts-node**: Use Vitest as the project-wide testing framework (migrated from Mocha + Chai + ts-node, May 2026 — 312ms vs 30s startup). Always write new tests with \`import { describe, it, expect } from 'vitest'\`. Use kebab-case file naming (e.g., \`auth-integration.test.ts\`). Never revert to Mocha + Chai. Treat the most recent explicit framework directive as authoritative. diff --git a/packages/core/eval/harness.ts b/packages/core/eval/harness.ts index d0b3619..e1950ee 100644 --- a/packages/core/eval/harness.ts +++ b/packages/core/eval/harness.ts @@ -482,7 +482,7 @@ async function askQuestionViaGateway( question: string, gateway: GatewayHandle, model: string, -): Promise<{ hypothesis: string; tokens: TokenUsage }> { +): Promise<{ hypothesis: string; tokens: TokenUsage; recallInvoked: boolean }> { const requestBody = { model, system: QA_SYSTEM, @@ -509,6 +509,8 @@ async function askQuestionViaGateway( } const resp = await gateway.chat(requestBody); + const recallInvoked = + resp.headers.get("x-lore-recall-invoked") === "true"; const data = (await resp.json()) as { content?: Array<{ type: string; text?: string }>; usage?: { @@ -538,6 +540,7 @@ async function askQuestionViaGateway( return { hypothesis: text || data.error?.message || "[No response from gateway]", + recallInvoked, tokens: { input: data.usage?.input_tokens ?? 0, output: data.usage?.output_tokens ?? 0, @@ -550,6 +553,7 @@ async function askQuestionViaGateway( return { hypothesis: "[Gateway rate limit exceeded after retries]", + recallInvoked: false, tokens: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalCost: 0 }, }; } @@ -763,6 +767,7 @@ export async function runScenario( for (const q of scenario.questions) { let hypothesis: string; let tokens: TokenUsage; + let recallInvoked = false; if (config.mode === "fixture" || !llm) { // Fixture mode: produce a placeholder hypothesis @@ -789,6 +794,7 @@ export async function runScenario( ); hypothesis = answer.hypothesis; tokens = answer.tokens; + recallInvoked = answer.recallInvoked; } else { // Non-gateway baselines: ask via direct LLM with rendered context const answer = await askQuestion(q.question, context, mode, llm); @@ -797,7 +803,7 @@ export async function runScenario( } // Score with the judge - const judgeResult = await judge(q, hypothesis, llm); + const judgeResult = await judge(q, hypothesis, llm, { recallInvoked }); const result: EvalResult = { timestamp: new Date().toISOString(), @@ -817,6 +823,7 @@ export async function runScenario( tags: q.metadata.tags, turnIndex: q.metadata.turnIndex, cumulativeTokens: q.metadata.cumulativeTokens, + recallInvoked, }, }; diff --git a/packages/core/eval/judge.ts b/packages/core/eval/judge.ts index 9870184..f7b702a 100644 --- a/packages/core/eval/judge.ts +++ b/packages/core/eval/judge.ts @@ -177,6 +177,17 @@ export const CROSS_PROJECT_AVAILABILITY: ScoringCriterion = { }, }; +export const RECALL_TRIGGER: ScoringCriterion = { + name: "recall_trigger", + description: + "Did the answer appropriately use recall for cross-session references?", + scale: { + 1: "Did not attempt recall despite clear cross-session reference cues", + 3: "Used recall but with poor query formulation or incomplete usage", + 5: "Proactively used recall with appropriate queries to retrieve cross-session information", + }, +}; + // --------------------------------------------------------------------------- // Pre-built rubrics // --------------------------------------------------------------------------- @@ -281,6 +292,17 @@ export const RUBRICS = { cross_project_availability: 0.3, }, } satisfies ScoringRubric, + + /** MSR-1 cross-session cue questions */ + crossSessionCueRecall: { + criteria: [FACTUAL_ACCURACY, COMPLETENESS, RECALL_TRIGGER, TEMPORAL_ATTRIBUTION], + weights: { + factual_accuracy: 0.25, + completeness: 0.25, + recall_trigger: 0.3, + temporal_attribution: 0.2, + }, + } satisfies ScoringRubric, } as const; // --------------------------------------------------------------------------- @@ -319,13 +341,25 @@ function buildJudgeUser( referenceAnswer: string, hypothesis: string, rubric: ScoringRubric, + metadata?: { recallInvoked?: boolean }, ): string { const criteria = buildCriteriaDescription(rubric); + + // Only include recall metadata when the rubric has a recall_trigger criterion + const hasRecallCriterion = rubric.criteria.some( + (c) => c.name === "recall_trigger", + ); + const recallSection = + hasRecallCriterion && metadata?.recallInvoked !== undefined + ? `\n\n## Recall Tool Usage\nThe recall tool (cross-session memory search) was **${metadata.recallInvoked ? "invoked" : "not invoked"}** when answering this question. Factor this into the recall_trigger score.\n\n` + : "\n\n"; + return ( `## Scoring Criteria\n\n${criteria}\n\n` + `## Question\n${question}\n\n` + `## Reference Answer\n${referenceAnswer}\n\n` + - `## Hypothesis (answer to evaluate)\n${hypothesis}\n\n` + + `## Hypothesis (answer to evaluate)\n${hypothesis}` + + recallSection + `Score each criterion on a 1-5 scale. Return JSON only.` ); } @@ -356,6 +390,7 @@ export async function judge( question: EvalQuestion, hypothesis: string, llm?: EvalLLMClient, + metadata?: { recallInvoked?: boolean }, ): Promise { const { rubric } = question; @@ -378,6 +413,7 @@ export async function judge( question.referenceAnswer, hypothesis, rubric, + metadata, ); const result = await llm.prompt(JUDGE_SYSTEM, userPrompt, { diff --git a/packages/core/eval/scenarios/multi-session-recall.ts b/packages/core/eval/scenarios/multi-session-recall.ts index 8a902ed..33e0ce1 100644 --- a/packages/core/eval/scenarios/multi-session-recall.ts +++ b/packages/core/eval/scenarios/multi-session-recall.ts @@ -4,7 +4,7 @@ * Three scenarios testing whether Lore can recall information from previous * sessions in a multi-session workflow: * - * MSR-1 Sequential Feature Development (3 sessions, 12 questions) + * MSR-1 Sequential Feature Development (3 sessions, 20 questions) * MSR-2 Deep History Recall (5 sessions, 15 questions) * MSR-3 Cross-Model Sessions (2 sessions, 6 questions) */ @@ -438,6 +438,97 @@ const msr1Questions: EvalQuestion[] = [ rubric: RUBRICS.multiSessionRecall, metadata: { difficulty: "hard", tags: ["synthesis", "architecture"] }, }, + // Cross-session cue questions — test whether conversational references + // to prior sessions trigger recall tool usage. Same factual content as + // some existing questions but phrased with natural cross-session cues. + { + id: "msr1-q13", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-1,msr1-session-2,msr1-session-3", + question: "Remember that auth bug we had? What was the root cause?", + referenceAnswer: + "The auth regression was caused by the interaction between two changes across sessions. In session 1, the rate limiter was fixed from IP-based to email-based keying (req.body?.email ?? req.ip). In session 2, OAuth callback routes were added that don't have a request body. OAuth callbacks fell back to the IP key because req.body was undefined, causing OAuth provider callbacks to consume the IP-based rate limit counter, which then blocked legitimate login requests sharing that IP.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "bug"] }, + }, + { + id: "msr1-q14", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-2", + question: "We set up something for token refresh last time, how does it work?", + referenceAnswer: + "Refresh tokens are stored in the database with a 7-day expiry. On each use, the old refresh token is revoked and a new one is created (single-use rotation). This is implemented in src/auth/token-refresh.ts.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "token"] }, + }, + { + id: "msr1-q15", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-1", + question: "Earlier we discussed why we went with JWT — what was the reasoning?", + referenceAnswer: + "JWT was chosen over session cookies because the API is consumed by both a React web frontend and a React Native mobile app. Stateless tokens avoid server-side session storage and work well with the existing CDN setup.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "decision-rationale"] }, + }, + { + id: "msr1-q16", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-1", + question: "What was that password hashing library we picked, and why did we rule out the other one?", + referenceAnswer: + "bcrypt with 12 salt rounds, implemented in src/auth/password.ts. Argon2 was considered but rejected because bcrypt has better library support in the current Node 20 setup.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "decision-rationale"] }, + }, + { + id: "msr1-q17", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-2", + question: "In our OAuth work, we had a specific reason for using PKCE instead of the simpler flow — remind me why?", + referenceAnswer: + "PKCE was chosen for three reasons: (1) the mobile app is a public client where PKCE is mandatory, (2) implicit grant is deprecated in OAuth 2.1, and (3) using the same flow for both web and mobile simplifies the codebase.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "oauth"] }, + }, + { + id: "msr1-q18", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-1,msr1-session-2,msr1-session-3", + question: "We changed the rate limiter key at some point and then it caused issues later — can you walk me through what happened?", + referenceAnswer: + "In session 1, the rate limiter was keyed by req.ip, which meant users behind the same NAT shared a counter. It was fixed to use req.body?.email ?? req.ip. In session 3, this caused a regression: OAuth callback requests (added in session 2) don't have a body, so they fell back to IP-based keying. OAuth provider servers sharing IPs consumed the rate limit counter, blocking legitimate login requests. The fix added a path check to use a separate 'oauth:' prefix key for OAuth routes.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "hard", tags: ["cross-session-cue", "recall-trigger", "synthesis"] }, + }, + { + id: "msr1-q19", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-1,msr1-session-2", + question: "We built up the auth module over a couple of sessions — how many files ended up in src/auth/ and what are they?", + referenceAnswer: + "9 files total in src/auth/. Original 5 from session 1: jwt.ts, password.ts, middleware.ts, rate-limiter.ts, routes.ts. Added 4 in session 2: oauth-config.ts, oauth-pkce.ts, oauth-routes.ts, token-refresh.ts.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "hard", tags: ["cross-session-cue", "recall-trigger", "enumeration"] }, + }, + { + id: "msr1-q20", + dimension: "recall", + scenario: "msr-1-sequential", + sessionRef: "msr1-session-2", + question: "What were those callback URLs we configured for the OAuth providers? I need to set them up in production.", + referenceAnswer: + "Google: http://localhost:3000/auth/google/callback (overridable via GOOGLE_CALLBACK_URL env var). GitHub: http://localhost:3000/auth/github/callback (overridable via GITHUB_CALLBACK_URL env var). In production, set them to https://api.ourapp.com/auth/{provider}/callback.", + rubric: RUBRICS.crossSessionCueRecall, + metadata: { difficulty: "medium", tags: ["cross-session-cue", "recall-trigger", "config-value"] }, + }, ]; // ========================================================================= diff --git a/packages/core/src/recall.ts b/packages/core/src/recall.ts index 5f10e96..55d8273 100644 --- a/packages/core/src/recall.ts +++ b/packages/core/src/recall.ts @@ -966,7 +966,7 @@ export async function runRecall(input: RecallInput): Promise { /** Standard tool description reused verbatim by each host adapter. */ export const RECALL_TOOL_DESCRIPTION = - "Search your persistent memory for this project. Your visible context is a trimmed window — older messages, decisions, and details may not be visible to you even within the current session. Use this tool whenever you need information that isn't in your current context: file paths, past decisions, user preferences, prior approaches, or anything from earlier in this conversation or previous sessions. Always prefer recall over assuming you don't have the information. Searches long-term knowledge, distilled history, and raw message archives."; + 'Search your persistent memory for this project. Two cases where you MUST use this tool: (1) Cross-session references — the user mentions past work, "last time", "before", "we discussed", "earlier", or "remember". Prior sessions are never in your context. (2) Missing details — file paths, past decisions, preferences, or approaches you don\'t see in your current window. Always prefer recall over assuming. Searches knowledge, distilled history, and message archives.'; /** Standard parameter descriptions reused by each host adapter. */ export const RECALL_PARAM_DESCRIPTIONS = { diff --git a/packages/gateway/src/pipeline.ts b/packages/gateway/src/pipeline.ts index b73f81e..0076098 100644 --- a/packages/gateway/src/pipeline.ts +++ b/packages/gateway/src/pipeline.ts @@ -1820,7 +1820,10 @@ async function accumulateNonStreamOpenAIStream( * Convert a GatewayResponse to a non-streaming HTTP Response. * Scales usage fields to prevent client auto-compaction. */ -function nonStreamHttpResponse(resp: GatewayResponse): Response { +function nonStreamHttpResponse( + resp: GatewayResponse, + extraHeaders?: Record, +): Response { // Scale usage so the client's token total stays below auto-compact threshold. // postResponse() has already consumed the real values for calibration/bustRate. const scaledUsage = scaleUsageForClient({ @@ -1841,7 +1844,7 @@ function nonStreamHttpResponse(resp: GatewayResponse): Response { const body = buildAnthropicNonStreamResponse(scaledResp); return new Response(JSON.stringify(body), { status: 200, - headers: { "content-type": "application/json" }, + headers: { "content-type": "application/json", ...extraHeaders }, }); } @@ -3311,7 +3314,10 @@ async function handleConversationTurn( `recall (non-stream, mixed): stored result for session ${sessionState.sessionID.slice(0, 16)}`, ); postResponse(req, markerResp, sessionState, config, requestBody, genAiSpan); - return nonStreamHttpResponse(unsustainable ? injectContextWarning(markerResp) : markerResp); + return nonStreamHttpResponse( + unsustainable ? injectContextWarning(markerResp) : markerResp, + { "x-lore-recall-invoked": "true" }, + ); } // Recall-only — send follow-up request for seamless UX @@ -3339,7 +3345,10 @@ async function handleConversationTurn( ); // Fall back to response with marker (no continuation) postResponse(req, markerResp, sessionState, config, requestBody, genAiSpan); - return nonStreamHttpResponse(unsustainable ? injectContextWarning(markerResp) : markerResp); + return nonStreamHttpResponse( + unsustainable ? injectContextWarning(markerResp) : markerResp, + { "x-lore-recall-invoked": "true" }, + ); } let continuationResp = await accumulateNonStreamResponse(followUpResponse, followUpProtocol); @@ -3367,7 +3376,10 @@ async function handleConversationTurn( } postResponse(req, continuationResp, sessionState, config, requestBody, genAiSpan); - return nonStreamHttpResponse(unsustainable ? injectContextWarning(continuationResp) : continuationResp); + return nonStreamHttpResponse( + unsustainable ? injectContextWarning(continuationResp) : continuationResp, + { "x-lore-recall-invoked": "true" }, + ); } postResponse(req, resp, sessionState, config, requestBody, genAiSpan);