From 462817fa86e6e9377d4c6c7b8ddec8c1f16f11d1 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Tue, 12 May 2026 17:01:50 +0000 Subject: [PATCH] fix: filter distilled messages from temporal FTS search and purge worker boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Temporal FTS search was hitting all 114K+ messages, but only ~14K undistilled ones are meaningful — the other 100K are already represented in distillation search results. Additionally, ~355 legacy worker boilerplate messages (curator/observer/reflector prompts averaging 100KB+ each, containing full conversation transcripts) were polluting keyword matches for virtually any domain query. Two fixes: 1. Add distilled=0 filter to search(), searchScored(), and searchLike() in temporal.ts — matches what vector search already does. Uses existing compound index for efficient filtering. 2. DB migration v20 purges legacy worker boilerplate messages that match known system prompt patterns (observer, curator, consolidation, reflector, eval). --- packages/core/src/db.ts | 14 ++++++++++++++ packages/core/src/temporal.ts | 14 ++++++++------ packages/core/test/db.test.ts | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/packages/core/src/db.ts b/packages/core/src/db.ts index 5613f4d..a8783a5 100644 --- a/packages/core/src/db.ts +++ b/packages/core/src/db.ts @@ -470,6 +470,20 @@ const MIGRATIONS: string[] = [ ); CREATE INDEX IF NOT EXISTS idx_import_history_project ON import_history(project_id); `, + ` + -- Version 20: Purge worker boilerplate from temporal messages. + -- Legacy gateway/plugin worker calls (distillation observer, curator, + -- consolidation, reflector, eval) stored their full system prompts + -- (containing entire conversation transcripts, up to 1.6MB each) as + -- temporal messages. These pollute FTS search results by matching + -- virtually any domain keyword. Safe to delete: their actual output + -- (distillations, knowledge entries) is stored in dedicated tables. + DELETE FROM temporal_messages WHERE content LIKE '%You are a memory observer.%' + OR content LIKE '%You are a long-term memory curator.%' + OR content LIKE '%You are a long-term memory curator performing a consolidation pass.%' + OR content LIKE '%You are a memory reflector.%' + OR content LIKE '%You are evaluating distillation quality.%'; + `, ]; /** Return the resolved path of the SQLite database file. */ diff --git a/packages/core/src/temporal.ts b/packages/core/src/temporal.ts index 0113994..b78682c 100644 --- a/packages/core/src/temporal.ts +++ b/packages/core/src/temporal.ts @@ -171,6 +171,8 @@ export function markDistilled(ids: string[]) { .run(...ids); } +// Only searches undistilled messages — distilled content is already represented +// in distillation search results and would duplicate/dilute temporal hits. // LIKE-based fallback for when FTS5 fails unexpectedly. function searchLike(input: { pid: string; @@ -186,8 +188,8 @@ function searchLike(input: { const conditions = terms.map(() => "LOWER(content) LIKE ?").join(" AND "); const likeParams = terms.map((t) => `%${t}%`); const query = input.sessionID - ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?` - : `SELECT * FROM temporal_messages WHERE project_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`; + ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?` + : `SELECT * FROM temporal_messages WHERE project_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?`; const params = input.sessionID ? [input.pid, input.sessionID, ...likeParams, input.limit] : [input.pid, ...likeParams, input.limit]; @@ -208,11 +210,11 @@ export function search(input: { const ftsSQL = input.sessionID ? `SELECT m.* FROM temporal_fts f CROSS JOIN temporal_messages m ON m.rowid = f.rowid - WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? + WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0 ORDER BY rank LIMIT ?` : `SELECT m.* FROM temporal_fts f CROSS JOIN temporal_messages m ON m.rowid = f.rowid - WHERE f.content MATCH ? AND m.project_id = ? + WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0 ORDER BY rank LIMIT ?`; try { @@ -251,11 +253,11 @@ export function searchScored(input: { const ftsSQL = input.sessionID ? `SELECT m.*, rank FROM temporal_fts f CROSS JOIN temporal_messages m ON m.rowid = f.rowid - WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? + WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0 ORDER BY rank LIMIT ?` : `SELECT m.*, rank FROM temporal_fts f CROSS JOIN temporal_messages m ON m.rowid = f.rowid - WHERE f.content MATCH ? AND m.project_id = ? + WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0 ORDER BY rank LIMIT ?`; try { diff --git a/packages/core/test/db.test.ts b/packages/core/test/db.test.ts index 331e719..caf098a 100644 --- a/packages/core/test/db.test.ts +++ b/packages/core/test/db.test.ts @@ -23,7 +23,7 @@ describe("db", () => { const row = db().query("SELECT version FROM schema_version").get() as { version: number; }; - expect(row.version).toBe(19); + expect(row.version).toBe(20); }); test("distillation_fts virtual table exists", () => {