Doorman11991 · Doorman11991 · May 31, 2026 · May 31, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,31 @@
 # Changelog
 
+## [Unreleased]
+
+### feat: hybrid code search — exact + semantic in one call (#67)
+
+Adds a `hybrid_search` tool ("grep on steroids") that fuses exact matching
+(regex/keyword) with semantic ranking over a symbol-aware local index, in a
+single call. It surfaces code that *does* what you describe even when it
+doesn't contain the query words, and ranks plain grep-style hits by relevance.
+
+- New module `src/tools/hybrid_search.js`: walks source files (honoring the
+  shared ignore list), splits them into symbol-centered chunks (lightweight
+  AST-ish boundary detection across JS/TS/Python/Go/Rust/Java/etc.), and scores
+  each chunk with BM25 + a hashed bag-of-words vector, boosting exact matches.
+- Modes: `hybrid` (default), `regex`, `keyword`, `semantic`.
+- Fully local and dependency-free — reuses the existing `src/rag/index_store`
+  scoring engine, so there are no model downloads, no native runtime, and no
+  external services. Inspired by the projects suggested in #67 (colgrep,
+  semble) but kept zero-dependency to match SmallCode's local-first design.
+- Wired into the executor, tool schemas, the search/code-intel routing
+  categories, and tool-call dedup. Path arguments are contained to the project
+  via `safeResolvePath`.
+- Tunables: `SMALLCODE_HYBRID_MAX_FILES` (default 1500),
+  `SMALLCODE_HYBRID_MAX_BYTES` (default 512KiB).
+- Test coverage: `test/hybrid_search.test.js` (11 cases). Full suite: 313
+  passing.
+
 ## [1.5.2] - 2026-05-30
 
 ### fix: restore terminal on suspend, termination, and crashes (#71)

diff --git a/README.md b/README.md
@@ -243,6 +243,9 @@ Never exceeds your model's context window. Tool results capped at 4k chars, mid-
 ### 2-Stage Tool Routing
 Halves the schema context overhead. Model picks a category (read/write/search/run/plan) first, then gets only relevant tool schemas. Critical for models with 8-16k context.
 
+### Hybrid Code Search ("grep on steroids")
+The `hybrid_search` tool answers a single query with both exact matching (regex/keyword, the precision of grep) and semantic ranking (find code that *does* a thing even when it doesn't contain the query words) over a symbol-aware local index. It's fully offline with zero model downloads — it reuses SmallCode's local BM25 + hashed-vector engine, so it runs instantly on CPU with no external services. Modes: `hybrid` (default, exact + semantic), `regex`, `keyword`, `semantic`. Inspired by [colgrep](https://github.com/lightonai/next-plaid) and [semble](https://github.com/MinishLab/semble) ([#67](https://github.com/Doorman11991/smallcode/issues/67)), kept dependency-free to match SmallCode's local-first design. Tune with `SMALLCODE_HYBRID_MAX_FILES` / `SMALLCODE_HYBRID_MAX_BYTES`.
+
 ### Early-Stop Detection
 Detects repetition loops, patch spirals (stuck on corrupted file → forces rewrite), and greeting regression (model lost context → re-injects task). Saves tokens and time.
 

diff --git a/bin/executor.js b/bin/executor.js
@@ -439,6 +439,25 @@ async function executeTool(name, args, ctx) {
       } catch { return { result: 'No matches found.' }; }
     }
 
+    case 'hybrid_search': {
+      // "grep on steroids" (issue #67): one call fuses exact regex/keyword
+      // matching with semantic ranking over a symbol-aware local index.
+      try {
+        const q = String(args.query || args.pattern || '').trim();
+        if (!q) return { error: 'hybrid_search: query is required' };
+        const safePath = args.path ? safeResolvePath(args.path, cwd) : { ok: true, fullPath: cwd };
+        if (!safePath.ok) return { error: `hybrid_search rejected: ${safePath.reason}` };
+        const allowedModes = new Set(['hybrid', 'regex', 'keyword', 'semantic']);
+        const mode = allowedModes.has(args.mode) ? args.mode : 'hybrid';
+        const limit = Math.max(1, Math.min(parseInt(args.limit, 10) || 10, 30));
+        const { hybridSearch, formatResults } = require('../src/tools/hybrid_search');
+        const results = hybridSearch(q, { root: safePath.fullPath || cwd, mode, limit });
+        return { result: sanitizeToolOutput(formatResults(results, q, mode)).slice(0, 4000) };
+      } catch (e) {
+        return { result: `hybrid_search failed: ${e.message}` };
+      }
+    }
+
     case 'find_files': {
       try {
         // Smart listing (Feature #17): if no glob pattern, use scored file tree

diff --git a/bin/tools.js b/bin/tools.js
diff --git a/src/compiled/tool_router.js b/src/compiled/tool_router.js
@@ -230,13 +230,13 @@ function classifyToolCategory(message) {
 function getToolsForCategory(category) {
   switch (category) {
     case 'code_intel':
-      return ['graph_search', 'explain_symbol', 'read_file', 'find_files', 'search'];
+      return ['graph_search', 'explain_symbol', 'read_file', 'find_files', 'search', 'hybrid_search'];
     case 'read':
       return ['read_file', 'list_projects', 'graph_search', 'find_files', 'find_and_read'];
     case 'write':
       return ['read_file', 'write_file', 'patch', 'bash', 'read_and_patch', 'create_and_run'];
     case 'search':
-      return ['search', 'find_files', 'graph_search', 'read_file', 'explain_symbol', 'search_and_read'];
+      return ['search', 'find_files', 'graph_search', 'read_file', 'explain_symbol', 'search_and_read', 'hybrid_search'];
     case 'run':
       return ['bash', 'run', 'read_file'];
     case 'plan':

diff --git a/src/tools/dedup.js b/src/tools/dedup.js
@@ -27,6 +27,7 @@ const PURE_TOOLS = new Set([
   'list_files',
   'search',
   'grep',
+  'hybrid_search',
   'graph_search',
   'explain_symbol',
   'find_by_path',

diff --git a/src/tools/hybrid_search.js b/src/tools/hybrid_search.js
@@ -0,0 +1,254 @@
+// SmallCode — Hybrid Code Search ("grep on steroids")
+//
+// Realizes the ideas from issue #67: a single search call that fuses
+//   1. EXACT matching   — regex / keyword (the precision of grep)
+//   2. SEMANTIC ranking  — meaning-based similarity (find code that *does* a
+//                          thing even when it doesn't contain the query words)
+// over a symbol-aware (AST-ish) chunk index kept in a small local file.
+//
+// The referenced projects (colgrep = Rust + ColBERT multi-vector, semble =
+// Python + model2vec) are excellent but pull in heavy native/Python runtimes
+// and model downloads. SmallCode's whole premise is staying small and fully
+// local with zero external services, so this reuses the existing local hybrid
+// scoring engine (BM25 + hashed bag-of-words vectors from src/rag/index_store)
+// rather than shipping an embedding model. Same single-call hybrid ergonomics,
+// no new dependencies, no model weights, runs instantly on CPU.
+//
+// If a semantic embedding MCP (e.g. budget-aware-mcp) is connected, callers can
+// still layer it on top; this tool guarantees a useful local baseline offline.
+//
+// Configuration:
+//   SMALLCODE_HYBRID_MAX_FILES   max files to index per search (default 1500)
+//   SMALLCODE_HYBRID_MAX_BYTES   skip files larger than this (default 524288)
+
+'use strict';
+
+const fs = require('fs');
+const path = require('path');
+const { tokenize, embed, cosine, bm25Score } = require('../rag/index_store');
+const { SOURCE_EXTS, SKIP_DIRS } = require('./file_tree');
+
+const MAX_FILES = parseInt(process.env.SMALLCODE_HYBRID_MAX_FILES, 10) || 1500;
+const MAX_BYTES = parseInt(process.env.SMALLCODE_HYBRID_MAX_BYTES, 10) || 512 * 1024;
+
+// Symbol-definition patterns across common languages. We don't build a full
+// AST — we detect definition boundaries so each chunk is centered on a
+// function/class/method, which is what makes semantic ranking meaningful.
+const SYMBOL_PATTERNS = [
+  /\b(?:function|func|fn|def|sub)\s+([A-Za-z_$][\w$]*)/,
+  /\b(?:class|struct|interface|enum|trait|impl|type)\s+([A-Za-z_$][\w$]*)/,
+  /(?:^|\s)(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:function|\([^)]*\)\s*=>)/,
+  /\b([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{/,            // method(...) {
+  /(?:public|private|protected|static|async)\s+([A-Za-z_$][\w$]*)\s*\(/,
+];
+
+function detectSymbol(line) {
+  for (const re of SYMBOL_PATTERNS) {
+    const m = line.match(re);
+    if (m && m[1]) return m[1];
+  }
+  return null;
+}
+
+// Walk the tree collecting source files, honoring the shared ignore list.
+function collectFiles(root, limit = MAX_FILES) {
+  const out = [];
+  const stack = [root];
+  while (stack.length && out.length < limit) {
+    const dir = stack.pop();
+    let entries;
+    try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
+    for (const e of entries) {
+      if (out.length >= limit) break;
+      if (e.name.startsWith('.') && e.name !== '.env') continue;
+      const full = path.join(dir, e.name);
+      if (e.isDirectory()) {
+        if (!SKIP_DIRS.has(e.name)) stack.push(full);
+      } else if (e.isFile() && SOURCE_EXTS.has(path.extname(e.name))) {
+        out.push(full);
+      }
+    }
+  }
+  return out;
+}
+
+// Split a file into symbol-centered chunks. A new chunk starts at each detected
+// definition; lines before the first definition form a leading chunk. This
+// keeps chunks semantically coherent without a real parser.
+function chunkFile(relPath, content) {
+  const lines = content.split('\n');
+  const chunks = [];
+  let cur = null;
+  const flush = () => {
+    if (cur && cur.lines.join('\n').trim()) {
+      chunks.push({
+        id: `${relPath}:${cur.startLine}`,
+        path: relPath,
+        startLine: cur.startLine,
+        endLine: cur.startLine + cur.lines.length - 1,
+        symbol: cur.symbol || '',
+        code: cur.lines.join('\n'),
+      });
+    }
+  };
+  for (let i = 0; i < lines.length; i++) {
+    const sym = detectSymbol(lines[i]);
+    if (sym || cur === null) {
+      // Boundary: close the previous chunk and open a new one. Avoid making a
+      // brand-new chunk for back-to-back definition lines with no body yet.
+      if (cur && (sym ? cur.lines.length > 1 : true)) flush();
+      if (!cur || sym) cur = { startLine: i + 1, symbol: sym, lines: [] };
+    }
+    cur.lines.push(lines[i]);
+    // Cap runaway chunks so a file with no detected symbols still splits.
+    if (cur.lines.length >= 80) { flush(); cur = { startLine: i + 2, symbol: '', lines: [] }; }
+  }
+  flush();
+  return chunks;
+}
+
+// Build an in-memory hybrid index over the project's source chunks.
+function buildIndex(root, opts = {}) {
+  const files = collectFiles(root, opts.maxFiles || MAX_FILES);
+  const docs = [];
+  for (const file of files) {
+    let stat;
+    try { stat = fs.statSync(file); } catch { continue; }
+    if (stat.size > (opts.maxBytes || MAX_BYTES)) continue;
+    let content;
+    try { content = fs.readFileSync(file, 'utf-8'); } catch { continue; }
+    if (content.includes('\u0000')) continue; // binary
+    const rel = path.relative(root, file).split(path.sep).join('/');
+    for (const chunk of chunkFile(rel, content)) {
+      const searchable = [chunk.path, chunk.symbol, chunk.code].filter(Boolean).join('\n');
+      const tokens = tokenize(searchable);
+      const tf = Object.create(null);
+      for (const t of tokens) tf[t] = (tf[t] || 0) + 1;
+      docs.push({ ...chunk, termFreq: tf, docLength: tokens.length, embedding: embed(searchable) });
+    }
+  }
+  return docs;
+}
+
+// Compile the user pattern into a regex. `keyword` mode escapes regex
+// metacharacters so the query is treated literally.
+function compilePattern(query, mode) {
+  if (mode === 'semantic') return null;
+  const flags = 'i';
+  if (mode === 'keyword') {
+    const escaped = query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+    return new RegExp(escaped, flags);
+  }
+  try { return new RegExp(query, flags); }
+  catch { // invalid regex → fall back to literal keyword match
+    return new RegExp(query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), flags);
+  }
+}
+
+function _stats(docs, queryTerms) {
+  const df = new Map(queryTerms.map(t => [t, 0]));
+  let totalLen = 0;
+  for (const d of docs) {
+    totalLen += d.docLength || 0;
+    for (const t of queryTerms) if (d.termFreq[t]) df.set(t, df.get(t) + 1);
+  }
+  return { df, totalDocs: docs.length || 1, avgDocLength: totalLen / (docs.length || 1) || 1 };
+}
+
+/**
+ * Hybrid search over a project directory.
+ *
+ * @param {string} query
+ * @param {object} options
+ *   - root:   project root (default cwd)
+ *   - mode:   'hybrid' (default) | 'regex' | 'keyword' | 'semantic'
+ *   - limit:  max results (default 10)
+ *   - vectorWeight: semantic weight in fusion (default 0.6)
+ *   - exactBoost: score bonus when a chunk also matches exactly (default 2.0)
+ * @returns {Array<{path,startLine,endLine,symbol,score,exact,snippet}>}
+ */
+function hybridSearch(query, options = {}) {
+  const root = options.root || process.cwd();
+  const mode = options.mode || 'hybrid';
+  const limit = options.limit || 10;
+  const vectorWeight = options.vectorWeight ?? 0.6;
+  const exactBoost = options.exactBoost ?? 2.0;
+
+  const docs = options._index || buildIndex(root, options);
+  if (!docs.length) return [];
+
+  const regex = compilePattern(query, mode);
+  const queryTerms = [...new Set(tokenize(query))];
+  const queryEmbedding = embed(query);
+  const stats = _stats(docs, queryTerms);
+
+  const scored = [];
+  for (const d of docs) {
+    let exact = false;
+    let exactHits = 0;
+    if (regex) {
+      const m = d.code.match(new RegExp(regex.source, 'gi'));
+      if (m) { exact = true; exactHits = m.length; }
+    }
+    // 'regex'/'keyword' are exact-only: drop non-matching chunks entirely.
+    if ((mode === 'regex' || mode === 'keyword') && !exact) continue;
+
+    const bm25 = bm25Score(queryTerms, d, stats);
+    const vector = cosine(queryEmbedding, d.embedding || {});
+    let score = bm25 + vectorWeight * vector;
+    if (mode === 'semantic') score = vector;
+    if (exact) score += exactBoost + Math.min(exactHits, 5) * 0.2;
+
+    if (score <= 0) continue;
+    scored.push({
+      path: d.path,
+      startLine: d.startLine,
+      endLine: d.endLine,
+      symbol: d.symbol,
+      score: Number(score.toFixed(4)),
+      exact,
+      snippet: _firstMatchSnippet(d, regex),
+    });
+  }
+
+  return scored.sort((a, b) => b.score - a.score).slice(0, limit);
+}
+
+// Pull the most relevant 1-3 lines for display: the first exact match line if
+// any, otherwise the symbol/signature line.
+function _firstMatchSnippet(doc, regex) {
+  const lines = doc.code.split('\n');
+  if (regex) {
+    for (let i = 0; i < lines.length; i++) {
+      if (regex.test(lines[i])) {
+        return lines[i].trim().slice(0, 160);
+      }
+    }
+  }
+  return (lines.find(l => l.trim()) || '').trim().slice(0, 160);
+}
+
+// Format results as a compact, model-friendly block.
+function formatResults(results, query, mode) {
+  if (!results.length) return `No results for "${query}" (mode: ${mode}).`;
+  const lines = [`Hybrid search: "${query}" (mode: ${mode}) — ${results.length} result(s)`, ''];
+  for (const r of results) {
+    const loc = `${r.path}:${r.startLine}`;
+    const sym = r.symbol ? ` ${r.symbol}` : '';
+    const tag = r.exact ? '●' : '○'; // ● exact+semantic, ○ semantic-only
+    lines.push(`${tag} ${loc}${sym}  [score ${r.score}]`);
+    if (r.snippet) lines.push(`    ${r.snippet}`);
+  }
+  lines.push('');
+  lines.push('● exact + semantic match   ○ semantic match only');
+  return lines.join('\n');
+}
+
+module.exports = {
+  hybridSearch,
+  buildIndex,
+  chunkFile,
+  detectSymbol,
+  compilePattern,
+  formatResults,
+};
diff --git a/src/tools/two_stage_router.js b/src/tools/two_stage_router.js
@@ -21,7 +21,7 @@ const TOOL_CATEGORIES = {
   },
   search: {
     description: 'Search code by regex, search code graph, explain symbols',
-    tools: ['search', 'search_and_read', 'graph_search', 'explain_symbol', 'list_projects'],
+    tools: ['search', 'search_and_read', 'graph_search', 'explain_symbol', 'list_projects', 'hybrid_search'],
   },
   run: {
     description: 'Run shell commands, execute scripts',