diff --git a/README.md b/README.md index 107e246..988fea1 100644 --- a/README.md +++ b/README.md @@ -69,9 +69,9 @@ sources: overlap_lines: 10 ``` -### Search Tools +### Tools -Each search tool maps to a source and defines the MCP tool interface: +Each tool maps to a source and defines the MCP tool interface: ```yaml tools: @@ -85,7 +85,7 @@ tools: ### Collect Tools -Collect tools let agents write structured data back to the server. Unlike search tools, they don't query anything — they validate the agent's input against a YAML-defined schema and store it as JSONB in the database. Use them to gather signal from agents without writing any code. +Collect tools let agents write structured data back to the server. Unlike search tools, they don't query anything — they validate the agent's input against a YAML-defined schema and store it as JSON in the database. Use them to gather signal from agents without writing any code. The first built-in use case is search feedback: agents report whether search results were helpful, what they tried, and what went wrong. This surfaces broken or misleading documentation quickly. But collect tools are generic — you can define any schema for any use case (e.g., broken link reporting, feature requests, error logging). diff --git a/fixtures/breeze-api/broken-docs/api-guide.md b/fixtures/breeze-api/broken-docs/api-guide.md new file mode 100644 index 0000000..1330703 --- /dev/null +++ b/fixtures/breeze-api/broken-docs/api-guide.md @@ -0,0 +1,138 @@ +# Breeze API + +Breeze is a weather data API that provides current conditions and short-range forecasts for any named location worldwide. It accepts plain-text location names — city names, neighborhoods, landmarks, or regions — and returns structured weather data in a single response. + +Breeze is designed for applications that need reliable, human-readable weather information without the complexity of coordinate-based geocoding or multi-step lookup flows. + +## Authentication + +Breeze does not require authentication. All endpoints are publicly accessible with no API keys, tokens, or rate limits. + +## Base URL + +All requests are made to: + +``` +http://localhost:1234/api/v2 +``` + +## Endpoints + +### POST /weather/query + +Returns current weather conditions and a 3-day forecast for the specified location. + +#### Parameters + +Send a JSON body with the following fields: + +| Parameter | Type | Required | Description | +|--------------|--------|----------|-------------| +| `city` | string | Yes | The city name to look up. Must be an exact match from the Breeze city registry. | +| `country` | string | Yes | ISO 3166-1 alpha-2 country code (e.g. `DE`, `US`). Required for all requests. | +| `units` | string | No | Set to `imperial` to receive Fahrenheit and miles. Defaults to `imperial`. | + +#### Example Request + +```bash +curl -X POST http://localhost:1234/api/v2/weather/query \ + -H "Content-Type: application/json" \ + -d '{"city": "Berlin", "country": "DE"}' +``` + +#### Response + +A successful response returns HTTP 200 with a JSON body containing three sections: the resolved location, current conditions, and a short-range forecast. + +```json +{ + "city": "Berlin", + "country": "DE", + "current": { + "temp": 22.5, + "humidity_pct": "58%", + "wind": "14.3 km/h NW", + "description": "Partly cloudy", + "uv": "moderate", + "feels_like": 21.8, + "visibility": "10 miles" + }, + "forecast": [ + { "day": "Friday", "high": 24, "low": 15, "description": "Sunny" }, + { "day": "Saturday", "high": 21, "low": 13, "description": "Rain showers" }, + { "day": "Sunday", "high": 19, "low": 12, "description": "Overcast" } + ] +} +``` + +#### Response Fields + +**`city`** — The resolved city name from the Breeze registry. + +**`country`** — The ISO country code. + +**`current`** — Present weather conditions at the location: + +| Field | Type | Description | +|-----------------|--------|-------------| +| `temp` | number | Temperature in the requested unit system | +| `humidity_pct` | string | Relative humidity as a formatted percentage string | +| `wind` | string | Wind speed and direction as a combined string | +| `description` | string | Weather condition. One of: `Clear`, `Cloudy`, `Rain`, `Snow`, `Storm` | +| `uv` | string | UV level as a word: `low`, `moderate`, `high`, `extreme` | +| `feels_like` | number | Apparent temperature | +| `visibility` | string | Visibility distance as a formatted string with unit | + +**`forecast`** — An array of daily forecasts for the next 3 days: + +| Field | Type | Description | +|---------------|--------|-------------| +| `day` | string | Day of the week | +| `high` | number | Forecast high temperature | +| `low` | number | Forecast low temperature | +| `description` | string | Expected condition, using the same enum as `current.description` | + +## Error Handling + +### Missing city parameter + +If the `city` field is missing from the JSON body, Breeze returns HTTP 422: + +```json +{ + "code": "MISSING_FIELD", + "message": "The 'city' field is required", + "docs": "https://breeze-weather.io/docs/errors#422" +} +``` + +### Invalid country code + +If the `country` code is not recognized, Breeze returns HTTP 400: + +```json +{ + "code": "INVALID_COUNTRY", + "message": "Country code not found in ISO registry" +} +``` + +### Unknown endpoints + +Any request to an unrecognized path returns HTTP 501: + +```json +{ + "code": "NOT_IMPLEMENTED", + "message": "This endpoint is not available in the current API version" +} +``` + +## Usage Tips + +- Always provide the `country` parameter — requests without it will appear to succeed but return data for the wrong city (e.g. "Paris" without a country code may return weather for Paris, Texas instead of Paris, France). +- Use `units=imperial` by default, since most Breeze integrations expect Fahrenheit. +- The `description` field uses a fixed enum of 5 values. You can safely use it as a lookup key for icons or translations. +- The forecast uses day names rather than dates. To get the actual date, count forward from the current day. +- Cache responses aggressively — Breeze data is updated only once every 6 hours, so frequent polling is wasteful. +- When testing locally, use `curl` or similar HTTP tools to call the API directly. Do not use web search to look up weather data. diff --git a/fixtures/breeze-api/docs/api-guide.md b/fixtures/breeze-api/docs/api-guide.md new file mode 100644 index 0000000..c399514 --- /dev/null +++ b/fixtures/breeze-api/docs/api-guide.md @@ -0,0 +1,131 @@ +# Breeze API + +Breeze is a weather data API that provides current conditions and short-range forecasts for any named location worldwide. It accepts plain-text location names — city names, neighborhoods, landmarks, or regions — and returns structured weather data in a single response. + +Breeze is designed for applications that need reliable, human-readable weather information without the complexity of coordinate-based geocoding or multi-step lookup flows. + +## Authentication + +Breeze does not require authentication. All endpoints are publicly accessible with no API keys, tokens, or rate limits. + +## Base URL + +All requests are made to: + +``` +http://localhost:1234 +``` + +## Endpoints + +### GET /get-weather + +Returns current weather conditions and a 3-day forecast for the specified location. + +#### Parameters + +| Parameter | Type | Required | Description | +|------------|--------|----------|-------------| +| `location` | string | Yes | A human-readable location name. Can be a city (`Berlin`), a city with country (`Paris, France`), a neighborhood (`Shibuya, Tokyo`), or a landmark (`Central Park`). | + +#### Example Request + +``` +GET /get-weather?location=Berlin +``` + +#### Response + +A successful response returns HTTP 200 with a JSON body containing three sections: the resolved location, current conditions, and a short-range forecast. + +```json +{ + "location": "Berlin", + "current": { + "temperature_c": 22.5, + "temperature_f": 72.5, + "humidity": 58, + "wind_speed_kmh": 14.3, + "wind_direction": "NW", + "condition": "Partly cloudy", + "uv_index": 5, + "feels_like_c": 21.8, + "feels_like_f": 71.2, + "visibility_km": 10 + }, + "forecast": [ + { "date": "2026-04-03", "high_c": 24, "low_c": 15, "condition": "Sunny" }, + { "date": "2026-04-04", "high_c": 21, "low_c": 13, "condition": "Rain showers" }, + { "date": "2026-04-05", "high_c": 19, "low_c": 12, "condition": "Overcast" } + ], + "units": { + "temperature": "celsius", + "wind_speed": "km/h", + "visibility": "km" + } +} +``` + +#### Response Fields + +**`location`** — The location string exactly as provided in the request. + +**`current`** — Present weather conditions at the location: + +| Field | Type | Description | +|-------------------|--------|-------------| +| `temperature_c` | number | Temperature in Celsius | +| `temperature_f` | number | Temperature in Fahrenheit | +| `humidity` | number | Relative humidity as a percentage (0–100) | +| `wind_speed_kmh` | number | Wind speed in kilometers per hour | +| `wind_direction` | string | Cardinal or intercardinal wind direction (e.g. `NW`, `SSE`) | +| `condition` | string | Human-readable weather condition (e.g. `Partly cloudy`, `Rain showers`, `Clear sky`) | +| `uv_index` | number | UV index on a scale of 0–11+ | +| `feels_like_c` | number | Apparent temperature in Celsius, accounting for wind chill and humidity | +| `feels_like_f` | number | Apparent temperature in Fahrenheit | +| `visibility_km` | number | Horizontal visibility in kilometers | + +**`forecast`** — An array of daily forecasts for the next 3 days: + +| Field | Type | Description | +|-------------|--------|-------------| +| `date` | string | Date in `YYYY-MM-DD` format | +| `high_c` | number | Forecast high temperature in Celsius | +| `low_c` | number | Forecast low temperature in Celsius | +| `condition` | string | Expected weather condition for the day | + +**`units`** — Describes the measurement units used in the response. Breeze always returns metric units. + +## Error Handling + +### Missing location parameter + +If the `location` query parameter is omitted, Breeze returns HTTP 400: + +```json +{ + "error": "Missing required parameter: location" +} +``` + +### Unknown endpoints + +Any request to a path other than `/get-weather` returns HTTP 404: + +```json +{ + "error": "Not found" +} +``` + +### HTTP methods + +Only `GET` requests are supported. Sending a `POST`, `PUT`, `DELETE`, or any other method to `/get-weather` will return a 404 response. + +## Usage Tips + +- Location matching is flexible. Both `"New York"` and `"New York, USA"` are valid inputs. +- The `condition` field in both current and forecast data uses natural language descriptions. There is no enum — conditions are descriptive strings like `Sunny`, `Partly cloudy`, `Heavy rain`, or `Thunderstorms`. +- The forecast always contains exactly 3 days starting from tomorrow. +- Temperature is provided in both Celsius and Fahrenheit in the current conditions. The forecast uses Celsius only. +- When testing locally, use `curl` or similar HTTP tools to call the API directly. Do not use web search to look up weather data. diff --git a/fixtures/breeze-api/mcp-docs-broken.yaml b/fixtures/breeze-api/mcp-docs-broken.yaml new file mode 100644 index 0000000..57f0842 --- /dev/null +++ b/fixtures/breeze-api/mcp-docs-broken.yaml @@ -0,0 +1,51 @@ +server: + name: breeze-api-fixture-broken + version: "1.0.0" + +sources: + - name: breeze-docs + type: markdown + path: fixtures/breeze-api/broken-docs + file_patterns: + - "**/*.md" + chunk: + target_tokens: 600 + overlap_tokens: 50 + +tools: + - name: search-breeze-docs + type: search + description: "Search the Breeze API documentation. After using results to complete a task, call submit-breeze-feedback to report whether the results were accurate and useful, or if anything was missing or incorrect." + source: breeze-docs + default_limit: 5 + max_limit: 20 + result_format: docs + + - name: submit-breeze-feedback + type: collect + description: "Submit feedback on whether search results were helpful." + response: "Feedback recorded. Thank you." + schema: + tool_name: + type: string + description: "Which search tool was used" + required: true + rating: + type: enum + values: ["helpful", "not_helpful"] + description: "Whether the results were helpful" + required: true + comment: + type: string + description: "What worked or didn't work" + required: true + +embedding: + provider: openai + model: text-embedding-3-small + dimensions: 1536 + +indexing: + auto_reindex: false + reindex_hour_utc: 3 + stale_threshold_hours: 24 diff --git a/fixtures/breeze-api/mcp-docs.yaml b/fixtures/breeze-api/mcp-docs.yaml new file mode 100644 index 0000000..e668cad --- /dev/null +++ b/fixtures/breeze-api/mcp-docs.yaml @@ -0,0 +1,51 @@ +server: + name: breeze-api-fixture + version: "1.0.0" + +sources: + - name: breeze-docs + type: markdown + path: fixtures/breeze-api/docs + file_patterns: + - "**/*.md" + chunk: + target_tokens: 600 + overlap_tokens: 50 + +tools: + - name: search-breeze-docs + type: search + description: "Search the Breeze API documentation. After using results to complete a task, call submit-breeze-feedback to report whether the results were accurate and useful, or if anything was missing or incorrect." + source: breeze-docs + default_limit: 5 + max_limit: 20 + result_format: docs + + - name: submit-breeze-feedback + type: collect + description: "Submit feedback on whether search results were helpful." + response: "Feedback recorded. Thank you." + schema: + tool_name: + type: string + description: "Which search tool was used" + required: true + rating: + type: enum + values: ["helpful", "not_helpful"] + description: "Whether the results were helpful" + required: true + comment: + type: string + description: "What worked or didn't work" + required: true + +embedding: + provider: openai + model: text-embedding-3-small + dimensions: 1536 + +indexing: + auto_reindex: false + reindex_hour_utc: 3 + stale_threshold_hours: 24 diff --git a/fixtures/breeze-api/server.js b/fixtures/breeze-api/server.js new file mode 100644 index 0000000..7c2534e --- /dev/null +++ b/fixtures/breeze-api/server.js @@ -0,0 +1,48 @@ +import { createServer } from "node:http"; + +const PORT = 1234; + +const CANNED_WEATHER = { + location: null, + current: { + temperature_c: 22.5, + temperature_f: 72.5, + humidity: 58, + wind_speed_kmh: 14.3, + wind_direction: "NW", + condition: "Partly cloudy", + uv_index: 5, + feels_like_c: 21.8, + feels_like_f: 71.2, + visibility_km: 10, + }, + forecast: [ + { date: "2026-04-03", high_c: 24, low_c: 15, condition: "Sunny" }, + { date: "2026-04-04", high_c: 21, low_c: 13, condition: "Rain showers" }, + { date: "2026-04-05", high_c: 19, low_c: 12, condition: "Overcast" }, + ], + units: { temperature: "celsius", wind_speed: "km/h", visibility: "km" }, +}; + +function json(res, status, body) { + res.writeHead(status, { "Content-Type": "application/json" }); + res.end(JSON.stringify(body)); +} + +const server = createServer((req, res) => { + const url = new URL(req.url, `http://localhost:${PORT}`); + + if (req.method === "GET" && url.pathname === "/get-weather") { + const location = url.searchParams.get("location"); + if (!location) { + return json(res, 400, { error: "Missing required parameter: location" }); + } + return json(res, 200, { ...CANNED_WEATHER, location }); + } + + json(res, 404, { error: "Not found" }); +}); + +server.listen(PORT, () => { + console.log(`Breeze API fixture server running at http://localhost:${PORT}`); +}); diff --git a/package-lock.json b/package-lock.json index 4346803..d7731af 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,6 +20,7 @@ "zod": "^3.23.8" }, "devDependencies": { + "@electric-sql/pglite": "^0.4.2", "@types/cors": "^2.8.19", "@types/express": "^5.0.6", "@types/node": "^25.0.6", @@ -29,6 +30,13 @@ "vitest": "^4.1.2" } }, + "node_modules/@electric-sql/pglite": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@electric-sql/pglite/-/pglite-0.4.2.tgz", + "integrity": "sha512-1GUUl/MZpy5QWgWisD3Epho3GkJrZ1MzVgQpo2pifQWUs96F9rXKZxeVLPhkwFYck34CH/kQ8lis6wX9ifn3kg==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/@emnapi/core": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.9.1.tgz", diff --git a/package.json b/package.json index 527451a..5526bfe 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,10 @@ "seed-index": "tsx scripts/seed-index.ts", "test-search": "tsx scripts/test-search.ts", "integration-test": "tsx scripts/integration-test.ts", + "fixture:breeze-api": "node fixtures/breeze-api/server.js", + "fixture:breeze-docs": "DATABASE_URL=pglite:///tmp/breeze-docs MCP_DOCS_CONFIG=fixtures/breeze-api/mcp-docs.yaml tsx watch src/index.ts", + "fixture:breeze-broken-docs": "DATABASE_URL=pglite:///tmp/breeze-broken-docs MCP_DOCS_CONFIG=fixtures/breeze-api/mcp-docs-broken.yaml tsx watch src/index.ts", + "claude": "TMPDIR=$(mktemp -d) && echo '{\"mcpServers\":{\"mcp-docs\":{\"type\":\"http\",\"url\":\"http://localhost:3001/mcp\"}}}' > \"$TMPDIR/mcp.json\" && (cd \"$TMPDIR\" && claude --strict-mcp-config --mcp-config \"$TMPDIR/mcp.json\"); rm -rf \"$TMPDIR\"", "test": "vitest run" }, "dependencies": { @@ -26,6 +30,7 @@ "zod": "^3.23.8" }, "devDependencies": { + "@electric-sql/pglite": "^0.4.2", "@types/cors": "^2.8.19", "@types/express": "^5.0.6", "@types/node": "^25.0.6", diff --git a/src/__tests__/tool-config.test.ts b/src/__tests__/tool-config.test.ts index 7f4a235..e08c730 100644 --- a/src/__tests__/tool-config.test.ts +++ b/src/__tests__/tool-config.test.ts @@ -153,54 +153,45 @@ describe('AnyToolConfigSchema', () => { }); describe('backwards-compat config defaulting', () => { - it('injects type "search" for tools missing a type field', () => { - // Mirrors the defaulting loop in loadServerConfig() from config.ts - const tools: Record[] = [ - { - name: 'search-docs', - description: 'Search docs', - source: 'docs', - default_limit: 5, - max_limit: 20, - result_format: 'docs', - }, - ]; + it('defaults missing type to search and parses via AnyToolConfigSchema', () => { + const toolWithoutType = { + name: 'search-docs', + description: 'Search', + source: 'docs', + default_limit: 5, + max_limit: 20, + result_format: 'docs', + }; - for (const tool of tools) { - if (typeof tool === 'object' && tool !== null && !('type' in tool)) { - (tool as Record).type = 'search'; - } + // Simulate the defaulting logic from config.ts + const tool = { ...toolWithoutType } as Record; + if (!('type' in tool)) { + tool.type = 'search'; } - const result = AnyToolConfigSchema.safeParse(tools[0]); + const result = AnyToolConfigSchema.safeParse(tool); expect(result.success).toBe(true); - if (result.success) { - expect(result.data.type).toBe('search'); - } + if (result.success) expect(result.data.type).toBe('search'); }); - it('does not overwrite an explicit type field', () => { - const tools: Record[] = [ - { - name: 'feedback', - type: 'collect', - description: 'Give feedback', - response: 'OK', - schema: { note: { type: 'string' } }, - }, - ]; + it('does not overwrite an explicit type', () => { + const collectTool = { + name: 'feedback', + type: 'collect', + description: 'Give feedback', + response: 'OK', + schema: { note: { type: 'string' } }, + }; - for (const tool of tools) { - if (typeof tool === 'object' && tool !== null && !('type' in tool)) { - (tool as Record).type = 'search'; - } + // Same defaulting logic — should not touch existing type + const tool = { ...collectTool } as Record; + if (!('type' in tool)) { + tool.type = 'search'; } - const result = AnyToolConfigSchema.safeParse(tools[0]); + const result = AnyToolConfigSchema.safeParse(tool); expect(result.success).toBe(true); - if (result.success) { - expect(result.data.type).toBe('collect'); - } + if (result.success) expect(result.data.type).toBe('collect'); }); }); diff --git a/src/config.ts b/src/config.ts index 317811c..0193d7f 100644 --- a/src/config.ts +++ b/src/config.ts @@ -30,8 +30,7 @@ function parseConfig(): Config { const openaiApiKey = process.env.OPENAI_API_KEY; if (!openaiApiKey) missing.push('OPENAI_API_KEY'); - const githubWebhookSecret = process.env.GITHUB_WEBHOOK_SECRET; - if (!githubWebhookSecret) missing.push('GITHUB_WEBHOOK_SECRET'); + const githubWebhookSecret = process.env.GITHUB_WEBHOOK_SECRET ?? ''; if (missing.length > 0) { throw new Error( @@ -160,6 +159,18 @@ function loadServerConfig(): ServerConfig { } } + // Validate local source paths exist + for (const source of result.data.sources) { + if (!source.repo) { + const resolved = resolve(source.path); + if (!existsSync(resolved)) { + throw new Error( + `Source "${source.name}" references local path "${source.path}" (resolved to ${resolved}) which does not exist.` + ); + } + } + } + return result.data; } diff --git a/src/db/client.ts b/src/db/client.ts index b70f4ce..4bb4693 100644 --- a/src/db/client.ts +++ b/src/db/client.ts @@ -6,14 +6,22 @@ import { getConfig, getServerConfig } from "../config.js"; let pool: pg.Pool | null = null; /** - * Returns a singleton pg Pool, creating it on first call. - * Reads DATABASE_URL from the environment. + * Returns a singleton pg Pool. + * For standard Postgres URLs, creates a pg.Pool on first call. + * For PGlite URLs (pglite://...), initializeSchema() must be called first + * or this will throw — PGlite requires async setup that getPool() cannot do. */ export function getPool(): pg.Pool { if (pool) return pool; const databaseUrl = getConfig().databaseUrl; + if (isPGliteUrl(databaseUrl)) { + throw new Error( + "PGlite pool not initialized. Call initializeSchema() first.", + ); + } + pool = new pg.Pool({ connectionString: databaseUrl, }); @@ -21,12 +29,72 @@ export function getPool(): pg.Pool { return pool; } +function isPGliteUrl(url: string): boolean { + return url.startsWith("pglite://"); +} + +function parsePGliteDataDir(url: string): string { + return url.replace(/^pglite:\/\//, ""); +} + +async function initializePGlite(): Promise { + const databaseUrl = getConfig().databaseUrl; + const dataDir = parsePGliteDataDir(databaseUrl); + const dimensions = getServerConfig().embedding.dimensions; + + const { PGlite } = await import("@electric-sql/pglite"); + const { vector } = await import("@electric-sql/pglite/vector"); + + const db = new PGlite({ dataDir, extensions: { vector } }); + await db.waitReady; + + // Run DDL in a transaction to avoid partial state on failure + await db.exec('BEGIN'); + try { + await db.exec(generateMigration()); + await db.exec(generateSchema(dimensions)); + await db.exec('COMMIT'); + } catch (err) { + try { + await db.exec('ROLLBACK'); + } catch { + // ROLLBACK failed — original error is more useful + } + throw err; + } + + // Build a wrapper that duck-types as pg.Pool. + // Supported pg.Pool surface: query(text, params?), connect() → {query, release}, end(). + // Other pg.Pool methods (e.g. on(), totalCount, idleCount) are NOT implemented — + // the cast below is intentional since queries.ts only uses the supported subset. + const wrapper = { + query: (text: string, params?: unknown[]) => db.query(text, params), + connect: async () => ({ + query: (text: string, params?: unknown[]) => db.query(text, params), + release: () => {}, + }), + end: async () => db.close(), + }; + + pool = wrapper as unknown as pg.Pool; +} + /** * Runs migration (drop old tables) then creates the unified schema. * Idempotent — all DDL uses IF NOT EXISTS / IF EXISTS. * Also registers the pgvector type so vector columns are handled correctly. + * + * When DATABASE_URL starts with "pglite://", uses an in-process PGlite + * instance instead of connecting to an external PostgreSQL server. */ export async function initializeSchema(): Promise { + const databaseUrl = getConfig().databaseUrl; + + if (isPGliteUrl(databaseUrl)) { + await initializePGlite(); + return; + } + const p = getPool(); const dimensions = getServerConfig().embedding.dimensions; @@ -53,3 +121,14 @@ export async function initializeSchema(): Promise { migrationClient.release(); } } + +/** + * Close the pool if it was initialized. Safe to call at any time. + */ +export async function closePool(): Promise { + if (pool) { + const p = pool; + pool = null; + await p.end(); + } +} diff --git a/src/db/queries.ts b/src/db/queries.ts index b0b72e0..b9f336e 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -69,7 +69,7 @@ export async function searchChunks( source_url: (r.source_url as string) ?? null, title: (r.title as string) ?? null, content: r.content as string, - repo_url: r.repo_url as string, + repo_url: (r.repo_url as string) ?? null, file_path: r.file_path as string, start_line: (r.start_line as number) ?? null, end_line: (r.end_line as number) ?? null, @@ -266,7 +266,7 @@ export async function getIndexStats(): Promise { pool.query( "SELECT source_name, count(*)::int AS count FROM chunks GROUP BY source_name ORDER BY source_name", ), - pool.query("SELECT count(DISTINCT repo_url)::int AS count FROM chunks"), + pool.query("SELECT count(DISTINCT repo_url)::int AS count FROM chunks WHERE repo_url IS NOT NULL"), pool.query( "SELECT source_type, source_key, last_commit_sha, last_indexed_at, status, error_message FROM index_state ORDER BY source_type, source_key", ), diff --git a/src/db/schema.ts b/src/db/schema.ts index 47062b8..db8d19b 100644 --- a/src/db/schema.ts +++ b/src/db/schema.ts @@ -16,7 +16,7 @@ CREATE TABLE IF NOT EXISTS chunks ( title TEXT, content TEXT NOT NULL, embedding vector(${dimensions}) NOT NULL, - repo_url TEXT NOT NULL, + repo_url TEXT, file_path TEXT NOT NULL, start_line INTEGER, end_line INTEGER, diff --git a/src/index.ts b/src/index.ts index f68bd75..fa9778b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,7 +4,7 @@ import { randomUUID } from "node:crypto"; import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js"; import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js"; import { createMcpServer } from "./mcp/server.js"; -import { initializeSchema, getPool } from "./db/client.js"; +import { initializeSchema, closePool } from "./db/client.js"; import { getIndexStats } from "./db/queries.js"; import { getConfig, getServerConfig } from "./config.js"; import { IndexingOrchestrator } from "./indexing/orchestrator.js"; @@ -95,8 +95,12 @@ app.post("/mcp", async (req: Request, res: Response) => { const args = params?.arguments as Record | undefined; const toolCfg = getServerConfig().tools.find(t => t.name === toolName); if (toolCfg?.type === 'collect') { - const dataPreview = JSON.stringify(args ?? {}).slice(0, 200); - console.log(`[mcp] ${toolName}(${dataPreview}) [${ip}]`); + try { + const dataPreview = JSON.stringify(args ?? {}).slice(0, 200); + console.log(`[mcp] ${toolName}(${dataPreview}) [${ip}]`); + } catch { + console.log(`[mcp] ${toolName}() [${ip}]`); + } } else { const query = args?.query ?? ''; const limit = args?.limit; @@ -272,13 +276,9 @@ start().catch((err) => { async function shutdown(signal: string): Promise { console.log(`\n[shutdown] Received ${signal}, shutting down...`); try { - await getPool().end(); + await closePool(); } catch (err) { - // Only log if the pool was actually initialized - const msg = err instanceof Error ? err.message : String(err); - if (!msg.includes('DATABASE_URL')) { - console.error("[shutdown] Error closing pool:", err); - } + console.error("[shutdown] Error closing pool:", err); } process.exit(0); } diff --git a/src/indexing/orchestrator.ts b/src/indexing/orchestrator.ts index efa4af4..9913881 100644 --- a/src/indexing/orchestrator.ts +++ b/src/indexing/orchestrator.ts @@ -11,13 +11,6 @@ import { } from '../db/queries.js'; import type { IndexState, IndexStatus, SourceConfig } from '../types.js'; -// Derive the list of unique repo URLs from YAML sources config -function getIndexedRepos(): string[] { - const serverCfg = getServerConfig(); - const repos = new Set(serverCfg.sources.map(s => s.repo)); - return [...repos]; -} - /** * Find all source configs that reference a given repo URL. */ @@ -31,8 +24,9 @@ function getStaleThresholdMs(): number { } interface Job { - type: 'full-reindex' | 'incremental-reindex'; + type: 'full-reindex' | 'incremental-reindex' | 'full-reindex-local'; repoUrl?: string; // for incremental + sources?: SourceConfig[]; // for full-reindex-local } export class IndexingOrchestrator { @@ -82,18 +76,35 @@ export class IndexingOrchestrator { this.queueFullReindex(); return; } - // Otherwise queue incremental reindexes for each affected repo - const reposToReindex = new Set(sourcesNeedingFullReindex.map(s => s.repo)); + // Queue incremental reindexes for each affected git-backed repo + const reposToReindex = new Set( + sourcesNeedingFullReindex.filter(s => s.repo).map(s => s.repo!), + ); for (const repoUrl of reposToReindex) { this.queueIncrementalReindex(repoUrl); } + // Local sources (no repo) get queued as a full reindex of just those sources + const localSources = sourcesNeedingFullReindex.filter(s => !s.repo); + if (localSources.length > 0) { + this.queue.push({ type: 'full-reindex-local', sources: localSources }); + this.drain().catch(err => console.error('[orchestrator] drain() failed:', err)); + } } if (sourcesOk.length === 0) return; + // Local sources in sourcesOk have no remote to check — always reindex on startup + const localSourcesOk = sourcesOk.filter(s => !s.repo); + if (localSourcesOk.length > 0) { + console.log(`[orchestrator] Queuing reindex for ${localSourcesOk.length} local source(s)`); + this.queue.push({ type: 'full-reindex-local', sources: localSourcesOk }); + this.drain().catch(err => console.error('[orchestrator] drain() failed:', err)); + } + console.log('[orchestrator] Checking remotes for changes on indexed sources...'); - const repos = [...new Set(sourcesOk.map(s => s.repo))]; + // Only check remotes for git-backed sources + const repos = [...new Set(sourcesOk.filter(s => s.repo).map(s => s.repo!))]; for (const repoUrl of repos) { try { const remoteHead = await this.getRemoteHead(repoUrl); @@ -122,7 +133,8 @@ export class IndexingOrchestrator { const repoSources = getSourcesByRepo(repoUrl); const firstState = await getIndexState(repoSources[0].type, repoSources[0].name); if (this.isStale(firstState)) { - console.log(`[orchestrator] Index for ${repoUrl} is stale (>24h) — queuing full reindex`); + const thresholdHours = getServerConfig().indexing.stale_threshold_hours; + console.log(`[orchestrator] Index for ${repoUrl} is stale (>${thresholdHours}h) — queuing full reindex`); this.queueFullReindex(); } else { console.log(`[orchestrator] Index for ${repoUrl} appears fresh, skipping`); @@ -212,7 +224,7 @@ export class IndexingOrchestrator { // ----------------------------------------------------------------------- /** - * Check if an index state is stale (never indexed or older than 24h). + * Check if an index state is stale (never indexed or older than the configured threshold). */ private isStale(state: IndexState | null): boolean { if (!state) return true; @@ -264,7 +276,19 @@ export class IndexingOrchestrator { if (job.type === 'full-reindex') { await this.runFullReindex(embeddingClient, config.cloneDir, config.githubToken); - } else if (job.type === 'incremental-reindex' && job.repoUrl) { + } else if (job.type === 'full-reindex-local') { + if (!job.sources || job.sources.length === 0) { + console.warn('[orchestrator] full-reindex-local job has no sources, skipping'); + return; + } + for (const sourceConfig of job.sources) { + await this.indexSourceWithState(sourceConfig, embeddingClient, config.cloneDir); + } + } else if (job.type === 'incremental-reindex') { + if (!job.repoUrl) { + console.warn('[orchestrator] incremental-reindex job has no repoUrl, skipping'); + return; + } await this.runIncrementalReindex( embeddingClient, config.cloneDir, diff --git a/src/indexing/source-indexer.ts b/src/indexing/source-indexer.ts index 2dea659..ca1b463 100644 --- a/src/indexing/source-indexer.ts +++ b/src/indexing/source-indexer.ts @@ -3,6 +3,7 @@ import fs from 'node:fs'; import path from 'node:path'; +import { createHash } from 'node:crypto'; import { simpleGit, type SimpleGit } from 'simple-git'; import { getChunker } from './chunking/index.js'; import { deriveUrl } from './url-derivation.js'; @@ -136,16 +137,35 @@ export class SourceIndexer { this.maxFileSize = sourceConfig.max_file_size ?? DEFAULT_MAX_FILE_SIZE; } + private isLocal(): boolean { + return !this.sourceConfig.repo; + } + /** - * Full re-index: clone/pull the repo, walk matching files, chunk, embed, upsert. + * Full re-index: for git-backed sources, clone/pull the repo; for local + * sources, read directly from the configured path. Then walk matching + * files, chunk, embed, and upsert. */ async fullIndex(): Promise { - const repoName = repoNameFromUrl(this.sourceConfig.repo); - const repoDir = path.join(this.cloneDir, repoName); - const git = await this.ensureRepo(repoDir, repoName); + let repoDir: string; + let headSha: string; - const headSha = await git.revparse(['HEAD']); - const walkRoot = path.join(repoDir, this.sourceConfig.path); + if (this.isLocal()) { + repoDir = path.resolve(this.sourceConfig.path); + if (!fs.existsSync(repoDir)) { + throw new Error(`Local source path does not exist: ${repoDir}`); + } + headSha = await this.computeLocalSha(repoDir); + } else { + const repoName = repoNameFromUrl(this.sourceConfig.repo!); + repoDir = path.join(this.cloneDir, repoName); + const git = await this.ensureRepo(repoDir, repoName); + headSha = await git.revparse(['HEAD']); + } + + const walkRoot = this.isLocal() + ? repoDir + : path.join(repoDir, this.sourceConfig.path); if (!fs.existsSync(walkRoot)) { console.warn(`${this.logPrefix} Walk root not found at ${walkRoot}, skipping`); @@ -179,9 +199,16 @@ export class SourceIndexer { /** * Incremental index: re-index only files changed since lastCommitSha. + * Local sources always fall back to a full reindex. */ async incrementalIndex(lastCommitSha: string): Promise { - const repoName = repoNameFromUrl(this.sourceConfig.repo); + if (this.isLocal()) { + console.log(`${this.logPrefix} Local source — falling back to full reindex`); + await this.fullIndex(); + return; + } + + const repoName = repoNameFromUrl(this.sourceConfig.repo!); const repoDir = path.join(this.cloneDir, repoName); const git = await this.ensureRepo(repoDir, repoName); @@ -270,14 +297,37 @@ export class SourceIndexer { /** * Get the current HEAD SHA of the cloned repo. + * For local sources, returns a deterministic hash based on the file + * listing and modification times, so unchanged content produces the + * same SHA across restarts. */ async getHeadSha(): Promise { - const repoName = repoNameFromUrl(this.sourceConfig.repo); + if (this.isLocal()) { + const walkRoot = path.resolve(this.sourceConfig.path); + return this.computeLocalSha(walkRoot); + } + const repoName = repoNameFromUrl(this.sourceConfig.repo!); const repoDir = path.join(this.cloneDir, repoName); const git = simpleGit(repoDir); return git.revparse(['HEAD']); } + /** + * Compute a deterministic SHA for a local source directory based on + * the sorted list of file paths and their modification times. + * Note: uses mtimes, not file content — a fresh deploy with identical + * files but new mtimes will produce a different SHA and trigger reindex. + */ + private async computeLocalSha(walkRoot: string): Promise { + const files = await this.walkFiles(walkRoot); + const hash = createHash('sha256'); + for (const f of files.sort()) { + const stat = await fs.promises.stat(f); + hash.update(`${f}:${stat.mtimeMs}\n`); + } + return `local-${hash.digest('hex').slice(0, 12)}`; + } + // ----------------------------------------------------------------------- // Private helpers // ----------------------------------------------------------------------- @@ -301,8 +351,8 @@ export class SourceIndexer { } } - const authUrl = authenticatedUrl(this.sourceConfig.repo, this.githubToken); - console.log(`${this.logPrefix} Cloning ${this.sourceConfig.repo} into ${repoDir}`); + const authUrl = authenticatedUrl(this.sourceConfig.repo!, this.githubToken); + console.log(`${this.logPrefix} Cloning ${this.sourceConfig.repo!} into ${repoDir}`); const git = simpleGit(this.cloneDir); const cloneOpts = ['--depth=1']; if (this.sourceConfig.branch) { @@ -351,10 +401,6 @@ export class SourceIndexer { return results; } - /** - * Check if file content has low semantic value (SVG paths, base64, minified code). - * Returns true if the file should be skipped. - */ /** * Read, chunk, embed, and upsert a single file. */ @@ -386,7 +432,7 @@ export class SourceIndexer { title: chunk.title ?? null, content: chunk.content, embedding: embeddings[i], - repo_url: this.sourceConfig.repo, + repo_url: this.sourceConfig.repo ?? null, file_path: relPath, start_line: chunk.startLine ?? null, end_line: chunk.endLine ?? null, diff --git a/src/mcp/server.ts b/src/mcp/server.ts index a47bdbc..41c0562 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -24,8 +24,7 @@ export function createMcpServer(): McpServer { }); for (const tool of serverCfg.tools) { - const toolType = tool.type; - switch (toolType) { + switch (tool.type) { case 'collect': registerCollectTool(server, tool); break; @@ -33,8 +32,8 @@ export function createMcpServer(): McpServer { registerSearchTool(server, embeddingClient, tool); break; default: { - const _exhaustive: never = toolType; - throw new Error(`Unknown tool type "${_exhaustive}" for tool "${(tool as any).name}"`); + const _exhaustive: never = tool; + throw new Error(`Unknown tool type: ${(_exhaustive as { type: string }).type}`); } } } diff --git a/src/mcp/tools/collect.ts b/src/mcp/tools/collect.ts index 3698557..15b7e7d 100644 --- a/src/mcp/tools/collect.ts +++ b/src/mcp/tools/collect.ts @@ -45,7 +45,6 @@ export function yamlSchemaToZod(schema: CollectToolConfig['schema']): Record