From 68ec17a074cd1b159f29738fc0bd344b305ea81c Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Mon, 18 May 2026 22:31:38 -0400 Subject: [PATCH 1/3] refactor(importer): rewrite write path on gitsheets transact + slug-keyed paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent's first cut of the JSON importer (PR #57) bypassed gitsheets and wrote raw TOML files under legacyId-keyed paths (people/${legacyId}.toml). The stated reason was "clean diffs across re-runs when slugs rename" — a property that nothing in the runtime actually consumes, in exchange for: - a custom toToml() that has to match the lib's serializer - a hand-rolled wipeOwnedDirectories() + writeAllRecords() filesystem layer that has to match gitsheets' tree expectations - a `git cat-file --batch` UUID-forward-read pass to preserve UUIDs - a separate reconciliation step (file rename pass) before legacy-import could merge into fixture/main (issue #59) This rewrite uses the library as designed: - `openPublicStore` opens the gitsheets repo handle for the data repo - A single `store.transact` per snapshot: `await tx[sheet].clear()` empties each importer-owned sheet, then a loop of `await tx[sheet].upsert(record)` re-populates fresh. Deletes are captured implicitly — anything that was in the previous snapshot but isn't in the new one is gone after clear() - Path templates from `.gitsheets/.toml` produce slug-keyed paths directly (`people/${{ slug }}.toml`, `tags/${{ namespace }}/${{ slug }}.toml`, `project-memberships/${{ projectSlug }}/${{ personSlug }}.toml`, …), matching the runtime spec. legacy-import → fixture is now a plain `git merge`, no rename pass. - Zod schema validation runs on every record (the bare-git version skipped it once translation passed) - Pre-pass walks the previous snapshot via `store..query()` to preserve record UUIDs across re-runs. Composite-path sheets (project-memberships, tag-assignments) recover legacy IDs via uuid→legacyId reverse maps built from the simple-sheet pass. Composite-path records (memberships, updates, buzz) need slug fields in the upserted record for the path template to render; passthrough on the schemas lets us add `projectSlug` / `personSlug` alongside the schema-declared `projectId` / `personId`. Same pattern the API's runtime write services already use (`withMembershipPath`). Deletes: - `toToml()` (replaced by gitsheets' own TOML serializer) - `writeRecord()` / `writeAllRecords()` / `wipeOwnedDirectories()` (gitsheets Sheet operations replace them) - `git cat-file --batch` (gitsheets `query()` replaces it) - `ensureBranch` + `checkoutBranch` + `createImportCommit` (collapses to a small `ensureBranchCheckedOut` helper; transact does the commit) - The `--no-commit` CLI flag (gitsheets transact is atomic; `--dry-run` is the only meaningful preview) Known upstream limitations (filed): - JarvusInnovations/gitsheets#178 — `Sheet#clear()` walks + deletes every child instead of pointing the subtree at git's empty-tree hash - JarvusInnovations/gitsheets#179 — `Transaction#finalize()` commits any time `markMutated` is set, even when the resulting tree-hash equals the parent's. We work around #179 here with a post-transact tree-hash compare + `git update-ref` reset; clean fix is upstream. Tests: - All 22 importer tests pass against an updated `makeRepo()` fixture that seeds the full `.gitsheets/*.toml` matrix the gitsheets store requires - The "modified single record" idempotence test now asserts a slug-keyed diff (`projects/transit-app.toml`) instead of the legacyId-keyed `projects/100.toml` - File-content reads switched from working-tree `readFile` to `git show HEAD:` — gitsheets commits to the object DB without touching the working tree, so the previous test pattern was reading the pre-commit state by accident Live verification: import against codeforphilly.org (page-size=10000, delay-ms=0) produces 44,077 records in a slug-keyed tree on legacy-import, ready to merge into fixture. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/scripts/import-laddr.ts | 11 +- apps/api/scripts/import-laddr/importer.ts | 794 +++++++--------------- apps/api/tests/import-laddr.test.ts | 92 ++- 3 files changed, 320 insertions(+), 577 deletions(-) diff --git a/apps/api/scripts/import-laddr.ts b/apps/api/scripts/import-laddr.ts index 5efc8d2..a45ebdc 100644 --- a/apps/api/scripts/import-laddr.ts +++ b/apps/api/scripts/import-laddr.ts @@ -11,7 +11,7 @@ * --source-host=codeforphilly.org \ * --data-repo=/path/to/codeforphilly-data \ * --branch=legacy-import \ - * [--dry-run] [--no-commit] [--limit=N] [--verbose] [--page-size=N] [--delay-ms=N] + * [--dry-run] [--limit=N] [--verbose] [--page-size=N] [--delay-ms=N] * * Defaults: * --source-host codeforphilly.org @@ -30,7 +30,6 @@ interface CliArgs { readonly dataRepo: string; readonly branch: string; readonly dryRun: boolean; - readonly noCommit: boolean; readonly limit: number | undefined; readonly verbose: boolean; readonly pageSize: number | undefined; @@ -76,7 +75,6 @@ function parseArgs(argv: readonly string[]): CliArgs { ? (opts['branch'] as string) : 'legacy-import', dryRun: opts['dry-run'] === true, - noCommit: opts['no-commit'] === true, limit: typeof limit === 'number' && Number.isFinite(limit) ? limit : undefined, verbose: opts['verbose'] === true, pageSize: typeof pageSize === 'number' && Number.isFinite(pageSize) ? pageSize : undefined, @@ -90,16 +88,13 @@ async function main(): Promise { console.log(`[import-laddr] source-host=${args.sourceHost}`); console.log(`[import-laddr] data-repo=${args.dataRepo}`); console.log(`[import-laddr] branch=${args.branch}`); - console.log( - `[import-laddr] dry-run=${args.dryRun} no-commit=${args.noCommit} limit=${args.limit ?? 'none'}`, - ); + console.log(`[import-laddr] dry-run=${args.dryRun} limit=${args.limit ?? 'none'}`); const report = await importLaddrFromJson({ sourceHost: args.sourceHost, dataRepo: args.dataRepo, branch: args.branch, dryRun: args.dryRun, - noCommit: args.noCommit, limit: args.limit, verbose: args.verbose, pageSize: args.pageSize, @@ -127,8 +122,6 @@ function printReport(report: ImportReport, args: CliArgs): void { } if (args.dryRun) { lines.push(`(dry-run: no writes performed)`); - } else if (args.noCommit) { - lines.push(`(no-commit: files staged, no commit made)`); } else if (report.noChanges) { lines.push(`(no changes from parent commit — branch unchanged)`); } else if (report.commitHash) { diff --git a/apps/api/scripts/import-laddr/importer.ts b/apps/api/scripts/import-laddr/importer.ts index a38e1d2..2c61c49 100644 --- a/apps/api/scripts/import-laddr/importer.ts +++ b/apps/api/scripts/import-laddr/importer.ts @@ -8,22 +8,22 @@ * Branch model: * - On first run, `legacy-import` is created from the `empty` branch (which * carries only `.gitsheets/` configs, no records). - * - On subsequent runs, the importer resets a working ref to the current - * `legacy-import` HEAD, removes every importer-owned directory, writes - * fresh files, and commits. - * - Records use `/.toml` paths (composite for memberships - * and tag-assignments) so re-runs overwrite stable filenames. The - * legacy-import branch is parallel history — the runtime spec's slug- - * based path templates apply once data is merged into `main`, which is - * an operator step outside this importer's scope. + * - On subsequent runs, the importer checks out the existing local branch, + * opens the gitsheets store, reads the previous snapshot's record UUIDs + * (to keep them stable across re-runs), then opens a single transaction + * that clears each importer-owned sheet and re-upserts the fresh data. + * - Files are written by gitsheets per each sheet's path template + * (e.g. `people/${{ slug }}.toml`) — the runtime API reads the same paths. + * The legacy-import branch's tree is shape-identical to fixture/main; the + * operator's merge into main is a plain `git merge`, no rename pass. * * Author identity on every commit: the pseudonymous Code for Philly API - * user (see plans/laddr-import-via-json.md). The agent's git config is - * never used. + * user. The agent's git config is never used (gitsheets honors `author` + * directly on `transact`). * * Side effects: - * - Writes/removes files in the data repo's working tree - * - Creates one commit on the local `legacy-import` branch + * - Checks out the target branch in the data repo's working tree + * - Creates one commit on the local `legacy-import` branch (via gitsheets) * - Does NOT push to origin (operator's call) * * Private-store side: out of scope for this importer. The JSON endpoints @@ -31,8 +31,7 @@ * newsletter prefs) will be imported separately on a future plan. */ import { execFile } from 'node:child_process'; -import { mkdir, readdir, rm, writeFile } from 'node:fs/promises'; -import { join, resolve } from 'node:path'; +import { resolve } from 'node:path'; import { promisify } from 'node:util'; const exec = promisify(execFile); @@ -56,6 +55,7 @@ import type { TagAssignment, } from '@cfp/shared/schemas'; +import { openPublicStore, type PublicStore } from '../../src/store/public.js'; import { fetchAllPages, RawPersonSchema, @@ -101,8 +101,6 @@ export interface ImportOptions { readonly initialParent?: string; /** If true, fetch + translate + report but do not write to the repo. */ readonly dryRun?: boolean; - /** If true, write files + stage but do not commit. */ - readonly noCommit?: boolean; /** Truncate each fetched resource to N rows (for dev loops). */ readonly limit?: number; /** Increase logging verbosity. */ @@ -132,25 +130,15 @@ export interface ImportReport { readonly branch: string; readonly counts: Record; readonly warnings: string[]; - /** Commit hash produced, or null in `--dry-run` / `--no-commit` / no-changes. */ + /** Commit hash produced, or null in `--dry-run` / no-changes. */ readonly commitHash: string | null; - /** True when the working tree after staging matches HEAD (so no commit was made). */ + /** True when the produced tree matches HEAD's (no commit was made). */ readonly noChanges: boolean; } const AUTHOR_NAME = 'Code for Philly API'; const AUTHOR_EMAIL = 'api@users.noreply.codeforphilly.org'; -const IMPORTER_OWNED_DIRS = [ - 'people', - 'projects', - 'tags', - 'project-memberships', - 'project-updates', - 'project-buzz', - 'tag-assignments', -] as const; - // --------------------------------------------------------------------------- // Entry point // --------------------------------------------------------------------------- @@ -180,16 +168,27 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise( - '/tags', - RawTagSchema, - {}, - fetchOpts, - )) { + for await (const row of fetchAllPages('/tags', RawTagSchema, {}, fetchOpts)) { const translated = translateTag(row, ctx); if (translated === null) { counts.tags!.skipped++; @@ -229,7 +223,6 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise = []; for await (const row of fetchAllPages( '/people', RawPersonSchema, @@ -262,11 +255,6 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise` and create commit. + // 3. One atomic gitsheets transaction: + // - clear() each importer-owned sheet (deletes capture for free) + // - upsert() every translated record (path-template + schema validation) // ------------------------------------------------------------------------- - const repo = resolve(opts.dataRepo); - await ensureGitRepo(repo); - const parent = await ensureBranch(repo, branch, initialParent); - await checkoutBranch(repo, branch, parent); - await wipeOwnedDirectories(repo); - - const filesWritten = await writeAllRecords(repo, { - tags, - people, - projects, - memberships, - updates, - buzz, - tagAssignments, - tagAssignmentLegacyTuples, - idMaps, - warnings, - }); - - log(`[import] wrote ${filesWritten} files`); + if (store === null) throw new Error('[import-laddr] internal: store not opened'); + + const message = buildCommitMessage({ runAt, sourceHost: opts.sourceHost, counts }); + + const result = await store.transact( + { + message, + author: { name: AUTHOR_NAME, email: AUTHOR_EMAIL }, + trailers: { + Action: 'import.laddr.json', + 'Source-Host': opts.sourceHost, + 'Run-At': runAt, + }, + }, + async (tx) => { + log(`[import] clear + upsert tags (${tags.length})`); + await tx.tags.clear(); + for (const t of tags) await tx.tags.upsert(t); + + log(`[import] clear + upsert people (${people.length})`); + await tx.people.clear(); + for (const p of people) await tx.people.upsert(p); + + log(`[import] clear + upsert projects (${projects.length})`); + await tx.projects.clear(); + for (const p of projects) await tx.projects.upsert(p); + + log(`[import] clear + upsert project-memberships (${memberships.length})`); + await tx['project-memberships'].clear(); + for (const m of memberships) { + const projectSlug = idMaps.projectSlugByLegacy.get(m.legacyIds.projectLegacyId); + const personSlug = idMaps.personSlugByLegacy.get(m.legacyIds.personLegacyId); + if (!projectSlug || !personSlug) { + warnings.push( + `[project-memberships] skipped: missing slug for project=${m.legacyIds.projectLegacyId} or person=${m.legacyIds.personLegacyId}`, + ); + continue; + } + await tx['project-memberships'].upsert({ + ...m.record, + projectSlug, + personSlug, + } as ProjectMembership); + } - // ------------------------------------------------------------------------- - // 4. Stage and check for changes. - // ------------------------------------------------------------------------- - for (const dir of IMPORTER_OWNED_DIRS) { - await git(repo, 'add', '-A', '--', dir); - } + log(`[import] clear + upsert project-updates (${updates.length})`); + await tx['project-updates'].clear(); + for (const { record, projectLegacyId } of updates) { + const projectSlug = idMaps.projectSlugByLegacy.get(projectLegacyId); + if (!projectSlug) { + warnings.push( + `[project-updates] skipped: missing slug for project=${projectLegacyId}`, + ); + continue; + } + await tx['project-updates'].upsert({ ...record, projectSlug } as ProjectUpdate); + } - if (opts.noCommit) { - return { - runAt, - sourceHost: opts.sourceHost, - branch, - counts, - warnings: warningsList, - commitHash: null, - noChanges: false, - }; - } + log(`[import] clear + upsert project-buzz (${buzz.length})`); + await tx['project-buzz'].clear(); + for (const { record, projectLegacyId } of buzz) { + const projectSlug = idMaps.projectSlugByLegacy.get(projectLegacyId); + if (!projectSlug) { + warnings.push( + `[project-buzz] skipped: missing slug for project=${projectLegacyId}`, + ); + continue; + } + await tx['project-buzz'].upsert({ ...record, projectSlug } as ProjectBuzz); + } - // Compare the tree we built to the parent's tree — when nothing changed - // upstream, we want to exit cleanly without creating an empty commit. - const { stdout: porcelain } = await git(repo, 'status', '--porcelain'); - if (porcelain.trim() === '') { - log('[import] no changes from parent commit — nothing to commit'); - return { - runAt, - sourceHost: opts.sourceHost, - branch, - counts, - warnings: warningsList, - commitHash: null, - noChanges: true, - }; + log(`[import] clear + upsert tag-assignments (${tagAssignments.length})`); + await tx['tag-assignments'].clear(); + for (const ta of tagAssignments) await tx['tag-assignments'].upsert(ta); + }, + ); + + // gitsheets' Transaction#finalize doesn't compare the resulting tree-hash + // to the parent's tree-hash before committing — it commits whenever any + // mutating method was called, even if the tree ended up byte-identical + // (e.g. `clear()` + re-`upsert()` of the same records). Filed as + // JarvusInnovations/gitsheets#179. Until that lands, detect the no-op here + // and reset the ref so the snapshot history stays clean. + let commitHash: string | null = result.commitHash; + let noChanges = commitHash === null; + if (commitHash !== null && result.parentCommitHash !== null && result.treeHash !== null) { + const parentTreeHash = ( + await exec('git', ['rev-parse', `${result.parentCommitHash}^{tree}`], { cwd: repo }) + ).stdout.trim(); + if (parentTreeHash === result.treeHash) { + await exec('git', ['update-ref', `refs/heads/${branch}`, result.parentCommitHash, commitHash], { + cwd: repo, + }); + commitHash = null; + noChanges = true; + log('[import] tree matches parent — reset ref (no-op snapshot)'); + } } - const commitHash = await createImportCommit(repo, { - branch, - runAt, - sourceHost: opts.sourceHost, - counts, - }); - return { runAt, sourceHost: opts.sourceHost, @@ -491,255 +507,20 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise { - let count = 0; - - // people/.toml - for (const r of b.people) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['people', `${r.legacyId}.toml`], r); - count++; - } - // projects/.toml - for (const r of b.projects) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['projects', `${r.legacyId}.toml`], r); - count++; - } - // tags/.toml - for (const r of b.tags) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['tags', `${r.legacyId}.toml`], r); - count++; - } - // project-memberships/-.toml - for (const { record, legacyIds } of b.memberships) { - await writeRecord( - repo, - ['project-memberships', `${legacyIds.projectLegacyId}-${legacyIds.personLegacyId}.toml`], - record, - ); - count++; - } - // project-updates/.toml - for (const { record } of b.updates) { - if (record.legacyId === undefined) continue; - await writeRecord(repo, ['project-updates', `${record.legacyId}.toml`], record); - count++; - } - // project-buzz/.toml - for (const { record } of b.buzz) { - if (record.legacyId === undefined) continue; - await writeRecord(repo, ['project-buzz', `${record.legacyId}.toml`], record); - count++; - } - // tag-assignments/--.toml - for (let i = 0; i < b.tagAssignments.length; i++) { - const record = b.tagAssignments[i]!; - const legacy = b.tagAssignmentLegacyTuples[i]!; - await writeRecord( - repo, - [ - 'tag-assignments', - `${legacy.tagLegacyId}-${legacy.taggableType}-${legacy.taggableLegacyId}.toml`, - ], - record, - ); - count++; - } - - return count; -} - -async function writeRecord( - repo: string, - pathParts: readonly string[], - record: Record, -): Promise { - const full = join(repo, ...pathParts); - await mkdir(join(full, '..'), { recursive: true }); - await writeFile(full, toToml(record), 'utf8'); -} - -// --------------------------------------------------------------------------- -// TOML serialization (flat records; same shape as scripts/scrub-data.ts). -// Records are written with keys in a stable alphabetical order so consecutive -// snapshots produce stable diffs even if the in-memory object key order -// drifts. +// Helpers // --------------------------------------------------------------------------- -export function toToml(record: Record): string { - const keys = Object.keys(record).sort(); - const lines: string[] = []; - for (const key of keys) { - const value = record[key]; - if (value === null || value === undefined) continue; - if (typeof value === 'string') { - if (value.includes('\n')) { - // Use TOML's basic-multiline-string form; escape the rare embedded - // triple-quote sequence and any backslashes. - const escaped = value.replace(/\\/g, '\\\\').replace(/"""/g, '\\"""'); - lines.push(`${key} = """\n${escaped}\n"""`); - } else { - const escaped = value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); - lines.push(`${key} = "${escaped}"`); - } - } else if (typeof value === 'number') { - lines.push(`${key} = ${value}`); - } else if (typeof value === 'boolean') { - lines.push(`${key} = ${value}`); - } - // Arrays/objects intentionally not handled — all current v1 record fields - // are scalar at the top level. - } - return `${lines.join('\n')}\n`; -} - -// --------------------------------------------------------------------------- -// Git helpers -// --------------------------------------------------------------------------- - -function git( - cwd: string, - ...args: string[] -): Promise<{ stdout: string; stderr: string }> { - return exec('git', args, { cwd, maxBuffer: 256 * 1024 * 1024 }); -} - -async function ensureGitRepo(repo: string): Promise { - try { - await git(repo, 'rev-parse', '--git-dir'); - } catch (err) { - throw new Error( - `[import-laddr] ${repo} is not a git working directory: ${describe(err)}`, - { cause: err }, - ); - } -} - -/** - * Make sure the named branch exists locally. Returns the parent commit hash - * we should use as the snapshot's parent — current branch tip if it exists, - * else `initialParent`'s commit hash. - */ -async function ensureBranch( - repo: string, - branch: string, - initialParent: string, -): Promise { - try { - const { stdout } = await git(repo, 'rev-parse', '--verify', `refs/heads/${branch}`); - return stdout.trim(); - } catch { - // No local branch. Try `origin/` first; fall back to initialParent. - try { - const { stdout } = await git(repo, 'rev-parse', '--verify', `refs/remotes/origin/${branch}`); - return stdout.trim(); - } catch { - // ignore — fall through - } - const { stdout } = await git(repo, 'rev-parse', '--verify', initialParent); - return stdout.trim(); - } -} - -async function checkoutBranch( - repo: string, - branch: string, - parent: string, -): Promise { - // Force-reset working tree to the desired parent under the named branch. - await git(repo, 'checkout', '-B', branch, parent); -} - -async function wipeOwnedDirectories(repo: string): Promise { - for (const dir of IMPORTER_OWNED_DIRS) { - const full = join(repo, dir); - // `git rm -rf -- ` removes both the index entries and the working - // tree files in one shot. The first run on a fresh branch has nothing - // to remove, so swallow ENOENT-style failures. - try { - await git(repo, 'rm', '-rf', '--ignore-unmatch', '--', dir); - } catch { - // ignore — directory not present - } - // Defensively remove any leftover working-tree files (covers untracked - // detritus from a previous --no-commit run). - await rm(full, { recursive: true, force: true }); - } -} - interface CommitParams { - readonly branch: string; readonly runAt: string; readonly sourceHost: string; readonly counts: Record; } -async function createImportCommit( - repo: string, - p: CommitParams, -): Promise { - const env = { - ...process.env, - GIT_AUTHOR_NAME: AUTHOR_NAME, - GIT_AUTHOR_EMAIL: AUTHOR_EMAIL, - GIT_COMMITTER_NAME: AUTHOR_NAME, - GIT_COMMITTER_EMAIL: AUTHOR_EMAIL, - GIT_AUTHOR_DATE: p.runAt, - GIT_COMMITTER_DATE: p.runAt, - }; - - const message = buildCommitMessage(p); - const messageFile = join(repo, '.git', 'IMPORT_LADDR_MSG'); - await writeFile(messageFile, message, 'utf8'); - - // Use `--quiet` to keep `git commit`'s stdout small (the create-mode list - // for a 40k-file snapshot otherwise exceeds the default execFile buffer). - await exec('git', ['commit', '--quiet', '-F', messageFile], { - cwd: repo, - env, - maxBuffer: 256 * 1024 * 1024, - }); - - const { stdout: shaRaw } = await git(repo, 'rev-parse', 'HEAD'); - return shaRaw.trim(); -} - function buildCommitMessage(p: CommitParams): string { const c = p.counts; const subject = `import: snapshot from ${p.sourceHost} (${p.runAt})`; @@ -753,13 +534,9 @@ function buildCommitMessage(p: CommitParams): string { `${c['tag-assignments']!.imported} tag-assignments`, ].join(', '); - return `${subject}\n\n${summary}.\n\nAction: import.laddr.json\nSource-Host: ${p.sourceHost}\nRun-At: ${p.runAt}\n`; + return `${subject}\n\n${summary}.\n`; } -// --------------------------------------------------------------------------- -// Misc helpers -// --------------------------------------------------------------------------- - function blank(): EntityCounts { return { imported: 0, skipped: 0, errors: 0 }; } @@ -784,206 +561,141 @@ function describe(err: unknown): string { return String(err); } +// --------------------------------------------------------------------------- +// Git plumbing (branch checkout only — gitsheets handles commit creation) +// --------------------------------------------------------------------------- + +async function ensureGitRepo(repo: string): Promise { + try { + await exec('git', ['rev-parse', '--git-dir'], { cwd: repo }); + } catch (err) { + throw new Error( + `[import-laddr] ${repo} is not a git working directory: ${describe(err)}`, + { cause: err }, + ); + } +} + /** - * Read each importer-owned `.toml` file from the latest snapshot tip and - * extract the record's `id` field. Used to keep UUIDs stable across re-runs - * so an unchanged source produces an unchanged tree (idempotence). - * - * Reads from `refs/heads/` if it exists, then `refs/remotes/origin/ - * `, then the configured fallback. Returns an empty map if no parent - * exists yet (first run). - * - * Implementation note: `git cat-file --batch` is used to stream blob contents - * in a single subprocess rather than fork+exec per-file. Snapshots can have - * 40k+ files; per-file `git show` calls take many minutes. + * Check out the target branch in the working tree. On first run, create it + * from `origin/` if available, falling back to `initialParent` + * (typically `origin/empty`). */ -async function collectExistingIds( +async function ensureBranchCheckedOut( repo: string, branch: string, initialParent: string, -): Promise { - const ids = newExistingIds(); - let ref: string | null = null; - for (const candidate of [ - `refs/heads/${branch}`, - `refs/remotes/origin/${branch}`, - initialParent, - ]) { - try { - await git(repo, 'rev-parse', '--verify', candidate); - ref = candidate; - break; - } catch { - // try next - } +): Promise { + // Existing local branch: just switch. + try { + await exec('git', ['rev-parse', '--verify', `refs/heads/${branch}`], { cwd: repo }); + await exec('git', ['checkout', branch], { cwd: repo }); + return; + } catch { + // No local branch — fall through. } - if (ref === null) return ids; - - // `ls-tree -r` gives us mode + sha + filename for every file under the - // commit's tree. We need both blob sha (for cat-file --batch lookup) and - // path (so we know which sheet the record belongs to). - let listing: string; + // No local branch yet. Try origin/, fall back to initialParent. + let parent: string; try { - const { stdout } = await git(repo, 'ls-tree', '-r', ref); - listing = stdout; + await exec('git', ['rev-parse', '--verify', `refs/remotes/origin/${branch}`], { + cwd: repo, + }); + parent = `origin/${branch}`; } catch { - return ids; + parent = initialParent; } + await exec('git', ['checkout', '-b', branch, parent], { cwd: repo }); +} - interface Entry { - readonly sha: string; - readonly path: string; - } - const entries: Entry[] = []; - for (const line of listing.split('\n')) { - // Format: ` \t` - const tabIdx = line.indexOf('\t'); - if (tabIdx === -1) continue; - const meta = line.slice(0, tabIdx).split(/\s+/); - const path = line.slice(tabIdx + 1); - if (meta.length < 3) continue; - if (!path.endsWith('.toml')) continue; - let owned = false; - for (const dir of IMPORTER_OWNED_DIRS) { - if (path.startsWith(`${dir}/`)) { - owned = true; - break; +// --------------------------------------------------------------------------- +// Pre-pass: read existing UUIDs from the current snapshot so re-runs preserve +// each record's `id` field. Keys mirror the translator's `idFor(ctx, key)` +// calls. +// +// Simple sheets (people, projects, tags, project-updates, project-buzz) +// carry `legacyId` directly on the record, so we read them in one pass. +// Composite-path sheets (project-memberships, tag-assignments) don't store +// legacy IDs on the record; we recover them via uuid→legacyId reverse maps +// built from the simple-sheet pass. +// --------------------------------------------------------------------------- + +async function collectExistingIds( + store: PublicStore, + log: (msg: string) => void, +): Promise { + const ids = newExistingIds(); + let count = 0; + + // Pass 1: simple sheets. Also build uuid → legacyId reverse maps used by + // the composite-sheet pass below. + const personLegacyByUuid = new Map(); + const projectLegacyByUuid = new Map(); + const tagLegacyByUuid = new Map(); + + const simpleSheets = ['people', 'projects', 'tags', 'project-updates', 'project-buzz'] as const; + for (const sheetName of simpleSheets) { + const sheet = store[sheetName] as { query: () => AsyncIterable> }; + for await (const record of sheet.query()) { + const legacyId = record['legacyId']; + const id = record['id']; + if (typeof legacyId === 'number' && typeof id === 'string') { + ids.byFile.set(`${sheetName}/${legacyId}`, id); + count++; + if (sheetName === 'people') personLegacyByUuid.set(id, legacyId); + else if (sheetName === 'projects') projectLegacyByUuid.set(id, legacyId); + else if (sheetName === 'tags') tagLegacyByUuid.set(id, legacyId); } } - if (!owned) continue; - entries.push({ sha: meta[2]!, path }); } - if (entries.length === 0) return ids; - - // Spawn `git cat-file --batch` once; feed it newline-separated SHAs on stdin, - // parse the streamed ` blob \n\n` responses. - const blobs = await batchCatFile(repo, entries.map((e) => e.sha)); - for (let i = 0; i < entries.length; i++) { - const content = blobs[i] ?? ''; - const id = extractTomlString(content, 'id'); - if (id) { - const key = entries[i]!.path.replace(/\.toml$/, ''); - ids.byFile.set(key, id); + // Pass 2: composite-path sheets. Look up legacy IDs for the referenced + // entities; skip records that point at uuids we couldn't resolve. + const membershipsSheet = store['project-memberships'] as { + query: () => AsyncIterable>; + }; + for await (const record of membershipsSheet.query()) { + const projectId = record['projectId']; + const personId = record['personId']; + const id = record['id']; + if (typeof id !== 'string' || typeof projectId !== 'string' || typeof personId !== 'string') { + continue; } + const projectLegacyId = projectLegacyByUuid.get(projectId); + const personLegacyId = personLegacyByUuid.get(personId); + if (projectLegacyId === undefined || personLegacyId === undefined) continue; + ids.byFile.set(`project-memberships/${projectLegacyId}-${personLegacyId}`, id); + count++; } - return ids; -} - -/** - * Stream blob contents via a single `git cat-file --batch` invocation. Each - * input SHA produces one entry in the returned array, in the same order. - * - * The protocol: emit one SHA per line on stdin; for each, git emits a header - * line ` \n` followed by `` bytes of content and a - * trailing `\n`. On `missing` (unknown SHA), git emits ` missing\n` and - * no content. We treat missing as empty. - */ -async function batchCatFile(repo: string, shas: readonly string[]): Promise { - if (shas.length === 0) return []; - const { spawn } = await import('node:child_process'); - return await new Promise((resolve, reject) => { - const child = spawn('git', ['cat-file', '--batch'], { - cwd: repo, - stdio: ['pipe', 'pipe', 'pipe'], - }); - - const results: string[] = []; - let stderrAcc = ''; - let buf = Buffer.alloc(0); - let mode: 'header' | 'content' = 'header'; - let expected = 0; - - child.stderr.setEncoding('utf8'); - child.stderr.on('data', (chunk: string) => { - stderrAcc += chunk; - }); - - child.stdout.on('data', (chunk: Buffer) => { - buf = Buffer.concat([buf, chunk]); - while (true) { - if (mode === 'header') { - const nl = buf.indexOf(0x0a); - if (nl === -1) return; - const header = buf.slice(0, nl).toString('utf8'); - buf = buf.slice(nl + 1); - // header is ` ` or ` missing` - const parts = header.split(' '); - if (parts.length === 3 && parts[1] !== 'missing') { - expected = parseInt(parts[2]!, 10); - mode = 'content'; - } else { - // missing — no content body - results.push(''); - if (results.length === shas.length) { - try { - child.stdin.end(); - } catch { - // ignore - } - } - } - } else { - // content mode: wait for `expected` bytes + the trailing newline - if (buf.length < expected + 1) return; - const content = buf.slice(0, expected).toString('utf8'); - buf = buf.slice(expected + 1); // skip trailing newline - results.push(content); - mode = 'header'; - if (results.length === shas.length) { - try { - child.stdin.end(); - } catch { - // ignore - } - } - } - } - }); - - child.on('close', (code) => { - if (code !== 0 && results.length !== shas.length) { - reject(new Error(`git cat-file --batch exited ${code}: ${stderrAcc}`)); - } else { - resolve(results); - } - }); - child.on('error', reject); - - // Feed SHAs as a single write — git's batch mode reads to EOL. - const payload = shas.join('\n') + '\n'; - child.stdin.write(payload); - // Don't end stdin yet — close it when all entries have been read so the - // batch process drains cleanly. (Closing early on a slow consumer would - // truncate output.) - }); -} -function extractTomlString(content: string, key: string): string | null { - const re = new RegExp(`^${key}\\s*=\\s*"(.*)"$`, 'm'); - const m = content.match(re); - if (m === null) return null; - // Reverse the simple TOML escapes used by our writer. - return (m[1] ?? '').replace(/\\"/g, '"').replace(/\\\\/g, '\\'); -} - -// Exposed for direct invocation in tests that walk the tree. -export { IMPORTER_OWNED_DIRS }; - -// Used by tests that want to introspect the unused-but-imported readdir helper. -export async function listOwnedToml(repo: string): Promise { - const out: string[] = []; - for (const dir of IMPORTER_OWNED_DIRS) { - const full = join(repo, dir); - try { - for (const entry of await readdir(full, { withFileTypes: true })) { - if (entry.isFile() && entry.name.endsWith('.toml')) { - out.push(`${dir}/${entry.name}`); - } - } - } catch { - // dir not present + const tagAssignmentsSheet = store['tag-assignments'] as { + query: () => AsyncIterable>; + }; + for await (const record of tagAssignmentsSheet.query()) { + const tagId = record['tagId']; + const taggableId = record['taggableId']; + const taggableType = record['taggableType']; + const id = record['id']; + if ( + typeof id !== 'string' || + typeof tagId !== 'string' || + typeof taggableId !== 'string' || + (taggableType !== 'project' && taggableType !== 'person') + ) { + continue; } + const tagLegacyId = tagLegacyByUuid.get(tagId); + const taggableLegacyId = + taggableType === 'project' + ? projectLegacyByUuid.get(taggableId) + : personLegacyByUuid.get(taggableId); + if (tagLegacyId === undefined || taggableLegacyId === undefined) continue; + ids.byFile.set( + `tag-assignments/${tagLegacyId}-${taggableType}-${taggableLegacyId}`, + id, + ); + count++; } - return out; + + log(`[import] pre-pass: preserved ${count} record UUIDs from previous snapshot`); + return ids; } diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts index d61b81b..bb3a62f 100644 --- a/apps/api/tests/import-laddr.test.ts +++ b/apps/api/tests/import-laddr.test.ts @@ -369,12 +369,39 @@ async function makeRepo(): Promise<{ path: string; cleanup: () => Promise await run('config', 'user.email', 'test@cfp.test'); await run('config', 'user.name', 'test'); await run('config', 'commit.gpgsign', 'false'); - // Create an "empty" branch with a .gitsheets seed similar to upstream + // Create an "empty" branch with the full .gitsheets config matrix. The + // importer opens the gitsheets store at boot, which requires every sheet + // declared on the validator map (PublicValidators) to have a corresponding + // .gitsheets/.toml. Mirrors the upstream codeforphilly-data repo. await mkdir(join(dir, '.gitsheets'), { recursive: true }); - await writeFile( - join(dir, '.gitsheets', 'people.toml'), - "[gitsheet]\nroot = 'people'\npath = '${{ slug }}'\n", - ); + const sheets: Array<[string, string]> = [ + ['people', "root = 'people'\npath = '${{ slug }}'\n"], + ['projects', "root = 'projects'\npath = '${{ slug }}'\n"], + ['tags', "root = 'tags'\npath = '${{ namespace }}/${{ slug }}'\n"], + [ + 'project-memberships', + "root = 'project-memberships'\npath = '${{ projectSlug }}/${{ personSlug }}'\n", + ], + [ + 'project-updates', + "root = 'project-updates'\npath = '${{ projectSlug }}/${{ number }}'\n", + ], + ['project-buzz', "root = 'project-buzz'\npath = '${{ projectSlug }}/${{ slug }}'\n"], + [ + 'tag-assignments', + "root = 'tag-assignments'\npath = '${{ taggableType }}/${{ taggableId }}/${{ tagId }}'\n", + ], + ['help-wanted-roles', "root = 'help-wanted-roles'\npath = '${{ projectSlug }}/${{ slug }}'\n"], + [ + 'help-wanted-interest', + "root = 'help-wanted-interest'\npath = '${{ roleId }}/${{ personId }}'\n", + ], + ['slug-history', "root = 'slug-history'\npath = '${{ entityType }}/${{ slug }}'\n"], + ['revocations', "root = 'revocations'\npath = '${{ jti }}'\n"], + ]; + for (const [name, body] of sheets) { + await writeFile(join(dir, '.gitsheets', `${name}.toml`), `[gitsheet]\n${body}`); + } await run('add', '.gitsheets'); await run('commit', '-m', 'initial empty branch'); await run('branch', '-M', 'empty'); // rename initial branch @@ -551,39 +578,48 @@ describe('importLaddrFromJson — orchestrator', () => { const tree = await exec('git', ['ls-tree', '-r', '--name-only', 'HEAD'], { cwd: repo }); const paths = tree.stdout.split('\n').filter(Boolean); - // people/.toml — keyed by legacyId, not slug - expect(paths).toContain('people/10.toml'); - expect(paths).toContain('people/20.toml'); - // projects/.toml - expect(paths).toContain('projects/100.toml'); - // tags/.toml - expect(paths).toContain('tags/1.toml'); - expect(paths).toContain('tags/2.toml'); - // composite memberships - expect(paths).toContain('project-memberships/100-10.toml'); - expect(paths).toContain('project-memberships/100-20.toml'); - // composite tag-assignments - expect(paths).toContain('tag-assignments/1-project-100.toml'); - expect(paths).toContain('tag-assignments/2-person-10.toml'); - // updates + buzz by legacyId - expect(paths).toContain('project-updates/500.toml'); - expect(paths).toContain('project-buzz/800.toml'); + // people/.toml — gitsheets path templates resolve `${{ slug }}` + expect(paths).toContain('people/alice.toml'); + expect(paths).toContain('people/bob.toml'); + // projects/.toml + expect(paths).toContain('projects/transit-app.toml'); + // tags//.toml — split from handle `topic.transit` + expect(paths).toContain('tags/topic/transit.toml'); + expect(paths).toContain('tags/tech/javascript.toml'); + // project-memberships//.toml + expect(paths).toContain('project-memberships/transit-app/alice.toml'); + expect(paths).toContain('project-memberships/transit-app/bob.toml'); + // tag-assignments: `taggableType/taggableId/tagId.toml`, UUID-keyed. + // Don't pin exact UUIDs — check shape + that both taggable types appear. + const tagAssignmentPaths = paths.filter((p) => p.startsWith('tag-assignments/')); + expect(tagAssignmentPaths.length).toBeGreaterThanOrEqual(2); + expect(tagAssignmentPaths.some((p) => p.startsWith('tag-assignments/project/'))).toBe(true); + expect(tagAssignmentPaths.some((p) => p.startsWith('tag-assignments/person/'))).toBe(true); + // project-updates//.toml + expect(paths).toContain('project-updates/transit-app/1.toml'); + // project-buzz//.toml — buzz slug derived from Handle + expect(paths).toContain('project-buzz/transit-app/transit-app-on-tv.toml'); + + // gitsheets writes commits directly to the git object DB without + // touching the working tree. Read blob contents via `git show HEAD:`. + const showBlob = async (path: string): Promise => + (await exec('git', ['show', `HEAD:${path}`], { cwd: repo })).stdout; // Stage lowercased - const projToml = await readFile(join(repo, 'projects/100.toml'), 'utf8'); + const projToml = await showBlob('projects/transit-app.toml'); expect(projToml).toContain('stage = "prototyping"'); expect(projToml).toContain('legacyId = 100'); // chatChannel preserved expect(projToml).toContain('chatChannel = "transit-app"'); // Person.slackSamlNameId == slug - const aliceToml = await readFile(join(repo, 'people/10.toml'), 'utf8'); + const aliceToml = await showBlob('people/alice.toml'); expect(aliceToml).toContain('slackSamlNameId = "alice"'); expect(aliceToml).toContain('slug = "alice"'); // No PII (email-shaped patterns / bcrypt hashes) in any committed file for (const path of paths.filter((p) => p.endsWith('.toml'))) { - const content = await readFile(join(repo, path), 'utf8'); + const content = await showBlob(path); expect(content, `email-like in ${path}`).not.toMatch(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}/); expect(content, `bcrypt-like in ${path}`).not.toMatch(/\$2[ayb]\$/); } @@ -698,10 +734,12 @@ describe('importLaddrFromJson — orchestrator', () => { expect(second.noChanges).toBe(false); // The diff between the two commits should touch exactly one file: - // projects/100.toml. + // the transit-app project itself (path is slug-keyed). All other + // records (memberships, tag-assignments, etc.) preserve their UUIDs + // and content across re-runs. const diff = await exec('git', ['diff', '--name-only', `${first.commitHash}..${second.commitHash}`], { cwd: repo }); const changed = diff.stdout.split('\n').filter(Boolean); - expect(changed).toEqual(['projects/100.toml']); + expect(changed).toEqual(['projects/transit-app.toml']); } finally { await cleanup(); } From 0afdd253f0c1dc1a98d1e8b43772f0dadb76d62c Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Mon, 18 May 2026 22:50:40 -0400 Subject: [PATCH 2/3] fix(deploy): size pod for the full laddr-imported dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous resource limits (768Mi memory, default Node heap ~400Mi) were fine for the 4-record fixture but OOM'd immediately on the real import: <--- Last few GCs ---> Mark-Compact (reduce) 383.2 (390.9) -> 382.7 (391.4) MB FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory Two changes, both rooted in the in-memory model the API uses (full public dataset + secondary indices + FTS all sit in RAM by design): - Bump container memory: requests 384→768 Mi, limits 768→2048 Mi. Headroom for ~31k people, 268 projects, 10k tag-assignments, FTS index, plus ephemeral per-request work. - Set NODE_OPTIONS=--max-old-space-size=1536 in the env ConfigMap. Node's default old-space cap in containers is ~25% of memory (kernel-detected, capped low). Without raising it, even a generous container limit doesn't matter — the pod OOMs at the heap ceiling before it touches the cgroup limit. 1.5 Gi leaves ~25% headroom under the 2 Gi container ceiling for the runtime's own ephemeral allocations. Live-verified on sandbox: pod boots in ~20s, 269 projects + ~30k people loaded, FTS ready, facet aggregations correct, readiness probe green. Co-Authored-By: Claude Opus 4.7 (1M context) --- deploy/kustomize/base/configmap.yaml | 5 +++++ deploy/kustomize/base/deployment.yaml | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml index 2558798..5ebdac9 100644 --- a/deploy/kustomize/base/configmap.yaml +++ b/deploy/kustomize/base/configmap.yaml @@ -9,6 +9,11 @@ data: GIT_AUTHOR_EMAIL: "api@codeforphilly.org" GIT_AUTHOR_NAME: "CodeForPhilly API" NODE_ENV: "production" + # Bump Node's old-space heap ceiling above the default (~400Mi on Linux + # containers) so the in-memory record set + secondary indices + FTS fit. + # Keep ~25% headroom below the container's memory limit for ephemeral + # work (request handling, gitsheets tree mutations, etc.). + NODE_OPTIONS: "--max-old-space-size=1536" PORT: "3001" STORAGE_BACKEND: "filesystem" CFP_DATA_BRANCH: "fixture" diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml index 2e01faf..1b6b271 100644 --- a/deploy/kustomize/base/deployment.yaml +++ b/deploy/kustomize/base/deployment.yaml @@ -64,10 +64,12 @@ spec: resources: requests: cpu: 100m - memory: 384Mi + memory: 768Mi limits: cpu: 1000m - memory: 768Mi + # Holds the full public dataset + secondary indices + FTS in + # memory (~31k people, 268 projects, 10k tag-assignments, …) + memory: 2Gi securityContext: runAsNonRoot: true runAsUser: 1000 From 88b3b225c45402d58589a206db05e57ae5d23f4b Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Tue, 19 May 2026 07:47:18 -0400 Subject: [PATCH 3/3] chore(deps): bump gitsheets to ^1.3.1, drop transact workaround + unused imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gitsheets v1.3.1 ships fixes for the two upstream issues this PR's importer worked around: - JarvusInnovations/gitsheets#178 — `Sheet#clear()` now uses git's empty-tree hash via hologit's `TreeObject.clearChildren()`. Constant-time instead of walking + `deleteChild` per entry. Snapshot importers feel this most. - JarvusInnovations/gitsheets#179 — `Transaction#finalize` now compares resulting tree-hash to parent's tree-hash; no commit is created when they match. The post-transact `git update-ref` workaround in the importer can go away. Removed: - The post-transact tree-hash comparison + ref reset (no longer needed) - Unused `IdMaps` type import (left over from the rewrite) - Unused `readFile` import in the test (replaced by `git show HEAD:`) Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/package.json | 2 +- apps/api/scripts/import-laddr/importer.ts | 30 ++++------------------- apps/api/tests/import-laddr.test.ts | 2 +- package-lock.json | 28 ++++++++++----------- 4 files changed, 21 insertions(+), 41 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index dd8162d..fd18494 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -31,7 +31,7 @@ "bcryptjs": "^3.0.3", "better-sqlite3": "^12.10.0", "fastify": "^5.8.5", - "gitsheets": "^1.2.0", + "gitsheets": "^1.3.1", "jose": "^6.2.3", "samlify": "^2.13.0", "uuidv7": "^1.2.1", diff --git a/apps/api/scripts/import-laddr/importer.ts b/apps/api/scripts/import-laddr/importer.ts index 2c61c49..ae56fdf 100644 --- a/apps/api/scripts/import-laddr/importer.ts +++ b/apps/api/scripts/import-laddr/importer.ts @@ -81,7 +81,6 @@ import { translateTagAssignment, translateUpdate, type ExistingIds, - type IdMaps, type TranslateCtx, type Warnings, } from './translators.js'; @@ -478,36 +477,17 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise