diff --git a/apps/api/package.json b/apps/api/package.json index dd8162d..fd18494 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -31,7 +31,7 @@ "bcryptjs": "^3.0.3", "better-sqlite3": "^12.10.0", "fastify": "^5.8.5", - "gitsheets": "^1.2.0", + "gitsheets": "^1.3.1", "jose": "^6.2.3", "samlify": "^2.13.0", "uuidv7": "^1.2.1", diff --git a/apps/api/scripts/import-laddr.ts b/apps/api/scripts/import-laddr.ts index 5efc8d2..a45ebdc 100644 --- a/apps/api/scripts/import-laddr.ts +++ b/apps/api/scripts/import-laddr.ts @@ -11,7 +11,7 @@ * --source-host=codeforphilly.org \ * --data-repo=/path/to/codeforphilly-data \ * --branch=legacy-import \ - * [--dry-run] [--no-commit] [--limit=N] [--verbose] [--page-size=N] [--delay-ms=N] + * [--dry-run] [--limit=N] [--verbose] [--page-size=N] [--delay-ms=N] * * Defaults: * --source-host codeforphilly.org @@ -30,7 +30,6 @@ interface CliArgs { readonly dataRepo: string; readonly branch: string; readonly dryRun: boolean; - readonly noCommit: boolean; readonly limit: number | undefined; readonly verbose: boolean; readonly pageSize: number | undefined; @@ -76,7 +75,6 @@ function parseArgs(argv: readonly string[]): CliArgs { ? (opts['branch'] as string) : 'legacy-import', dryRun: opts['dry-run'] === true, - noCommit: opts['no-commit'] === true, limit: typeof limit === 'number' && Number.isFinite(limit) ? limit : undefined, verbose: opts['verbose'] === true, pageSize: typeof pageSize === 'number' && Number.isFinite(pageSize) ? pageSize : undefined, @@ -90,16 +88,13 @@ async function main(): Promise { console.log(`[import-laddr] source-host=${args.sourceHost}`); console.log(`[import-laddr] data-repo=${args.dataRepo}`); console.log(`[import-laddr] branch=${args.branch}`); - console.log( - `[import-laddr] dry-run=${args.dryRun} no-commit=${args.noCommit} limit=${args.limit ?? 'none'}`, - ); + console.log(`[import-laddr] dry-run=${args.dryRun} limit=${args.limit ?? 'none'}`); const report = await importLaddrFromJson({ sourceHost: args.sourceHost, dataRepo: args.dataRepo, branch: args.branch, dryRun: args.dryRun, - noCommit: args.noCommit, limit: args.limit, verbose: args.verbose, pageSize: args.pageSize, @@ -127,8 +122,6 @@ function printReport(report: ImportReport, args: CliArgs): void { } if (args.dryRun) { lines.push(`(dry-run: no writes performed)`); - } else if (args.noCommit) { - lines.push(`(no-commit: files staged, no commit made)`); } else if (report.noChanges) { lines.push(`(no changes from parent commit — branch unchanged)`); } else if (report.commitHash) { diff --git a/apps/api/scripts/import-laddr/importer.ts b/apps/api/scripts/import-laddr/importer.ts index a38e1d2..ae56fdf 100644 --- a/apps/api/scripts/import-laddr/importer.ts +++ b/apps/api/scripts/import-laddr/importer.ts @@ -8,22 +8,22 @@ * Branch model: * - On first run, `legacy-import` is created from the `empty` branch (which * carries only `.gitsheets/` configs, no records). - * - On subsequent runs, the importer resets a working ref to the current - * `legacy-import` HEAD, removes every importer-owned directory, writes - * fresh files, and commits. - * - Records use `/.toml` paths (composite for memberships - * and tag-assignments) so re-runs overwrite stable filenames. The - * legacy-import branch is parallel history — the runtime spec's slug- - * based path templates apply once data is merged into `main`, which is - * an operator step outside this importer's scope. + * - On subsequent runs, the importer checks out the existing local branch, + * opens the gitsheets store, reads the previous snapshot's record UUIDs + * (to keep them stable across re-runs), then opens a single transaction + * that clears each importer-owned sheet and re-upserts the fresh data. + * - Files are written by gitsheets per each sheet's path template + * (e.g. `people/${{ slug }}.toml`) — the runtime API reads the same paths. + * The legacy-import branch's tree is shape-identical to fixture/main; the + * operator's merge into main is a plain `git merge`, no rename pass. * * Author identity on every commit: the pseudonymous Code for Philly API - * user (see plans/laddr-import-via-json.md). The agent's git config is - * never used. + * user. The agent's git config is never used (gitsheets honors `author` + * directly on `transact`). * * Side effects: - * - Writes/removes files in the data repo's working tree - * - Creates one commit on the local `legacy-import` branch + * - Checks out the target branch in the data repo's working tree + * - Creates one commit on the local `legacy-import` branch (via gitsheets) * - Does NOT push to origin (operator's call) * * Private-store side: out of scope for this importer. The JSON endpoints @@ -31,8 +31,7 @@ * newsletter prefs) will be imported separately on a future plan. */ import { execFile } from 'node:child_process'; -import { mkdir, readdir, rm, writeFile } from 'node:fs/promises'; -import { join, resolve } from 'node:path'; +import { resolve } from 'node:path'; import { promisify } from 'node:util'; const exec = promisify(execFile); @@ -56,6 +55,7 @@ import type { TagAssignment, } from '@cfp/shared/schemas'; +import { openPublicStore, type PublicStore } from '../../src/store/public.js'; import { fetchAllPages, RawPersonSchema, @@ -81,7 +81,6 @@ import { translateTagAssignment, translateUpdate, type ExistingIds, - type IdMaps, type TranslateCtx, type Warnings, } from './translators.js'; @@ -101,8 +100,6 @@ export interface ImportOptions { readonly initialParent?: string; /** If true, fetch + translate + report but do not write to the repo. */ readonly dryRun?: boolean; - /** If true, write files + stage but do not commit. */ - readonly noCommit?: boolean; /** Truncate each fetched resource to N rows (for dev loops). */ readonly limit?: number; /** Increase logging verbosity. */ @@ -132,25 +129,15 @@ export interface ImportReport { readonly branch: string; readonly counts: Record; readonly warnings: string[]; - /** Commit hash produced, or null in `--dry-run` / `--no-commit` / no-changes. */ + /** Commit hash produced, or null in `--dry-run` / no-changes. */ readonly commitHash: string | null; - /** True when the working tree after staging matches HEAD (so no commit was made). */ + /** True when the produced tree matches HEAD's (no commit was made). */ readonly noChanges: boolean; } const AUTHOR_NAME = 'Code for Philly API'; const AUTHOR_EMAIL = 'api@users.noreply.codeforphilly.org'; -const IMPORTER_OWNED_DIRS = [ - 'people', - 'projects', - 'tags', - 'project-memberships', - 'project-updates', - 'project-buzz', - 'tag-assignments', -] as const; - // --------------------------------------------------------------------------- // Entry point // --------------------------------------------------------------------------- @@ -180,16 +167,27 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise( - '/tags', - RawTagSchema, - {}, - fetchOpts, - )) { + for await (const row of fetchAllPages('/tags', RawTagSchema, {}, fetchOpts)) { const translated = translateTag(row, ctx); if (translated === null) { counts.tags!.skipped++; @@ -229,7 +222,6 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise = []; for await (const row of fetchAllPages( '/people', RawPersonSchema, @@ -262,11 +254,6 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise` and create commit. - // ------------------------------------------------------------------------- - const repo = resolve(opts.dataRepo); - await ensureGitRepo(repo); - const parent = await ensureBranch(repo, branch, initialParent); - await checkoutBranch(repo, branch, parent); - await wipeOwnedDirectories(repo); - - const filesWritten = await writeAllRecords(repo, { - tags, - people, - projects, - memberships, - updates, - buzz, - tagAssignments, - tagAssignmentLegacyTuples, - idMaps, - warnings, - }); - - log(`[import] wrote ${filesWritten} files`); - - // ------------------------------------------------------------------------- - // 4. Stage and check for changes. + // 3. One atomic gitsheets transaction: + // - clear() each importer-owned sheet (deletes capture for free) + // - upsert() every translated record (path-template + schema validation) // ------------------------------------------------------------------------- - for (const dir of IMPORTER_OWNED_DIRS) { - await git(repo, 'add', '-A', '--', dir); - } + if (store === null) throw new Error('[import-laddr] internal: store not opened'); + + const message = buildCommitMessage({ runAt, sourceHost: opts.sourceHost, counts }); + + const result = await store.transact( + { + message, + author: { name: AUTHOR_NAME, email: AUTHOR_EMAIL }, + trailers: { + Action: 'import.laddr.json', + 'Source-Host': opts.sourceHost, + 'Run-At': runAt, + }, + }, + async (tx) => { + log(`[import] clear + upsert tags (${tags.length})`); + await tx.tags.clear(); + for (const t of tags) await tx.tags.upsert(t); + + log(`[import] clear + upsert people (${people.length})`); + await tx.people.clear(); + for (const p of people) await tx.people.upsert(p); + + log(`[import] clear + upsert projects (${projects.length})`); + await tx.projects.clear(); + for (const p of projects) await tx.projects.upsert(p); + + log(`[import] clear + upsert project-memberships (${memberships.length})`); + await tx['project-memberships'].clear(); + for (const m of memberships) { + const projectSlug = idMaps.projectSlugByLegacy.get(m.legacyIds.projectLegacyId); + const personSlug = idMaps.personSlugByLegacy.get(m.legacyIds.personLegacyId); + if (!projectSlug || !personSlug) { + warnings.push( + `[project-memberships] skipped: missing slug for project=${m.legacyIds.projectLegacyId} or person=${m.legacyIds.personLegacyId}`, + ); + continue; + } + await tx['project-memberships'].upsert({ + ...m.record, + projectSlug, + personSlug, + } as ProjectMembership); + } - if (opts.noCommit) { - return { - runAt, - sourceHost: opts.sourceHost, - branch, - counts, - warnings: warningsList, - commitHash: null, - noChanges: false, - }; - } + log(`[import] clear + upsert project-updates (${updates.length})`); + await tx['project-updates'].clear(); + for (const { record, projectLegacyId } of updates) { + const projectSlug = idMaps.projectSlugByLegacy.get(projectLegacyId); + if (!projectSlug) { + warnings.push( + `[project-updates] skipped: missing slug for project=${projectLegacyId}`, + ); + continue; + } + await tx['project-updates'].upsert({ ...record, projectSlug } as ProjectUpdate); + } - // Compare the tree we built to the parent's tree — when nothing changed - // upstream, we want to exit cleanly without creating an empty commit. - const { stdout: porcelain } = await git(repo, 'status', '--porcelain'); - if (porcelain.trim() === '') { - log('[import] no changes from parent commit — nothing to commit'); - return { - runAt, - sourceHost: opts.sourceHost, - branch, - counts, - warnings: warningsList, - commitHash: null, - noChanges: true, - }; - } + log(`[import] clear + upsert project-buzz (${buzz.length})`); + await tx['project-buzz'].clear(); + for (const { record, projectLegacyId } of buzz) { + const projectSlug = idMaps.projectSlugByLegacy.get(projectLegacyId); + if (!projectSlug) { + warnings.push( + `[project-buzz] skipped: missing slug for project=${projectLegacyId}`, + ); + continue; + } + await tx['project-buzz'].upsert({ ...record, projectSlug } as ProjectBuzz); + } - const commitHash = await createImportCommit(repo, { - branch, - runAt, - sourceHost: opts.sourceHost, - counts, - }); + log(`[import] clear + upsert tag-assignments (${tagAssignments.length})`); + await tx['tag-assignments'].clear(); + for (const ta of tagAssignments) await tx['tag-assignments'].upsert(ta); + }, + ); + // gitsheets v1.3.1+ compares the post-transact tree-hash to the parent's + // tree-hash and returns commitHash=null when they match (no-op snapshot). + // No workaround needed. return { runAt, sourceHost: opts.sourceHost, branch, counts, warnings: warningsList, - commitHash, - noChanges: false, + commitHash: result.commitHash, + noChanges: result.commitHash === null, }; } // --------------------------------------------------------------------------- -// Filesystem writers -// --------------------------------------------------------------------------- - -interface WriteBundle { - readonly tags: readonly Tag[]; - readonly people: readonly Person[]; - readonly projects: readonly Project[]; - readonly memberships: readonly { - record: ProjectMembership; - legacyIds: { projectLegacyId: number; personLegacyId: number }; - }[]; - readonly updates: readonly { - record: ProjectUpdate; - projectLegacyId: number; - }[]; - readonly buzz: readonly { - record: ProjectBuzz; - projectLegacyId: number; - }[]; - readonly tagAssignments: readonly TagAssignment[]; - readonly tagAssignmentLegacyTuples: readonly { - tagLegacyId: number; - taggableLegacyId: number; - taggableType: 'project' | 'person'; - }[]; - readonly idMaps: IdMaps; - readonly warnings: Warnings; -} - -async function writeAllRecords(repo: string, b: WriteBundle): Promise { - let count = 0; - - // people/.toml - for (const r of b.people) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['people', `${r.legacyId}.toml`], r); - count++; - } - // projects/.toml - for (const r of b.projects) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['projects', `${r.legacyId}.toml`], r); - count++; - } - // tags/.toml - for (const r of b.tags) { - if (r.legacyId === undefined) continue; - await writeRecord(repo, ['tags', `${r.legacyId}.toml`], r); - count++; - } - // project-memberships/-.toml - for (const { record, legacyIds } of b.memberships) { - await writeRecord( - repo, - ['project-memberships', `${legacyIds.projectLegacyId}-${legacyIds.personLegacyId}.toml`], - record, - ); - count++; - } - // project-updates/.toml - for (const { record } of b.updates) { - if (record.legacyId === undefined) continue; - await writeRecord(repo, ['project-updates', `${record.legacyId}.toml`], record); - count++; - } - // project-buzz/.toml - for (const { record } of b.buzz) { - if (record.legacyId === undefined) continue; - await writeRecord(repo, ['project-buzz', `${record.legacyId}.toml`], record); - count++; - } - // tag-assignments/--.toml - for (let i = 0; i < b.tagAssignments.length; i++) { - const record = b.tagAssignments[i]!; - const legacy = b.tagAssignmentLegacyTuples[i]!; - await writeRecord( - repo, - [ - 'tag-assignments', - `${legacy.tagLegacyId}-${legacy.taggableType}-${legacy.taggableLegacyId}.toml`, - ], - record, - ); - count++; - } - - return count; -} - -async function writeRecord( - repo: string, - pathParts: readonly string[], - record: Record, -): Promise { - const full = join(repo, ...pathParts); - await mkdir(join(full, '..'), { recursive: true }); - await writeFile(full, toToml(record), 'utf8'); -} - -// --------------------------------------------------------------------------- -// TOML serialization (flat records; same shape as scripts/scrub-data.ts). -// Records are written with keys in a stable alphabetical order so consecutive -// snapshots produce stable diffs even if the in-memory object key order -// drifts. +// Helpers // --------------------------------------------------------------------------- -export function toToml(record: Record): string { - const keys = Object.keys(record).sort(); - const lines: string[] = []; - for (const key of keys) { - const value = record[key]; - if (value === null || value === undefined) continue; - if (typeof value === 'string') { - if (value.includes('\n')) { - // Use TOML's basic-multiline-string form; escape the rare embedded - // triple-quote sequence and any backslashes. - const escaped = value.replace(/\\/g, '\\\\').replace(/"""/g, '\\"""'); - lines.push(`${key} = """\n${escaped}\n"""`); - } else { - const escaped = value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); - lines.push(`${key} = "${escaped}"`); - } - } else if (typeof value === 'number') { - lines.push(`${key} = ${value}`); - } else if (typeof value === 'boolean') { - lines.push(`${key} = ${value}`); - } - // Arrays/objects intentionally not handled — all current v1 record fields - // are scalar at the top level. - } - return `${lines.join('\n')}\n`; -} - -// --------------------------------------------------------------------------- -// Git helpers -// --------------------------------------------------------------------------- - -function git( - cwd: string, - ...args: string[] -): Promise<{ stdout: string; stderr: string }> { - return exec('git', args, { cwd, maxBuffer: 256 * 1024 * 1024 }); -} - -async function ensureGitRepo(repo: string): Promise { - try { - await git(repo, 'rev-parse', '--git-dir'); - } catch (err) { - throw new Error( - `[import-laddr] ${repo} is not a git working directory: ${describe(err)}`, - { cause: err }, - ); - } -} - -/** - * Make sure the named branch exists locally. Returns the parent commit hash - * we should use as the snapshot's parent — current branch tip if it exists, - * else `initialParent`'s commit hash. - */ -async function ensureBranch( - repo: string, - branch: string, - initialParent: string, -): Promise { - try { - const { stdout } = await git(repo, 'rev-parse', '--verify', `refs/heads/${branch}`); - return stdout.trim(); - } catch { - // No local branch. Try `origin/` first; fall back to initialParent. - try { - const { stdout } = await git(repo, 'rev-parse', '--verify', `refs/remotes/origin/${branch}`); - return stdout.trim(); - } catch { - // ignore — fall through - } - const { stdout } = await git(repo, 'rev-parse', '--verify', initialParent); - return stdout.trim(); - } -} - -async function checkoutBranch( - repo: string, - branch: string, - parent: string, -): Promise { - // Force-reset working tree to the desired parent under the named branch. - await git(repo, 'checkout', '-B', branch, parent); -} - -async function wipeOwnedDirectories(repo: string): Promise { - for (const dir of IMPORTER_OWNED_DIRS) { - const full = join(repo, dir); - // `git rm -rf -- ` removes both the index entries and the working - // tree files in one shot. The first run on a fresh branch has nothing - // to remove, so swallow ENOENT-style failures. - try { - await git(repo, 'rm', '-rf', '--ignore-unmatch', '--', dir); - } catch { - // ignore — directory not present - } - // Defensively remove any leftover working-tree files (covers untracked - // detritus from a previous --no-commit run). - await rm(full, { recursive: true, force: true }); - } -} - interface CommitParams { - readonly branch: string; readonly runAt: string; readonly sourceHost: string; readonly counts: Record; } -async function createImportCommit( - repo: string, - p: CommitParams, -): Promise { - const env = { - ...process.env, - GIT_AUTHOR_NAME: AUTHOR_NAME, - GIT_AUTHOR_EMAIL: AUTHOR_EMAIL, - GIT_COMMITTER_NAME: AUTHOR_NAME, - GIT_COMMITTER_EMAIL: AUTHOR_EMAIL, - GIT_AUTHOR_DATE: p.runAt, - GIT_COMMITTER_DATE: p.runAt, - }; - - const message = buildCommitMessage(p); - const messageFile = join(repo, '.git', 'IMPORT_LADDR_MSG'); - await writeFile(messageFile, message, 'utf8'); - - // Use `--quiet` to keep `git commit`'s stdout small (the create-mode list - // for a 40k-file snapshot otherwise exceeds the default execFile buffer). - await exec('git', ['commit', '--quiet', '-F', messageFile], { - cwd: repo, - env, - maxBuffer: 256 * 1024 * 1024, - }); - - const { stdout: shaRaw } = await git(repo, 'rev-parse', 'HEAD'); - return shaRaw.trim(); -} - function buildCommitMessage(p: CommitParams): string { const c = p.counts; const subject = `import: snapshot from ${p.sourceHost} (${p.runAt})`; @@ -753,13 +514,9 @@ function buildCommitMessage(p: CommitParams): string { `${c['tag-assignments']!.imported} tag-assignments`, ].join(', '); - return `${subject}\n\n${summary}.\n\nAction: import.laddr.json\nSource-Host: ${p.sourceHost}\nRun-At: ${p.runAt}\n`; + return `${subject}\n\n${summary}.\n`; } -// --------------------------------------------------------------------------- -// Misc helpers -// --------------------------------------------------------------------------- - function blank(): EntityCounts { return { imported: 0, skipped: 0, errors: 0 }; } @@ -784,206 +541,141 @@ function describe(err: unknown): string { return String(err); } +// --------------------------------------------------------------------------- +// Git plumbing (branch checkout only — gitsheets handles commit creation) +// --------------------------------------------------------------------------- + +async function ensureGitRepo(repo: string): Promise { + try { + await exec('git', ['rev-parse', '--git-dir'], { cwd: repo }); + } catch (err) { + throw new Error( + `[import-laddr] ${repo} is not a git working directory: ${describe(err)}`, + { cause: err }, + ); + } +} + /** - * Read each importer-owned `.toml` file from the latest snapshot tip and - * extract the record's `id` field. Used to keep UUIDs stable across re-runs - * so an unchanged source produces an unchanged tree (idempotence). - * - * Reads from `refs/heads/` if it exists, then `refs/remotes/origin/ - * `, then the configured fallback. Returns an empty map if no parent - * exists yet (first run). - * - * Implementation note: `git cat-file --batch` is used to stream blob contents - * in a single subprocess rather than fork+exec per-file. Snapshots can have - * 40k+ files; per-file `git show` calls take many minutes. + * Check out the target branch in the working tree. On first run, create it + * from `origin/` if available, falling back to `initialParent` + * (typically `origin/empty`). */ -async function collectExistingIds( +async function ensureBranchCheckedOut( repo: string, branch: string, initialParent: string, -): Promise { - const ids = newExistingIds(); - let ref: string | null = null; - for (const candidate of [ - `refs/heads/${branch}`, - `refs/remotes/origin/${branch}`, - initialParent, - ]) { - try { - await git(repo, 'rev-parse', '--verify', candidate); - ref = candidate; - break; - } catch { - // try next - } +): Promise { + // Existing local branch: just switch. + try { + await exec('git', ['rev-parse', '--verify', `refs/heads/${branch}`], { cwd: repo }); + await exec('git', ['checkout', branch], { cwd: repo }); + return; + } catch { + // No local branch — fall through. } - if (ref === null) return ids; - - // `ls-tree -r` gives us mode + sha + filename for every file under the - // commit's tree. We need both blob sha (for cat-file --batch lookup) and - // path (so we know which sheet the record belongs to). - let listing: string; + // No local branch yet. Try origin/, fall back to initialParent. + let parent: string; try { - const { stdout } = await git(repo, 'ls-tree', '-r', ref); - listing = stdout; + await exec('git', ['rev-parse', '--verify', `refs/remotes/origin/${branch}`], { + cwd: repo, + }); + parent = `origin/${branch}`; } catch { - return ids; + parent = initialParent; } + await exec('git', ['checkout', '-b', branch, parent], { cwd: repo }); +} - interface Entry { - readonly sha: string; - readonly path: string; - } - const entries: Entry[] = []; - for (const line of listing.split('\n')) { - // Format: ` \t` - const tabIdx = line.indexOf('\t'); - if (tabIdx === -1) continue; - const meta = line.slice(0, tabIdx).split(/\s+/); - const path = line.slice(tabIdx + 1); - if (meta.length < 3) continue; - if (!path.endsWith('.toml')) continue; - let owned = false; - for (const dir of IMPORTER_OWNED_DIRS) { - if (path.startsWith(`${dir}/`)) { - owned = true; - break; +// --------------------------------------------------------------------------- +// Pre-pass: read existing UUIDs from the current snapshot so re-runs preserve +// each record's `id` field. Keys mirror the translator's `idFor(ctx, key)` +// calls. +// +// Simple sheets (people, projects, tags, project-updates, project-buzz) +// carry `legacyId` directly on the record, so we read them in one pass. +// Composite-path sheets (project-memberships, tag-assignments) don't store +// legacy IDs on the record; we recover them via uuid→legacyId reverse maps +// built from the simple-sheet pass. +// --------------------------------------------------------------------------- + +async function collectExistingIds( + store: PublicStore, + log: (msg: string) => void, +): Promise { + const ids = newExistingIds(); + let count = 0; + + // Pass 1: simple sheets. Also build uuid → legacyId reverse maps used by + // the composite-sheet pass below. + const personLegacyByUuid = new Map(); + const projectLegacyByUuid = new Map(); + const tagLegacyByUuid = new Map(); + + const simpleSheets = ['people', 'projects', 'tags', 'project-updates', 'project-buzz'] as const; + for (const sheetName of simpleSheets) { + const sheet = store[sheetName] as { query: () => AsyncIterable> }; + for await (const record of sheet.query()) { + const legacyId = record['legacyId']; + const id = record['id']; + if (typeof legacyId === 'number' && typeof id === 'string') { + ids.byFile.set(`${sheetName}/${legacyId}`, id); + count++; + if (sheetName === 'people') personLegacyByUuid.set(id, legacyId); + else if (sheetName === 'projects') projectLegacyByUuid.set(id, legacyId); + else if (sheetName === 'tags') tagLegacyByUuid.set(id, legacyId); } } - if (!owned) continue; - entries.push({ sha: meta[2]!, path }); } - if (entries.length === 0) return ids; - - // Spawn `git cat-file --batch` once; feed it newline-separated SHAs on stdin, - // parse the streamed ` blob \n\n` responses. - const blobs = await batchCatFile(repo, entries.map((e) => e.sha)); - for (let i = 0; i < entries.length; i++) { - const content = blobs[i] ?? ''; - const id = extractTomlString(content, 'id'); - if (id) { - const key = entries[i]!.path.replace(/\.toml$/, ''); - ids.byFile.set(key, id); + // Pass 2: composite-path sheets. Look up legacy IDs for the referenced + // entities; skip records that point at uuids we couldn't resolve. + const membershipsSheet = store['project-memberships'] as { + query: () => AsyncIterable>; + }; + for await (const record of membershipsSheet.query()) { + const projectId = record['projectId']; + const personId = record['personId']; + const id = record['id']; + if (typeof id !== 'string' || typeof projectId !== 'string' || typeof personId !== 'string') { + continue; } + const projectLegacyId = projectLegacyByUuid.get(projectId); + const personLegacyId = personLegacyByUuid.get(personId); + if (projectLegacyId === undefined || personLegacyId === undefined) continue; + ids.byFile.set(`project-memberships/${projectLegacyId}-${personLegacyId}`, id); + count++; } - return ids; -} - -/** - * Stream blob contents via a single `git cat-file --batch` invocation. Each - * input SHA produces one entry in the returned array, in the same order. - * - * The protocol: emit one SHA per line on stdin; for each, git emits a header - * line ` \n` followed by `` bytes of content and a - * trailing `\n`. On `missing` (unknown SHA), git emits ` missing\n` and - * no content. We treat missing as empty. - */ -async function batchCatFile(repo: string, shas: readonly string[]): Promise { - if (shas.length === 0) return []; - const { spawn } = await import('node:child_process'); - return await new Promise((resolve, reject) => { - const child = spawn('git', ['cat-file', '--batch'], { - cwd: repo, - stdio: ['pipe', 'pipe', 'pipe'], - }); - - const results: string[] = []; - let stderrAcc = ''; - let buf = Buffer.alloc(0); - let mode: 'header' | 'content' = 'header'; - let expected = 0; - - child.stderr.setEncoding('utf8'); - child.stderr.on('data', (chunk: string) => { - stderrAcc += chunk; - }); - - child.stdout.on('data', (chunk: Buffer) => { - buf = Buffer.concat([buf, chunk]); - while (true) { - if (mode === 'header') { - const nl = buf.indexOf(0x0a); - if (nl === -1) return; - const header = buf.slice(0, nl).toString('utf8'); - buf = buf.slice(nl + 1); - // header is ` ` or ` missing` - const parts = header.split(' '); - if (parts.length === 3 && parts[1] !== 'missing') { - expected = parseInt(parts[2]!, 10); - mode = 'content'; - } else { - // missing — no content body - results.push(''); - if (results.length === shas.length) { - try { - child.stdin.end(); - } catch { - // ignore - } - } - } - } else { - // content mode: wait for `expected` bytes + the trailing newline - if (buf.length < expected + 1) return; - const content = buf.slice(0, expected).toString('utf8'); - buf = buf.slice(expected + 1); // skip trailing newline - results.push(content); - mode = 'header'; - if (results.length === shas.length) { - try { - child.stdin.end(); - } catch { - // ignore - } - } - } - } - }); - - child.on('close', (code) => { - if (code !== 0 && results.length !== shas.length) { - reject(new Error(`git cat-file --batch exited ${code}: ${stderrAcc}`)); - } else { - resolve(results); - } - }); - child.on('error', reject); - - // Feed SHAs as a single write — git's batch mode reads to EOL. - const payload = shas.join('\n') + '\n'; - child.stdin.write(payload); - // Don't end stdin yet — close it when all entries have been read so the - // batch process drains cleanly. (Closing early on a slow consumer would - // truncate output.) - }); -} -function extractTomlString(content: string, key: string): string | null { - const re = new RegExp(`^${key}\\s*=\\s*"(.*)"$`, 'm'); - const m = content.match(re); - if (m === null) return null; - // Reverse the simple TOML escapes used by our writer. - return (m[1] ?? '').replace(/\\"/g, '"').replace(/\\\\/g, '\\'); -} - -// Exposed for direct invocation in tests that walk the tree. -export { IMPORTER_OWNED_DIRS }; - -// Used by tests that want to introspect the unused-but-imported readdir helper. -export async function listOwnedToml(repo: string): Promise { - const out: string[] = []; - for (const dir of IMPORTER_OWNED_DIRS) { - const full = join(repo, dir); - try { - for (const entry of await readdir(full, { withFileTypes: true })) { - if (entry.isFile() && entry.name.endsWith('.toml')) { - out.push(`${dir}/${entry.name}`); - } - } - } catch { - // dir not present + const tagAssignmentsSheet = store['tag-assignments'] as { + query: () => AsyncIterable>; + }; + for await (const record of tagAssignmentsSheet.query()) { + const tagId = record['tagId']; + const taggableId = record['taggableId']; + const taggableType = record['taggableType']; + const id = record['id']; + if ( + typeof id !== 'string' || + typeof tagId !== 'string' || + typeof taggableId !== 'string' || + (taggableType !== 'project' && taggableType !== 'person') + ) { + continue; } + const tagLegacyId = tagLegacyByUuid.get(tagId); + const taggableLegacyId = + taggableType === 'project' + ? projectLegacyByUuid.get(taggableId) + : personLegacyByUuid.get(taggableId); + if (tagLegacyId === undefined || taggableLegacyId === undefined) continue; + ids.byFile.set( + `tag-assignments/${tagLegacyId}-${taggableType}-${taggableLegacyId}`, + id, + ); + count++; } - return out; + + log(`[import] pre-pass: preserved ${count} record UUIDs from previous snapshot`); + return ids; } diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts index d61b81b..6af273f 100644 --- a/apps/api/tests/import-laddr.test.ts +++ b/apps/api/tests/import-laddr.test.ts @@ -7,7 +7,7 @@ * dev (see plans/laddr-import-via-json.md). */ import { execFile } from 'node:child_process'; -import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; @@ -369,12 +369,39 @@ async function makeRepo(): Promise<{ path: string; cleanup: () => Promise await run('config', 'user.email', 'test@cfp.test'); await run('config', 'user.name', 'test'); await run('config', 'commit.gpgsign', 'false'); - // Create an "empty" branch with a .gitsheets seed similar to upstream + // Create an "empty" branch with the full .gitsheets config matrix. The + // importer opens the gitsheets store at boot, which requires every sheet + // declared on the validator map (PublicValidators) to have a corresponding + // .gitsheets/.toml. Mirrors the upstream codeforphilly-data repo. await mkdir(join(dir, '.gitsheets'), { recursive: true }); - await writeFile( - join(dir, '.gitsheets', 'people.toml'), - "[gitsheet]\nroot = 'people'\npath = '${{ slug }}'\n", - ); + const sheets: Array<[string, string]> = [ + ['people', "root = 'people'\npath = '${{ slug }}'\n"], + ['projects', "root = 'projects'\npath = '${{ slug }}'\n"], + ['tags', "root = 'tags'\npath = '${{ namespace }}/${{ slug }}'\n"], + [ + 'project-memberships', + "root = 'project-memberships'\npath = '${{ projectSlug }}/${{ personSlug }}'\n", + ], + [ + 'project-updates', + "root = 'project-updates'\npath = '${{ projectSlug }}/${{ number }}'\n", + ], + ['project-buzz', "root = 'project-buzz'\npath = '${{ projectSlug }}/${{ slug }}'\n"], + [ + 'tag-assignments', + "root = 'tag-assignments'\npath = '${{ taggableType }}/${{ taggableId }}/${{ tagId }}'\n", + ], + ['help-wanted-roles', "root = 'help-wanted-roles'\npath = '${{ projectSlug }}/${{ slug }}'\n"], + [ + 'help-wanted-interest', + "root = 'help-wanted-interest'\npath = '${{ roleId }}/${{ personId }}'\n", + ], + ['slug-history', "root = 'slug-history'\npath = '${{ entityType }}/${{ slug }}'\n"], + ['revocations', "root = 'revocations'\npath = '${{ jti }}'\n"], + ]; + for (const [name, body] of sheets) { + await writeFile(join(dir, '.gitsheets', `${name}.toml`), `[gitsheet]\n${body}`); + } await run('add', '.gitsheets'); await run('commit', '-m', 'initial empty branch'); await run('branch', '-M', 'empty'); // rename initial branch @@ -551,39 +578,48 @@ describe('importLaddrFromJson — orchestrator', () => { const tree = await exec('git', ['ls-tree', '-r', '--name-only', 'HEAD'], { cwd: repo }); const paths = tree.stdout.split('\n').filter(Boolean); - // people/.toml — keyed by legacyId, not slug - expect(paths).toContain('people/10.toml'); - expect(paths).toContain('people/20.toml'); - // projects/.toml - expect(paths).toContain('projects/100.toml'); - // tags/.toml - expect(paths).toContain('tags/1.toml'); - expect(paths).toContain('tags/2.toml'); - // composite memberships - expect(paths).toContain('project-memberships/100-10.toml'); - expect(paths).toContain('project-memberships/100-20.toml'); - // composite tag-assignments - expect(paths).toContain('tag-assignments/1-project-100.toml'); - expect(paths).toContain('tag-assignments/2-person-10.toml'); - // updates + buzz by legacyId - expect(paths).toContain('project-updates/500.toml'); - expect(paths).toContain('project-buzz/800.toml'); + // people/.toml — gitsheets path templates resolve `${{ slug }}` + expect(paths).toContain('people/alice.toml'); + expect(paths).toContain('people/bob.toml'); + // projects/.toml + expect(paths).toContain('projects/transit-app.toml'); + // tags//.toml — split from handle `topic.transit` + expect(paths).toContain('tags/topic/transit.toml'); + expect(paths).toContain('tags/tech/javascript.toml'); + // project-memberships//.toml + expect(paths).toContain('project-memberships/transit-app/alice.toml'); + expect(paths).toContain('project-memberships/transit-app/bob.toml'); + // tag-assignments: `taggableType/taggableId/tagId.toml`, UUID-keyed. + // Don't pin exact UUIDs — check shape + that both taggable types appear. + const tagAssignmentPaths = paths.filter((p) => p.startsWith('tag-assignments/')); + expect(tagAssignmentPaths.length).toBeGreaterThanOrEqual(2); + expect(tagAssignmentPaths.some((p) => p.startsWith('tag-assignments/project/'))).toBe(true); + expect(tagAssignmentPaths.some((p) => p.startsWith('tag-assignments/person/'))).toBe(true); + // project-updates//.toml + expect(paths).toContain('project-updates/transit-app/1.toml'); + // project-buzz//.toml — buzz slug derived from Handle + expect(paths).toContain('project-buzz/transit-app/transit-app-on-tv.toml'); + + // gitsheets writes commits directly to the git object DB without + // touching the working tree. Read blob contents via `git show HEAD:`. + const showBlob = async (path: string): Promise => + (await exec('git', ['show', `HEAD:${path}`], { cwd: repo })).stdout; // Stage lowercased - const projToml = await readFile(join(repo, 'projects/100.toml'), 'utf8'); + const projToml = await showBlob('projects/transit-app.toml'); expect(projToml).toContain('stage = "prototyping"'); expect(projToml).toContain('legacyId = 100'); // chatChannel preserved expect(projToml).toContain('chatChannel = "transit-app"'); // Person.slackSamlNameId == slug - const aliceToml = await readFile(join(repo, 'people/10.toml'), 'utf8'); + const aliceToml = await showBlob('people/alice.toml'); expect(aliceToml).toContain('slackSamlNameId = "alice"'); expect(aliceToml).toContain('slug = "alice"'); // No PII (email-shaped patterns / bcrypt hashes) in any committed file for (const path of paths.filter((p) => p.endsWith('.toml'))) { - const content = await readFile(join(repo, path), 'utf8'); + const content = await showBlob(path); expect(content, `email-like in ${path}`).not.toMatch(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}/); expect(content, `bcrypt-like in ${path}`).not.toMatch(/\$2[ayb]\$/); } @@ -698,10 +734,12 @@ describe('importLaddrFromJson — orchestrator', () => { expect(second.noChanges).toBe(false); // The diff between the two commits should touch exactly one file: - // projects/100.toml. + // the transit-app project itself (path is slug-keyed). All other + // records (memberships, tag-assignments, etc.) preserve their UUIDs + // and content across re-runs. const diff = await exec('git', ['diff', '--name-only', `${first.commitHash}..${second.commitHash}`], { cwd: repo }); const changed = diff.stdout.split('\n').filter(Boolean); - expect(changed).toEqual(['projects/100.toml']); + expect(changed).toEqual(['projects/transit-app.toml']); } finally { await cleanup(); } diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml index 2558798..5ebdac9 100644 --- a/deploy/kustomize/base/configmap.yaml +++ b/deploy/kustomize/base/configmap.yaml @@ -9,6 +9,11 @@ data: GIT_AUTHOR_EMAIL: "api@codeforphilly.org" GIT_AUTHOR_NAME: "CodeForPhilly API" NODE_ENV: "production" + # Bump Node's old-space heap ceiling above the default (~400Mi on Linux + # containers) so the in-memory record set + secondary indices + FTS fit. + # Keep ~25% headroom below the container's memory limit for ephemeral + # work (request handling, gitsheets tree mutations, etc.). + NODE_OPTIONS: "--max-old-space-size=1536" PORT: "3001" STORAGE_BACKEND: "filesystem" CFP_DATA_BRANCH: "fixture" diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml index 2e01faf..1b6b271 100644 --- a/deploy/kustomize/base/deployment.yaml +++ b/deploy/kustomize/base/deployment.yaml @@ -64,10 +64,12 @@ spec: resources: requests: cpu: 100m - memory: 384Mi + memory: 768Mi limits: cpu: 1000m - memory: 768Mi + # Holds the full public dataset + secondary indices + FTS in + # memory (~31k people, 268 projects, 10k tag-assignments, …) + memory: 2Gi securityContext: runAsNonRoot: true runAsUser: 1000 diff --git a/package-lock.json b/package-lock.json index 286c924..4ac914c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -41,7 +41,7 @@ "bcryptjs": "^3.0.3", "better-sqlite3": "^12.10.0", "fastify": "^5.8.5", - "gitsheets": "^1.2.0", + "gitsheets": "^1.3.1", "jose": "^6.2.3", "samlify": "^2.13.0", "uuidv7": "^1.2.1", @@ -8409,9 +8409,9 @@ } }, "node_modules/git-client": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/git-client/-/git-client-1.11.1.tgz", - "integrity": "sha512-N3tsTCjaELPf3HiAI+Fg4ua74hRl6AT1v6EICbhvykg0pnAXfWqdjGXOY5TgA99r555N2SOtqIgKuwluB9F5kQ==", + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/git-client/-/git-client-1.12.1.tgz", + "integrity": "sha512-Px7HE2ug+IKiOhNE/vezkwpDE7IUYHnoRHQfds2BlRf6CZglNq7lmlxVoRtprJ4ZHTq0ZWrQLZtrcMxFw18zvw==", "license": "MIT", "dependencies": { "async-exit-hook": "^2.0.1", @@ -8427,9 +8427,9 @@ "license": "MIT" }, "node_modules/gitsheets": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/gitsheets/-/gitsheets-1.2.0.tgz", - "integrity": "sha512-IqYitv/+SMf8va5+m7KvotZWlTwgD9BIpBEJhIuTySnY0g1hQoSUTEg67w3+NdrqMFzXeWnmw+3VPCgs8I2NXQ==", + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/gitsheets/-/gitsheets-1.3.1.tgz", + "integrity": "sha512-5s+ewIZu76nytdtTQKjdVWL4u5Ci+VYBT609I6JPDem+Xm9V2GpeV5BGUfJ7QWYdhqytYAaUyeipj/3pjHmUwQ==", "license": "Apache-2.0", "dependencies": { "@iarna/toml": "^2.2.5", @@ -8437,7 +8437,7 @@ "ajv-formats": "^3.0.1", "csv-parse": "^6.2.1", "csv-stringify": "^6.7.0", - "hologit": "^0.49.1", + "hologit": "^0.50.2", "markdownlint": "^0.40.0", "rfc6902": "^5.2.0", "sort-keys": "^6.0.0", @@ -8845,9 +8845,9 @@ } }, "node_modules/hologit": { - "version": "0.49.1", - "resolved": "https://registry.npmjs.org/hologit/-/hologit-0.49.1.tgz", - "integrity": "sha512-W9gUMB4cKdYoOniE6MRxEc1k426O22khyuJ7atraj85IEFiJQEAs4TWxwKbcdVDJ09XwQt9sMrOm05KMmP1KBA==", + "version": "0.50.2", + "resolved": "https://registry.npmjs.org/hologit/-/hologit-0.50.2.tgz", + "integrity": "sha512-9StuAaE8TkdnJE/cA5vdq+pwt/hZH1JGa9qsGfAz7rvwsXrVlQT/lBJd+pedpJHOJgPdA4NXC0lqljFzOWV+7A==", "license": "MIT", "os": [ "darwin", @@ -8856,13 +8856,13 @@ "dependencies": { "@iarna/toml": "^2.2.5", "async-exit-hook": "^2.0.1", - "axios": "^1.13.6", + "axios": "^1.14.0", "chokidar": "^5.0.0", "debounce": "^3.0.0", "fb-watchman": "^2.0.2", - "git-client": "^1.11.0", + "git-client": "^1.12.0", "hab-client": "^1.1.3", - "handlebars": "^4.7.8", + "handlebars": "^4.7.9", "minimatch": "^10.2.4", "mz": "^2.7.0", "mz-modules": "^2.1.0",