From 7cd84efdc230877fc3e19ccf32786b293cc76554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hermann?= Date: Thu, 2 Apr 2026 15:42:00 +0200 Subject: [PATCH] Implement RFC 0015 filemeta path normalization --- docs/spec.md | 3 +- internal/core/models.go | 2 +- internal/engine/backup_scan.go | 28 +++----- internal/engine/backup_test.go | 19 +++-- internal/engine/backup_upload.go | 3 +- internal/engine/diff.go | 44 ++++++++++-- internal/engine/diff_test.go | 48 +++++++++++++ internal/engine/filemeta_paths.go | 64 +++++++++++++++++ internal/engine/restore.go | 25 ++----- internal/engine/restore_test.go | 89 ++++++++++++++++++++++++ rfcs/0015-filemeta-path-normalization.md | 13 ++-- 11 files changed, 274 insertions(+), 64 deletions(-) create mode 100644 internal/engine/filemeta_paths.go diff --git a/docs/spec.md b/docs/spec.md index 3acbf9b..fef216c 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -154,7 +154,6 @@ All objects are stored under a flat key namespace of the form `/`. "name": "invoice.pdf", "type": "file", "parents": ["filemeta/"], - "paths": [], "content_hash": "", "content_ref": "", "size": 21733, @@ -177,7 +176,7 @@ All objects are stored under a flat key namespace of the form `/`. | `parents` | List of `filemeta/` refs pointing to parent metadata objects | | `content_hash` | SHA-256 of the raw file content | | `content_ref` | Opaque content reference used as `content/` key; HMAC of `content_hash` for encrypted repos, plain `content_hash` for unencrypted repos | -| `paths` | Reserved for future use (multi-path support) | +| `paths` | Optional legacy compatibility field; new snapshots typically omit it and derive display paths from `parents` + `name` | | `extra` | Source-specific metadata (e.g. MIME type) | | `mode` | POSIX file mode bits (e.g. `0755` = `493`). Omitted if zero. | | `uid` | Numeric owner user ID. Omitted if zero. | diff --git a/internal/core/models.go b/internal/core/models.go index 9368290..42b7f1c 100644 --- a/internal/core/models.go +++ b/internal/core/models.go @@ -34,7 +34,7 @@ type FileMeta struct { Name string `json:"name"` Type FileType `json:"type"` // "file" or "folder" Parents []string `json:"parents"` // List of "filemeta/" refs (NOT raw IDs) - Paths []string `json:"paths"` + Paths []string `json:"paths,omitempty"` ContentHash string `json:"content_hash"` // SHA256 of the file content ContentRef string `json:"content_ref,omitempty"` // HMAC(dedupKey, ContentHash) for secure backend lookup Size int64 `json:"size"` diff --git a/internal/engine/backup_scan.go b/internal/engine/backup_scan.go index a07f580..08535fc 100644 --- a/internal/engine/backup_scan.go +++ b/internal/engine/backup_scan.go @@ -210,7 +210,8 @@ func (bm *BackupManager) detectChange(ctx context.Context, oldRoot string, meta meta.ContentRef = oldMeta.ContentRef } - newRef, _, err := meta.Ref() + newPersisted := persistedFileMeta(*meta) + newRef, _, err := newPersisted.Ref() if err != nil { return false, "", err } @@ -273,7 +274,8 @@ func (bm *BackupManager) insertFolder(_ context.Context, root string, meta *core meta.ContentHash = "" meta.Size = 0 - metaRef, metaData, err := meta.Ref() + persisted := persistedFileMeta(*meta) + metaRef, metaData, err := persisted.Ref() if err != nil { return "", err } @@ -353,25 +355,13 @@ func (bm *BackupManager) recordRemoved(ft core.FileType) { // the parent chain in the HAMT tree. This is used for incremental/changes // sources that can't build a path map (the parent may not be in the change set). func (bm *BackupManager) buildPathFromTree(ctx context.Context, root string, meta *core.FileMeta) string { - const maxDepth = 50 - parts := []string{meta.Name} - curParents := meta.Parents - for i := 0; i < maxDepth && len(curParents) > 0; i++ { - parent := bm.lookupMetaByFileID(ctx, root, curParents[0]) + return fileMetaPath(*meta, func(parentID string) (core.FileMeta, bool) { + parent := bm.lookupMetaByFileID(ctx, root, parentID) if parent == nil { - break + return core.FileMeta{}, false } - // Short-circuit: if parent already has a resolved path, prepend it. - if len(parent.Paths) > 0 { - return parent.Paths[0] + "/" + strings.Join(parts, "/") - } - parts = append(parts, parent.Name) - curParents = parent.Parents - } - for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { - parts[i], parts[j] = parts[j], parts[i] - } - return strings.Join(parts, "/") + return *parent, true + }) } // lookupMetaByFileID resolves a FileID to its FileMeta via the HAMT tree. diff --git a/internal/engine/backup_test.go b/internal/engine/backup_test.go index 983b449..b8824a6 100644 --- a/internal/engine/backup_test.go +++ b/internal/engine/backup_test.go @@ -13,7 +13,8 @@ import ( // TestBackupManager_ResolvesPathsForOpaqueIDs verifies that when a source // emits FileMeta without Paths (e.g. incremental/changes sources with opaque -// cloud IDs), the backup engine resolves Paths by walking the HAMT parent chain. +// cloud IDs), the backup engine resolves scan-time paths by walking the HAMT +// parent chain, but does not persist those paths in stored FileMeta objects. func TestBackupManager_ResolvesPathsForOpaqueIDs(t *testing.T) { ctx := context.Background() src := NewMockSource() @@ -56,11 +57,11 @@ func TestBackupManager_ResolvesPathsForOpaqueIDs(t *testing.T) { t.Fatalf("Backup failed: %v", err) } - // Read back the stored FileMeta and verify Paths were resolved. + // Read back the stored FileMeta and verify Paths were not persisted. readStore := store.NewCompressedStore(dest) tree := hamt.NewTree(readStore) - checkPath := func(parentID, fileID, expectedPath string) { + checkNoStoredPath := func(parentID, fileID string) { t.Helper() ref, err := tree.Lookup(result.Root, parentID, fileID) if err != nil || ref == "" { @@ -74,16 +75,14 @@ func TestBackupManager_ResolvesPathsForOpaqueIDs(t *testing.T) { if err := json.Unmarshal(data, &fm); err != nil { t.Fatalf("Unmarshal: %v", err) } - if len(fm.Paths) == 0 { - t.Errorf("%s: Paths is empty, expected %q", fileID, expectedPath) - } else if fm.Paths[0] != expectedPath { - t.Errorf("%s: Paths[0]=%q, expected %q", fileID, fm.Paths[0], expectedPath) + if len(fm.Paths) != 0 { + t.Errorf("%s: persisted Paths=%v, expected omitted paths", fileID, fm.Paths) } } - checkPath("", "FOLDER_A", "Documents") - checkPath("FOLDER_A", "FOLDER_B", "Documents/Photos") - checkPath("FOLDER_B", "FILE_C", "Documents/Photos/pic.jpg") + checkNoStoredPath("", "FOLDER_A") + checkNoStoredPath("FOLDER_A", "FOLDER_B") + checkNoStoredPath("FOLDER_B", "FILE_C") } func TestBackupManager_Run(t *testing.T) { diff --git a/internal/engine/backup_upload.go b/internal/engine/backup_upload.go index 317a799..13f26ac 100644 --- a/internal/engine/backup_upload.go +++ b/internal/engine/backup_upload.go @@ -120,7 +120,8 @@ func (bm *BackupManager) processFile(ctx context.Context, meta core.FileMeta, ph meta.ContentRef = contentRef meta.Size = size - metaRef, metaData, err := meta.Ref() + persisted := persistedFileMeta(meta) + metaRef, metaData, err := persisted.Ref() if err != nil { return uploadResult{err: err} } diff --git a/internal/engine/diff.go b/internal/engine/diff.go index e01a413..25ce740 100644 --- a/internal/engine/diff.go +++ b/internal/engine/diff.go @@ -170,9 +170,17 @@ func (dm *DiffManager) loadSnapshot(ctx context.Context, ref string) (*core.Snap func (dm *DiffManager) diffRoots(root1, root2 string) ([]FileChange, error) { var changes []FileChange + oldByID, err := dm.collectMetadata(root1) + if err != nil { + return nil, err + } + newByID, err := dm.collectMetadata(root2) + if err != nil { + return nil, err + } - err := dm.tree.Diff(root1, root2, func(d hamt.DiffEntry) error { - change, err := dm.toFileChange(d) + err = dm.tree.Diff(root1, root2, func(d hamt.DiffEntry) error { + change, err := dm.toFileChange(d, oldByID, newByID) if err != nil { return err } @@ -182,14 +190,25 @@ func (dm *DiffManager) diffRoots(root1, root2 string) ([]FileChange, error) { return changes, err } -func (dm *DiffManager) toFileChange(d hamt.DiffEntry) (FileChange, error) { +func (dm *DiffManager) toFileChange(d hamt.DiffEntry, oldByID, newByID map[string]core.FileMeta) (FileChange, error) { ct, metaRef := classifyEntry(d) meta, err := dm.loadMeta(metaRef) if err != nil { return FileChange{}, err } - return FileChange{Type: ct, Path: meta.Name, Meta: *meta}, nil + byID := newByID + if ct == ChangeRemoved { + byID = oldByID + } + return FileChange{ + Type: ct, + Path: fileMetaPath(*meta, func(parentID string) (core.FileMeta, bool) { + parent, ok := byID[parentID] + return parent, ok + }), + Meta: *meta, + }, nil } func classifyEntry(d hamt.DiffEntry) (ChangeType, string) { @@ -204,6 +223,9 @@ func classifyEntry(d hamt.DiffEntry) (ChangeType, string) { } func (dm *DiffManager) loadMeta(ref string) (*core.FileMeta, error) { + if dm.metaCache == nil { + dm.metaCache = make(map[string]core.FileMeta) + } if fm, ok := dm.metaCache[ref]; ok { return &fm, nil } @@ -215,5 +237,19 @@ func (dm *DiffManager) loadMeta(ref string) (*core.FileMeta, error) { if err := json.Unmarshal(data, &fm); err != nil { return nil, err } + dm.metaCache[ref] = fm return &fm, nil } + +func (dm *DiffManager) collectMetadata(root string) (map[string]core.FileMeta, error) { + byID := make(map[string]core.FileMeta) + err := dm.tree.Walk(root, func(_, valueRef string) error { + fm, err := dm.loadMeta(valueRef) + if err != nil { + return err + } + byID[fm.FileID] = *fm + return nil + }) + return byID, err +} diff --git a/internal/engine/diff_test.go b/internal/engine/diff_test.go index d267778..1914cb5 100644 --- a/internal/engine/diff_test.go +++ b/internal/engine/diff_test.go @@ -65,6 +65,47 @@ func TestDiffManager_Run(t *testing.T) { } } +func TestDiffManager_DerivesPathsFromParentsWithoutPersistedPaths(t *testing.T) { + ctx := context.Background() + store := NewMockStore() + + rootDir := createMetaWithID(ctx, store, core.FileMeta{ + FileID: "dir", + Name: "docs", + Type: core.FileTypeFolder, + }) + oldFile := createMetaWithID(ctx, store, core.FileMeta{ + FileID: "file", + Name: "guide.txt", + Type: core.FileTypeFile, + Parents: []string{"dir"}, + Size: 100, + Paths: []string{"docs/guide.txt"}, + }) + newFile := createMetaWithID(ctx, store, core.FileMeta{ + FileID: "file", + Name: "guide.txt", + Type: core.FileTypeFile, + Parents: []string{"dir"}, + Size: 200, + }) + + root1 := createHamt(t, store, []string{"dir", "file"}, []string{rootDir, oldFile}) + root2 := createHamt(t, store, []string{"dir", "file"}, []string{rootDir, newFile}) + + dm := NewDiffManager(store) + changes, err := dm.diffRoots(root1, root2) + if err != nil { + t.Fatalf("Diff failed: %v", err) + } + if len(changes) != 1 { + t.Fatalf("Expected 1 change, got %d", len(changes)) + } + if changes[0].Path != "docs/guide.txt" { + t.Fatalf("Expected derived path docs/guide.txt, got %q", changes[0].Path) + } +} + func createMeta(ctx context.Context, s *MockStore, name string, size int64) string { m := core.FileMeta{Name: name, Size: size} h, d, _ := core.ComputeJSONHash(&m) @@ -73,6 +114,13 @@ func createMeta(ctx context.Context, s *MockStore, name string, size int64) stri return ref } +func createMetaWithID(ctx context.Context, s *MockStore, m core.FileMeta) string { + h, d, _ := core.ComputeJSONHash(&m) + ref := "filemeta/" + h + _ = s.Put(ctx, ref, d) + return ref +} + func createHamt(t *testing.T, s *MockStore, ids []string, refs []string) string { tree := hamt.NewTree(s) root := "" diff --git a/internal/engine/filemeta_paths.go b/internal/engine/filemeta_paths.go new file mode 100644 index 0000000..2871841 --- /dev/null +++ b/internal/engine/filemeta_paths.go @@ -0,0 +1,64 @@ +package engine + +import ( + "path" + "strings" + + "github.com/cloudstic/cli/internal/core" +) + +func persistedFileMeta(meta core.FileMeta) core.FileMeta { + persisted := meta + persisted.Paths = nil + return persisted +} + +func fileMetaPath(meta core.FileMeta, lookupParent func(string) (core.FileMeta, bool)) string { + if p := normalizeMetaPath(firstMetaPath(meta)); p != "" { + return p + } + + const maxDepth = 50 + parts := []string{meta.Name} + cur := meta + for i := 0; i < maxDepth && len(cur.Parents) > 0; i++ { + parent, ok := lookupParent(cur.Parents[0]) + if !ok { + break + } + if p := normalizeMetaPath(firstMetaPath(parent)); p != "" { + return path.Join(append([]string{p}, reverseParts(parts)...)...) + } + parts = append(parts, parent.Name) + cur = parent + } + return path.Join(reverseParts(parts)...) +} + +func firstMetaPath(meta core.FileMeta) string { + if len(meta.Paths) == 0 { + return "" + } + return meta.Paths[0] +} + +func normalizeMetaPath(p string) string { + p = strings.TrimSpace(strings.ReplaceAll(p, "\\", "/")) + if p == "" { + return "" + } + clean := path.Clean("/" + p) + clean = strings.TrimPrefix(clean, "/") + if clean == "." { + return "" + } + return clean +} + +func reverseParts(parts []string) []string { + reversed := make([]string, len(parts)) + for i := range parts { + reversed[len(parts)-1-i] = parts[i] + } + return reversed +} diff --git a/internal/engine/restore.go b/internal/engine/restore.go index 511b5ae..d652c8c 100644 --- a/internal/engine/restore.go +++ b/internal/engine/restore.go @@ -499,27 +499,10 @@ func (rm *RestoreManager) writeFileContent(ctx context.Context, w io.Writer, met } func buildRestorePath(meta core.FileMeta, byID map[string]core.FileMeta) string { - // Fast path: use stored Paths when available (new snapshots). - if len(meta.Paths) > 0 { - return meta.Paths[0] - } - - // Fallback: reconstruct from parent chain (old snapshots). - const maxDepth = 50 - parts := []string{meta.Name} - cur := meta - for i := 0; i < maxDepth && len(cur.Parents) > 0; i++ { - parent, ok := byID[cur.Parents[0]] - if !ok { - break - } - parts = append(parts, parent.Name) - cur = parent - } - for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { - parts[i], parts[j] = parts[j], parts[i] - } - return path.Join(parts...) + return fileMetaPath(meta, func(parentID string) (core.FileMeta, bool) { + parent, ok := byID[parentID] + return parent, ok + }) } // filterByPath returns only the entries whose restore path matches the given filter. diff --git a/internal/engine/restore_test.go b/internal/engine/restore_test.go index 763325d..485c5bb 100644 --- a/internal/engine/restore_test.go +++ b/internal/engine/restore_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/cloudstic/cli/internal/core" + "github.com/cloudstic/cli/internal/hamt" "github.com/cloudstic/cli/internal/ui" "github.com/cloudstic/cli/pkg/store" ) @@ -656,3 +657,91 @@ func TestRestoreManager_PathFilter_DryRun(t *testing.T) { t.Errorf("Expected empty buffer in dry run, got %d bytes", buf.Len()) } } + +func TestRestoreManager_PathFilter_MixedLegacyAndNormalizedFileMeta(t *testing.T) { + ctx := context.Background() + dest := NewMockStore() + + rootMeta := core.FileMeta{ + FileID: "dir", + Name: "docs", + Type: core.FileTypeFolder, + Paths: []string{"docs"}, + } + childMeta := core.FileMeta{ + FileID: "file", + Name: "guide.txt", + Type: core.FileTypeFile, + Parents: []string{"dir"}, + Size: int64(len("guide")), + } + + rootRef, rootData, err := rootMeta.Ref() + if err != nil { + t.Fatalf("root meta ref: %v", err) + } + if err := dest.Put(ctx, rootRef, rootData); err != nil { + t.Fatalf("put root meta: %v", err) + } + + content := core.Content{Type: core.ObjectTypeContent, Size: childMeta.Size, DataInlineB64: []byte("guide")} + contentHash, contentData, err := core.ComputeJSONHash(&content) + if err != nil { + t.Fatalf("content ref: %v", err) + } + contentRef := "content/" + contentHash + if err := dest.Put(ctx, contentRef, contentData); err != nil { + t.Fatalf("put content: %v", err) + } + + childMeta.ContentHash = core.ComputeHash([]byte("guide")) + childMeta.ContentRef = contentHash + childRef, childData, err := childMeta.Ref() + if err != nil { + t.Fatalf("child meta ref: %v", err) + } + if err := dest.Put(ctx, childRef, childData); err != nil { + t.Fatalf("put child meta: %v", err) + } + + tree := hamt.NewTree(dest) + root, err := tree.Insert("", "", rootMeta.FileID, rootRef) + if err != nil { + t.Fatalf("insert root: %v", err) + } + root, err = tree.Insert(root, rootMeta.FileID, childMeta.FileID, childRef) + if err != nil { + t.Fatalf("insert child: %v", err) + } + + snap := core.Snapshot{Root: root, Seq: 1, Created: "2026-04-02T00:00:00Z"} + snapHash, snapData, err := core.ComputeJSONHash(&snap) + if err != nil { + t.Fatalf("snapshot ref: %v", err) + } + snapRef := "snapshot/" + snapHash + if err := dest.Put(ctx, snapRef, snapData); err != nil { + t.Fatalf("put snapshot: %v", err) + } + if err := dest.Put(ctx, "index/latest", createIndex(snapRef, 1)); err != nil { + t.Fatalf("put latest index: %v", err) + } + + rsMgr := NewRestoreManager(dest, ui.NewNoOpReporter()) + var buf bytes.Buffer + result, err := rsMgr.Run(ctx, NewZipRestoreWriter(&buf), "latest", WithRestorePath("docs/guide.txt")) + if err != nil { + t.Fatalf("Restore failed: %v", err) + } + if result.FilesWritten != 1 { + t.Fatalf("Expected 1 file, got %d", result.FilesWritten) + } + + entries := zipEntries(t, &buf) + if _, ok := entries["docs/guide.txt"]; !ok { + t.Fatal("docs/guide.txt not restored") + } + if _, ok := entries["docs/"]; !ok { + t.Fatal("docs/ ancestor directory not restored") + } +} diff --git a/rfcs/0015-filemeta-path-normalization.md b/rfcs/0015-filemeta-path-normalization.md index a62a418..82b1f26 100644 --- a/rfcs/0015-filemeta-path-normalization.md +++ b/rfcs/0015-filemeta-path-normalization.md @@ -1,6 +1,6 @@ # RFC 0015: FileMeta Path Normalization -- **Status:** Draft +- **Status:** Implemented - **Date:** 2026-03-17 - **Affects:** `internal/core`, `internal/engine`, `pkg/source`, docs - **Related:** [RFC 0001](./0001-hamt-evolution.md), [RFC 0006](./0006-direct-to-filesystem-restore.md) @@ -212,14 +212,15 @@ The extra reconstruction cost is acceptable because: ## Rollout plan 1. Add shared helpers for normalized ephemeral path handling and derived-path - reconstruction. + reconstruction. (done) 2. Update engine consumers so correctness no longer depends on persisted - `Paths`. + `Paths`. (done) 3. Change backup persistence to clear `Paths` before hashing and writing new - `FileMeta` objects. -4. Update docs/spec examples to show `paths` as optional legacy compatibility + `FileMeta` objects. (done) +4. Update docs/spec examples to show `paths` as optional legacy compatibility. + (done) data. -5. Validate mixed old/new snapshot behavior in integration tests. +5. Validate mixed old/new snapshot behavior in integration tests. (done) ## Open questions