From 5c572cf12eced2407a869bd00c71d9d74777bb09 Mon Sep 17 00:00:00 2001
From: Wantong <jwtty0919@gmail.com>
Date: Tue, 28 Apr 2026 15:17:33 -0700
Subject: [PATCH 01/19] fix: add pod/rs guardrail check consistently (#682)

---
 pkg/webhook/webhook.go | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go
index 82710a39c..2fe915f79 100644
--- a/pkg/webhook/webhook.go
+++ b/pkg/webhook/webhook.go
@@ -641,12 +641,9 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho
 
 	// Build core v1 resources list, conditionally including pods if workload is enabled
 	coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName,
-		limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName,
+		limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status",
 		replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName,
 		serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"}
-	if w.enableWorkload {
-		coreV1Resources = append(coreV1Resources, podResourceName, podResourceName+"/status")
-	}
 
 	namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{
 		Operations: cuOperations,
@@ -654,11 +651,8 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho
 	})
 
 	// Build apps/v1 resources list, conditionally including replicasets if workload is enabled
-	appsV1Resources := []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status",
+	appsV1Resources := []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status", replicaSetResourceName, replicaSetResourceName + "/status",
 		deploymentResourceName, deploymentResourceName + "/status", statefulSetResourceName, statefulSetResourceName + "/status"}
-	if w.enableWorkload {
-		appsV1Resources = append(appsV1Resources, replicaSetResourceName, replicaSetResourceName+"/status")
-	}
 
 	namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{
 		Operations: cuOperations,

From 34ef3309fadba18198f9adfd0af5dae46c4ac866 Mon Sep 17 00:00:00 2001
From: Arvind Thirumurugan <arvindthirumurugan@gmail.com>
Date: Tue, 28 Apr 2026 17:37:10 -0700
Subject: [PATCH 02/19] chore: update token writer file permission (#675)

---
 docker/refresh-token.Dockerfile       |  2 +
 pkg/authtoken/token_writer.go         |  8 ++-
 pkg/utils/writefile/writefile.go      | 34 ++++++++++
 pkg/utils/writefile/writefile_test.go | 89 +++++++++++++++++++++++++++
 pkg/webhook/webhook.go                |  5 +-
 5 files changed, 134 insertions(+), 4 deletions(-)
 create mode 100644 pkg/utils/writefile/writefile.go
 create mode 100644 pkg/utils/writefile/writefile_test.go

diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile
index 20da08c80..22b99f038 100644
--- a/docker/refresh-token.Dockerfile
+++ b/docker/refresh-token.Dockerfile
@@ -15,6 +15,8 @@ RUN go mod download
 # Copy the go source
 COPY cmd/authtoken/main.go main.go
 COPY pkg/authtoken pkg/authtoken
+# writefile is a dependency of pkg/authtoken for secure file creation (0600 permissions)
+COPY pkg/utils/writefile pkg/utils/writefile
 
 ARG TARGETARCH
 
diff --git a/pkg/authtoken/token_writer.go b/pkg/authtoken/token_writer.go
index 48b965f3f..66591f715 100644
--- a/pkg/authtoken/token_writer.go
+++ b/pkg/authtoken/token_writer.go
@@ -18,9 +18,10 @@ package authtoken
 import (
 	"fmt"
 	"io"
-	"os"
 
 	"k8s.io/klog/v2"
+
+	"github.com/kubefleet-dev/kubefleet/pkg/utils/writefile"
 )
 
 type Factory struct {
@@ -31,8 +32,11 @@ func NewFactory(filePath string) Factory {
 	return Factory{filePath: filePath}
 }
 
+// Create opens the token file for writing using CreateSecureFile.
+// The member-agent container runs as the same UID and reads this file via client-go's
+// BearerTokenFile (O_RDONLY), so owner permission is sufficient for both writer and reader.
 func (w Factory) Create() (io.WriteCloser, error) {
-	wc, err := os.Create(w.filePath)
+	wc, err := writefile.CreateSecureFile(w.filePath)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/utils/writefile/writefile.go b/pkg/utils/writefile/writefile.go
new file mode 100644
index 000000000..f161bfeca
--- /dev/null
+++ b/pkg/utils/writefile/writefile.go
@@ -0,0 +1,34 @@
+/*
+Copyright 2025 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package writefile
+
+import (
+	"os"
+	"path/filepath"
+)
+
+// CreateSecureFile creates or truncates a file with owner-only permissions (0600).
+// The path is sanitized with filepath.Clean to resolve any relative or redundant elements.
+//
+//   - O_WRONLY: write-only, since callers only need to write sensitive data.
+//   - O_CREATE: creates the file if it does not exist.
+//   - O_TRUNC: empties the file before writing so that no leftover bytes from a
+//     previous (possibly longer) write remain in the file.
+//   - 0600: owner read/write only, preventing other users from accessing the file.
+func CreateSecureFile(path string) (*os.File, error) {
+	return os.OpenFile(filepath.Clean(path), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
+}
diff --git a/pkg/utils/writefile/writefile_test.go b/pkg/utils/writefile/writefile_test.go
new file mode 100644
index 000000000..7d389ac66
--- /dev/null
+++ b/pkg/utils/writefile/writefile_test.go
@@ -0,0 +1,89 @@
+/*
+Copyright 2025 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package writefile
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestCreateSecureFile(t *testing.T) {
+	tests := []struct {
+		name         string
+		oldContent   string
+		writeContent string
+	}{
+		{
+			name:         "creates new file with restricted permissions",
+			writeContent: "sensitive-data",
+		},
+		{
+			name:         "truncates longer existing content",
+			oldContent:   "this-is-a-very-long-existing-content-that-should-be-truncated",
+			writeContent: "short",
+		},
+		{
+			name:         "overwrites shorter existing content",
+			oldContent:   "short",
+			writeContent: "a-much-longer-replacement-content",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			dir := t.TempDir()
+			filePath := filepath.Join(dir, "secure-file")
+
+			writeAndClose := func(content string) {
+				f, err := CreateSecureFile(filePath)
+				if err != nil {
+					t.Fatalf("CreateSecureFile(%q) returned error: %v", filePath, err)
+				}
+				if _, err := io.WriteString(f, content); err != nil {
+					t.Fatalf("io.WriteString() returned error: %v", err)
+				}
+				f.Close()
+			}
+
+			if tc.oldContent != "" {
+				writeAndClose(tc.oldContent)
+			}
+
+			writeAndClose(tc.writeContent)
+
+			got, err := os.ReadFile(filePath)
+			if err != nil {
+				t.Fatalf("os.ReadFile(%q) returned error: %v", filePath, err)
+			}
+			if diff := cmp.Diff(tc.writeContent, string(got)); diff != "" {
+				t.Errorf("CreateSecureFile() file content mismatch (-want +got):\n%s", diff)
+			}
+
+			info, err := os.Stat(filePath)
+			if err != nil {
+				t.Fatalf("os.Stat(%q) returned error: %v", filePath, err)
+			}
+			if gotPerm := info.Mode().Perm(); gotPerm != 0600 {
+				t.Errorf("CreateSecureFile() file permission = %o, want %o", gotPerm, 0600)
+			}
+		})
+	}
+}
diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go
index 2fe915f79..e3bf5beb9 100644
--- a/pkg/webhook/webhook.go
+++ b/pkg/webhook/webhook.go
@@ -55,6 +55,7 @@ import (
 	clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1"
 	placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 	"github.com/kubefleet-dev/kubefleet/cmd/hubagent/options"
+	"github.com/kubefleet-dev/kubefleet/pkg/utils/writefile"
 	"github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceoverride"
 	"github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceplacement"
 	"github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceplacementdisruptionbudget"
@@ -936,7 +937,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error {
 		return fmt.Errorf("could not create directory %q to store certificates: %w", certDir, err)
 	}
 	certPath := filepath.Join(certDir, fleetWebhookCertFileName)
-	f, err := os.OpenFile(filepath.Clean(certPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600)
+	f, err := writefile.CreateSecureFile(certPath)
 	if err != nil {
 		return fmt.Errorf("could not open %q: %w", certPath, err)
 	}
@@ -950,7 +951,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error {
 	}
 
 	keyPath := filepath.Join(certDir, fleetWebhookKeyFileName)
-	kf, err := os.OpenFile(filepath.Clean(keyPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600)
+	kf, err := writefile.CreateSecureFile(keyPath)
 	if err != nil {
 		return fmt.Errorf("could not open %q: %w", keyPath, err)
 	}

From 5355ec8e7407f337ab83d88e42ee98c6981e15aa Mon Sep 17 00:00:00 2001
From: Akash Kumar <91385321+AkashKumar7902@users.noreply.github.com>
Date: Wed, 29 Apr 2026 18:56:25 +0530
Subject: [PATCH 03/19] docs: clarify hub-agent cert secret default (#673)

Signed-off-by: Akash Kumar <meakash7902@gmail.com>
---
 charts/hub-agent/README.md   | 12 +++++++-----
 charts/hub-agent/values.yaml |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md
index b49e1bd15..9582dda49 100644
--- a/charts/hub-agent/README.md
+++ b/charts/hub-agent/README.md
@@ -164,7 +164,8 @@ helm install hub-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/hub-agent \
   --create-namespace \
   --set useCertManager=true \
   --set enableWorkload=true \
-  --set enableWebhook=true
+  --set enableWebhook=true \
+  --set webhookCertSecretName=fleet-webhook-server-cert
 
 # Or using traditional repository
 helm install hub-agent kubefleet/hub-agent \
@@ -172,12 +173,13 @@ helm install hub-agent kubefleet/hub-agent \
   --create-namespace \
   --set useCertManager=true \
   --set enableWorkload=true \
-  --set enableWebhook=true
+  --set enableWebhook=true \
+  --set webhookCertSecretName=fleet-webhook-server-cert
 ```
 
 The `webhookCertSecretName` parameter specifies the Secret name for the certificate:
-- Default: `fleet-webhook-server-cert`
-- When using cert-manager, this is where cert-manager stores the certificate
+- Default: `unset`; there is no default in `values.yaml`
+- When using cert-manager, set this to the Secret where cert-manager stores the certificate
 - Must match the secret name referenced in the deployment volume mount
 
 Example with custom secret name:
@@ -198,4 +200,4 @@ helm install hub-agent kubefleet/hub-agent \
   --set useCertManager=true \
   --set enableWorkload=true \
   --set webhookCertSecretName=my-webhook-secret
-```
\ No newline at end of file
+```
diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml
index e87061b58..4964cba84 100644
--- a/charts/hub-agent/values.yaml
+++ b/charts/hub-agent/values.yaml
@@ -23,6 +23,7 @@ enableWorkload: false
 useCertManager: false
 # webhookCertSecretName is ONLY used when useCertManager=true
 # It specifies the name of the Secret where cert-manager stores the certificate
+# Example:
 # webhookCertSecretName: fleet-webhook-server-cert
 
 forceDeleteWaitTime: 15m0s

From 81ecb5bb0d09c7301b3643c93d3311ff117fdf98 Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Wed, 29 Apr 2026 10:20:19 -0700
Subject: [PATCH 04/19] fix: deterministic envelope Work names to prevent
 duplicates (#665)

* fix: deterministic envelope Work names to prevent duplicates

Envelope Work names were generated with a fresh uuid.NewUUID() on every
Create, so two back-to-back reconciles of the same binding with a stale
informer-cache List could each build a Work with different names and
both succeed. The `len > 1` branch then returned UnexpectedBehaviorError
without any recovery, leaving the binding permanently stuck in
WorkSynchronized=False/SyncWorkFailed and the placement never
converging on that cluster.

buildNewWorkForEnvelopeCR now derives the name suffix from a truncated
SHA-256 hash of (envelope type, namespace, name). Two concurrent Creates
for the same (binding, envelope) produce the same name and collide at
the API server; the existing AlreadyExists handling in upsertWork
requeues into the len == 1 update branch.

Auto-deletion on detection of multiple Works is deliberately avoided:
the member agent may have already applied both Works, so deleting one
would trigger resource fluctuation on the member side. When the `len > 1`
state is still observed (possible only in environments that were stuck
before this change), the controller now emits a Warning Event on the
binding (reason=DuplicateEnvelopeWorks) and a clear log message
instructing the operator to delete all but the oldest Work in the
affected namespace.

Fixes #630

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

* Addressing feedback

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

---------

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>
---
 pkg/controllers/workgenerator/envelope.go     |  49 ++++-
 .../workgenerator/envelope_test.go            | 192 +++++++++++++++++-
 2 files changed, 224 insertions(+), 17 deletions(-)

diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go
index 24cff56cd..95453274b 100644
--- a/pkg/controllers/workgenerator/envelope.go
+++ b/pkg/controllers/workgenerator/envelope.go
@@ -18,13 +18,15 @@ package workgenerator
 
 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"fmt"
 	"sort"
 	"strings"
 
+	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
-	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/klog/v2"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
@@ -87,18 +89,27 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj(
 	var work *fleetv1beta1.Work
 	switch {
 	case len(workList.Items) > 1:
-		// Multiple matching work objects found; this should never occur under normal conditions.
-		wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef())
-		klog.ErrorS(wrappedErr, "Failed to create or update work object for envelope",
-			"resourceBinding", klog.KObj(binding),
-			"resourceSnapshot", klog.KObj(resourceSnapshot),
-			"envelope", envelopeReader.GetEnvelopeObjRef())
-		// Log the work object names to help debug.
+		// Multiple matching work objects found. This can only happen in environments that
+		// were stuck on this state *before* deterministic Work naming was introduced in
+		// buildNewWorkForEnvelopeCR; newly-raced reconciles are now prevented by the
+		// API server's name-uniqueness check. We deliberately do NOT auto-delete the
+		// extras: both Works may have been applied by the member agent, and deleting one
+		// would trigger member-side resource cleanup and user-visible fluctuation on
+		// resources the survivor still owns. Surface the state for operator cleanup
+		// (manually delete all but the oldest Work in the affected namespace).
 		workNames := make([]string, len(workList.Items))
 		for i := range workList.Items {
 			workNames[i] = workList.Items[i].Name
 		}
-		klog.ErrorS(wrappedErr, "Duplicate work objects found", "works", workNames)
+		wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef())
+		klog.ErrorS(wrappedErr, "Duplicate envelope Work objects found; manual cleanup required — delete all but the oldest Work in the namespace",
+			"works", workNames,
+			"resourceBinding", klog.KObj(binding),
+			"resourceSnapshot", klog.KObj(resourceSnapshot),
+			"envelope", envelopeReader.GetEnvelopeObjRef())
+		r.recorder.Eventf(binding, corev1.EventTypeWarning, "DuplicateEnvelopeWorks",
+			"Multiple Work objects (%v) found for envelope %v in namespace %s; delete all but the oldest to recover",
+			workNames, envelopeReader.GetEnvelopeObjRef(), fmt.Sprintf(utils.NamespaceNameFormat, binding.GetBindingSpec().TargetCluster))
 		return nil, controller.NewUnexpectedBehaviorError(wrappedErr)
 	case len(workList.Items) == 1:
 		klog.V(2).InfoS("Found existing work object for the envelope; updating it",
@@ -200,6 +211,24 @@ func refreshWorkForEnvelopeCR(
 	work.Spec.ApplyStrategy = resourceBinding.GetBindingSpec().ApplyStrategy
 }
 
+// envelopeWorkNameSuffix returns a deterministic suffix for the envelope Work's name,
+// derived from the envelope's identity. Using a deterministic suffix (instead of a random
+// UUID) means two concurrent Create attempts for the same envelope collide on name at the
+// API server, which enforces the "one Work per (binding, envelope)" invariant server-side.
+// The hash is truncated to 8 bytes of SHA-256 (rendered as 16 hex characters), keeping
+// the overall Work name well under the DNS-1123 subdomain limit; collision probability
+// at the scale of a single binding's envelope set is ~2^-64.
+func envelopeWorkNameSuffix(envelopeReader fleetv1beta1.EnvelopeReader) string {
+	// NOTE: ClusterResourceEnvelope has an empty namespace; ResourceEnvelope has a namespace.
+	// Including the type disambiguates an unlikely name collision across kinds.
+	identity := fmt.Sprintf("%s/%s/%s",
+		envelopeReader.GetEnvelopeType(),
+		envelopeReader.GetNamespace(),
+		envelopeReader.GetName())
+	sum := sha256.Sum256([]byte(identity))
+	return hex.EncodeToString(sum[:8])
+}
+
 func buildNewWorkForEnvelopeCR(
 	workNamePrefix string,
 	resourceBinding fleetv1beta1.BindingObj,
@@ -208,7 +237,7 @@ func buildNewWorkForEnvelopeCR(
 	manifests []fleetv1beta1.Manifest,
 	resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string,
 ) *fleetv1beta1.Work {
-	workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, uuid.NewUUID())
+	workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, envelopeWorkNameSuffix(envelopeReader))
 	workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.GetBindingSpec().TargetCluster)
 
 	// Create the labels map
diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go
index f262766f3..a3283c482 100644
--- a/pkg/controllers/workgenerator/envelope_test.go
+++ b/pkg/controllers/workgenerator/envelope_test.go
@@ -17,8 +17,11 @@ limitations under the License.
 package workgenerator
 
 import (
+	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -32,9 +35,12 @@ import (
 
 	fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 	"github.com/kubefleet-dev/kubefleet/pkg/utils"
+	"github.com/kubefleet-dev/kubefleet/pkg/utils/controller"
 	"github.com/kubefleet-dev/kubefleet/test/utils/informer"
 )
 
+const testWorkNamePrefix = "test-work"
+
 func TestExtractManifestsFromEnvelopeCR(t *testing.T) {
 	tests := []struct {
 		name           string
@@ -255,8 +261,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 	ignoreWorkMeta := cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Name", "OwnerReferences")
 	scheme := serviceScheme(t)
 
-	workNamePrefix := "test-work"
-
 	resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "test-snapshot",
@@ -306,7 +310,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 	// Create an existing work for update test
 	existingWork := &fleetv1beta1.Work{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      workNamePrefix,
+			Name:      testWorkNamePrefix,
 			Namespace: "fleet-member-test-cluster-1",
 			Labels: map[string]string{
 				fleetv1beta1.ParentBindingLabel:     resourceBinding.Name,
@@ -483,7 +487,13 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 			wantErr:                             true,
 		},
 		{
-			name:                                "two existing works should result in error",
+			// Duplicate envelope Works indicate an environment that was stuck before
+			// deterministic naming was introduced. The controller surfaces the state
+			// (error + Event) but does NOT auto-delete, because deleting a Work the
+			// member agent has already applied would trigger resource fluctuation on
+			// the member side. Full post-conditions are verified in
+			// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation.
+			name:                                "two existing works surface an UnexpectedBehaviorError without mutation",
 			envelopeReader:                      resourceEnvelope,
 			resourceOverrideSnapshotHash:        "new-resource-hash",
 			clusterResourceOverrideSnapshotHash: "new-cluster-resource-hash",
@@ -512,7 +522,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 			}
 
 			// Call the function under test
-			got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, workNamePrefix,
+			got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, testWorkNamePrefix,
 				resourceBinding, resourceSnapshot, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash)
 
 			if (err != nil) != tt.wantErr {
@@ -532,7 +542,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 func TestProcessOneSelectedResource(t *testing.T) {
 	scheme := serviceScheme(t)
 
-	workNamePrefix := "test-work"
 	resourceBinding := &fleetv1beta1.ClusterResourceBinding{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "test-binding",
@@ -685,7 +694,7 @@ func TestProcessOneSelectedResource(t *testing.T) {
 				tt.selectedResource,
 				resourceBinding,
 				snapshot,
-				workNamePrefix,
+				testWorkNamePrefix,
 				tt.resourceOverrideSnapshotHash,
 				tt.clusterResourceOverrideSnapshotHash,
 				activeWork,
@@ -725,3 +734,172 @@ func createResourceContent(t *testing.T, obj runtime.Object) *fleetv1beta1.Resou
 		},
 	}
 }
+
+// TestEnvelopeWorkNameSuffix verifies that the suffix is a stable, length-bounded function
+// of the envelope's identity. Determinism is what lets the API server enforce the
+// "one Work per (binding, envelope)" invariant via name uniqueness.
+func TestEnvelopeWorkNameSuffix(t *testing.T) {
+	base := &fleetv1beta1.ResourceEnvelope{
+		ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"},
+	}
+
+	tests := []struct {
+		name      string
+		other     fleetv1beta1.EnvelopeReader
+		wantEqual bool // true → suffix must match base; false → suffix must differ
+	}{
+		{
+			name:      "same identity",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}},
+			wantEqual: true,
+		},
+		{
+			name:      "different name",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-b", Namespace: "ns-1"}},
+			wantEqual: false,
+		},
+		{
+			name:      "different namespace",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-2"}},
+			wantEqual: false,
+		},
+		{
+			// Same name as base, but cluster-scoped — the type field in the hash input disambiguates.
+			name:      "different type, same name",
+			other:     &fleetv1beta1.ClusterResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a"}},
+			wantEqual: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotBase := envelopeWorkNameSuffix(base)
+			gotOther := envelopeWorkNameSuffix(tt.other)
+			if tt.wantEqual && gotBase != gotOther {
+				t.Errorf("envelopeWorkNameSuffix(%v) = %q, envelopeWorkNameSuffix(%v) = %q, want equal", base, gotBase, tt.other, gotOther)
+			}
+			if !tt.wantEqual && gotBase == gotOther {
+				t.Errorf("envelopeWorkNameSuffix(%v) = envelopeWorkNameSuffix(%v) = %q, want different", base, tt.other, gotBase)
+			}
+		})
+	}
+
+	// Suffix is bounded and DNS-1123-safe (lowercase hex).
+	suffix := envelopeWorkNameSuffix(base)
+	if len(suffix) != 16 {
+		t.Errorf("envelopeWorkNameSuffix length = %d, want 16", len(suffix))
+	}
+	if strings.ToLower(suffix) != suffix {
+		t.Errorf("envelopeWorkNameSuffix(%v) = %q, want lowercase", base, suffix)
+	}
+}
+
+// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation verifies that
+// when the label query returns multiple Works for the same (binding, envelope), the
+// controller surfaces the condition (UnexpectedBehaviorError + Event) but does NOT
+// mutate cluster state. Auto-deleting duplicates is unsafe because the member agent
+// may have applied them; deleting one triggers resource cleanup that conflicts with
+// the survivor. Deterministic naming (see buildNewWorkForEnvelopeCR) prevents new
+// duplicates; pre-existing ones require operator cleanup.
+func TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation(t *testing.T) {
+	scheme := serviceScheme(t)
+	ctx := context.Background()
+
+	resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "test-snapshot",
+			Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"},
+		},
+	}
+	resourceBinding := &fleetv1beta1.ClusterResourceBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "test-binding",
+			Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"},
+		},
+		Spec: fleetv1beta1.ResourceBindingSpec{
+			TargetCluster:        "test-cluster-1",
+			ResourceSnapshotName: resourceSnapshot.Name,
+		},
+	}
+	resourceEnvelope := &fleetv1beta1.ResourceEnvelope{
+		ObjectMeta: metav1.ObjectMeta{Name: "dup-envelope", Namespace: "default"},
+		Data: map[string]runtime.RawExtension{
+			"configmap": {Raw: []byte(`{"apiVersion":"v1","kind":"ConfigMap","metadata":{"name":"cm","namespace":"default"}}`)},
+		},
+	}
+
+	workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.Spec.TargetCluster)
+	labelsForWork := map[string]string{
+		fleetv1beta1.ParentBindingLabel:     resourceBinding.Name,
+		fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel],
+		fleetv1beta1.EnvelopeTypeLabel:      string(fleetv1beta1.ResourceEnvelopeType),
+		fleetv1beta1.EnvelopeNameLabel:      resourceEnvelope.Name,
+		fleetv1beta1.EnvelopeNamespaceLabel: resourceEnvelope.Namespace,
+	}
+	dupNames := []string{"dup-envelope-a", "dup-envelope-b", "dup-envelope-c"}
+	mkWork := func(name string) *fleetv1beta1.Work {
+		return &fleetv1beta1.Work{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      name,
+				Namespace: workNamespace,
+				Labels:    labelsForWork,
+			},
+		}
+	}
+	objs := make([]client.Object, 0, len(dupNames))
+	for _, n := range dupNames {
+		objs = append(objs, mkWork(n))
+	}
+	fakeClient := fake.NewClientBuilder().
+		WithScheme(scheme).
+		WithObjects(objs...).
+		Build()
+
+	recorder := record.NewFakeRecorder(10)
+	r := &Reconciler{
+		Client:          fakeClient,
+		recorder:        recorder,
+		InformerManager: &informer.FakeManager{},
+	}
+
+	got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, resourceEnvelope, testWorkNamePrefix,
+		resourceBinding, resourceSnapshot, "", "")
+
+	if got != nil {
+		t.Errorf("createOrUpdateEnvelopeCRWorkObj() = %v, want nil on duplicate-detected path", got)
+	}
+	if err == nil {
+		t.Fatalf("createOrUpdateEnvelopeCRWorkObj() error = nil, want UnexpectedBehaviorError")
+	}
+	if !errors.Is(err, controller.ErrUnexpectedBehavior) {
+		t.Errorf("createOrUpdateEnvelopeCRWorkObj() error = %v, want wrapping controller.ErrUnexpectedBehavior", err)
+	}
+
+	// Post-condition: all duplicates remain untouched — no auto-deletion.
+	remaining := &fleetv1beta1.WorkList{}
+	if err := fakeClient.List(ctx, remaining, client.InNamespace(workNamespace)); err != nil {
+		t.Fatalf("List Works in %q: %v", workNamespace, err)
+	}
+	sortStrings := cmpopts.SortSlices(func(a, b string) bool { return a < b })
+	if diff := cmp.Diff(dupNames, workNames(remaining.Items), sortStrings); diff != "" {
+		t.Errorf("Works after call mismatch (-want +got):\n%s", diff)
+	}
+
+	// Post-condition: a Warning Event was emitted so operators can see the stuck state.
+	select {
+	case ev := <-recorder.Events:
+		if !strings.Contains(ev, "DuplicateEnvelopeWorks") {
+			t.Errorf("event = %q, want reason DuplicateEnvelopeWorks", ev)
+		}
+	default:
+		t.Error("no Event emitted for duplicate envelope Works; operators would have no surface signal")
+	}
+}
+
+func workNames(items []fleetv1beta1.Work) []string {
+	out := make([]string, len(items))
+	for i := range items {
+		out[i] = items[i].Name
+	}
+	return out
+}

From abfa3ef4bfa8f5b267b801a4205fa37d2a731d76 Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Wed, 29 Apr 2026 10:21:17 -0700
Subject: [PATCH 05/19] fix: member agent validate opts, fix klog.ErrorS, and
 kubectl-fleet CLI improvements (#575)

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>
---
 Makefile                                      |  1 +
 cmd/memberagent/main.go                       | 12 +++++-
 test/e2e/cluster_staged_updaterun_test.go     |  2 +-
 test/e2e/drain_tool_test.go                   |  8 ++--
 tools/fleet/README.md                         | 42 +++++++++----------
 tools/fleet/cmd/approve/approve.go            | 19 +++++----
 tools/fleet/cmd/draincluster/draincluster.go  | 25 ++++++-----
 .../cmd/uncordoncluster/uncordoncluster.go    | 27 ++++++------
 tools/fleet/main.go                           | 16 +++++++
 tools/utils/common.go                         | 13 ++++++
 10 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/Makefile b/Makefile
index 4e4ffa70a..210ed1a6e 100644
--- a/Makefile
+++ b/Makefile
@@ -224,6 +224,7 @@ generate: $(CONTROLLER_GEN) ## Generate deep copy methods
 build: generate fmt vet ## Build agent binaries
 	go build -o bin/hubagent cmd/hubagent/main.go
 	go build -o bin/memberagent cmd/memberagent/main.go
+	go build -o bin/kubectl-fleet ./tools/fleet/
 
 .PHONY: run-hubagent
 run-hubagent: manifests generate fmt vet ## Run hub-agent from your host
diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go
index a710ab2ca..e281f48f3 100644
--- a/cmd/memberagent/main.go
+++ b/cmd/memberagent/main.go
@@ -91,6 +91,11 @@ func main() {
 		klog.InfoS("flag:", "name", f.Name, "value", f.Value)
 	})
 
+	if errs := opts.Validate(); len(errs) != 0 {
+		klog.ErrorS(errs.ToAggregate(), "invalid parameter")
+		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
+	}
+
 	// Set up controller-runtime logger
 	ctrl.SetLogger(zap.New(zap.UseDevMode(true)))
 
@@ -213,13 +218,16 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo
 			return err
 		})
 		if err != nil {
-			klog.ErrorS(err, "Failed to retrieve token file from the path %s", tokenFilePath)
+			klog.ErrorS(err, "Failed to retrieve token file", "path", tokenFilePath)
 			return nil, err
 		}
 		hubConfig.BearerTokenFile = tokenFilePath
 	}
 
 	hubConfig.TLSClientConfig.Insecure = tlsClientInsecure
+	if tlsClientInsecure {
+		klog.Warning("TLS verification is disabled for hub cluster connection. This is insecure and should not be used in production.")
+	}
 	if !tlsClientInsecure {
 		caBundle, ok := os.LookupEnv("CA_BUNDLE")
 		if ok && caBundle == "" {
@@ -257,7 +265,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo
 		r := textproto.NewReader(bufio.NewReader(strings.NewReader(header)))
 		h, err := r.ReadMIMEHeader()
 		if err != nil && !errors.Is(err, io.EOF) {
-			klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER %q", header)
+			klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER", "header", header)
 			return nil, err
 		}
 		hubConfig.WrapTransport = func(rt http.RoundTripper) http.RoundTripper {
diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go
index df5c8b4a7..3215362ea 100644
--- a/test/e2e/cluster_staged_updaterun_test.go
+++ b/test/e2e/cluster_staged_updaterun_test.go
@@ -2203,7 +2203,7 @@ func approveClusterApprovalRequest(stageName, updateRunName, stageTask string) {
 
 	// Use kubectl-fleet approve plugin to approve the request
 	cmd := exec.Command(fleetBinaryPath, "approve", "clusterapprovalrequest",
-		"--hubClusterContext", "kind-hub",
+		"--hub-cluster-context", "kind-hub",
 		"--name", approvalRequestName)
 	output, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "kubectl-fleet approve failed: %s", string(output))
diff --git a/test/e2e/drain_tool_test.go b/test/e2e/drain_tool_test.go
index 07ab7e6a1..408f8a670 100644
--- a/test/e2e/drain_tool_test.go
+++ b/test/e2e/drain_tool_test.go
@@ -360,8 +360,8 @@ var _ = Describe("Drain is allowed on one cluster, blocked on others - ClusterRe
 func runDrainClusterBinary(hubClusterName, memberClusterName string) {
 	By(fmt.Sprintf("draining cluster %s", memberClusterName))
 	cmd := exec.Command(fleetBinaryPath, "draincluster",
-		"--hubClusterContext", hubClusterName,
-		"--clusterName", memberClusterName)
+		"--hub-cluster-context", hubClusterName,
+		"--cluster-name", memberClusterName)
 	_, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "Drain command failed with error: %v", err)
 }
@@ -369,8 +369,8 @@ func runDrainClusterBinary(hubClusterName, memberClusterName string) {
 func runUncordonClusterBinary(hubClusterName, memberClusterName string) {
 	By(fmt.Sprintf("uncordoning cluster %s", memberClusterName))
 	cmd := exec.Command(fleetBinaryPath, "uncordoncluster",
-		"--hubClusterContext", hubClusterName,
-		"--clusterName", memberClusterName)
+		"--hub-cluster-context", hubClusterName,
+		"--cluster-name", memberClusterName)
 	_, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "Uncordon command failed with error: %v", err)
 }
diff --git a/tools/fleet/README.md b/tools/fleet/README.md
index 897978db3..c7405896d 100644
--- a/tools/fleet/README.md
+++ b/tools/fleet/README.md
@@ -72,25 +72,25 @@ Use the `approve` subcommand to approve approval request resources for staged up
 
 **Cluster-scoped (ClusterApprovalRequest):**
 ```bash
-kubectl fleet approve clusterapprovalrequest --hubClusterContext <hub-cluster-context> --name <approval-request-name>
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context <hub-cluster-context> --name <approval-request-name>
 # Or using alias:
-kubectl fleet approve careq --hubClusterContext <hub-cluster-context> --name <approval-request-name>
+kubectl fleet approve careq --hub-cluster-context <hub-cluster-context> --name <approval-request-name>
 ```
 
 **Namespace-scoped (ApprovalRequest):**
 ```bash
-kubectl fleet approve approvalrequest --hubClusterContext <hub-cluster-context> --name <approval-request-name> --namespace <namespace>
+kubectl fleet approve approvalrequest --hub-cluster-context <hub-cluster-context> --name <approval-request-name> --namespace <namespace>
 # Or using alias:
-kubectl fleet approve areq --hubClusterContext <hub-cluster-context> --name <approval-request-name> -n <namespace>
+kubectl fleet approve areq --hub-cluster-context <hub-cluster-context> --name <approval-request-name> -n <namespace>
 ```
 
 Example:
 ```bash
 # Approve a ClusterApprovalRequest
-kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name my-approval-request
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name my-approval-request
 
 # Approve an ApprovalRequest in a specific namespace
-kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval-request --namespace my-namespace
+kubectl fleet approve approvalrequest --hub-cluster-context hub --name my-approval-request --namespace my-namespace
 ```
 
 ### Drain a Member Cluster
@@ -98,12 +98,12 @@ kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval
 Use the `draincluster` subcommand to remove all resources propagated to a member cluster from the hub cluster by any `Placement` resource. This is useful when you want to temporarily move all workloads off a member cluster in preparation for an event like upgrade or reconfiguration.
 
 ```bash
-kubectl fleet draincluster --hubClusterContext <hub-cluster-context> --clusterName <memberClusterName>
+kubectl fleet draincluster --hub-cluster-context <hub-cluster-context> --cluster-name <memberClusterName>
 ```
 
 Example:
 ```bash
-kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-1
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name member-cluster-1
 ```
 
 ### Uncordon a Member Cluster
@@ -111,12 +111,12 @@ kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-
 Use the `uncordoncluster` subcommand to uncordon a member cluster that has been previously drained, allowing resources to be propagated to the cluster again.
 
 ```bash
-kubectl fleet uncordoncluster --hubClusterContext <hub-cluster-context> --clusterName <memberClusterName>
+kubectl fleet uncordoncluster --hub-cluster-context <hub-cluster-context> --cluster-name <memberClusterName>
 ```
 
 Example:
 ```bash
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName member-cluster-1
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name member-cluster-1
 ```
 
 ## Subcommands
@@ -156,13 +156,13 @@ If the `cordon` taint is not present on the member cluster, the command will hav
 ## Flags
 
 The `approve` subcommand uses the following flags:
-- `--hubClusterContext`: kubectl context for the hub cluster (required)
+- `--hub-cluster-context`: kubectl context for the hub cluster (required)
 - `--name`: name of the resource to approve (required)
 - `--namespace`, `-n`: namespace of the resource to approve (required for `approvalrequest`, not allowed for `clusterapprovalrequest`)
 
 Both `draincluster` and `uncordoncluster` subcommands use the following flags:
-- `--hubClusterContext`: kubectl context for the hub cluster (required)
-- `--clusterName`: name of the member cluster to operate on (required)
+- `--hub-cluster-context`: kubectl context for the hub cluster (required)
+- `--cluster-name`: name of the member cluster to operate on (required)
 
 ## Examples
 
@@ -173,31 +173,31 @@ Both `draincluster` and `uncordoncluster` subcommands use the following flags:
 kubectl config get-contexts
 
 # 2. Drain a cluster for maintenance
-kubectl fleet draincluster --hubClusterContext production-hub --clusterName worker-node-1
+kubectl fleet draincluster --hub-cluster-context production-hub --cluster-name worker-node-1
 
 # 3. Perform maintenance on the worker-node-1 cluster
 # ... maintenance operations ...
 
 # 4. After maintenance, uncordon the cluster to allow workloads back
-kubectl fleet uncordoncluster --hubClusterContext production-hub --clusterName worker-node-1
+kubectl fleet uncordoncluster --hub-cluster-context production-hub --cluster-name worker-node-1
 ```
 
 ### Additional Examples
 
 ```bash
 # Approve a ClusterApprovalRequest for staged updates
-kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name update-approval-stage-1
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name update-approval-stage-1
 
 # Approve a ApprovalRequest for staged updates
-kubectl fleet approve approvalrequest --hubClusterContext hub --name update-approval-stage-1 --namespace test-namespace
+kubectl fleet approve approvalrequest --hub-cluster-context hub --name update-approval-stage-1 --namespace test-namespace
 
 # Drain multiple clusters (run separately for each cluster)
-kubectl fleet draincluster --hubClusterContext hub --clusterName east-cluster
-kubectl fleet draincluster --hubClusterContext hub --clusterName west-cluster
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name east-cluster
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name west-cluster
 
 # Uncordon clusters after maintenance
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName east-cluster
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName west-cluster
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name east-cluster
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name west-cluster
 ```
 
 ## Troubleshooting
diff --git a/tools/fleet/cmd/approve/approve.go b/tools/fleet/cmd/approve/approve.go
index 72834bd9c..aa97605c7 100644
--- a/tools/fleet/cmd/approve/approve.go
+++ b/tools/fleet/cmd/approve/approve.go
@@ -20,11 +20,11 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -87,6 +87,7 @@ type approveOptions struct {
 	hubClusterContext string
 	name              string
 	namespace         string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -119,16 +120,19 @@ For namespace-scoped resources (approvalrequest), you must also specify the --na
 			if err := o.setupClient(); err != nil {
 				return err
 			}
-			return o.run(cmd.Context(), cfg)
+			ctx, cancel := context.WithTimeout(cmd.Context(), o.timeout)
+			defer cancel()
+			return o.run(ctx, cfg)
 		},
 	}
 
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "The name of the kubeconfig context to use for the hub cluster")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "The name of the kubeconfig context to use for the hub cluster")
 	cmd.Flags().StringVar(&o.name, "name", "", "The name of the resource to approve")
 	cmd.Flags().StringVarP(&o.namespace, "namespace", "n", "", "The namespace of the resource to approve (required for namespace-scoped resources)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags.
-	_ = cmd.MarkFlagRequired("hubClusterContext")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
 	_ = cmd.MarkFlagRequired("name")
 
 	return cmd
@@ -208,10 +212,9 @@ func (o *approveOptions) approveApprovalRequest(ctx context.Context) error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *approveOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/cmd/draincluster/draincluster.go b/tools/fleet/cmd/draincluster/draincluster.go
index 9616bd35b..7dc6f334b 100644
--- a/tools/fleet/cmd/draincluster/draincluster.go
+++ b/tools/fleet/cmd/draincluster/draincluster.go
@@ -20,11 +20,11 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
 	k8errors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/validation"
@@ -49,6 +49,7 @@ const (
 type drainOptions struct {
 	hubClusterContext string
 	clusterName       string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -70,18 +71,20 @@ func NewCmdDrainCluster() *cobra.Command {
 	}
 
 	// Add flags specific to drain command
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)")
-	cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)")
+	cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags
-	_ = cmd.MarkFlagRequired("hubClusterContext")
-	_ = cmd.MarkFlagRequired("clusterName")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
+	_ = cmd.MarkFlagRequired("cluster-name")
 
 	return cmd
 }
 
 func (o *drainOptions) runDrain() error {
-	ctx := context.Background()
+	ctx, cancel := context.WithTimeout(context.Background(), o.timeout)
+	defer cancel()
 
 	isDrainSuccessful, err := o.drain(ctx)
 	if err != nil {
@@ -111,13 +114,9 @@ func (o *drainOptions) runDrain() error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *drainOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err)
-	}
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
index aba52f121..32b7d1779 100644
--- a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
+++ b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
@@ -20,15 +20,14 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1"
-	placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 	toolsutils "github.com/kubefleet-dev/kubefleet/tools/utils"
 )
 
@@ -36,6 +35,7 @@ import (
 type uncordonOptions struct {
 	hubClusterContext string
 	clusterName       string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -57,18 +57,21 @@ func NewCmdUncordonCluster() *cobra.Command {
 	}
 
 	// Add flags specific to uncordon command
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)")
-	cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)")
+	cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags
-	_ = cmd.MarkFlagRequired("hubClusterContext")
-	_ = cmd.MarkFlagRequired("clusterName")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
+	_ = cmd.MarkFlagRequired("cluster-name")
 
 	return cmd
 }
 
 func (o *uncordonOptions) runUncordon() error {
-	ctx := context.Background()
+	ctx, cancel := context.WithTimeout(context.Background(), o.timeout)
+	defer cancel()
+
 	if err := o.uncordon(ctx); err != nil {
 		return fmt.Errorf("failed to uncordon cluster %s: %w", o.clusterName, err)
 	}
@@ -79,13 +82,9 @@ func (o *uncordonOptions) runUncordon() error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *uncordonOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err)
-	}
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/main.go b/tools/fleet/main.go
index 7c614bf4e..7f3156ad4 100644
--- a/tools/fleet/main.go
+++ b/tools/fleet/main.go
@@ -17,6 +17,7 @@ limitations under the License.
 package main
 
 import (
+	"fmt"
 	"log"
 
 	"github.com/spf13/cobra"
@@ -26,6 +27,12 @@ import (
 	"github.com/kubefleet-dev/kubefleet/tools/fleet/cmd/uncordoncluster"
 )
 
+// These variables are set via ldflags at build time.
+var (
+	version = "dev"
+	commit  = "unknown"
+)
+
 func main() {
 	rootCmd := &cobra.Command{
 		Use:   "kubectl-fleet",
@@ -33,6 +40,15 @@ func main() {
 		Long:  "kubectl-fleet is a kubectl plugin for managing KubeFleet member clusters",
 	}
 
+	// Add version subcommand
+	rootCmd.AddCommand(&cobra.Command{
+		Use:   "version",
+		Short: "Print the version information",
+		Run: func(cmd *cobra.Command, args []string) {
+			fmt.Printf("kubectl-fleet %s (%s)\n", version, commit)
+		},
+	})
+
 	// Add subcommands
 	rootCmd.AddCommand(approve.NewCmdApprove())
 	rootCmd.AddCommand(draincluster.NewCmdDrainCluster())
diff --git a/tools/utils/common.go b/tools/utils/common.go
index 6439dee14..d62ddae4d 100644
--- a/tools/utils/common.go
+++ b/tools/utils/common.go
@@ -13,6 +13,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1"
+	placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 )
 
 var (
@@ -24,6 +25,18 @@ var (
 	}
 )
 
+// NewFleetScheme creates a runtime.Scheme with all Fleet API types registered.
+func NewFleetScheme() (*runtime.Scheme, error) {
+	scheme := runtime.NewScheme()
+	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
+		return nil, err
+	}
+	if err := placementv1beta1.AddToScheme(scheme); err != nil {
+		return nil, err
+	}
+	return scheme, nil
+}
+
 // GetClusterClientFromClusterContext creates a new client.Client for the given cluster context and scheme.
 func GetClusterClientFromClusterContext(clusterContext string, scheme *runtime.Scheme) (client.Client, error) {
 	clusterConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(

From 447629b09f73a2cfc91de244cd27fe00f27a3feb Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Thu, 30 Apr 2026 16:24:34 -0700
Subject: [PATCH 06/19] fix: don't treat in-progress binding states as terminal
 failures (#670)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HasBindingFailed previously returned true for any condition with
Status=False, regardless of the Reason. Several in-progress reasons on
the per-cluster binding conditions (Overridden, WorkSynchronized,
Applied, Available) are expressed as Status=False rather than
Status=Unknown — most notably WorkNotSynchronizedYetReason and
NotAvailableYetReason. The over-broad classification caused upstream
controllers (rollout, updaterun) to bail on bindings that were still
making forward progress.

Switch HasBindingFailed to consult an explicit allowlist of
non-terminal reasons (nonTerminalBindingFailureReasons) drawn from the
condition package: OverriddenPendingReason, WorkNotSynchronizedYetReason,
ApplyPendingReason, NotAvailableYetReason. Anything outside that set is
treated as terminal — the safer default, since wrongly classifying an
unknown reason as "transient" would stall rollouts forever.

Per code review feedback (the original draft used a `strings.Contains`
on the literal "failed", which is fragile against typos and convention
drift), the allowlist uses typed constants. New "Pending" / "NotYet"
reasons added to pkg/utils/condition must also be added to the
allowlist; this is documented inline.

Tests:

- TestHasBindingFailed_NonTerminalReasons covers each known transient
  reason returning false, each known terminal reason returning true,
  an unknown-reason fall-through to terminal, and a mixed-condition
  case where a transient reason on one condition does not mask a
  terminal failure on another.
- Existing TestHasBindingFailed (which uses camelCase reason strings
  not in the allowlist) still passes — those legacy strings are
  correctly classified as terminal.
- TestCheckClusterUpdateResult in the updaterun package was encoding
  the previous bug: it asserted that WorkNotSynchronizedYet should
  produce an error from checkClusterUpdateResult. Updated the case to
  match corrected semantics (in-progress, no error) and added a new
  case for the terminal SyncWorkFailedReason path. This aligns with
  the existing TODO at execution.go:626 that called for distinguishing
  recoverable from non-recoverable failures.

Fixes #648

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>
Co-authored-by: Claude Code <noreply@anthropic.com>
---
 pkg/controllers/updaterun/execution_test.go | 23 ++++-
 pkg/utils/binding/binding.go                | 47 ++++++++--
 pkg/utils/binding/binding_test.go           | 97 +++++++++++++++++++++
 3 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go
index daedfdfe3..c65d8fb1f 100644
--- a/pkg/controllers/updaterun/execution_test.go
+++ b/pkg/controllers/updaterun/execution_test.go
@@ -242,7 +242,9 @@ func TestCheckClusterUpdateResult(t *testing.T) {
 			wantErr:       true,
 		},
 		{
-			name: "checkClusterUpdateResult should return false and error if the binding has false workSynchronized condition",
+			// WorkNotSynchronizedYet is an in-progress reason, not a terminal failure (issue #648).
+			// checkClusterUpdateResult now treats it as "still updating" rather than an error.
+			name: "checkClusterUpdateResult should return false but no error if workSynchronized is in-progress (WorkNotSynchronizedYet)",
 			binding: &placementv1beta1.ClusterResourceBinding{
 				ObjectMeta: metav1.ObjectMeta{Generation: 1},
 				Status: placementv1beta1.ResourceBindingStatus{
@@ -258,6 +260,25 @@ func TestCheckClusterUpdateResult(t *testing.T) {
 			},
 			clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
 			wantSucceeded: false,
+			wantErr:       false,
+		},
+		{
+			name: "checkClusterUpdateResult should return false and error if workSynchronized failed terminally (SyncWorkFailed)",
+			binding: &placementv1beta1.ClusterResourceBinding{
+				ObjectMeta: metav1.ObjectMeta{Generation: 1},
+				Status: placementv1beta1.ResourceBindingStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:               string(placementv1beta1.ResourceBindingWorkSynchronized),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.SyncWorkFailedReason,
+						},
+					},
+				},
+			},
+			clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
+			wantSucceeded: false,
 			wantErr:       true,
 		},
 		{
diff --git a/pkg/utils/binding/binding.go b/pkg/utils/binding/binding.go
index 7fb449615..c4eacea44 100644
--- a/pkg/utils/binding/binding.go
+++ b/pkg/utils/binding/binding.go
@@ -23,14 +23,51 @@ import (
 	"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
 )
 
-// HasBindingFailed checks if BindingObj has failed based on its conditions.
+// nonTerminalBindingFailureReasons is the closed set of `Reason` strings that the per-cluster
+// binding conditions (Overridden, WorkSynchronized, Applied, Available) can carry while still
+// `Status=False` *without* representing a terminal failure. Each one means the binding is still
+// progressing and the controller should keep waiting rather than treat the binding as failed.
+//
+// The set is intentionally an allowlist: any new `Status=False` reason added to the codebase is
+// treated as terminal until it is explicitly added here. This is the safer default because the
+// alternative (treat unknown reasons as transient) would stall the rollout controller forever
+// on a genuinely-failing binding it does not yet know how to classify.
+//
+// When you add a new in-progress / pending / "not yet" reason to pkg/utils/condition/reason.go
+// for any of the above conditions, also add it here.
+var nonTerminalBindingFailureReasons = map[string]struct{}{
+	condition.OverriddenPendingReason:      {},
+	condition.WorkNotSynchronizedYetReason: {},
+	condition.ApplyPendingReason:           {},
+	condition.NotAvailableYetReason:        {},
+}
+
+// HasBindingFailed reports whether a binding has reached a terminal failure on any of its
+// per-cluster conditions (Overridden, WorkSynchronized, Applied, Available).
+//
+// A condition counts as a terminal failure when its `Status` is `False` *and* its `Reason` is
+// not in `nonTerminalBindingFailureReasons`. The Reason check is necessary because several of
+// the in-progress states (e.g. `WorkNotSynchronizedYetReason`, `NotAvailableYetReason`) are
+// expressed as `Status=False` rather than `Status=Unknown`. Treating those as failures was the
+// previous bug — it caused the rollout controller to drop bindings that were still progressing
+// out of `canBeReadyBindings`, stalling rollout decisions.
+//
+// Unknown reasons fall through to "terminal" by design; see the comment on
+// `nonTerminalBindingFailureReasons`.
 func HasBindingFailed(binding placementv1beta1.BindingObj) bool {
 	for i := condition.OverriddenCondition; i <= condition.AvailableCondition; i++ {
-		if condition.IsConditionStatusFalse(binding.GetCondition(string(i.ResourceBindingConditionType())), binding.GetGeneration()) {
-			// TODO: parse the reason of the condition to see if the failure is recoverable/retriable or not
-			klog.V(2).Infof("binding %s has condition %s with status false", klog.KObj(binding), string(i.ResourceBindingConditionType()))
-			return true
+		c := binding.GetCondition(string(i.ResourceBindingConditionType()))
+		if !condition.IsConditionStatusFalse(c, binding.GetGeneration()) {
+			continue
+		}
+		if _, transient := nonTerminalBindingFailureReasons[c.Reason]; transient {
+			klog.V(3).Infof("binding %s has condition %s status false with non-terminal reason %q; treating as in-progress",
+				klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
+			continue
 		}
+		klog.V(2).Infof("binding %s has terminal failure on condition %s (reason %q)",
+			klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
+		return true
 	}
 	return false
 }
diff --git a/pkg/utils/binding/binding_test.go b/pkg/utils/binding/binding_test.go
index b190f4c1e..722d0c340 100644
--- a/pkg/utils/binding/binding_test.go
+++ b/pkg/utils/binding/binding_test.go
@@ -22,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
+	"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
 )
 
 func TestHasBindingFailed(t *testing.T) {
@@ -326,6 +327,102 @@ func TestHasBindingFailed(t *testing.T) {
 	}
 }
 
+// TestHasBindingFailed_NonTerminalReasons verifies that conditions with Status=False but a
+// known in-progress reason (WorkNotSynchronizedYet, NotAvailableYet, OverriddenPending,
+// ApplyPending) are NOT treated as failures. This is the bug fix for issue #648.
+func TestHasBindingFailed_NonTerminalReasons(t *testing.T) {
+	bindingWithCondition := func(condType string, reason string) *placementv1beta1.ClusterResourceBinding {
+		return &placementv1beta1.ClusterResourceBinding{
+			ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
+			Status: placementv1beta1.ResourceBindingStatus{
+				Conditions: []metav1.Condition{
+					{
+						Type:               condType,
+						Status:             metav1.ConditionFalse,
+						ObservedGeneration: 1,
+						Reason:             reason,
+						Message:            "in progress",
+					},
+				},
+			},
+		}
+	}
+
+	tests := []struct {
+		name    string
+		binding *placementv1beta1.ClusterResourceBinding
+		want    bool
+	}{
+		{
+			name:    "WorkSynchronized=False with WorkNotSynchronizedYet is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingWorkSynchronized), condition.WorkNotSynchronizedYetReason),
+			want:    false,
+		},
+		{
+			name:    "Available=False with NotAvailableYet is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingAvailable), condition.NotAvailableYetReason),
+			want:    false,
+		},
+		{
+			name:    "Overridden=False with OverriddenPending is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenPendingReason),
+			want:    false,
+		},
+		{
+			name:    "Applied=False with ApplyPending is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyPendingReason),
+			want:    false,
+		},
+		{
+			name:    "Applied=False with ApplyFailed is a terminal failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyFailedReason),
+			want:    true,
+		},
+		{
+			name:    "Overridden=False with OverriddenFailed is a terminal failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenFailedReason),
+			want:    true,
+		},
+		{
+			name:    "Status=False with an unknown reason defaults to terminal (safer default)",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), "SomeReasonNobodyAddedToTheAllowlist"),
+			want:    true,
+		},
+		{
+			name: "non-terminal reason on one condition does not mask a terminal failure on another",
+			binding: &placementv1beta1.ClusterResourceBinding{
+				ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
+				Status: placementv1beta1.ResourceBindingStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:               string(placementv1beta1.ResourceBindingOverridden),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.OverriddenPendingReason,
+						},
+						{
+							Type:               string(placementv1beta1.ResourceBindingApplied),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.ApplyFailedReason,
+						},
+					},
+				},
+			},
+			want: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := HasBindingFailed(tc.binding)
+			if got != tc.want {
+				t.Errorf("HasBindingFailed(%v) = %v, want %v", tc.binding, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestIsBindingReportDiff(t *testing.T) {
 	tests := []struct {
 		name    string

From 97f9ed4afc1c3a12e53bc7cc288fd7905af4ede8 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 6 May 2026 10:51:53 -0700
Subject: [PATCH 07/19] fix: correct TimedWait waitTime regex to reject invalid
 duration strings (#688)

---
 apis/placement/v1/stageupdate_types.go        |   3 +-
 apis/placement/v1alpha1/stagedupdate_types.go |   2 +-
 apis/placement/v1beta1/stageupdate_types.go   |   3 +-
 ...etes-fleet.io_clusterstagedupdateruns.yaml |  34 ++--
 ...leet.io_clusterstagedupdatestrategies.yaml |  34 ++--
 ....kubernetes-fleet.io_stagedupdateruns.yaml |  32 ++--
 ...netes-fleet.io_stagedupdatestrategies.yaml |  32 ++--
 .../api_validation_integration_test.go        | 172 ++++++++++++++++++
 8 files changed, 243 insertions(+), 69 deletions(-)

diff --git a/apis/placement/v1/stageupdate_types.go b/apis/placement/v1/stageupdate_types.go
index afcc88f96..2e06af937 100644
--- a/apis/placement/v1/stageupdate_types.go
+++ b/apis/placement/v1/stageupdate_types.go
@@ -209,7 +209,8 @@ type StageTask struct {
 	Type StageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// Only hours (h), minutes (m), and seconds (s) units are accepted.
+	// +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime *metav1.Duration `json:"waitTime,omitempty"`
diff --git a/apis/placement/v1alpha1/stagedupdate_types.go b/apis/placement/v1alpha1/stagedupdate_types.go
index b6389a5d4..e8644d357 100644
--- a/apis/placement/v1alpha1/stagedupdate_types.go
+++ b/apis/placement/v1alpha1/stagedupdate_types.go
@@ -153,7 +153,7 @@ type AfterStageTask struct {
 	Type AfterStageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// +kubebuilder:validation:Pattern="^(?:0|(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+)$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime metav1.Duration `json:"waitTime,omitempty"`
diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go
index 6e26851b9..0054bba0e 100644
--- a/apis/placement/v1beta1/stageupdate_types.go
+++ b/apis/placement/v1beta1/stageupdate_types.go
@@ -347,7 +347,8 @@ type StageTask struct {
 	Type StageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// Only hours (h), minutes (m), and seconds (s) units are accepted.
+	// +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime *metav1.Duration `json:"waitTime,omitempty"`
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
index 2b34db4d3..27dc987da 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
@@ -845,10 +845,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -879,10 +879,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2070,7 +2070,7 @@ spec:
                                 description: The time to wait after all the clusters
                                   in the current stage complete the update before
                                   moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$
                                 type: string
                             required:
                             - type
@@ -3268,10 +3268,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -3302,10 +3302,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
index 9be8428dc..b134a75b5 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
@@ -70,10 +70,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -101,10 +101,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -264,7 +264,7 @@ spec:
                             description: The time to wait after all the clusters in
                               the current stage complete the update before moving
                               to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$
                             type: string
                         required:
                         - type
@@ -399,10 +399,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -430,10 +430,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
index ff9152c9d..81b506f4a 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
@@ -845,10 +845,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -879,10 +879,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2188,10 +2188,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2222,10 +2222,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
index 5315f7322..ccfb44e85 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
@@ -70,10 +70,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -101,10 +101,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -259,10 +259,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -290,10 +290,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go
index aae616ef7..797cab861 100644
--- a/test/apis/placement/v1beta1/api_validation_integration_test.go
+++ b/test/apis/placement/v1beta1/api_validation_integration_test.go
@@ -27,6 +27,7 @@ import (
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	k8sErrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/util/intstr"
 
 	placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
@@ -1685,6 +1686,60 @@ var _ = Describe("Test placement v1beta1 API validation", func() {
 			Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency))
 			Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed())
 		})
+
+		It("Should allow creation of ClusterStagedUpdateStrategy with waitTime set to '0s'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "0s",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			Expect(hubClient.Create(ctx, obj)).Should(Succeed())
+			Expect(hubClient.Delete(ctx, obj)).Should(Succeed())
+		})
+
+		It("Should allow creation of ClusterStagedUpdateStrategy with waitTime as a compound duration '1h30m'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1h30m",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			Expect(hubClient.Create(ctx, obj)).Should(Succeed())
+			Expect(hubClient.Delete(ctx, obj)).Should(Succeed())
+		})
 	})
 
 	Context("Test ClusterStagedUpdateStrategy API validation - invalid cases", func() {
@@ -2096,6 +2151,123 @@ var _ = Describe("Test placement v1beta1 API validation", func() {
 			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
 			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match"))
 		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '0abc'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "0abc",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1' (number without unit)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1d' (unsupported unit)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1d",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '01h' (leading zero)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "01h",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
 	})
 
 	Context("Test ClusterApprovalRequest API validation - valid cases", func() {

From 1c525ee64405b9036efe86903b500c8fdaaed70d Mon Sep 17 00:00:00 2001
From: Wei Weng <wengwei01@gmail.com>
Date: Thu, 7 May 2026 06:59:15 -0400
Subject: [PATCH 08/19] chore: Move NamespaceWithResourceSelectors to v1 (#697)

---
 .../v1/clusterresourceplacement_types.go      | 74 ++++++++++++++++++-
 ...tes-fleet.io_clusterresourceoverrides.yaml | 19 +++++
 ...t.io_clusterresourceoverridesnapshots.yaml | 19 +++++
 ...es-fleet.io_clusterresourceplacements.yaml | 38 ++++++++++
 ...ubernetes-fleet.io_resourceplacements.yaml | 38 ++++++++++
 5 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/apis/placement/v1/clusterresourceplacement_types.go b/apis/placement/v1/clusterresourceplacement_types.go
index b6d452ae8..bf4086ecc 100644
--- a/apis/placement/v1/clusterresourceplacement_types.go
+++ b/apis/placement/v1/clusterresourceplacement_types.go
@@ -123,13 +123,15 @@ type ClusterResourcePlacement struct {
 }
 
 // PlacementSpec defines the desired state of ClusterResourcePlacement and ResourcePlacement.
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) <= 1",message="only one namespace selector with NamespaceWithResourceSelectors mode is allowed"
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) && size(x.name) > 0 && !has(x.labelSelector))) == 1)",message="namespace selector with NamespaceWithResourceSelectors mode must select by name (not by label)"
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1')) == 1",message="when using NamespaceWithResourceSelectors mode, only one namespace selector is allowed (cannot mix with other namespace selectors)"
 type PlacementSpec struct {
-	// +kubebuilder:validation:MinItems=1
-	// +kubebuilder:validation:MaxItems=100
-
 	// ResourceSelectors is an array of selectors used to select cluster scoped resources. The selectors are `ORed`.
 	// You can have 1-100 selectors.
 	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=100
 	ResourceSelectors []ResourceSelectorTerm `json:"resourceSelectors"`
 
 	// Policy defines how to select member clusters to place the selected resources.
@@ -185,6 +187,24 @@ type ResourceSelectorTerm struct {
 	// For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
 	// - NamespaceOnly: Only the namespace object itself
 	// - NamespaceWithResources: The namespace AND all resources within it (default)
+	// - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+	//
+	// When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+	// (after the namespace selector) to specify which resources to include. These additional selectors can
+	// target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+	//
+	// Important requirements for NamespaceWithResourceSelectors mode:
+	// - Exactly one namespace selector with this mode is allowed
+	// - The namespace selector must select by name (not by label)
+	// - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+	// - All requirements are validated via CEL at API validation time
+	// - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+	//
+	// Example using NamespaceWithResourceSelectors:
+	// - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+	// - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+	// - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+	// This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
 	//
 	// +kubebuilder:validation:Required
 	Kind string `json:"kind"`
@@ -204,7 +224,7 @@ type ResourceSelectorTerm struct {
 	// SelectionScope defines the scope of resource selections when the Kind is `namespace`.
 	// This field is only applicable when Kind is "Namespace" and is ignored for other resource kinds.
 	// See the Kind field documentation for detailed examples and usage patterns.
-	// +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources
+	// +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources;NamespaceWithResourceSelectors
 	// +kubebuilder:default=NamespaceWithResources
 	// +kubebuilder:validation:Optional
 	SelectionScope SelectionScope `json:"selectionScope,omitempty"`
@@ -230,6 +250,52 @@ const (
 	// Note: This is the default value. When you select a namespace without specifying SelectionScope,
 	// this mode is used automatically.
 	NamespaceWithResources SelectionScope = "NamespaceWithResources"
+
+	// NamespaceWithResourceSelectors allows fine-grained selection of specific resources within a namespace.
+	// The namespace itself is always selected, and you can optionally specify which resources to include
+	// by adding additional ResourceSelectorTerm entries after the namespace selector.
+	//
+	// Use cases:
+	// 1. Select only specific resource types from a namespace (e.g., only Deployments and Services)
+	// 2. Select resources matching certain labels within a namespace
+	// 3. Include specific cluster-scoped resources along with namespace-scoped resources
+	//
+	// How "additional selectors" work:
+	// - Exactly one namespace selector with NamespaceWithResourceSelectors mode is required
+	// - This selector must select a namespace by name (label selectors not allowed)
+	// - ADDITIONAL selectors specify which resources to include:
+	//   - Namespace-scoped resources are filtered to only those within the selected namespace
+	//   - Cluster-scoped resources are included as specified (not limited to the namespace)
+	// - If no additional selectors are provided, only the namespace object itself is selected
+	//
+	// Example 1 - Select specific deployments from a namespace:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "production", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {tier: "frontend"}}
+	//   Result: The "production" namespace + all Deployments labeled tier=frontend within "production"
+	//
+	// Example 2 - Select namespace with multiple resource types:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"}
+	//   Selector 3: {Group: "", Version: "v1", Kind: "Service"}
+	//   Result: The "app" namespace + ALL Deployments and Services within "app"
+	//
+	// Example 3 - Include cluster-scoped resources:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"}
+	//   Selector 3: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "app-admin"}
+	//   Result: The "app" namespace + ALL Deployments in "app" + the "app-admin" ClusterRole
+	//
+	// Important constraints:
+	// - Exactly ONE namespace selector with NamespaceWithResourceSelectors mode is allowed
+	// - The namespace selector must select by name (label selectors not allowed)
+	// - Only ONE namespace selector total is allowed when using this mode (cannot mix with other namespace selectors)
+	// - All constraints are enforced via CEL at API validation time
+	//
+	// Runtime behavior:
+	// - If the selected namespace is deleted after the CRP is created, the controller will detect this during
+	//   the next reconciliation and report an error condition in the CRP status
+	// - The CRP will transition to a failed state until the namespace is recreated or the CRP is updated
+	NamespaceWithResourceSelectors SelectionScope = "NamespaceWithResourceSelectors"
 )
 
 // PlacementPolicy contains the rules to select target member clusters to place the selected resources.
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
index 79927f6d1..f3f4cdfef 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
@@ -72,6 +72,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -133,6 +151,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
index 6b80c6832..a57b5b3d2 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
@@ -86,6 +86,24 @@ spec:
                             For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                             - NamespaceOnly: Only the namespace object itself
                             - NamespaceWithResources: The namespace AND all resources within it (default)
+                            - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                            When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                            (after the namespace selector) to specify which resources to include. These additional selectors can
+                            target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                            Important requirements for NamespaceWithResourceSelectors mode:
+                            - Exactly one namespace selector with this mode is allowed
+                            - The namespace selector must select by name (not by label)
+                            - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                            - All requirements are validated via CEL at API validation time
+                            - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                            Example using NamespaceWithResourceSelectors:
+                            - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                            - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                            - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                            This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                           type: string
                         labelSelector:
                           description: |-
@@ -147,6 +165,7 @@ spec:
                           enum:
                           - NamespaceOnly
                           - NamespaceWithResources
+                          - NamespaceWithResourceSelectors
                           type: string
                         version:
                           description: Version of the resource to be selected.
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
index 8ffe14382..5c586555d 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
@@ -554,6 +554,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -615,6 +633,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
@@ -961,6 +980,25 @@ spec:
             - message: statusReportingScope is immutable
               rule: '!has(oldSelf.statusReportingScope) || self.statusReportingScope
                 == oldSelf.statusReportingScope'
+            - message: only one namespace selector with NamespaceWithResourceSelectors
+                mode is allowed
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) <= 1
+            - message: namespace selector with NamespaceWithResourceSelectors mode
+                must select by name (not by label)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope)
+                && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name)
+                && size(x.name) > 0 && !has(x.labelSelector))) == 1)
+            - message: when using NamespaceWithResourceSelectors mode, only one namespace
+                selector is allowed (cannot mix with other namespace selectors)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1
           status:
             description: The observed status of ClusterResourcePlacement.
             properties:
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
index 8ab1049c5..db0cdea5d 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
@@ -546,6 +546,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -607,6 +625,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
@@ -946,6 +965,25 @@ spec:
             x-kubernetes-validations:
             - message: policy cannot be removed once set
               rule: '!(has(oldSelf.policy) && !has(self.policy))'
+            - message: only one namespace selector with NamespaceWithResourceSelectors
+                mode is allowed
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) <= 1
+            - message: namespace selector with NamespaceWithResourceSelectors mode
+                must select by name (not by label)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope)
+                && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name)
+                && size(x.name) > 0 && !has(x.labelSelector))) == 1)
+            - message: when using NamespaceWithResourceSelectors mode, only one namespace
+                selector is allowed (cannot mix with other namespace selectors)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1
           status:
             description: The observed status of ResourcePlacement.
             properties:

From dd2db6f74e20ccd612712a22b9e738a815f28016 Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Fri, 8 May 2026 19:34:59 -0700
Subject: [PATCH 09/19] refactor: extract IsLatest/LookupLatest policy snapshot
 helpers (#667)

* refactor: extract IsLatest/LookupLatest policy snapshot helpers

Move the two duplicated "latest policy snapshot" lookups into
pkg/utils/controller/policy_snapshot_resolver.go so both the scheduler
and the policy-snapshot watcher share the implementation.

Adds two helpers:

- IsLatestPolicySnapshot(snapshot) (bool, error): inspects the
  IsLatestSnapshot label. Returns (false, ErrUnexpectedBehavior-wrapped)
  for missing or non-bool values. The watcher's Reconcile now uses this
  in place of inline label parsing.

- LookupLatestPolicySnapshot(ctx, client.Reader, placement)
  (PolicySnapshotObj, error): wraps the existing FetchLatestPolicySnapshot
  and enforces the "exactly one latest snapshot" invariant. Returns a
  plain error for the (transient) zero case and ErrUnexpectedBehavior for
  the >1 case. The scheduler's lookupLatestPolicySnapshot now delegates
  to this and keeps only its own log + retry handling.

The watcher's buildCustomPredicate continues to do its own ParseBool
because it compares old-vs-new label transitions, which doesn't fit the
helper's signature.

Adds table-driven tests for both helpers covering: label-true/false,
missing label, nil labels map, malformed value, exactly-one snapshot,
zero snapshots, multiple snapshots, ignoring snapshots from other
placements, and ignoring non-latest snapshots. Drops two now-unused
imports (strconv, labels) from scheduler.go.

Fixes #643

Co-Authored-By: Claude Code <noreply@anthropic.com>
Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

* chore: address review feedback

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

* test: align integration substring with new error wording

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

* test: cover LookupLatestPolicySnapshot list-error branch

Address codecov/patch shortfall on PR #667. The List-failure path in
controller.LookupLatestPolicySnapshot and the corresponding wrapper
in scheduler.lookupLatestPolicySnapshot were not exercised by any
existing unit test. Add an interceptor-based test in each package
that injects a non-StatusError on List, asserts (nil, error) and
that the error wraps ErrUnexpectedBehavior (NewAPIServerError
promotes non-Status errors via isUnexpectedCacheError).

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

---------

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>
Co-authored-by: Claude Code <noreply@anthropic.com>
---
 pkg/controllers/placement/controller.go       |  32 +-
 pkg/controllers/updaterun/initialization.go   |  26 +-
 .../initialization_integration_test.go        |   4 +-
 .../updaterun/validation_integration_test.go  |   2 +-
 pkg/scheduler/scheduler.go                    |  62 +--
 pkg/scheduler/scheduler_test.go               |  39 ++
 .../schedulingpolicysnapshot/watcher.go       |  27 +-
 pkg/utils/controller/controller.go            |  13 +
 .../controller/policy_snapshot_resolver.go    |  73 ++-
 .../policy_snapshot_resolver_test.go          | 440 ++++++++++--------
 10 files changed, 406 insertions(+), 312 deletions(-)

diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go
index a41d8f4c9..e6a80a899 100644
--- a/pkg/controllers/placement/controller.go
+++ b/pkg/controllers/placement/controller.go
@@ -507,25 +507,23 @@ func (r *Reconciler) ensureLatestPolicySnapshot(ctx context.Context, placementOb
 // 2 & 3 should never happen.
 func (r *Reconciler) lookupLatestSchedulingPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, int, error) {
 	placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()}
-	snapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey)
-	if err != nil {
-		return nil, -1, err
-	}
 	placementKObj := klog.KObj(placement)
-	policySnapshotItems := snapshotList.GetPolicySnapshotObjs()
-	if len(policySnapshotItems) == 1 {
-		policyIndex, err := labels.ParsePolicyIndexFromLabel(policySnapshotItems[0])
-		if err != nil {
-			klog.ErrorS(err, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(policySnapshotItems[0]))
-			return nil, -1, controller.NewUnexpectedBehaviorError(err)
-		}
-		return policySnapshotItems[0], policyIndex, nil
-	} else if len(policySnapshotItems) > 1 {
-		// It means there are multiple active snapshots and should never happen.
-		err := fmt.Errorf("there are %d active schedulingPolicySnapshots owned by placement %v", len(policySnapshotItems), placementKey)
-		klog.ErrorS(err, "Invalid schedulingPolicySnapshots", "placement", placementKObj)
-		return nil, -1, controller.NewUnexpectedBehaviorError(err)
+	activeSnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey)
+	switch {
+	case err == nil:
+		policyIndex, parseErr := labels.ParsePolicyIndexFromLabel(activeSnapshot)
+		if parseErr != nil {
+			klog.ErrorS(parseErr, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(activeSnapshot))
+			return nil, -1, controller.NewUnexpectedBehaviorError(parseErr)
+		}
+		return activeSnapshot, policyIndex, nil
+	case errors.Is(err, controller.ErrNoLatestPolicySnapshot):
+		// No active snapshot found; recover below by picking the largest-index snapshot.
+	default:
+		// API or invariant-violation error — propagate.
+		return nil, -1, err
 	}
+
 	// When there are no active snapshots, find the one who has the largest policy index.
 	// It should be rare only when placement controller is crashed before creating the new active snapshot.
 	sortedList, err := r.listSortedSchedulingPolicySnapshots(ctx, placement)
diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go
index c41c2174a..4eca2b1d9 100644
--- a/pkg/controllers/updaterun/initialization.go
+++ b/pkg/controllers/updaterun/initialization.go
@@ -125,29 +125,19 @@ func (r *Reconciler) determinePolicySnapshot(
 	updateRunRef := klog.KObj(updateRun)
 
 	// Get the latest policy snapshot.
-	policySnapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey)
+	latestPolicySnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey)
 	if err != nil {
-		klog.ErrorS(err, "Failed to list the latest policy snapshots", "placement", placementKey, "updateRun", updateRunRef)
-		// err can be retried.
-		return nil, -1, controller.NewAPIServerError(true, err)
-	}
-
-	policySnapshotObjs := policySnapshotList.GetPolicySnapshotObjs()
-	if len(policySnapshotObjs) != 1 {
-		if len(policySnapshotObjs) > 1 {
-			err := controller.NewUnexpectedBehaviorError(fmt.Errorf("more than one (%d in actual) latest policy snapshots associated with the placement: `%s`", len(policySnapshotObjs), placementKey))
-			klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
-			// no more retries for this error.
+		klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
+		// Missing or duplicate-active snapshots are placement-state invariants the run cannot
+		// recover from on its own — fail validation. Anything else (e.g. List errors classified as
+		// ErrAPIServerError, or unexpected cache failures promoted to ErrUnexpectedBehavior) is
+		// propagated raw so controller-runtime retries.
+		if errors.Is(err, controller.ErrNoLatestPolicySnapshot) || errors.Is(err, controller.ErrMultipleActivePolicySnapshots) {
 			return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error())
 		}
-		// no latest policy snapshot found.
-		err := fmt.Errorf("no latest policy snapshot associated with the placement: `%s`", placementKey)
-		klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
-		// again, no more retries here.
-		return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error())
+		return nil, -1, err
 	}
 
-	latestPolicySnapshot := policySnapshotObjs[0]
 	policySnapshotRef := klog.KObj(latestPolicySnapshot)
 	policyIndex, foundIndex := latestPolicySnapshot.GetLabels()[placementv1beta1.PolicyIndexLabel]
 	if !foundIndex || len(policyIndex) == 0 {
diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go
index 0516cb9e5..f208fc67a 100644
--- a/pkg/controllers/updaterun/initialization_integration_test.go
+++ b/pkg/controllers/updaterun/initialization_integration_test.go
@@ -210,7 +210,7 @@ var _ = Describe("Updaterun initialization tests", func() {
 			Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
 
 			By("Validating the initialization failed")
-			validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot associated")
+			validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot found")
 		})
 
 		It("Should fail to initialize if there are multiple latest policy snapshots", func() {
@@ -223,7 +223,7 @@ var _ = Describe("Updaterun initialization tests", func() {
 			Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
 
 			By("Validating the initialization failed")
-			validateFailedInitCondition(ctx, updateRun, "more than one (2 in actual) latest policy snapshots associated")
+			validateFailedInitCondition(ctx, updateRun, "multiple active policy snapshots found")
 
 			By("Deleting the second scheduling policy snapshot")
 			Expect(k8sClient.Delete(ctx, snapshot2)).Should(Succeed())
diff --git a/pkg/controllers/updaterun/validation_integration_test.go b/pkg/controllers/updaterun/validation_integration_test.go
index 11265bd3a..fe2182573 100644
--- a/pkg/controllers/updaterun/validation_integration_test.go
+++ b/pkg/controllers/updaterun/validation_integration_test.go
@@ -219,7 +219,7 @@ var _ = Describe("UpdateRun validation tests", func() {
 
 				By("Validating the validation failed")
 				wantStatus = generateFailedValidationStatus(updateRun, wantStatus)
-				validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot associated")
+				validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot found")
 
 				By("Checking update run status metrics are emitted")
 				validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateFailedMetric(placementv1beta1.StateRun, updateRun, string(failureType)))
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
index 672cd0460..30cc6e84e 100644
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@@ -21,12 +21,10 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"strconv"
 	"sync"
 	"time"
 
 	apiErrors "k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/client-go/tools/record"
@@ -347,57 +345,21 @@ func (s *Scheduler) cleanUpAllBindingsFor(ctx context.Context, placement fleetv1
 }
 
 // lookupLatestPolicySnapshot returns the latest (i.e., active) policy snapshot associated with a placement.
-// TODO(ryan): move this to a common lib
+//
+// It delegates to controller.LookupLatestPolicySnapshot so the same lookup can be reused outside the
+// scheduler. The scheduler lists with a cached client; this does not have any consistency concern as
+// sources will always trigger the scheduler when a new policy snapshot is created.
+//
+// The "no latest snapshot" case can happen transiently when a placement is newly created or its
+// latest snapshot is being replaced; the scheduler will be triggered again when the situation
+// corrects itself, so this is logged and surfaced as a non-unexpected error.
 func (s *Scheduler) lookupLatestPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, error) {
-	placementRef := klog.KObj(placement)
-	// Prepare the list options to filter policy snapshots by the placement name and the latest snapshot label.
-	var listOptions []client.ListOption
-	labelSelector := labels.SelectorFromSet(labels.Set{
-		fleetv1beta1.PlacementTrackingLabel: placement.GetName(),
-		fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
-	})
-	listOptions = append(listOptions, &client.ListOptions{LabelSelector: labelSelector})
-	// Find out the latest policy snapshot associated with the placement.
-	var policySnapshotList fleetv1beta1.PolicySnapshotList
-	if placement.GetNamespace() == "" {
-		policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{}
-	} else {
-		policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{}
-		listOptions = append(listOptions, client.InNamespace(placement.GetNamespace()))
-	}
-
-	// The scheduler lists with a cached client; this does not have any consistency concern as sources
-	// will always trigger the scheduler when a new policy snapshot is created.
-	if err := s.client.List(ctx, policySnapshotList, listOptions...); err != nil {
-		klog.ErrorS(err, "Failed to list policy snapshots of a placement", "placement", placementRef)
-		return nil, controller.NewAPIServerError(true, err)
-	}
-	policySnapshots := policySnapshotList.GetPolicySnapshotObjs()
-	switch {
-	case len(policySnapshots) == 0:
-		// There is no latest policy snapshot associated with the placement; it could happen when
-		// * the placement is newly created; or
-		// * the new policy snapshot is in the middle of being replaced.
-		//
-		// Either way, it is out of the scheduler's scope to handle such a case; the scheduler will
-		// be triggered again if the situation is corrected.
-		err := fmt.Errorf("no latest policy snapshot associated with placement")
-		klog.ErrorS(err, "Failed to find the latest policy snapshot, will retry", "placement", placementRef)
-		return nil, err
-	case len(policySnapshots) > 1:
-		// There are multiple active policy snapshots associated with the placement; normally this
-		// will never happen, as only one policy snapshot can be active in the sequence.
-		//
-		// Similarly, it is out of the scheduler's scope to handle such a case; the scheduler will
-		// report this unexpected occurrence but does not register it as a scheduler-side error.
-		// If (and when) the situation is corrected, the scheduler will be triggered again.
-		err := controller.NewUnexpectedBehaviorError(fmt.Errorf("too many active policy snapshots: %d, want 1", len(policySnapshots)))
-		klog.ErrorS(err, "There are multiple latest policy snapshots associated with placement", "placement", placementRef)
+	snapshot, err := controller.LookupLatestPolicySnapshot(ctx, s.client, types.NamespacedName{Namespace: placement.GetNamespace(), Name: placement.GetName()})
+	if err != nil {
+		klog.ErrorS(err, "Failed to look up latest policy snapshot, will retry", "placement", klog.KObj(placement))
 		return nil, err
-	default:
-		// Found the one and only active policy snapshot.
-		return policySnapshots[0], nil
 	}
+	return snapshot, nil
 }
 
 // addSchedulerCleanupFinalizer adds the scheduler cleanup finalizer to a placement (if it does not
diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go
index 6a33a6943..8a6ccad45 100644
--- a/pkg/scheduler/scheduler_test.go
+++ b/pkg/scheduler/scheduler_test.go
@@ -18,6 +18,8 @@ package scheduler
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"log"
 	"os"
 	"strconv"
@@ -30,10 +32,13 @@ import (
 	"github.com/prometheus/client_golang/prometheus/testutil"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 
 	fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 	hubmetrics "github.com/kubefleet-dev/kubefleet/pkg/metrics/hub"
+	"github.com/kubefleet-dev/kubefleet/pkg/utils/controller"
 )
 
 const (
@@ -591,6 +596,40 @@ func TestLookupLatestPolicySnapshot(t *testing.T) {
 	}
 }
 
+// TestLookupLatestPolicySnapshot_ListError covers the error path: when the underlying
+// controller.LookupLatestPolicySnapshot returns an error, the scheduler logs and propagates
+// it without panicking on the nil snapshot.
+func TestLookupLatestPolicySnapshot_ListError(t *testing.T) {
+	placement := &fleetv1beta1.ClusterResourcePlacement{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:       crpName,
+			Finalizers: []string{fleetv1beta1.SchedulerCleanupFinalizer},
+		},
+	}
+	listErr := fmt.Errorf("simulated cache failure")
+	fakeClient := interceptor.NewClient(
+		fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(placement).Build(),
+		interceptor.Funcs{
+			List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error {
+				return listErr
+			},
+		},
+	)
+
+	s := &Scheduler{client: fakeClient, uncachedReader: fakeClient}
+
+	got, err := s.lookupLatestPolicySnapshot(context.Background(), placement)
+	if err == nil {
+		t.Fatalf("lookupLatestPolicySnapshot() = %v, nil; want non-nil error", got)
+	}
+	if got != nil {
+		t.Errorf("lookupLatestPolicySnapshot() returned snapshot %v, want nil on error", got)
+	}
+	if !errors.Is(err, controller.ErrUnexpectedBehavior) {
+		t.Errorf("lookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+	}
+}
+
 func TestObserveSchedulingCycleMetrics(t *testing.T) {
 	metricMetadata := `
 		# HELP scheduling_cycle_duration_milliseconds The duration of a scheduling cycle run in milliseconds
diff --git a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
index 79d9e5a96..54a453b7e 100644
--- a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
+++ b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
@@ -74,32 +74,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 		return ctrl.Result{}, nil
 	}
 
-	// Verify if the policy snapshot is currently active.
-	// TODO: create a lib to check if the policy snapshot is latest.
-	isLatestVal, ok := policySnapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel]
-	if !ok {
-		// The IsLatestSnapshot label is not present; normally this should never occur.
-		klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("IsLatestSnapshotLabel is missing")),
-			"IsLatestSnapshot label is not present",
-			"policySnapshot", policySnapshotRef)
-		// This is not a situation that the controller can recover by itself. Should the label
-		// value be corrected, the controller will be triggered again.
-		return ctrl.Result{}, nil
-	}
-	isLatest, err := strconv.ParseBool(isLatestVal)
+	// Verify if the policy snapshot is currently active. A missing or malformed IsLatestSnapshot
+	// label is treated as "not latest" — the controller cannot recover from those states by
+	// itself; the placement controller will repair the label and re-trigger reconciliation.
+	isLatest, err := controller.IsLatestPolicySnapshot(policySnapshot)
 	if err != nil {
-		// The label value is not a correctly formatted boolean value; normally this should never
-		// occur.
-		klog.ErrorS(controller.NewUnexpectedBehaviorError(err),
-			"Failed to parse IsLatestSnapshot value",
-			"policySnapshot", policySnapshotRef)
-		// This is not an error that the controller can recover by itself; ignore the error.
-		// Should the label value be corrected, the controller will be triggered again.
+		klog.ErrorS(err, "Failed to determine whether policy snapshot is latest", "policySnapshot", policySnapshotRef)
 		return ctrl.Result{}, nil
 	}
 	if !isLatest {
-		// The policy snapshot is not currently active, i.e., it is not the latest policy snapshot;
-		// ignore it.
 		return ctrl.Result{}, nil
 	}
 
diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go
index 4dbeb1cd0..22bd64a56 100644
--- a/pkg/utils/controller/controller.go
+++ b/pkg/utils/controller/controller.go
@@ -63,6 +63,19 @@ var (
 
 	// ErrUserError indicates the error is caused by the user and customer needs to take the action.
 	ErrUserError = errors.New("failed to process the request due to a client error")
+
+	// ErrNoLatestPolicySnapshot indicates that no active (latest) policy snapshot was found for a
+	// placement. This is a transient condition during placement creation or rotation and is typically
+	// recoverable by the next reconcile; callers branch on it via errors.Is to distinguish it from
+	// real API or invariant-violation errors.
+	ErrNoLatestPolicySnapshot = errors.New("no latest policy snapshot found")
+
+	// ErrMultipleActivePolicySnapshots indicates that more than one policy snapshot for the same
+	// placement carries IsLatestSnapshot=true, violating the "exactly one active snapshot"
+	// invariant. Callers branch on it via errors.Is so they can fail fast on a real invariant
+	// violation without misclassifying transient cache errors that share the ErrUnexpectedBehavior
+	// category.
+	ErrMultipleActivePolicySnapshots = errors.New("multiple active policy snapshots found")
 )
 
 // NewUnexpectedBehaviorError returns ErrUnexpectedBehavior type error when err is not nil.
diff --git a/pkg/utils/controller/policy_snapshot_resolver.go b/pkg/utils/controller/policy_snapshot_resolver.go
index 29bb13f04..722a65a16 100644
--- a/pkg/utils/controller/policy_snapshot_resolver.go
+++ b/pkg/utils/controller/policy_snapshot_resolver.go
@@ -110,34 +110,69 @@ func BuildPolicySnapshot(placementObj fleetv1beta1.PlacementObj, policySnapshotI
 	return snapshot
 }
 
-// FetchLatestPolicySnapshot fetches the latest policy snapshot for a given placement.
-// For cluster-scoped placements, it fetches ClusterSchedulingPolicySnapshot.
-// For namespaced placements, it fetches SchedulingPolicySnapshot.
-func FetchLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotList, error) {
-	namespace := placementKey.Namespace
-	name := placementKey.Name
+// IsLatestPolicySnapshot reports whether the given policy snapshot carries the IsLatestSnapshot
+// label set to "true". A non-nil error is returned if the label is missing or its value cannot be
+// parsed as a bool; in those cases the boolean result is false.
+//
+// Callers typically log the error and treat the snapshot as non-latest, since the controller cannot
+// recover from a malformed label by itself — the placement controller will repair it and re-trigger
+// reconciliation.
+func IsLatestPolicySnapshot(snapshot fleetv1beta1.PolicySnapshotObj) (bool, error) {
+	val, ok := snapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel]
+	if !ok {
+		return false, NewUnexpectedBehaviorError(fmt.Errorf("%s label is missing", fleetv1beta1.IsLatestSnapshotLabel))
+	}
+	isLatest, err := strconv.ParseBool(val)
+	if err != nil {
+		return false, NewUnexpectedBehaviorError(fmt.Errorf("failed to parse %s label value %q: %w", fleetv1beta1.IsLatestSnapshotLabel, val, err))
+	}
+	return isLatest, nil
+}
 
+// LookupLatestPolicySnapshot returns the single active (latest) policy snapshot associated with the
+// placement identified by placementKey, enforcing the "exactly one latest snapshot" invariant.
+// A namespaced placementKey selects SchedulingPolicySnapshot; a cluster-scoped key (empty
+// namespace) selects ClusterSchedulingPolicySnapshot.
+//
+// Returns:
+//   - (snapshot, nil) when exactly one latest snapshot exists.
+//   - (nil, ErrNoLatestPolicySnapshot-wrapped error) when none exist. This is a transient state for
+//     newly-created placements or in-progress rotations; callers branch on it via errors.Is.
+//   - (nil, ErrMultipleActivePolicySnapshots-wrapped error) when more than one exists.
+//   - (nil, ErrAPIServerError-wrapped error) when the List call fails. The List error is also
+//     promoted to ErrUnexpectedBehavior for unexpected cache failures; callers must NOT collapse
+//     ErrUnexpectedBehavior with the dedicated invariant sentinels above when classifying retries.
+func LookupLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotObj, error) {
 	var policySnapshotList fleetv1beta1.PolicySnapshotList
-	var listOptions []client.ListOption
-	listOptions = append(listOptions, client.MatchingLabels{
-		fleetv1beta1.PlacementTrackingLabel: name,
-		fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
-	})
-
-	if namespace != "" {
-		// This is a namespaced SchedulingPolicySnapshotList
+	listOptions := []client.ListOption{
+		client.MatchingLabels{
+			fleetv1beta1.PlacementTrackingLabel: placementKey.Name,
+			fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
+		},
+	}
+	if placementKey.Namespace != "" {
 		policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{}
-		listOptions = append(listOptions, client.InNamespace(namespace))
+		listOptions = append(listOptions, client.InNamespace(placementKey.Namespace))
 	} else {
-		// This is a cluster-scoped ClusterSchedulingPolicySnapshotList
 		policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{}
 	}
-
 	if err := k8Client.List(ctx, policySnapshotList, listOptions...); err != nil {
 		klog.ErrorS(err, "Failed to list the policySnapshots associated with the placement", "placement", placementKey)
-		return nil, err
+		return nil, NewAPIServerError(true, err)
+	}
+
+	policySnapshots := policySnapshotList.GetPolicySnapshotObjs()
+	switch len(policySnapshots) {
+	case 0:
+		return nil, fmt.Errorf("%w for placement %s", ErrNoLatestPolicySnapshot, placementKey)
+	case 1:
+		return policySnapshots[0], nil
+	default:
+		// Wrap both the specific invariant sentinel and the categorical ErrUnexpectedBehavior so
+		// callers that branch on either continue to work; the specific sentinel lets retry-gate
+		// callers distinguish this invariant violation from transient ErrUnexpectedBehavior errors.
+		return nil, fmt.Errorf("%w: %w for placement %s: got %d, want 1", ErrUnexpectedBehavior, ErrMultipleActivePolicySnapshots, placementKey, len(policySnapshots))
 	}
-	return policySnapshotList, nil
 }
 
 // ListPolicySnapshots lists all policy snapshots associated with a placement key.
diff --git a/pkg/utils/controller/policy_snapshot_resolver_test.go b/pkg/utils/controller/policy_snapshot_resolver_test.go
index c58f07ac1..777ccb759 100644
--- a/pkg/utils/controller/policy_snapshot_resolver_test.go
+++ b/pkg/utils/controller/policy_snapshot_resolver_test.go
@@ -18,6 +18,8 @@ package controller
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -28,6 +30,7 @@ import (
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 
 	fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
 )
@@ -489,7 +492,7 @@ func TestBuildPolicySnapshot(t *testing.T) {
 	}
 }
 
-func TestFetchLatestPolicySnapshot(t *testing.T) {
+func TestListPolicySnapshots(t *testing.T) {
 	scheme := runtime.NewScheme()
 	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
 		t.Fatalf("Failed to add to scheme: %v", err)
@@ -502,7 +505,7 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 		expectedSnapshots int
 	}{
 		{
-			name: "fetch latest cluster scoped policy snapshots",
+			name: "list cluster scoped policy snapshots",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
@@ -512,7 +515,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -521,15 +523,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-1",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "other-placement-0",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1,
+			expectedSnapshots: 2,
 		},
 		{
-			name: "fetch latest cluster scoped policy snapshots - mixed setup",
+			name: "list cluster scoped policy snapshots - mixed setup",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
@@ -539,7 +548,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -548,7 +556,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-1",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "test-placement-2",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: placementName,
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "other-placement-0",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -559,15 +582,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "test-placement-1",
+						Namespace: policySnapshotNamespace,
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: placementName,
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1, // Should only return cluster-scoped ones for cluster-scoped placement
+			expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement
 		},
 		{
-			name: "fetch latest namespaced policy snapshots",
+			name: "list namespaced policy snapshots",
 			placementKey: types.NamespacedName{
 				Name:      placementName,
 				Namespace: policySnapshotNamespace,
@@ -579,7 +610,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -589,15 +619,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "other-placement-0",
+						Namespace: "other-namespace",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1,
+			expectedSnapshots: 2,
 		},
 		{
-			name: "fetch latest namespaced policy snapshots - mixed setup",
+			name: "list namespaced policy snapshots - mixed setup",
 			placementKey: types.NamespacedName{
 				Name:      placementName,
 				Namespace: policySnapshotNamespace,
@@ -609,7 +647,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -619,7 +656,15 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "other-placement-0",
+						Namespace: "other-namespace",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -629,25 +674,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1, // Should only return namespaced ones for namespaced placement
+			expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement
 		},
 		{
-			name: "no latest policy snapshots found",
+			name: "no policy snapshots found",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
 			existingSnapshots: []client.Object{
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-0",
+						Name: "other-placement-0",
 						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -664,9 +707,9 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 				WithObjects(tc.existingSnapshots...).
 				Build()
 
-			snapshots, err := FetchLatestPolicySnapshot(ctx, fakeClient, tc.placementKey)
+			snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey)
 			if err != nil {
-				t.Fatalf("FetchLatestPolicySnapshot() = %v, want nil", err)
+				t.Fatalf("ListPolicySnapshots() = %v, want nil", err)
 			}
 
 			if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots {
@@ -676,210 +719,204 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 	}
 }
 
-func TestListPolicySnapshots(t *testing.T) {
+func TestIsLatestPolicySnapshot(t *testing.T) {
+	snapshotWithLabels := func(labels map[string]string) fleetv1beta1.PolicySnapshotObj {
+		return &fleetv1beta1.ClusterSchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:   policySnapshotName,
+				Labels: labels,
+			},
+		}
+	}
+
+	testCases := []struct {
+		name     string
+		snapshot fleetv1beta1.PolicySnapshotObj
+		want     bool
+		wantErr  bool
+	}{
+		{
+			name:     "label true returns true",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "true"}),
+			want:     true,
+			wantErr:  false,
+		},
+		{
+			name:     "label false returns false",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "false"}),
+			want:     false,
+			wantErr:  false,
+		},
+		{
+			name:     "label missing returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.PlacementTrackingLabel: placementName}),
+			want:     false,
+			wantErr:  true,
+		},
+		{
+			name:     "nil labels returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(nil),
+			want:     false,
+			wantErr:  true,
+		},
+		{
+			name:     "label malformed returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "yes"}),
+			want:     false,
+			wantErr:  true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := IsLatestPolicySnapshot(tc.snapshot)
+			if (err != nil) != tc.wantErr {
+				t.Errorf("IsLatestPolicySnapshot() error = %v, wantErr %v", err, tc.wantErr)
+			}
+			if err != nil && !errors.Is(err, ErrUnexpectedBehavior) {
+				t.Errorf("IsLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+			}
+			if got != tc.want {
+				t.Errorf("IsLatestPolicySnapshot() = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestLookupLatestPolicySnapshot(t *testing.T) {
 	scheme := runtime.NewScheme()
 	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
 		t.Fatalf("Failed to add to scheme: %v", err)
 	}
 
+	clusterKey := types.NamespacedName{Name: placementName}
+	namespacedKey := types.NamespacedName{Name: placementName, Namespace: policySnapshotNamespace}
+
+	clusterLatestSnapshot := func(name string) *fleetv1beta1.ClusterSchedulingPolicySnapshot {
+		return &fleetv1beta1.ClusterSchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: name,
+				Labels: map[string]string{
+					fleetv1beta1.PlacementTrackingLabel: placementName,
+					fleetv1beta1.IsLatestSnapshotLabel:  "true",
+				},
+			},
+		}
+	}
+	namespacedLatestSnapshot := func(name string) *fleetv1beta1.SchedulingPolicySnapshot {
+		return &fleetv1beta1.SchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      name,
+				Namespace: policySnapshotNamespace,
+				Labels: map[string]string{
+					fleetv1beta1.PlacementTrackingLabel: placementName,
+					fleetv1beta1.IsLatestSnapshotLabel:  "true",
+				},
+			},
+		}
+	}
+
 	testCases := []struct {
 		name              string
 		placementKey      types.NamespacedName
 		existingSnapshots []client.Object
-		expectedSnapshots int
+		wantName          string
+		wantErrSentinel   error // nil means a snapshot is expected; otherwise the error must wrap this sentinel.
 	}{
 		{
-			name: "list cluster scoped policy snapshots",
-			placementKey: types.NamespacedName{
-				Name: placementName,
-			},
+			name:              "exactly one cluster-scoped latest snapshot",
+			placementKey:      clusterKey,
+			existingSnapshots: []client.Object{clusterLatestSnapshot("test-placement-0")},
+			wantName:          "test-placement-0",
+		},
+		{
+			name:              "exactly one namespaced latest snapshot",
+			placementKey:      namespacedKey,
+			existingSnapshots: []client.Object{namespacedLatestSnapshot("test-placement-0")},
+			wantName:          "test-placement-0",
+		},
+		{
+			name:              "no latest snapshot returns ErrNoLatestPolicySnapshot",
+			placementKey:      clusterKey,
+			existingSnapshots: nil,
+			wantErrSentinel:   ErrNoLatestPolicySnapshot,
+		},
+		{
+			name:         "only non-latest snapshots present returns ErrNoLatestPolicySnapshot",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-1",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
+							fleetv1beta1.IsLatestSnapshotLabel:  "false",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2,
+			wantErrSentinel: ErrNoLatestPolicySnapshot,
 		},
 		{
-			name: "list cluster scoped policy snapshots - mixed setup",
-			placementKey: types.NamespacedName{
-				Name: placementName,
-			},
+			name:         "multiple latest snapshots returns ErrMultipleActivePolicySnapshots",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-1",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-2",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
-				// Add namespaced snapshots to test mixed scenario
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
+				clusterLatestSnapshot("test-placement-0"),
+				clusterLatestSnapshot("test-placement-1"),
 			},
-			expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement
+			wantErrSentinel: ErrMultipleActivePolicySnapshots,
 		},
 		{
-			name: "list namespaced policy snapshots",
-			placementKey: types.NamespacedName{
-				Name:      placementName,
-				Namespace: policySnapshotNamespace,
-			},
+			name:         "ignores snapshots from other placements",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
+				clusterLatestSnapshot("test-placement-0"),
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
-						Name:      "other-placement-0",
-						Namespace: "other-namespace",
+						Name: "other-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: "other-placement",
+							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2,
+			wantName: "test-placement-0",
 		},
 		{
-			name: "list namespaced policy snapshots - mixed setup",
-			placementKey: types.NamespacedName{
-				Name:      placementName,
-				Namespace: policySnapshotNamespace,
-			},
+			name:         "ignores non-latest snapshots",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "other-placement-0",
-						Namespace: "other-namespace",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
-				// Add cluster-scoped snapshots to test mixed scenario
+				clusterLatestSnapshot("test-placement-1"),
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
+							fleetv1beta1.IsLatestSnapshotLabel:  "false",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement
+			wantName: "test-placement-1",
 		},
 		{
-			name: "no policy snapshots found",
-			placementKey: types.NamespacedName{
-				Name: placementName,
+			name:         "cluster-scoped lookup ignores a same-named latest namespaced snapshot",
+			placementKey: clusterKey,
+			existingSnapshots: []client.Object{
+				clusterLatestSnapshot("test-placement-0"),
+				namespacedLatestSnapshot("test-placement-0"),
 			},
+			wantName: "test-placement-0",
+		},
+		{
+			name:         "namespaced lookup ignores a same-named latest cluster snapshot",
+			placementKey: namespacedKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
+				namespacedLatestSnapshot("test-placement-0"),
+				clusterLatestSnapshot("test-placement-0"),
 			},
-			expectedSnapshots: 0,
+			wantName: "test-placement-0",
 		},
 	}
 
@@ -891,14 +928,51 @@ func TestListPolicySnapshots(t *testing.T) {
 				WithObjects(tc.existingSnapshots...).
 				Build()
 
-			snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey)
-			if err != nil {
-				t.Fatalf("ListPolicySnapshots() = %v, want nil", err)
+			got, err := LookupLatestPolicySnapshot(ctx, fakeClient, tc.placementKey)
+			if (err != nil) != (tc.wantErrSentinel != nil) {
+				t.Fatalf("LookupLatestPolicySnapshot() error = %v, wantErrSentinel %v", err, tc.wantErrSentinel)
 			}
-
-			if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots {
-				t.Errorf("Expected %d snapshots, got %d", tc.expectedSnapshots, len(snapshots.GetPolicySnapshotObjs()))
+			if tc.wantErrSentinel != nil {
+				if !errors.Is(err, tc.wantErrSentinel) {
+					t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping %v", err, tc.wantErrSentinel)
+				}
+				return
+			}
+			if got == nil || got.GetName() != tc.wantName {
+				t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want name %q", got, tc.wantName)
 			}
 		})
 	}
 }
+
+// TestLookupLatestPolicySnapshot_ListError covers the API-error path: when the underlying List
+// call fails with an error that isUnexpectedCacheError treats as unexpected (i.e. not a
+// *apierrors.StatusError, context.Canceled, or context.DeadlineExceeded), NewAPIServerError
+// promotes it to ErrUnexpectedBehavior.
+func TestLookupLatestPolicySnapshot_ListError(t *testing.T) {
+	scheme := runtime.NewScheme()
+	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
+		t.Fatalf("Failed to add to scheme: %v", err)
+	}
+
+	listErr := fmt.Errorf("simulated cache failure")
+	fakeClient := interceptor.NewClient(
+		fake.NewClientBuilder().WithScheme(scheme).Build(),
+		interceptor.Funcs{
+			List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error {
+				return listErr
+			},
+		},
+	)
+
+	got, err := LookupLatestPolicySnapshot(context.Background(), fakeClient, types.NamespacedName{Name: placementName})
+	if err == nil {
+		t.Fatalf("LookupLatestPolicySnapshot() = %v, nil; want non-nil error", got)
+	}
+	if got != nil {
+		t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want nil", got)
+	}
+	if !errors.Is(err, ErrUnexpectedBehavior) {
+		t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+	}
+}

From 444e487c14e238c78525b8d8c0a231480d2f9d57 Mon Sep 17 00:00:00 2001
From: michaelawyu <chenyu1@microsoft.com>
Date: Mon, 11 May 2026 15:52:36 +1000
Subject: [PATCH 10/19] feat: add new error handling utilities (#679)

---
 pkg/utils/errors/errors.go      | 240 +++++++++++++
 pkg/utils/errors/errors_test.go | 579 ++++++++++++++++++++++++++++++++
 2 files changed, 819 insertions(+)
 create mode 100644 pkg/utils/errors/errors.go
 create mode 100644 pkg/utils/errors/errors_test.go

diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go
new file mode 100644
index 000000000..3f04059e5
--- /dev/null
+++ b/pkg/utils/errors/errors.go
@@ -0,0 +1,240 @@
+/*
+Copyright 2026 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// TO-DO: apply the new error pattern incrementally when it is ready to do so.
+
+// errors features custom error categorizing, wrapping and string formatting logic
+// used in the KubeFleet project.
+package errors
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+)
+
+type ErrCategory string
+
+const (
+	// An error is considered of the unexpected error category when a KubeFleet controller
+	// encounters a situation that it does not know how to handle and cannot be resolved
+	// by itself or by user fixing their inputs.
+	ErrCategoryUnexpected ErrCategory = "unexpected"
+
+	// An error is considered of the transient error category when a KubeFleet controller
+	// encounters a situation that would self-resolve soon, e.g., cache not sync'd yet.
+	ErrCategoryTransient ErrCategory = "transient"
+
+	// An error is considered of the API server error category when it arises during a call
+	// to the Kubernetes API server.
+	ErrCategoryAPIServer ErrCategory = "APIserver"
+
+	// An error is considered of the user error category when it arises due to some
+	// incorrect input or action from the user side and can resolve when the
+	// user fixes their inputs.
+	ErrCategoryUser ErrCategory = "user"
+
+	// An error is considered uncategorized when it does not fit into any of the other
+	// predefined error categories or no category has been explicitly assigned.
+	ErrCategoryUncategorized ErrCategory = "uncategorized"
+)
+
+var _ error = &Error{}
+
+type Error struct {
+	// category is the category of the error.
+	category ErrCategory
+
+	// wrapped is an error that this error wraps.
+	wrapped error
+
+	// desc is the additional description about the wrapped error.
+	desc string
+
+	// attrs are the additional attributes associated with the wrapped error.
+	attrs []interface{}
+}
+
+func (e *Error) categoryWithDefault() ErrCategory {
+	if len(e.category) == 0 {
+		return ErrCategoryUncategorized
+	}
+	return e.category
+}
+
+// Error implements the error interface.
+//
+// Note the output intentionally does not include the additional attributes, so as to keep the cardinality
+// low; to print them out, pass them explicitly to the klog functions.
+func (e *Error) Error() string {
+	switch {
+	case e.wrapped != nil && len(e.desc) == 0:
+		// With wrapped error but no description.
+		return e.wrapped.Error()
+	case e.wrapped != nil:
+		// With wrapped error, with description.
+		return fmt.Sprintf("%s: %s", e.desc, e.wrapped.Error())
+	case e.wrapped == nil && len(e.desc) > 0:
+		// No wrapped error but with description.
+		return e.desc
+	default:
+		// No wrapped error and no description.
+		return "an unknown error occurred"
+	}
+}
+
+// Unwrap returns the wrapped error, so that errors.Is and errors.As can traverse the error chain as
+// expected.
+func (e *Error) Unwrap() error {
+	return e.wrapped
+}
+
+// Wraps wraps an existing error with a higher-level description and optional key-value attributes,
+// and returns a new *Error. If the existing error (or its children) is already an *Error,
+// Wraps will surface the category and the attributes to the top-level.
+func Wraps(err error, desc string, kvs ...interface{}) *Error {
+	// Find if any of error down in the chain is already an *Error; if so, use its category and surface its
+	// attributes to the current level.
+	var category ErrCategory
+	var mergedKVs []interface{}
+	var childError *Error
+	if errors.As(err, &childError) {
+		category = childError.category
+		mergedKVs = append(mergedKVs, childError.attrs...)
+	}
+	mergedKVs = append(mergedKVs, kvs...)
+	if len(category) == 0 {
+		category = ErrCategoryUncategorized
+	}
+
+	return &Error{
+		category: category,
+		wrapped:  err,
+		desc:     desc,
+		attrs:    mergedKVs,
+	}
+}
+
+// Args returns the k-v attributes associated with an *Error (and its children).
+//
+// The function is designed to work with klog calls; to use it, call klog.ErrorS with:
+//
+// klog.ErrorS(err, "additional top-level error description", Args(err)...).
+//
+// If you need to add additional key-value attributes at the top level in addition to those
+// already captured from the error chain, append them to Args as follows:
+//
+// klog.ErrorS(err, "additional top-level error description", append(Args(err), "k", "v")...).
+func Args(err error, kvs ...interface{}) []interface{} {
+	var compositeKVs []interface{}
+	var childError *Error
+	if errors.As(err, &childError) {
+		// Make sure that the error category is always included as the first kv pair.
+		compositeKVs = append(compositeKVs, "errCategory", childError.categoryWithDefault())
+		compositeKVs = append(compositeKVs, childError.attrs...)
+		compositeKVs = append(compositeKVs, kvs...)
+	}
+	return compositeKVs
+}
+
+// CallerFrameOverview is a simple struct that summarizes a single call frame.
+type CallerFrameOverview struct {
+	Function string
+	File     string
+	Line     int
+}
+
+// captureCallers captures the current call stack and returns a slice of CallerFrameOverview
+// summarizing each frame in the stack, skipping the top 3 frames for simplicity.
+func captureCallers() []CallerFrameOverview {
+	pcs := make([]uintptr, 32)
+	// Skip 3 frames (the runtime.Callers function itself, this utility function, and the caller of this utility)
+	// when capturing the call stack for simplicity reasons.
+	n := runtime.Callers(3, pcs)
+	frames := runtime.CallersFrames(pcs[:n])
+	var callerFrames []CallerFrameOverview
+	for {
+		frame, more := frames.Next()
+		callerFrames = append(callerFrames, CallerFrameOverview{
+			Function: frame.Function,
+			File:     frame.File,
+			Line:     frame.Line,
+		})
+		if !more {
+			break
+		}
+	}
+	return callerFrames
+}
+
+// NewUnexpectedError creates a new Error categorized as ErrCategoryUnexpected. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewUnexpectedError(err error, desc string, kvs ...interface{}) *Error {
+	// For unexpected errors, collect the caller frames.
+	//
+	// Note (chenyu1): previously for unexpected errors, we use the debug.Stack() to print out a full stack
+	// trace in the logs. Effective as it is, the operation is expensive and it always captures the full stack,
+	// where the top of the stack is always the error handling utility function/method itself rather than
+	// the actual caller site where the error originates. Furthermore, as debug.Stack() returns a multi-line string,
+	// often the logging backend cannot render it properly, making it difficult to read/summarize. To address this,
+	// in this new implementation we switch to runtime.Callers and runtime.CallersFrames to capture the call stack
+	// in a structured way with the unrelated stacks skipped.
+	frames := captureCallers()
+	kvs = append(kvs, "callers", frames)
+
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryUnexpected
+	return newErr
+}
+
+// NewTransientError creates a new Error categorized as ErrCategoryTransient. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewTransientError(err error, desc string, kvs ...interface{}) *Error {
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryTransient
+	return newErr
+}
+
+// NewUserError creates a new Error categorized as ErrCategoryUser. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewUserError(err error, desc string, kvs ...interface{}) *Error {
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryUser
+	return newErr
+}
+
+// NewAPIServerError creates a new Error categorized as ErrCategoryAPIServer. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewAPIServerError(err error, desc string, cached bool, kvs ...interface{}) *Error {
+	// For API server errors, collect caching and HTTP response related information.
+	kvs = append(kvs, "cached", cached)
+	kvs = append(kvs, "APIServerErrReason", apierrors.ReasonForError(err))
+
+	apiErr, ok := err.(apierrors.APIStatus)
+	if ok {
+		kvs = append(kvs, "APIServerResHTTPCode", apiErr.Status().Code)
+	}
+
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryAPIServer
+	return newErr
+}
diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go
new file mode 100644
index 000000000..95d406477
--- /dev/null
+++ b/pkg/utils/errors/errors_test.go
@@ -0,0 +1,579 @@
+/*
+Copyright 2026 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package errors
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log/slog"
+	"strings"
+	"testing"
+
+	"github.com/go-logr/logr"
+	"github.com/google/go-cmp/cmp"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/klog/v2"
+)
+
+var (
+	// wrappedErrComparer is a cmp option that compares the .wrapped field by pointer equality,
+	// avoiding the need to recurse into unexported fields of standard-library error types.
+	wrappedErrComparer = cmp.FilterPath(
+		func(p cmp.Path) bool { return p.Last().String() == ".wrapped" },
+		cmp.Comparer(func(x, y error) bool { return x == y }), //nolint:errorlint
+	)
+)
+
+// TestCommonUsePatterns demonstrates how the common error handling patterns are enabled by the errors package.
+func TestCommonUsePatterns(t *testing.T) {
+	var bytesBuf bytes.Buffer
+	slogHandler := slog.NewTextHandler(&bytesBuf, &slog.HandlerOptions{Level: slog.LevelDebug})
+	logger := logr.FromSlogHandler(slogHandler)
+	klog.SetLogger(logger)
+
+	// Example 1: handling API server errors with rich context and structured logging.
+	rawAPIServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "work")
+	categorizedAPIServerErr := NewAPIServerError(rawAPIServerErr, "failed to create namespace", false, "k1", "v1")
+	wrappedErr := Wraps(categorizedAPIServerErr, "additional high-level error description", "k2", "v2", "k3", "v3")
+	klog.ErrorS(wrappedErr, "additional top/controller-level error description", Args(wrappedErr)...)
+	// If there needs to be kv attributes at the top level as well:
+	// klog.ErrorS(wrappedErr, "additional top/controller-level error description", append(Args(wrappedErr), "k4", "v4", "k5", "v5")...)
+	klog.Flush()
+
+	outputStr := readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:53: time=2026-04-29T02:39:44.853+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: failed to create namespace: namespaces \"work\" already exists" errCategory=APIserver k1=v1 cached=false APIServerErrReason=AlreadyExists APIServerResHTTPCode=409 k2=v2 k3=v3
+	wantSubStrings := []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: failed to create namespace: namespaces \\\"work\\\" already exists\"",
+		"errCategory=APIserver",
+		"k1=v1",
+		"k2=v2",
+		"k3=v3",
+		"cached=false",
+		"APIServerErrReason=AlreadyExists",
+		"APIServerResHTTPCode=409",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+
+	// Example 2: handling unexpected errors with rich context and structured logging.
+	rawUtilityCallErr := fmt.Errorf("cannot calculate resource hash")
+	categorizedUnexpectedErr := NewUnexpectedError(rawUtilityCallErr, "additional low-level error description", "k1", "v1")
+	wrappedUnexpectedErr := Wraps(categorizedUnexpectedErr, "additional high-level error description", "k2", "v2")
+	klog.ErrorS(wrappedUnexpectedErr, "additional top/controller-level error description", Args(wrappedUnexpectedErr)...)
+	klog.Flush()
+
+	outputStr = readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2
+	wantSubStrings = []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: additional low-level error description: cannot calculate resource hash\"",
+		"errCategory=unexpected",
+		"k1=v1",
+		"callers=",
+		"Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns",
+		"k2=v2",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+
+	// Example 3: handling uncategorized errors with rich context and structured logging.
+	rawErr := fmt.Errorf("something went wrong")
+	uncategorizedErr := Wraps(rawErr, "additional low-level error description", "k1", "v1")
+	wrappedUncategorizedErr := Wraps(uncategorizedErr, "additional high-level error description", "k2", "v2")
+	klog.ErrorS(wrappedUncategorizedErr, "additional top/controller-level error description", Args(wrappedUncategorizedErr)...)
+	klog.Flush()
+
+	outputStr = readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:119: time=2026-04-29T02:59:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: something went wrong" errCategory=uncategorized k1=v1 k2=v2
+	wantSubStrings = []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: additional low-level error description: something went wrong\"",
+		"errCategory=uncategorized",
+		"k1=v1",
+		"k2=v2",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+}
+
+func TestErrorStringer(t *testing.T) {
+	wrappedErr := fmt.Errorf("something went wrong")
+
+	testCases := []struct {
+		name    string
+		err     *Error
+		wantMsg string
+	}{
+		{
+			name:    "no wrapped error, no description",
+			err:     &Error{},
+			wantMsg: "an unknown error occurred",
+		},
+		{
+			name:    "no wrapped error, with description",
+			err:     &Error{desc: "operation failed"},
+			wantMsg: "operation failed",
+		},
+		{
+			name:    "with wrapped error, no description",
+			err:     &Error{wrapped: wrappedErr},
+			wantMsg: "something went wrong",
+		},
+		{
+			name:    "with wrapped error, with description",
+			err:     &Error{wrapped: wrappedErr, desc: "operation failed"},
+			wantMsg: "operation failed: something went wrong",
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.err.Error()
+			if got != tc.wantMsg {
+				t.Errorf("Error() = %q, want %q", got, tc.wantMsg)
+			}
+		})
+	}
+}
+
+func TestWraps(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryUser,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "wrapping a plain error, no kvs",
+			err:  plainErr,
+			desc: "high level desc",
+			wantErr: &Error{
+				category: ErrCategoryUncategorized,
+				wrapped:  plainErr,
+				desc:     "high level desc",
+			},
+		},
+		{
+			name: "wrapping a plain error, with kvs",
+			err:  plainErr,
+			desc: "high level desc",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryUncategorized,
+				wrapped:  plainErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		{
+			name: "wrapping an *Error, no extra kvs",
+			err:  categorizedChildErr,
+			desc: "high level desc",
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"ck1", "cv1"},
+			},
+		},
+		{
+			name: "wrapping an *Error, with extra kvs",
+			err:  categorizedChildErr,
+			desc: "high level desc",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := Wraps(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("Wraps() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestArgs(t *testing.T) {
+	testCases := []struct {
+		name    string
+		err     error
+		kvs     []interface{}
+		wantKVs []interface{}
+	}{
+		{
+			name:    "plain error (not an *Error)",
+			err:     fmt.Errorf("plain error"),
+			wantKVs: nil,
+		},
+		{
+			name:    "*Error with no category set, no attrs, no extra kvs",
+			err:     &Error{},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUncategorized},
+		},
+		{
+			name:    "*Error with explicit category, no attrs, no extra kvs",
+			err:     &Error{category: ErrCategoryUser},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUser},
+		},
+		{
+			name:    "*Error with explicit category and attrs, no extra kvs",
+			err:     &Error{category: ErrCategoryAPIServer, attrs: []interface{}{"k1", "v1", "k2", "v2"}},
+			wantKVs: []interface{}{"errCategory", ErrCategoryAPIServer, "k1", "v1", "k2", "v2"},
+		},
+		{
+			name:    "*Error with explicit category and attrs, with extra kvs",
+			err:     &Error{category: ErrCategoryUnexpected, attrs: []interface{}{"k1", "v1"}},
+			kvs:     []interface{}{"k2", "v2"},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUnexpected, "k1", "v1", "k2", "v2"},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotKVs := Args(tc.err, tc.kvs...)
+			if diff := cmp.Diff(gotKVs, tc.wantKVs); diff != "" {
+				t.Errorf("Args() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestUnwrap(t *testing.T) {
+	innerErr := fmt.Errorf("inner error")
+	testCases := []struct {
+		name    string
+		err     *Error
+		wantErr error
+	}{
+		{
+			name:    "nil wrapped error",
+			err:     &Error{},
+			wantErr: nil,
+		},
+		{
+			name:    "non-nil wrapped error",
+			err:     &Error{wrapped: innerErr},
+			wantErr: innerErr,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := tc.err.Unwrap()
+			// Note: here we actually want to just compare the error values by pointer equality.
+			if gotErr != tc.wantErr { //nolint:errorlint
+				t.Errorf("Unwrap() = %v, want %v", gotErr, tc.wantErr)
+			}
+		})
+	}
+}
+
+func TestNewTransientError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryUser,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "plain error, no kvs",
+			err:  plainErr,
+			desc: "transient condition",
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  plainErr,
+				desc:     "transient condition",
+			},
+		},
+		{
+			name: "plain error, with kvs",
+			err:  plainErr,
+			desc: "transient condition",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  plainErr,
+				desc:     "transient condition",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		// This is technically speaking possible, but in practice users should not need to override categories.
+		{
+			name: "wrapping an *Error, category overridden to transient",
+			err:  categorizedChildErr,
+			desc: "transient condition",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  categorizedChildErr,
+				desc:     "transient condition",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewTransientError(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewTransientError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewUserError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryTransient,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "plain error, no kvs",
+			err:  plainErr,
+			desc: "invalid input",
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  plainErr,
+				desc:     "invalid input",
+			},
+		},
+		{
+			name: "plain error, with kvs",
+			err:  plainErr,
+			desc: "invalid input",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  plainErr,
+				desc:     "invalid input",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		{
+			// This is technically speaking possible, but in practice users should not need to override categories.
+			name: "wrapping an *Error, category overridden to user",
+			err:  categorizedChildErr,
+			desc: "invalid input",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "invalid input",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewUserError(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewUserError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewAPIServerError(t *testing.T) {
+	apiServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "fleet")
+	plainErr := fmt.Errorf("plain error")
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		cached  bool
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name:   "API server error, not cached, no extra kvs",
+			err:    apiServerErr,
+			desc:   "failed to create namespace",
+			cached: false,
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  apiServerErr,
+				desc:     "failed to create namespace",
+				attrs: []interface{}{
+					"cached", false,
+					"APIServerErrReason", apierrors.ReasonForError(apiServerErr),
+					"APIServerResHTTPCode", apiServerErr.Status().Code,
+				},
+			},
+		},
+		{
+			name:   "API server error, cached, with extra kvs",
+			err:    apiServerErr,
+			desc:   "failed to create namespace",
+			cached: true,
+			kvs:    []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  apiServerErr,
+				desc:     "failed to create namespace",
+				attrs: []interface{}{
+					"k1", "v1",
+					"cached", true,
+					"APIServerErrReason", apierrors.ReasonForError(apiServerErr),
+					"APIServerResHTTPCode", apiServerErr.Status().Code,
+				},
+			},
+		},
+		{
+			name:   "non API server error, no HTTPCode in attrs",
+			err:    plainErr,
+			desc:   "unexpected failure",
+			cached: false,
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  plainErr,
+				desc:     "unexpected failure",
+				attrs: []interface{}{
+					"cached", false,
+					"APIServerErrReason", apierrors.ReasonForError(plainErr),
+				},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewAPIServerError(tc.err, tc.desc, tc.cached, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewAPIServerError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewUnexpectedError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	gotErr := NewUnexpectedError(plainErr, "low-level desc", "k1", "v1")
+
+	// Verify the category.
+	if gotErr.category != ErrCategoryUnexpected {
+		t.Errorf("NewUnexpectedError() category = %v, want %v", gotErr.category, ErrCategoryUnexpected)
+	}
+	// Verify wrapped error and description via Error().
+	wantMsg := "low-level desc: plain error"
+	if got := gotErr.Error(); got != wantMsg {
+		t.Errorf("NewUnexpectedError() Error() = %q, want %q", got, wantMsg)
+	}
+	// Verify that the user-provided kvs appear at the start of attrs.
+	if len(gotErr.attrs) < 2 || gotErr.attrs[0] != "k1" || gotErr.attrs[1] != "v1" {
+		t.Errorf("NewUnexpectedError() attrs = %v, want k1=v1 at start", gotErr.attrs)
+	}
+	// Verify that the callers key is present in attrs and its frame count is not zero.
+	found := false
+	canCast := false
+	var frames []CallerFrameOverview
+	for i := 0; i < len(gotErr.attrs)-1; i++ {
+		if gotErr.attrs[i] == "callers" {
+			found = true
+			frames, canCast = gotErr.attrs[i+1].([]CallerFrameOverview)
+			break
+		}
+	}
+	if !found {
+		t.Errorf("NewUnexpectedError() attrs = %v, want callers key present", gotErr.attrs)
+	}
+	if !canCast {
+		t.Errorf("NewUnexpectedError() callers value has type %T, want []CallerFrameOverview", gotErr.attrs)
+	}
+	if len(frames) == 0 {
+		t.Errorf("NewUnexpectedError() callers = %v, want non-zero frame count", frames)
+	}
+
+	// Verify the first frame (the current test code).
+	callerFunc := frames[0].Function
+	if callerFunc != "github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestNewUnexpectedError" {
+		t.Errorf("NewUnexpectedError() first caller function = %s, want TestNewUnexpectedError", callerFunc)
+	}
+	callerFile := frames[0].File
+	if !strings.HasSuffix(callerFile, "pkg/utils/errors/errors_test.go") {
+		t.Errorf("NewUnexpectedError() first caller file = %s, want to end with pkg/utils/errors/errors_test.go", callerFile)
+	}
+	// Note: skip verifying the exact line number as it may change with code edits.
+}
+
+func readFromBuffer(t *testing.T, buf *bytes.Buffer) string {
+	output := make([]byte, 1000)
+	n, err := buf.Read(output)
+	if err != nil && err != io.EOF {
+		t.Fatalf("Failed to read from bytes buffer: %v", err)
+	}
+	output = output[:n]
+	outputStr := string(output)
+	return outputStr
+}

From 4846c3c3f72daa9e1030c5794e93c2171c4cc97e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 11 May 2026 15:18:08 -0700
Subject: [PATCH 11/19] chore: bump github/codeql-action from 4.35.2 to 4.35.3
 (#699)

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.35.2 to 4.35.3.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/95e58e9a2cdfd71adc6e0353d5c52f41a045d225...e46ed2cbd01164d986452f91f178727624ae40d7)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-version: 4.35.3
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Simon Waight <simon.waight@gmail.com>
---
 .github/workflows/codeql-analysis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index ccf33b9b8..2d761329b 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -42,7 +42,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -56,7 +56,7 @@ jobs:
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/autobuild@e46ed2cbd01164d986452f91f178727624ae40d7 # v4
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -69,4 +69,4 @@ jobs:
     #   ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4

From 4a5301ff572aa311563781be9d54aab41f45ec3f Mon Sep 17 00:00:00 2001
From: michaelawyu <chenyu1@microsoft.com>
Date: Tue, 12 May 2026 11:09:15 +1000
Subject: [PATCH 12/19] feat: guardrail: added additional service account
 protection (#702)

---
 pkg/webhook/webhook.go            | 18 +++++--
 test/e2e/fleet_guard_rail_test.go | 86 +++++++++++++++++++++++++++++++
 test/e2e/resources_test.go        |  1 +
 test/e2e/utils_test.go            |  2 +-
 4 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go
index e3bf5beb9..9f2d921b0 100644
--- a/pkg/webhook/webhook.go
+++ b/pkg/webhook/webhook.go
@@ -641,10 +641,20 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho
 	}
 
 	// Build core v1 resources list, conditionally including pods if workload is enabled
-	coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName,
-		limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status",
-		replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName,
-		serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"}
+	coreV1Resources := []string{
+		bindingResourceName,
+		configMapResourceName,
+		endPointResourceName,
+		limitRangeResourceName,
+		persistentVolumeClaimsName, persistentVolumeClaimsName + "/status",
+		podTemplateResourceName,
+		podResourceName, podResourceName + "/status",
+		replicationControllerResourceName, replicationControllerResourceName + "/status",
+		resourceQuotaResourceName, resourceQuotaResourceName + "/status",
+		secretResourceName,
+		serviceAccountResourceName, serviceAccountResourceName + "/token",
+		servicesResourceName, servicesResourceName + "/status",
+	}
 
 	namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{
 		Operations: cuOperations,
diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go
index 322af5ff1..043304783 100644
--- a/test/e2e/fleet_guard_rail_test.go
+++ b/test/e2e/fleet_guard_rail_test.go
@@ -24,6 +24,7 @@ import (
 	. "github.com/onsi/gomega"
 	admissionv1 "k8s.io/api/admission/v1"
 	appsv1 "k8s.io/api/apps/v1"
+	authenticationv1 "k8s.io/api/authentication/v1"
 	corev1 "k8s.io/api/core/v1"
 	policyv1 "k8s.io/api/policy/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
@@ -1304,3 +1305,88 @@ var _ = Describe("fleet guard rail restrict internal fleet resources from being
 		Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &ise), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &iseGVK, "", types.NamespacedName{Name: ise.Name, Namespace: ise.Namespace}))).Should(Succeed())
 	})
 })
+
+var _ = Describe("fleet guard rail webhook tests for service accounts in restricted namespaces", Ordered, func() {
+	svcAccountGVK := metav1.GroupVersionKind{Group: corev1.SchemeGroupVersion.Group, Version: corev1.SchemeGroupVersion.Version, Kind: "ServiceAccount"}
+	tokenReqGVK := metav1.GroupVersionKind{Group: "authentication.k8s.io", Version: "v1", Kind: "TokenRequest"}
+
+	svcAccountName := fmt.Sprintf(svcAccountNameTemplate, GinkgoParallelProcess())
+	svcAccountToAddName := "added-sa"
+	kubeSystemNamespaceName := "kube-system"
+
+	var svcAccount *corev1.ServiceAccount
+	BeforeAll(func() {
+		// Create a service account in the kube-system namespace.
+		svcAccount = &corev1.ServiceAccount{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      svcAccountName,
+				Namespace: kubeSystemNamespaceName,
+			},
+		}
+		Expect(hubClient.Create(ctx, svcAccount)).Should(Succeed(), "Failed to create service account")
+	})
+
+	AfterAll(func() {
+		// Ensure the removal of the created service account.
+		Eventually(func() error {
+			svcAccount := corev1.ServiceAccount{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      svcAccountName,
+					Namespace: kubeSystemNamespaceName,
+				},
+			}
+			if err := hubClient.Delete(ctx, &svcAccount); err != nil {
+				if k8sErrors.IsNotFound(err) {
+					return nil
+				}
+				return fmt.Errorf("failed to delete service account: %w", err)
+			}
+
+			if err := hubClient.Get(ctx, types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, &corev1.ServiceAccount{}); err != nil {
+				if k8sErrors.IsNotFound(err) {
+					return nil
+				}
+				return fmt.Errorf("failed to retrieve service account: %w", err)
+			}
+			return fmt.Errorf("service account still exists")
+		}, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete service account")
+	})
+
+	It("should deny creation of service accounts in kube-system namespace for non-whitelisted users", func() {
+		newSvcAccount := &corev1.ServiceAccount{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      svcAccountToAddName,
+				Namespace: kubeSystemNamespaceName,
+			},
+		}
+		wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat,
+			testUser,
+			utils.GenerateGroupString(testGroups),
+			admissionv1.Create,
+			svcAccountGVK, "",
+			types.NamespacedName{Name: newSvcAccount.Name, Namespace: newSvcAccount.Namespace},
+		)
+		Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, newSvcAccount), wantErrMsg)).Should(Succeed(), "Failed to deny creation of service account in kube-system namespace")
+	})
+
+	It("should deny access to service account token subresource in restricted namespace for non-whitelisted users", func() {
+		tokenRequest := &authenticationv1.TokenRequest{
+			ObjectMeta: metav1.ObjectMeta{
+				Namespace: kubeSystemNamespaceName,
+				Name:      svcAccountName,
+			},
+			Spec: authenticationv1.TokenRequestSpec{
+				Audiences:         []string{"experimental"},
+				ExpirationSeconds: ptr.To(int64(3600)),
+			},
+		}
+		wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat,
+			testUser,
+			utils.GenerateGroupString(testGroups),
+			admissionv1.Create,
+			tokenReqGVK, "token",
+			types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName},
+		)
+		Expect(checkIfStatusErrorWithMessage(impersonateHubClient.SubResource("token").Create(ctx, svcAccount, tokenRequest), wantErrMsg)).Should(Succeed(), "Failed to deny access to service account token subresource in restricted namespace")
+	})
+})
diff --git a/test/e2e/resources_test.go b/test/e2e/resources_test.go
index b4cb066cf..f096b52c0 100644
--- a/test/e2e/resources_test.go
+++ b/test/e2e/resources_test.go
@@ -54,6 +54,7 @@ const (
 	clusterStagedUpdateRunNameWithSubIndexTemplate = "cur-%d-%d"
 	stagedUpdateRunStrategyNameTemplate            = "sus-%d"
 	stagedUpdateRunNameWithSubIndexTemplate        = "sur-%d-%d"
+	svcAccountNameTemplate                         = "sa-%d"
 
 	customDeletionBlockerFinalizer = "kubernetes-fleet.io/custom-deletion-blocker-finalizer"
 	workNamespaceLabelName         = "process"
diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go
index 9e7da13b3..48a7bdc0e 100644
--- a/test/e2e/utils_test.go
+++ b/test/e2e/utils_test.go
@@ -1749,7 +1749,7 @@ func checkIfStatusErrorWithMessage(err error, errorMsg string) error {
 			return nil
 		}
 	}
-	return fmt.Errorf("error message %s not found in error %w", errorMsg, err)
+	return fmt.Errorf("error message (%s) not found in given error (%w)", errorMsg, err)
 }
 
 // buildOwnerReference builds an owner reference given a cluster and a placement name.

From 359d2b2f722b200bd7e190ffc3e5b76ef5cbd466 Mon Sep 17 00:00:00 2001
From: Simon Waight <simon.waight@gmail.com>
Date: Tue, 12 May 2026 13:55:51 +1000
Subject: [PATCH 13/19] chore: update community meetings. (#701)

Update community meetings.

Signed-off-by: Simon Waight <simon.waight@microsoft.com>
---
 README.md | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index d5782ce8e..776e3ecd4 100644
--- a/README.md
+++ b/README.md
@@ -9,13 +9,14 @@
 ![cncf_logo](screenshots/cncf-logo.png)
 
 KubeFleet is a sandbox project of the [Cloud Native Computing Foundation](https://cncf.io/) (CNCF) that works on any Kubernetes cluster.
-We are working towards the vision where we will eventually be able to treat each Kubernetes cluster as a [cattle](https://cloudscaling.com/blog/cloud-computing/the-history-of-pets-vs-cattle/).
+
+We are working towards the vision to enable you treat your Kubernetes clusters as a flexible container runtime environment where workloads can run anywhere within your fleet.
 
 ## What is KubeFleet?
-KubeFleet contains a set of Kubernetes controllers and CRDs which provide an advanced cloud-native solution for multi-cluster application management.
 
-Use KubeFleet to schedule workloads smartly, roll out changes progressively, and perform administrative tasks easily, across a group of Kubernetes clusters on any cloud or on-premises clusters.
+KubeFleet contains a set of Kubernetes controllers and CRDs which provide an advanced cloud-native solution for multi-cluster application management.
 
+Use KubeFleet to schedule workloads intelligently, roll out changes progressively, and perform administrative tasks easily, across a group of Kubernetes clusters in any location.
 
 ## Quickstart
 
@@ -24,12 +25,15 @@ Use KubeFleet to schedule workloads smartly, roll out changes progressively, and
 ## Key benefits and capabilities
 
 ### Centralized policy-driven fleet governance
+
 KubeFleet utilizes a hub-spoke architecture that creates a single control plane for the fleet. It allows fleet administrators to apply uniform cloud native policies on every member cluster, whether they reside in public clouds, private data centers, or edge locations. This greatly simplifies governance across large, geographically distributed fleets spanning hybrid and multi-cloud environments.
 
 ### Progressive Rollouts with Safeguards
+
 KubeFleet provides a cloud native progressive rollout plans sequence updates across the entire fleet with health verification at each step. The application owner can pause or rollback to any previous versions when they observe failures, limiting blast radius. This keeps multi-cluster application deployments reliable and predictable spanning edge, on-premises, and cloud environments.
 
 ### Powerful Multi-Cluster Scheduling
+
 KubeFleet's scheduler evaluates member cluster properties, available capacity, and declarative placement policies to select optimal destinations for workloads. It supports cluster affinity and anti-affinity rules, topology spread constraints to distribute workloads across failure domains, and resource-based placement to ensure sufficient compute, memory, and storage. The scheduler continuously reconciles as fleet conditions change, automatically adapting to cluster additions, removals, or capacity shifts across edge, on-premises, and cloud environments. For more details, please refer to the [KubeFleet website](https://kubefleet.dev/docs/).
 
 ## Documentation
@@ -46,22 +50,14 @@ You can reach the KubeFleet community and developers via the following channels:
 
 ## Community Meetings
 
-March 2026: we're currently revamping our community call schedule and will have more to share soon.
-
-Future plans will land on our [community repository](https://github.com/kubefleet-dev/community).
+Please refer to the [calendar](https://zoom-lfx.platform.linuxfoundation.org/meetings/kubefleet?view=month) for the latest schedule. 
 
-<!--
-We host bi-weekly community meetings that alternate between US/EU and APAC friendly time. In these sessions the community will showcase demos and discuss the current and future state of the project.
+We aim to hold one meeting per month for each of our US/EU and APAC/India communities.
 
-Please refer to the [calendar](https://zoom-lfx.platform.linuxfoundation.org/meetings/kubefleet?view=month) for the latest schedule:
-* Wednesdays at 09:30am PT [US/EU](https://zoom-lfx.platform.linuxfoundation.org/meeting/93624014488?password=27667a5c-9238-4b5a-b4d8-96daadaa9fa4) (weekly). [Convert to your timezone](https://dateful.com/convert/pacific-time-pt?t=930am).
-* Thursday at 9:00am CST [APAC](https://zoom-lfx.platform.linuxfoundation.org/meeting/98901589453?password=9ab588fd-1214-40c3-84c2-757c124e984f) (biweekly).  [Convert to your timezone](https://dateful.com/convert/beijing-china?t=9am).
-
-For more meeting information, minutes and recordings, please see the [KubeFleet community meeting doc](https://docs.google.com/document/d/1iMcHn11fPlb9ZGoMHiGEBvdIc44W1CjZvsPH3eBg6pA/edit?usp=sharing).
-
--->
+For more meeting information please see the [KubeFleet community repository](https://github.com/kubefleet-dev/community#community-meetings).
 
 ## Code of Conduct
+
 Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). See the [Code of Conduct](CODE_OF_CONDUCT.md) for more information.
 
 ## Contributing
@@ -69,8 +65,8 @@ Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://git
 The [contribution guide](CONTRIBUTING.md) covers everything you need to know about how you can contribute to KubeFleet.
 
 ## Support
-For more information, see [SUPPORT](SUPPORT.md).
 
+For more information, see [SUPPORT](SUPPORT.md).
 
 [1]:  https://img.shields.io/github/v/release/kubefleet-dev/kubefleet
 [2]:  https://goreportcard.com/badge/go.goms.io/fleet
@@ -79,4 +75,5 @@ For more information, see [SUPPORT](SUPPORT.md).
 [5]:  https://img.shields.io/github/go-mod/go-version/kubefleet-dev/kubefleet
 
 Copyright The KubeFleet Authors.
-The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/trademark-usage/).
+
+The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/trademark-usage/).
\ No newline at end of file

From e12aa9f38cc13332c228170669d95d215518d5ab Mon Sep 17 00:00:00 2001
From: Wantong <jwtty0919@gmail.com>
Date: Mon, 11 May 2026 22:27:58 -0700
Subject: [PATCH 14/19] fix: add securityContext to hub-agent and member-agent
 deployment (#703)

---
 charts/hub-agent/templates/deployment.yaml    | 23 +++++++++++++++----
 charts/member-agent/templates/deployment.yaml | 19 +++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml
index c1acd945c..3bdca63fa 100644
--- a/charts/hub-agent/templates/deployment.yaml
+++ b/charts/hub-agent/templates/deployment.yaml
@@ -19,10 +19,23 @@ spec:
         {{- include "hub-agent.selectorLabels" . | nindent 8 }}
     spec:
       serviceAccountName: {{ include "hub-agent.fullname" . }}-sa
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        fsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
       containers:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           args:
             - --leader-elect=true
             - --enable-webhook={{ .Values.enableWebhook }}
@@ -77,22 +90,24 @@ spec:
                 fieldPath: metadata.namespace
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
-          {{- if .Values.useCertManager }}
           volumeMounts:
           - name: webhook-cert
             # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go
             mountPath: /tmp/k8s-webhook-server/serving-certs
+            {{- if .Values.useCertManager }}
             readOnly: true
-          {{- end }}
-      {{- if .Values.useCertManager }}
+            {{- end }}
       volumes:
       - name: webhook-cert
+        {{- if .Values.useCertManager }}
         secret:
           secretName: {{ .Values.webhookCertSecretName }}
           # defaultMode 0444 (read for all) allows the container process to read the certs
           # regardless of the user/group it runs as
           defaultMode: 0444
-      {{- end }}
+        {{- else }}
+        emptyDir: {}
+        {{- end }}
       {{- with .Values.affinity }}
       affinity:
         {{- toYaml . | nindent 8 }}
diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml
index ae83e5d95..30b564c4d 100644
--- a/charts/member-agent/templates/deployment.yaml
+++ b/charts/member-agent/templates/deployment.yaml
@@ -16,10 +16,23 @@ spec:
     spec:
       restartPolicy: Always
       serviceAccountName: {{ include "member-agent.fullname" . }}-sa
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        fsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
       containers:
         - name: {{ include "member-agent.fullname" . }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           ports:
             - name: http
               containerPort: 80
@@ -140,6 +153,12 @@ spec:
         - name: refresh-token
           image: "{{ .Values.refreshtoken.repository }}:{{ .Values.refreshtoken.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.refreshtoken.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           args:
             {{- $provider := .Values.config.provider }}
             - {{ $provider }}

From 57640d658fbc9e4f6394cd688539d1d8a87e3340 Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Tue, 12 May 2026 13:20:21 -0700
Subject: [PATCH 15/19] test: cover snapshot cross-scope isolation in
 DeleteResourceSnapshots (#669)

---
 go.mod                                        |  2 +-
 .../resource_snapshot_resolver_test.go        | 65 +++++++++++++------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/go.mod b/go.mod
index 8dfac998d..1b36cfd48 100644
--- a/go.mod
+++ b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/Azure/karpenter-provider-azure v1.5.1
 	github.com/crossplane/crossplane-runtime/v2 v2.1.0
 	github.com/evanphx/json-patch/v5 v5.9.11
+	github.com/go-logr/logr v1.4.3
 	github.com/google/go-cmp v0.7.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
@@ -65,7 +66,6 @@ require (
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
 	github.com/go-errors/errors v1.4.2 // indirect
-	github.com/go-logr/logr v1.4.3 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
 	github.com/go-openapi/jsonpointer v0.21.1 // indirect
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
diff --git a/pkg/utils/controller/resource_snapshot_resolver_test.go b/pkg/utils/controller/resource_snapshot_resolver_test.go
index 22670ad81..16b8d6941 100644
--- a/pkg/utils/controller/resource_snapshot_resolver_test.go
+++ b/pkg/utils/controller/resource_snapshot_resolver_test.go
@@ -1683,10 +1683,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 	}
 
 	tests := []struct {
-		name          string
-		placementObj  fleetv1beta1.PlacementObj
-		objects       []client.Object
-		expectedError string
+		name         string
+		placementObj fleetv1beta1.PlacementObj
+		objects      []client.Object
+		// Snapshots that must still exist after the delete. Nil means every seeded object should be deleted.
+		wantSurvivors []types.NamespacedName
+		wantError     string
 	}{
 		{
 			name: "delete resource snapshots - namespaced",
@@ -1770,8 +1772,8 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 						},
 					},
 				},
-				// Namespaced snapshots with same placement name (should NOT be deleted)
-				// TODO: find a way to test this
+				// Namespaced snapshot sharing the same placement name; cross-scope isolation
+				// requires it to survive a cluster-scoped delete.
 				&fleetv1beta1.ResourceSnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "test-namespaced-snapshot",
@@ -1782,6 +1784,9 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 					},
 				},
 			},
+			wantSurvivors: []types.NamespacedName{
+				{Namespace: "test-namespace", Name: "test-namespaced-snapshot"},
+			},
 		},
 		{
 			name: "mixed environment - delete namespaced snapshots only",
@@ -1792,8 +1797,8 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 				},
 			},
 			objects: []client.Object{
-				// Cluster-scoped snapshots with same placement name (should NOT be deleted)
-				// TODO: find a way to test this
+				// Cluster-scoped snapshot sharing the same placement name; cross-scope isolation
+				// requires it to survive a namespaced delete.
 				&fleetv1beta1.ClusterResourceSnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-cluster-snapshot",
@@ -1832,6 +1837,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 					},
 				},
 			},
+			wantSurvivors: []types.NamespacedName{
+				// Cluster-scoped survivor (different scope, same placement name).
+				{Name: "test-cluster-snapshot"},
+				// Different-namespace survivor (same scope, different namespace).
+				{Namespace: "other-namespace", Name: "other-namespaced-snapshot"},
+			},
 		},
 	}
 
@@ -1841,12 +1852,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 
 			err := DeleteResourceSnapshots(context.Background(), k8Client, tt.placementObj)
 
-			if tt.expectedError != "" {
+			if tt.wantError != "" {
 				if err == nil {
 					t.Fatalf("Expected error but got nil")
 				}
-				if !strings.Contains(err.Error(), tt.expectedError) {
-					t.Errorf("Expected error to contain: %s, but got: %v", tt.expectedError, err)
+				if !strings.Contains(err.Error(), tt.wantError) {
+					t.Errorf("Expected error to contain: %s, but got: %v", tt.wantError, err)
 				}
 				return
 			}
@@ -1855,18 +1866,30 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 				t.Fatalf("Expected no error but got: %v", err)
 			}
 
-			// Verify snapshots were deleted by checking they no longer exist
-			placementKey := types.NamespacedName{
-				Name:      tt.placementObj.GetName(),
-				Namespace: tt.placementObj.GetNamespace(),
+			// Unfiltered list of both scopes; exact match guards the cross-scope regression.
+			var clusterList fleetv1beta1.ClusterResourceSnapshotList
+			if err := k8Client.List(context.Background(), &clusterList); err != nil {
+				t.Fatalf("Failed to list ClusterResourceSnapshots after deletion: %v", err)
 			}
-			result, err := ListAllResourceSnapshots(context.Background(), k8Client, placementKey)
-			if err != nil {
-				t.Fatalf("Expected no error when listing snapshots after deletion, but got: %v", err)
+			var nsList fleetv1beta1.ResourceSnapshotList
+			if err := k8Client.List(context.Background(), &nsList); err != nil {
+				t.Fatalf("Failed to list ResourceSnapshots after deletion: %v", err)
 			}
-
-			if len(result.GetResourceSnapshotObjs()) != 0 {
-				t.Errorf("Expected 0 resource snapshots after deletion, got %d", len(result.GetResourceSnapshotObjs()))
+			got := make([]types.NamespacedName, 0, len(clusterList.Items)+len(nsList.Items))
+			for _, s := range clusterList.Items {
+				got = append(got, types.NamespacedName{Name: s.Name})
+			}
+			for _, s := range nsList.Items {
+				got = append(got, types.NamespacedName{Namespace: s.Namespace, Name: s.Name})
+			}
+			less := func(a, b types.NamespacedName) bool {
+				if a.Namespace != b.Namespace {
+					return a.Namespace < b.Namespace
+				}
+				return a.Name < b.Name
+			}
+			if diff := cmp.Diff(tt.wantSurvivors, got, cmpopts.SortSlices(less), cmpopts.EquateEmpty()); diff != "" {
+				t.Errorf("surviving snapshots mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}

From 63da14cf6db5f579aabc9120f44723073c22f563 Mon Sep 17 00:00:00 2001
From: Yetkin Timocin <ytimocin@microsoft.com>
Date: Tue, 12 May 2026 20:03:54 -0700
Subject: [PATCH 16/19] fix: specify which object failed in override error
 message (#686)

* fix: specify which object failed in override error message

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

* fix: harden formatOverrideTarget fallback and align IsClusterMatched log

Address review feedback on PR #686:

- formatOverrideTarget: fall back to "Unknown" when the unstructured
  target has an empty Kind, so a malformed snapshot doesn't render as
  `"" "name"` in user-facing error messages.
- applyOverrideRules: drop the NewUnexpectedBehaviorError log wrapper on
  the IsClusterMatched failure path. The outer applyOverrides wrap
  already tags this as ErrUserError, so classifying the log line as
  "unexpected" contradicted the user-error classification.
- applyOverrides: switch the outer wrap from the inline
  fmt.Errorf("%w: ...", controller.ErrUserError, ...) form to
  controller.NewUserError(fmt.Errorf(...)) to match the form used by
  every other call site in the codebase. Behaviorally identical
  (same Error() string, same errors.Is, same trim output at
  workgenerator/controller.go:188-194).
- Add TestFormatOverrideTarget covering namespaced, cluster-scoped, and
  the new empty-Kind fallback (both scopes).
- Trim the verbose doc and test comments introduced by this PR.

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>

---------

Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com>
---
 .../controller_integration_test.go            | 145 ++++++++---
 pkg/controllers/workgenerator/override.go     |  54 ++--
 .../workgenerator/override_test.go            | 238 +++++++++++++++---
 pkg/controllers/workgenerator/suite_test.go   |  42 ++++
 4 files changed, 387 insertions(+), 92 deletions(-)

diff --git a/pkg/controllers/workgenerator/controller_integration_test.go b/pkg/controllers/workgenerator/controller_integration_test.go
index 1e7e64b1a..19bbafb9f 100644
--- a/pkg/controllers/workgenerator/controller_integration_test.go
+++ b/pkg/controllers/workgenerator/controller_integration_test.go
@@ -57,10 +57,12 @@ var (
 	validClusterResourceOverrideSnapshotName   = "cro-1"
 	validResourceOverrideSnapshotName          = "ro-1"
 	invalidClusterResourceOverrideSnapshotName = "cro-2" // the overridden manifest is invalid
+	invalidResourceOverrideSnapshotName        = "ro-2"  // the overridden manifest is invalid
 
 	validClusterResourceOverrideSnapshot   placementv1beta1.ClusterResourceOverrideSnapshot
 	validResourceOverrideSnapshot          placementv1beta1.ResourceOverrideSnapshot
 	invalidClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot
+	invalidResourceOverrideSnapshot        placementv1beta1.ResourceOverrideSnapshot
 
 	bindingStatusCmpOpts = cmp.Options{
 		cmpopts.SortSlices(utils.LessFuncConditionByType),
@@ -280,7 +282,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 			// delete the binding
 			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 			Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())
-			By(fmt.Sprintf("resource binding  %s is deleted", binding.Name))
+			By(fmt.Sprintf("resource binding %s is deleted", binding.Name))
 			// check the binding is deleted
 			Eventually(func() bool {
 				err := k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)
@@ -302,7 +304,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -383,7 +385,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				Consistently(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
@@ -589,7 +591,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testConfigMap, testResourceEnvelope, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -692,12 +694,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceEnvelope2, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second generation
 				Eventually(func() bool {
@@ -711,7 +713,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the work that contains none enveloped object is updated
 				work := placementv1beta1.Work{}
 				Eventually(func() error {
@@ -790,12 +792,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second binding generation
 				Eventually(func() bool {
@@ -809,7 +811,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the enveloped work is deleted
 				Eventually(func() error {
 					envelopWorkLabelMatcher := client.MatchingLabels{
@@ -839,7 +841,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testClusterScopedEnvelope, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -942,12 +944,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second binding generation
 				Eventually(func() bool {
@@ -961,7 +963,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the enveloped work is deleted
 				Eventually(func() error {
 					envelopWorkLabelMatcher := client.MatchingLabels{
@@ -990,7 +992,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				// create binding
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
@@ -1003,19 +1005,19 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testConfigMap, testPdb,
 				})
 				Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("secondary resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name))
 			})
 
 			AfterEach(func() {
 				// delete the master resource snapshot
 				Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("master resource snapshot  %s deleted", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name))
 				// delete the second resource snapshot
 				Expect(k8sClient.Delete(ctx, secondSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("secondary resource snapshot  %s deleted", secondSnapshot.Name))
+				By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name))
 				// delete the binding
 				Expect(k8sClient.Delete(ctx, binding)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("resource binding  %s deleted", binding.Name))
+				By(fmt.Sprintf("resource binding %s deleted", binding.Name))
 			})
 
 			It("Should create all the work in the target namespace after all the resource snapshot are created", func() {
@@ -1033,7 +1035,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				}
 				diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests)
 				Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				secondWork := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork)
@@ -1099,7 +1101,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				}
 				diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests)
 				Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				secondWork := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork)
@@ -1155,7 +1157,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
@@ -1165,25 +1167,25 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
 				updateRolloutStartedGeneration(&binding)
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				// Now create the second resource snapshot
 				secondSnapshot = generateClusterResourceSnapshot(3, 3, 1, [][]byte{
 					testResource, testConfigMap,
 				})
 				Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new secondary resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name))
 				// Now create the third resource snapshot
 				thirdSnapshot := generateClusterResourceSnapshot(3, 3, 2, [][]byte{
 					testPdb,
 				})
 				Expect(k8sClient.Create(ctx, thirdSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("third resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name))
 				// check the work for the master resource snapshot is created
 				expectedManifest := []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
@@ -1201,7 +1203,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return nil
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is updated in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				expectedManifest = []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResource}},
 					{RawExtension: runtime.RawExtension{Raw: testConfigMap}},
@@ -1220,7 +1222,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return nil
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("second work %s is updated in %s", work.Name, work.Namespace))
-				// check the work for the third resource snapshot is created, it's name is crp-subindex
+				// check the work for the third resource snapshot is created, its name is crp-subindex
 				expectedManifest = []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testPdb}},
 				}
@@ -1247,7 +1249,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
@@ -1257,13 +1259,13 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource, testConfigMap, testPdb,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
 				updateRolloutStartedGeneration(&binding)
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				//inspect the work manifest that should have been updated to contain all
 				expectedManifest := []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
@@ -1301,7 +1303,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				work2 := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work2)
@@ -1310,7 +1312,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				// delete the binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s is deleted", binding.Name))
+				By(fmt.Sprintf("resource binding %s is deleted", binding.Name))
 
 				// verify that all associated works have been deleted
 				Eventually(func() error {
@@ -1352,7 +1354,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				crolist := []string{validClusterResourceOverrideSnapshotName}
 				roList := []placementv1beta1.NamespacedName{
 					{
@@ -1446,7 +1448,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				Consistently(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
@@ -1472,7 +1474,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1519,7 +1521,70 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					}
 					return cmp.Diff(wantStatus, binding.Status, cmpConditionOption)
 				}, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name))
-				Expect(binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message).Should(ContainSubstring("Failed to apply the override rules on the resources: add operation does not apply"))
+				message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message
+				Expect(message).Should(MatchRegexp(`ClusterResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`))
+			})
+		})
+
+		Context("Test Bound ClusterResourceBinding with a single resource snapshot and invalid resource override", func() {
+			var masterSnapshot *placementv1beta1.ClusterResourceSnapshot
+
+			BeforeEach(func() {
+				masterSnapshot = generateClusterResourceSnapshot(1, 1, 0, [][]byte{
+					testResourceCRD, testNameSpace, testResource,
+				})
+				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
+				spec := placementv1beta1.ResourceBindingSpec{
+					State:                placementv1beta1.BindingStateBound,
+					ResourceSnapshotName: masterSnapshot.Name,
+					TargetCluster:        memberClusterName,
+					ResourceOverrideSnapshots: []placementv1beta1.NamespacedName{
+						{
+							Name:      invalidResourceOverrideSnapshotName,
+							Namespace: appNamespaceName,
+						},
+					},
+				}
+				createClusterResourceBinding(&binding, spec)
+			})
+
+			AfterEach(func() {
+				By("Deleting master clusterResourceSnapshot")
+				Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
+			})
+
+			It("Should not create the work in the target namespace and surface ResourceOverrideSnapshot identity in the condition message", func() {
+				work := placementv1beta1.Work{}
+				Consistently(func() bool {
+					err := k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
+					return errors.IsNotFound(err)
+				}, duration, interval).Should(BeTrue(), "controller should not create work in hub cluster when an override fails")
+				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
+				Expect(len(binding.Finalizers)).Should(Equal(1))
+
+				Eventually(func() string {
+					Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
+					wantStatus := placementv1beta1.ResourceBindingStatus{
+						Conditions: []metav1.Condition{
+							{
+								Type:               string(placementv1beta1.ResourceBindingRolloutStarted),
+								Status:             metav1.ConditionTrue,
+								Reason:             condition.RolloutStartedReason,
+								ObservedGeneration: binding.GetGeneration(),
+							},
+							{
+								Type:               string(placementv1beta1.ResourceBindingOverridden),
+								Status:             metav1.ConditionFalse,
+								Reason:             condition.OverriddenFailedReason,
+								ObservedGeneration: binding.GetGeneration(),
+							},
+						},
+					}
+					return cmp.Diff(wantStatus, binding.Status, cmpConditionOption)
+				}, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name))
+				message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message
+				Expect(message).Should(MatchRegexp(`ResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`))
 			})
 		})
 
@@ -1531,7 +1596,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1588,7 +1653,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1673,7 +1738,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted))
 				Consistently(func() error {
diff --git a/pkg/controllers/workgenerator/override.go b/pkg/controllers/workgenerator/override.go
index a5af94417..91c79f6a4 100644
--- a/pkg/controllers/workgenerator/override.go
+++ b/pkg/controllers/workgenerator/override.go
@@ -35,6 +35,10 @@ import (
 	"github.com/kubefleet-dev/kubefleet/pkg/utils/overrider"
 )
 
+// fetchClusterResourceOverrideSnapshots returns the binding's CRO snapshots keyed by the
+// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so
+// callers can apply deterministically.
+//
 // TODO: combine the following two functions into one, as they are very similar.
 func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot, error) {
 	croMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot)
@@ -70,6 +74,9 @@ func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context,
 	return croMap, nil
 }
 
+// fetchResourceOverrideSnapshots returns the binding's RO snapshots keyed by the
+// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so
+// callers can apply deterministically.
 func (r *Reconciler) fetchResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot, error) {
 	roMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot)
 
@@ -127,11 +134,11 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 		Kind:    gvk.Kind,
 		Name:    uResource.GetName(),
 	}
-	isClusterScopeResource := r.InformerManager.IsClusterScopedResources(gvk)
+	isClusterScopedResource := r.InformerManager.IsClusterScopedResources(gvk)
 
 	// For the namespace scoped resource, it could be selected by the namespace itself.
-	// use the namespace as the key
-	if !isClusterScopeResource {
+	// Use the namespace as the key.
+	if !isClusterScopedResource {
 		key = placementv1beta1.ResourceIdentifier{
 			Group:   utils.NamespaceMetaGVK.Group,
 			Version: utils.NamespaceMetaGVK.Version,
@@ -149,14 +156,15 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 		}
 		if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil {
 			klog.ErrorS(err, "Failed to apply the override rules", "clusterResourceOverrideSnapshot", klog.KObj(snapshot))
-			return false, err
+			return false, controller.NewUserError(fmt.Errorf("ClusterResourceOverrideSnapshot %q failed to apply on %s: %s",
+				snapshot.Name, formatOverrideTarget(&uResource), err.Error()))
 		}
 	}
 	klog.V(2).InfoS("Applied clusterResourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(croMap[key]))
 
 	// If the resource is selected by both ClusterResourceOverride and ResourceOverride, ResourceOverride will win when resolving conflicts.
 	// Apply ResourceOverrideSnapshots.
-	if !isClusterScopeResource {
+	if !isClusterScopedResource {
 		key = placementv1beta1.ResourceIdentifier{
 			Group:     gvk.Group,
 			Version:   gvk.Version,
@@ -172,7 +180,8 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 			}
 			if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil {
 				klog.ErrorS(err, "Failed to apply the override rules", "resourceOverrideSnapshot", klog.KObj(snapshot))
-				return false, err
+				return false, controller.NewUserError(fmt.Errorf("ResourceOverrideSnapshot %q failed to apply on %s: %s",
+					snapshot.Name, formatOverrideTarget(&uResource), err.Error()))
 			}
 		}
 		klog.V(2).InfoS("Applied resourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(roMap[key]))
@@ -180,12 +189,29 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 	return resource.Raw == nil, nil
 }
 
+// formatOverrideTarget renders the target as e.g. `Deployment "my-app" in namespace "default"`
+// for inclusion in a user-facing error message.
+func formatOverrideTarget(target *unstructured.Unstructured) string {
+	// Fall back to "Unknown" so a snapshot missing `kind` doesn't render as `"" "name"`.
+	kind := target.GetObjectKind().GroupVersionKind().Kind
+	if kind == "" {
+		kind = "Unknown"
+	}
+	if ns := target.GetNamespace(); ns != "" {
+		return fmt.Sprintf("%s %q in namespace %q", kind, target.GetName(), ns)
+	}
+	return fmt.Sprintf("%s %q", kind, target.GetName())
+}
+
+// applyOverrideRules applies matching rules to the resource. A DeleteOverrideType rule clears
+// the resource and stops; otherwise JSON patches apply in order. Errors are returned raw — the
+// caller (applyOverrides) tags them as user errors so we don't double-wrap the sentinel.
 func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, rules []placementv1beta1.OverrideRule) error {
 	for _, rule := range rules {
 		matched, err := overrider.IsClusterMatched(cluster, rule)
 		if err != nil {
-			klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Found an invalid override rule")
-			return controller.NewUserError(err) // should not happen though and should be rejected by the webhook
+			klog.ErrorS(err, "Found an invalid override rule")
+			return err
 		}
 		if !matched {
 			continue
@@ -198,7 +224,7 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu
 		// Apply JSONPatchOverrides by default
 		if err = applyJSONPatchOverride(resource, cluster, rule.JSONPatchOverrides); err != nil {
 			klog.ErrorS(err, "Failed to apply JSON patch override")
-			return controller.NewUserError(err)
+			return err
 		}
 	}
 	return nil
@@ -207,11 +233,11 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu
 // applyJSONPatchOverride applies a JSON patch on the selected resources following [RFC 6902](https://datatracker.ietf.org/doc/html/rfc6902).
 func applyJSONPatchOverride(resourceContent *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, overrides []placementv1beta1.JSONPatchOverride) error {
 	var err error
-	if len(overrides) == 0 { // do nothing
+	if len(overrides) == 0 {
 		return nil
 	}
-	// go through the JSON patch overrides to replace the built-in variables before json Marshal
-	// as it may contain the built-in variables that cannot be marshaled directly
+	// Go through the JSON patch overrides to replace the built-in variables before json.Marshal,
+	// as the patch values may contain built-in variables that cannot be marshaled directly.
 	for i := range overrides {
 		// Process the JSON string to replace variables
 		jsonStr := string(overrides[i].Value.Raw)
@@ -263,14 +289,14 @@ func replaceClusterLabelKeyVariables(input string, cluster *clusterv1beta1.Membe
 		// extract the key value user wants to replace
 		endIdx := strings.Index(result[startIdx+prefixLen:], "}")
 		if endIdx == -1 {
-			klog.V(2).InfoS("malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input)
+			klog.V(2).InfoS("Malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input)
 			return "", fmt.Errorf("input %s is missing the closing bracket `}`", input)
 		}
 		endIdx += startIdx + prefixLen
 		// extract the key name
 		keyName := result[startIdx+prefixLen : endIdx]
 		// check if the key exists in the cluster labels
-		labelValue, exists := cluster.ObjectMeta.Labels[keyName]
+		labelValue, exists := cluster.Labels[keyName]
 		if !exists {
 			klog.V(2).InfoS("Label key not found on cluster", "key", keyName, "cluster", cluster.Name)
 			return "", fmt.Errorf("label key %s not found on cluster %s", keyName, cluster.Name)
diff --git a/pkg/controllers/workgenerator/override_test.go b/pkg/controllers/workgenerator/override_test.go
index a1e9ed8f2..324d2c02e 100644
--- a/pkg/controllers/workgenerator/override_test.go
+++ b/pkg/controllers/workgenerator/override_test.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -428,6 +429,9 @@ func TestFetchResourceOverrideSnapshot(t *testing.T) {
 }
 
 func TestApplyOverrides_clusterScopedResource(t *testing.T) {
+	// FakeManager.IsClusterScopedResources returns (APIResources[gvk] == IsClusterScopedResource).
+	// With the flag set to false and APIResources listing namespace-scoped GVKs, listed GVKs
+	// report as namespace-scoped and unlisted GVKs (ClusterRole) report as cluster-scoped.
 	fakeInformer := informer.FakeManager{
 		APIResources: map[schema.GroupVersionKind]bool{
 			{
@@ -450,7 +454,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 		croMap          map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot
 		wantClusterRole rbacv1.ClusterRole
 		wantErr         error
-		wantDeleted     bool
+		// wantErrSubstr asserts the failure message names the failing snapshot and target.
+		wantErrSubstr []string
+		wantDeleted   bool
 	}{
 		{
 			name: "empty overrides",
@@ -844,6 +850,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 					Name:    "clusterrole-name",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "conflicting-rules-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -895,6 +904,10 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "conflicting-rules-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+			},
 		},
 		{
 			name: "invalid json patch of clusterResourceOverride",
@@ -921,6 +934,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 					Name:    "clusterrole-name",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "test-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -953,6 +969,71 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "test-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+			},
+		},
+		{
+			// Invalid LabelSelector Operator makes IsClusterMatched fail — exercises the
+			// applyOverrideRules raw-error return that applyOverrides wraps.
+			name: "invalid cluster label selector in clusterResourceOverride",
+			clusterRole: rbacv1.ClusterRole{
+				TypeMeta: clusterRoleType,
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "clusterrole-name",
+				},
+			},
+			cluster: clusterv1beta1.MemberCluster{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "cluster-1",
+				},
+			},
+			croMap: map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot{
+				{
+					Group:   "rbac.authorization.k8s.io",
+					Version: "v1",
+					Kind:    "ClusterRole",
+					Name:    "clusterrole-name",
+				}: {
+					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "bad-selector-cro-snapshot",
+						},
+						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
+							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
+								Policy: &placementv1beta1.OverridePolicy{
+									OverrideRules: []placementv1beta1.OverrideRule{
+										{
+											ClusterSelector: &placementv1beta1.ClusterSelector{
+												ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{
+													{
+														LabelSelector: &metav1.LabelSelector{
+															MatchExpressions: []metav1.LabelSelectorRequirement{
+																{
+																	Key:      "key1",
+																	Operator: "InvalidOperator",
+																	Values:   []string{"value1"},
+																},
+															},
+														},
+													},
+												},
+											},
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "bad-selector-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+				`invalid cluster label selector`,
+			},
 		},
 		{
 			name: "delete during the clusterResourceOverride",
@@ -1118,6 +1199,11 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted)
 			}
 			if tc.wantErr != nil {
+				for _, want := range tc.wantErrSubstr {
+					if !strings.Contains(err.Error(), want) {
+						t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want)
+					}
+				}
 				return
 			}
 			if tc.wantDeleted {
@@ -1126,12 +1212,12 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var clusterRole rbacv1.ClusterRole
 			if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &clusterRole); err != nil {
-				t.Fatalf("Failed to convert the result to clusterole: %v, want nil", err)
+				t.Fatalf("Failed to convert the result to clusterRole: %v, want nil", err)
 			}
 
 			if diff := cmp.Diff(tc.wantClusterRole, clusterRole); diff != "" {
@@ -1141,7 +1227,8 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 	}
 }
 
-func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
+func TestApplyOverrides_namespaceScopedResource(t *testing.T) {
+	// See TestApplyOverrides_clusterScopedResource for the IsClusterScopedResource: false rationale.
 	fakeInformer := informer.FakeManager{
 		APIResources: map[schema.GroupVersionKind]bool{
 			{
@@ -1165,7 +1252,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 		roMap          map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot
 		wantDeployment appsv1.Deployment
 		wantErr        error
-		wantDelete     bool
+		// wantErrSubstr asserts the failure message names the failing snapshot and target.
+		wantErrSubstr []string
+		wantDeleted   bool
 	}{
 		{
 			name: "empty overrides",
@@ -1664,6 +1753,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Name:    "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "invalid-patch-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -1697,6 +1789,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "invalid-patch-cro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 		{
 			name: "invalid json patch of resourceOverride",
@@ -1728,6 +1825,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Namespace: "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "invalid-patch-ro-snapshot",
+							Namespace: "deployment-namespace",
+						},
 						Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -1751,6 +1852,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ResourceOverrideSnapshot "invalid-patch-ro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 		{
 			name: "delete type of resourceOverride",
@@ -1797,7 +1903,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "resourceOverride delete the cro override",
@@ -1884,7 +1990,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "resourceOverride no-op when the cro delete",
@@ -1970,7 +2076,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "cluster name as value in json patch of resourceOverride",
@@ -2218,6 +2324,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Namespace: "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "test-ro-snapshot",
+							Namespace: "deployment-namespace",
+						},
 						Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -2240,6 +2350,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ResourceOverrideSnapshot "test-ro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 	}
 	for _, tc := range tests {
@@ -2252,19 +2367,24 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 			if gotErr, wantErr := err != nil, tc.wantErr != nil; gotErr != wantErr || !errors.Is(err, tc.wantErr) {
 				t.Fatalf("applyOverrides() got error %v, want error %v", err, tc.wantErr)
 			}
-			if gotDeleted != tc.wantDelete {
-				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDelete)
+			if gotDeleted != tc.wantDeleted {
+				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted)
 			}
 			if tc.wantErr != nil {
+				for _, want := range tc.wantErrSubstr {
+					if !strings.Contains(err.Error(), want) {
+						t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want)
+					}
+				}
 				return
 			}
-			if tc.wantDelete {
+			if tc.wantDeleted {
 				return
 			}
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var deployment appsv1.Deployment
@@ -2757,7 +2877,7 @@ func TestApplyJSONPatchOverride(t *testing.T) {
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var deployment appsv1.Deployment
@@ -2774,10 +2894,10 @@ func TestApplyJSONPatchOverride(t *testing.T) {
 
 func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 	tests := map[string]struct {
-		cluster   *clusterv1beta1.MemberCluster
-		input     string
-		expected  string
-		expectErr bool
+		cluster *clusterv1beta1.MemberCluster
+		input   string
+		want    string
+		wantErr bool
 	}{
 		"No clusterLabelKey variables": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2787,8 +2907,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in us-west-1",
-			expected: "The cluster is in us-west-1",
+			input: "The cluster is in us-west-1",
+			want:  "The cluster is in us-west-1",
 		},
 		"ClusterLabelKey Variable replaced": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2798,8 +2918,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
-			expected: "The cluster is in us-west-1",
+			input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
+			want:  "The cluster is in us-west-1",
 		},
 		"The clusterLabelKey key is misspelled": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2807,8 +2927,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					Labels: map[string]string{},
 				},
 			},
-			input:    "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
-			expected: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
+			input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
+			want:  "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
 		},
 		"Multiple complex clusterLabelKey variables replaced": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2819,8 +2939,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}",
-			expected: "The cluster is in us-west-1 and environment is prod",
+			input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}",
+			want:  "The cluster is in us-west-1 and environment is prod",
 		},
 		"The clusterLabelKey key is not found": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2828,8 +2948,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					Labels: map[string]string{},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
+			wantErr: true,
 		},
 		"ClusterLabelKey Variable key case not match": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2839,10 +2959,10 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}",
+			wantErr: true,
 		},
-		"Invalid  clusterLabelKey variable format": {
+		"Invalid clusterLabelKey variable format": {
 			cluster: &clusterv1beta1.MemberCluster{
 				ObjectMeta: metav1.ObjectMeta{
 					Labels: map[string]string{
@@ -2850,8 +2970,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region",
+			wantErr: true,
 		},
 		"ClusterLabelKey variable key empty": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2861,19 +2981,61 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}",
+			wantErr: true,
 		},
 	}
 
 	for name, tc := range tests {
 		t.Run(name, func(t *testing.T) {
 			result, err := replaceClusterLabelKeyVariables(tc.input, tc.cluster)
-			if gotErr := err != nil; gotErr != tc.expectErr {
-				t.Fatalf("applyJSONPatchOverride() = error %v, want %v", err, tc.expectErr)
+			if gotErr := err != nil; gotErr != tc.wantErr {
+				t.Fatalf("replaceClusterLabelKeyVariables() = error %v, want %v", err, tc.wantErr)
 			}
-			if result != tc.expected {
-				t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.expected)
+			if result != tc.want {
+				t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.want)
+			}
+		})
+	}
+}
+
+func TestFormatOverrideTarget(t *testing.T) {
+	tests := map[string]struct {
+		kind      string
+		name      string
+		namespace string
+		want      string
+	}{
+		"namespaced resource": {
+			kind:      "Deployment",
+			name:      "my-app",
+			namespace: "default",
+			want:      `Deployment "my-app" in namespace "default"`,
+		},
+		"cluster-scoped resource": {
+			kind: "Namespace",
+			name: "app",
+			want: `Namespace "app"`,
+		},
+		"empty kind falls back to Unknown": {
+			name:      "my-app",
+			namespace: "default",
+			want:      `Unknown "my-app" in namespace "default"`,
+		},
+		"empty kind, cluster-scoped": {
+			name: "app",
+			want: `Unknown "app"`,
+		},
+	}
+
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			target := &unstructured.Unstructured{}
+			target.SetGroupVersionKind(schema.GroupVersionKind{Kind: tc.kind})
+			target.SetName(tc.name)
+			target.SetNamespace(tc.namespace)
+			if got := formatOverrideTarget(target); got != tc.want {
+				t.Errorf("formatOverrideTarget() = %q, want %q", got, tc.want)
 			}
 		})
 	}
diff --git a/pkg/controllers/workgenerator/suite_test.go b/pkg/controllers/workgenerator/suite_test.go
index 88e6850b8..e607846f3 100644
--- a/pkg/controllers/workgenerator/suite_test.go
+++ b/pkg/controllers/workgenerator/suite_test.go
@@ -301,6 +301,47 @@ func createOverrides() {
 	}
 	Expect(k8sClient.Create(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the cro-2")
 	By(fmt.Sprintf("Invalid cluster resource override snapshot %s created", invalidClusterResourceOverrideSnapshotName))
+
+	invalidResourceOverrideSnapshot = placementv1beta1.ResourceOverrideSnapshot{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      invalidResourceOverrideSnapshotName,
+			Namespace: appNamespaceName,
+			Labels: map[string]string{
+				placementv1beta1.IsLatestSnapshotLabel: "true",
+			},
+		},
+		Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
+			OverrideSpec: placementv1beta1.ResourceOverrideSpec{
+				ResourceSelectors: []placementv1beta1.ResourceSelector{
+					{
+						Group:   "test.kubernetes-fleet.io",
+						Version: "v1alpha1",
+						Kind:    "TestResource",
+						Name:    "random-test-resource",
+					},
+				},
+				Policy: &placementv1beta1.OverridePolicy{
+					OverrideRules: []placementv1beta1.OverrideRule{
+						{
+							ClusterSelector: &placementv1beta1.ClusterSelector{
+								ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{}, // select all members
+							},
+							JSONPatchOverrides: []placementv1beta1.JSONPatchOverride{
+								{
+									Operator: placementv1beta1.JSONPatchOverrideOpAdd,
+									Path:     "/invalid/path",
+									Value:    apiextensionsv1.JSON{Raw: []byte(`"new-value"`)},
+								},
+							},
+						},
+					},
+				},
+			},
+			OverrideHash: []byte("123"),
+		},
+	}
+	Expect(k8sClient.Create(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the ro-2")
+	By(fmt.Sprintf("Invalid resource override snapshot %s created", invalidResourceOverrideSnapshotName))
 }
 
 var _ = AfterSuite(func() {
@@ -309,6 +350,7 @@ var _ = AfterSuite(func() {
 	Expect(k8sClient.Delete(ctx, &validClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-1")
 	Expect(k8sClient.Delete(ctx, &validResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-1")
 	Expect(k8sClient.Delete(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-2")
+	Expect(k8sClient.Delete(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-2")
 	Expect(k8sClient.Delete(ctx, &appNamespace)).Should(Succeed(), "Failed to delete app namespace")
 
 	cancel()

From ce6b31dd4ac9d39de94738a0f89e00a1874fa234 Mon Sep 17 00:00:00 2001
From: michaelawyu <chenyu1@microsoft.com>
Date: Wed, 13 May 2026 15:39:35 +1000
Subject: [PATCH 17/19] fix: address a RO directory issue in templates (#706)

---
 charts/hub-agent/templates/deployment.yaml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml
index 3bdca63fa..882a3fa12 100644
--- a/charts/hub-agent/templates/deployment.yaml
+++ b/charts/hub-agent/templates/deployment.yaml
@@ -90,13 +90,21 @@ spec:
                 fieldPath: metadata.namespace
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.useCertManager }}
           volumeMounts:
           - name: webhook-cert
             # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go
             mountPath: /tmp/k8s-webhook-server/serving-certs
-            {{- if .Values.useCertManager }}
             readOnly: true
-            {{- end }}
+          {{- else }}
+          volumeMounts:
+          - name: webhook-cert
+            # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go.
+            # Note that this must be mounted one level up from the hardcoded path, otherwise
+            # the read only root filesystem setup would block the agent from attempting to
+            # clear the directory.
+            mountPath: /tmp/k8s-webhook-server/
+          {{- end }}
       volumes:
       - name: webhook-cert
         {{- if .Values.useCertManager }}

From 9dee67a64663d6037450165d2ba37b91f07795eb Mon Sep 17 00:00:00 2001
From: Britania Rodriguez Reyes <britaniar@microsoft.com>
Date: Wed, 13 May 2026 13:41:07 -0700
Subject: [PATCH 18/19] Fix failing e2e test now that enableWorkload is False
 meaning PVC can be placed

Signed-off-by: Britania Rodriguez Reyes <britaniar@microsoft.com>
---
 test/e2e/placement_selecting_resources_test.go          | 3 ++-
 test/e2e/resource_placement_selecting_resources_test.go | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/e2e/placement_selecting_resources_test.go b/test/e2e/placement_selecting_resources_test.go
index 10d6a56c3..b12ffa399 100644
--- a/test/e2e/placement_selecting_resources_test.go
+++ b/test/e2e/placement_selecting_resources_test.go
@@ -1392,11 +1392,12 @@ var _ = Describe("creating CRP and checking selected resources order", Ordered,
 
 	It("should update CRP status with the correct order of the selected resources", func() {
 		// Define the expected resources in order
-		// Note: PVCs are not propagated, so they should not appear in selected resources
+		// Note: PVCs are propagated when enableWorkload=false (the default for E2E tests)
 		expectedResources := []placementv1beta1.ResourceIdentifier{
 			{Kind: "Namespace", Name: nsName, Version: "v1"},
 			{Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"},
 			{Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"},
+			{Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"},
 			{Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"},
 		}
 
diff --git a/test/e2e/resource_placement_selecting_resources_test.go b/test/e2e/resource_placement_selecting_resources_test.go
index 2e6966274..9737dd787 100644
--- a/test/e2e/resource_placement_selecting_resources_test.go
+++ b/test/e2e/resource_placement_selecting_resources_test.go
@@ -1057,10 +1057,11 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f
 
 		It("should update RP status with the correct order of the selected resources", func() {
 			// Define the expected resources in order.
-			// Note: PVCs are not propagated, so they should not appear in selected resources
+			// Note: PVCs are propagated when enableWorkload=false (the default for E2E tests)
 			expectedResources := []placementv1beta1.ResourceIdentifier{
 				{Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"},
 				{Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"},
+				{Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"},
 				{Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"},
 			}
 

From 4b121aa34c4a6281726c37f134ed69531b45af82 Mon Sep 17 00:00:00 2001
From: Britania Rodriguez Reyes <britaniar@microsoft.com>
Date: Wed, 13 May 2026 15:53:42 -0700
Subject: [PATCH 19/19] Disable test due to enableWorkload=false change

Signed-off-by: Britania Rodriguez Reyes <britaniar@microsoft.com>
---
 test/e2e/resource_placement_hub_workload_test.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/e2e/resource_placement_hub_workload_test.go b/test/e2e/resource_placement_hub_workload_test.go
index b088109ee..0e644d145 100644
--- a/test/e2e/resource_placement_hub_workload_test.go
+++ b/test/e2e/resource_placement_hub_workload_test.go
@@ -33,6 +33,9 @@ import (
 	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 )
 
+// This test requires enableWorkload=true because it creates workloads on the hub cluster
+// and expects them to become ready. With enableWorkload=false, the pod webhook blocks
+// pod creation on the hub cluster.
 var _ = Describe("placing workloads using a CRP with PickAll policy", Label("resourceplacement"), Ordered, func() {
 	crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess())
 	var testDeployment appsv1.Deployment
@@ -41,6 +44,8 @@ var _ = Describe("placing workloads using a CRP with PickAll policy", Label("res
 	var testStatefulSet appsv1.StatefulSet
 
 	BeforeAll(func() {
+		Skip("This test requires enableWorkload=true to run workloads on the hub cluster")
+
 		// Read the test manifests
 		readDeploymentTestManifest(&testDeployment)
 		readDaemonSetTestManifest(&testDaemonSet)