diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index ccf33b9b8..2d761329b 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -42,7 +42,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -56,7 +56,7 @@ jobs:
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/autobuild@e46ed2cbd01164d986452f91f178727624ae40d7 # v4
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -69,4 +69,4 @@ jobs:
     #   ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4
diff --git a/Makefile b/Makefile
index e2941acc7..be3741254 100644
--- a/Makefile
+++ b/Makefile
@@ -275,6 +275,7 @@ build: generate fmt vet ## Build agent binaries
 	go build -o bin/hubagent cmd/hubagent/main.go
 	go build -o bin/memberagent cmd/memberagent/main.go
 	go build -o bin/crdinstaller cmd/crdinstaller/main.go
+	go build -o bin/kubectl-fleet ./tools/fleet/
 
 .PHONY: run-hubagent
 run-hubagent: manifests generate fmt vet ## Run hub-agent from your host
diff --git a/apis/placement/v1/clusterresourceplacement_types.go b/apis/placement/v1/clusterresourceplacement_types.go
index 798d09dcf..795841e72 100644
--- a/apis/placement/v1/clusterresourceplacement_types.go
+++ b/apis/placement/v1/clusterresourceplacement_types.go
@@ -123,13 +123,15 @@ type ClusterResourcePlacement struct {
 }
 
 // PlacementSpec defines the desired state of ClusterResourcePlacement and ResourcePlacement.
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) <= 1",message="only one namespace selector with NamespaceWithResourceSelectors mode is allowed"
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) && size(x.name) > 0 && !has(x.labelSelector))) == 1)",message="namespace selector with NamespaceWithResourceSelectors mode must select by name (not by label)"
+// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1')) == 1",message="when using NamespaceWithResourceSelectors mode, only one namespace selector is allowed (cannot mix with other namespace selectors)"
 type PlacementSpec struct {
-	// +kubebuilder:validation:MinItems=1
-	// +kubebuilder:validation:MaxItems=100
-
 	// ResourceSelectors is an array of selectors used to select cluster scoped resources. The selectors are `ORed`.
 	// You can have 1-100 selectors.
 	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=100
 	ResourceSelectors []ResourceSelectorTerm `json:"resourceSelectors"`
 
 	// Policy defines how to select member clusters to place the selected resources.
@@ -185,6 +187,24 @@ type ResourceSelectorTerm struct {
 	// For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
 	// - NamespaceOnly: Only the namespace object itself
 	// - NamespaceWithResources: The namespace AND all resources within it (default)
+	// - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+	//
+	// When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+	// (after the namespace selector) to specify which resources to include. These additional selectors can
+	// target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+	//
+	// Important requirements for NamespaceWithResourceSelectors mode:
+	// - Exactly one namespace selector with this mode is allowed
+	// - The namespace selector must select by name (not by label)
+	// - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+	// - All requirements are validated via CEL at API validation time
+	// - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+	//
+	// Example using NamespaceWithResourceSelectors:
+	// - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+	// - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+	// - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+	// This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
 	//
 	// +kubebuilder:validation:Required
 	Kind string `json:"kind"`
@@ -204,7 +224,7 @@ type ResourceSelectorTerm struct {
 	// SelectionScope defines the scope of resource selections when the Kind is `namespace`.
 	// This field is only applicable when Kind is "Namespace" and is ignored for other resource kinds.
 	// See the Kind field documentation for detailed examples and usage patterns.
-	// +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources
+	// +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources;NamespaceWithResourceSelectors
 	// +kubebuilder:default=NamespaceWithResources
 	// +kubebuilder:validation:Optional
 	SelectionScope SelectionScope `json:"selectionScope,omitempty"`
@@ -230,6 +250,52 @@ const (
 	// Note: This is the default value. When you select a namespace without specifying SelectionScope,
 	// this mode is used automatically.
 	NamespaceWithResources SelectionScope = "NamespaceWithResources"
+
+	// NamespaceWithResourceSelectors allows fine-grained selection of specific resources within a namespace.
+	// The namespace itself is always selected, and you can optionally specify which resources to include
+	// by adding additional ResourceSelectorTerm entries after the namespace selector.
+	//
+	// Use cases:
+	// 1. Select only specific resource types from a namespace (e.g., only Deployments and Services)
+	// 2. Select resources matching certain labels within a namespace
+	// 3. Include specific cluster-scoped resources along with namespace-scoped resources
+	//
+	// How "additional selectors" work:
+	// - Exactly one namespace selector with NamespaceWithResourceSelectors mode is required
+	// - This selector must select a namespace by name (label selectors not allowed)
+	// - ADDITIONAL selectors specify which resources to include:
+	//   - Namespace-scoped resources are filtered to only those within the selected namespace
+	//   - Cluster-scoped resources are included as specified (not limited to the namespace)
+	// - If no additional selectors are provided, only the namespace object itself is selected
+	//
+	// Example 1 - Select specific deployments from a namespace:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "production", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {tier: "frontend"}}
+	//   Result: The "production" namespace + all Deployments labeled tier=frontend within "production"
+	//
+	// Example 2 - Select namespace with multiple resource types:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"}
+	//   Selector 3: {Group: "", Version: "v1", Kind: "Service"}
+	//   Result: The "app" namespace + ALL Deployments and Services within "app"
+	//
+	// Example 3 - Include cluster-scoped resources:
+	//   Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"}
+	//   Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"}
+	//   Selector 3: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "app-admin"}
+	//   Result: The "app" namespace + ALL Deployments in "app" + the "app-admin" ClusterRole
+	//
+	// Important constraints:
+	// - Exactly ONE namespace selector with NamespaceWithResourceSelectors mode is allowed
+	// - The namespace selector must select by name (label selectors not allowed)
+	// - Only ONE namespace selector total is allowed when using this mode (cannot mix with other namespace selectors)
+	// - All constraints are enforced via CEL at API validation time
+	//
+	// Runtime behavior:
+	// - If the selected namespace is deleted after the CRP is created, the controller will detect this during
+	//   the next reconciliation and report an error condition in the CRP status
+	// - The CRP will transition to a failed state until the namespace is recreated or the CRP is updated
+	NamespaceWithResourceSelectors SelectionScope = "NamespaceWithResourceSelectors"
 )
 
 // PlacementPolicy contains the rules to select target member clusters to place the selected resources.
diff --git a/apis/placement/v1/stageupdate_types.go b/apis/placement/v1/stageupdate_types.go
index afcc88f96..2e06af937 100644
--- a/apis/placement/v1/stageupdate_types.go
+++ b/apis/placement/v1/stageupdate_types.go
@@ -209,7 +209,8 @@ type StageTask struct {
 	Type StageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// Only hours (h), minutes (m), and seconds (s) units are accepted.
+	// +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime *metav1.Duration `json:"waitTime,omitempty"`
diff --git a/apis/placement/v1alpha1/stagedupdate_types.go b/apis/placement/v1alpha1/stagedupdate_types.go
index 5ab660fec..25352ca7d 100644
--- a/apis/placement/v1alpha1/stagedupdate_types.go
+++ b/apis/placement/v1alpha1/stagedupdate_types.go
@@ -153,7 +153,7 @@ type AfterStageTask struct {
 	Type AfterStageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// +kubebuilder:validation:Pattern="^(?:0|(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+)$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime metav1.Duration `json:"waitTime,omitempty"`
diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go
index ef68dd087..7c6fe3842 100644
--- a/apis/placement/v1beta1/stageupdate_types.go
+++ b/apis/placement/v1beta1/stageupdate_types.go
@@ -347,7 +347,8 @@ type StageTask struct {
 	Type StageTaskType `json:"type"`
 
 	// The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
-	// +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$"
+	// Only hours (h), minutes (m), and seconds (s) units are accepted.
+	// +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$"
 	// +kubebuilder:validation:Type=string
 	// +kubebuilder:validation:Optional
 	WaitTime *metav1.Duration `json:"waitTime,omitempty"`
diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md
index b49e1bd15..9582dda49 100644
--- a/charts/hub-agent/README.md
+++ b/charts/hub-agent/README.md
@@ -164,7 +164,8 @@ helm install hub-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/hub-agent \
   --create-namespace \
   --set useCertManager=true \
   --set enableWorkload=true \
-  --set enableWebhook=true
+  --set enableWebhook=true \
+  --set webhookCertSecretName=fleet-webhook-server-cert
 
 # Or using traditional repository
 helm install hub-agent kubefleet/hub-agent \
@@ -172,12 +173,13 @@ helm install hub-agent kubefleet/hub-agent \
   --create-namespace \
   --set useCertManager=true \
   --set enableWorkload=true \
-  --set enableWebhook=true
+  --set enableWebhook=true \
+  --set webhookCertSecretName=fleet-webhook-server-cert
 ```
 
 The `webhookCertSecretName` parameter specifies the Secret name for the certificate:
-- Default: `fleet-webhook-server-cert`
-- When using cert-manager, this is where cert-manager stores the certificate
+- Default: `unset`; there is no default in `values.yaml`
+- When using cert-manager, set this to the Secret where cert-manager stores the certificate
 - Must match the secret name referenced in the deployment volume mount
 
 Example with custom secret name:
@@ -198,4 +200,4 @@ helm install hub-agent kubefleet/hub-agent \
   --set useCertManager=true \
   --set enableWorkload=true \
   --set webhookCertSecretName=my-webhook-secret
-```
\ No newline at end of file
+```
diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml
index 23fd4197e..e3d115344 100644
--- a/charts/hub-agent/templates/deployment.yaml
+++ b/charts/hub-agent/templates/deployment.yaml
@@ -19,6 +19,13 @@ spec:
         {{- include "hub-agent.selectorLabels" . | nindent 8 }}
     spec:
       serviceAccountName: {{ include "hub-agent.fullname" . }}-sa
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        fsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
       initContainers:
         {{- if .Values.crdInstaller.enabled }}
         - name: crd-installer
@@ -32,6 +39,12 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           args:
             - --leader-elect=true
             - --enable-webhook={{ .Values.enableWebhook }}
@@ -92,16 +105,26 @@ spec:
             # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go
             mountPath: /tmp/k8s-webhook-server/serving-certs
             readOnly: true
+          {{- else }}
+          volumeMounts:
+          - name: webhook-cert
+            # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go.
+            # Note that this must be mounted one level up from the hardcoded path, otherwise
+            # the read only root filesystem setup would block the agent from attempting to
+            # clear the directory.
+            mountPath: /tmp/k8s-webhook-server/
           {{- end }}
-      {{- if .Values.useCertManager }}
       volumes:
       - name: webhook-cert
+        {{- if .Values.useCertManager }}
         secret:
           secretName: {{ .Values.webhookCertSecretName }}
           # defaultMode 0444 (read for all) allows the container process to read the certs
           # regardless of the user/group it runs as
           defaultMode: 0444
-      {{- end }}
+        {{- else }}
+        emptyDir: {}
+        {{- end }}
       {{- with .Values.affinity }}
       affinity:
         {{- toYaml . | nindent 8 }}
diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml
index 90c1106a6..fd24c9182 100644
--- a/charts/hub-agent/values.yaml
+++ b/charts/hub-agent/values.yaml
@@ -32,6 +32,7 @@ enableWorkload: false
 useCertManager: false
 # webhookCertSecretName is ONLY used when useCertManager=true
 # It specifies the name of the Secret where cert-manager stores the certificate
+# Example:
 # webhookCertSecretName: fleet-webhook-server-cert
 
 forceDeleteWaitTime: 15m0s
diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml
index be57774c6..9aa9ab267 100644
--- a/charts/member-agent/templates/deployment.yaml
+++ b/charts/member-agent/templates/deployment.yaml
@@ -16,6 +16,13 @@ spec:
     spec:
       restartPolicy: Always
       serviceAccountName: {{ include "member-agent.fullname" . }}-sa
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        fsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
       initContainers:
         {{- if .Values.crdInstaller.enabled }}
         - name: crd-installer
@@ -29,6 +36,12 @@ spec:
         - name: {{ include "member-agent.fullname" . }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           ports:
             - name: http
               containerPort: 80
@@ -149,6 +162,12 @@ spec:
         - name: refresh-token
           image: "{{ .Values.refreshtoken.repository }}:{{ .Values.refreshtoken.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.refreshtoken.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
           args:
             {{- $provider := .Values.config.provider }}
             - {{ $provider }}
diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go
index c5ca386d0..94745b021 100644
--- a/cmd/memberagent/main.go
+++ b/cmd/memberagent/main.go
@@ -91,6 +91,11 @@ func main() {
 		klog.InfoS("flag:", "name", f.Name, "value", f.Value)
 	})
 
+	if errs := opts.Validate(); len(errs) != 0 {
+		klog.ErrorS(errs.ToAggregate(), "invalid parameter")
+		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
+	}
+
 	// Set up controller-runtime logger
 	ctrl.SetLogger(zap.New(zap.UseDevMode(true)))
 
@@ -213,13 +218,16 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo
 			return err
 		})
 		if err != nil {
-			klog.ErrorS(err, "Failed to retrieve token file from the path %s", tokenFilePath)
+			klog.ErrorS(err, "Failed to retrieve token file", "path", tokenFilePath)
 			return nil, err
 		}
 		hubConfig.BearerTokenFile = tokenFilePath
 	}
 
 	hubConfig.TLSClientConfig.Insecure = tlsClientInsecure
+	if tlsClientInsecure {
+		klog.Warning("TLS verification is disabled for hub cluster connection. This is insecure and should not be used in production.")
+	}
 	if !tlsClientInsecure {
 		caBundle, ok := os.LookupEnv("CA_BUNDLE")
 		if ok && caBundle == "" {
@@ -257,7 +265,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo
 		r := textproto.NewReader(bufio.NewReader(strings.NewReader(header)))
 		h, err := r.ReadMIMEHeader()
 		if err != nil && !errors.Is(err, io.EOF) {
-			klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER %q", header)
+			klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER", "header", header)
 			return nil, err
 		}
 		hubConfig.WrapTransport = func(rt http.RoundTripper) http.RoundTripper {
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
index 79927f6d1..f3f4cdfef 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml
@@ -72,6 +72,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -133,6 +151,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
index 6b80c6832..a57b5b3d2 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml
@@ -86,6 +86,24 @@ spec:
                             For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                             - NamespaceOnly: Only the namespace object itself
                             - NamespaceWithResources: The namespace AND all resources within it (default)
+                            - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                            When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                            (after the namespace selector) to specify which resources to include. These additional selectors can
+                            target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                            Important requirements for NamespaceWithResourceSelectors mode:
+                            - Exactly one namespace selector with this mode is allowed
+                            - The namespace selector must select by name (not by label)
+                            - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                            - All requirements are validated via CEL at API validation time
+                            - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                            Example using NamespaceWithResourceSelectors:
+                            - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                            - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                            - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                            This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                           type: string
                         labelSelector:
                           description: |-
@@ -147,6 +165,7 @@ spec:
                           enum:
                           - NamespaceOnly
                           - NamespaceWithResources
+                          - NamespaceWithResourceSelectors
                           type: string
                         version:
                           description: Version of the resource to be selected.
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
index 8ffe14382..5c586555d 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml
@@ -554,6 +554,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -615,6 +633,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
@@ -961,6 +980,25 @@ spec:
             - message: statusReportingScope is immutable
               rule: '!has(oldSelf.statusReportingScope) || self.statusReportingScope
                 == oldSelf.statusReportingScope'
+            - message: only one namespace selector with NamespaceWithResourceSelectors
+                mode is allowed
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) <= 1
+            - message: namespace selector with NamespaceWithResourceSelectors mode
+                must select by name (not by label)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope)
+                && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name)
+                && size(x.name) > 0 && !has(x.labelSelector))) == 1)
+            - message: when using NamespaceWithResourceSelectors mode, only one namespace
+                selector is allowed (cannot mix with other namespace selectors)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1
           status:
             description: The observed status of ClusterResourcePlacement.
             properties:
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
index 2b34db4d3..27dc987da 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml
@@ -845,10 +845,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -879,10 +879,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2070,7 +2070,7 @@ spec:
                                 description: The time to wait after all the clusters
                                   in the current stage complete the update before
                                   moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$
                                 type: string
                             required:
                             - type
@@ -3268,10 +3268,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -3302,10 +3302,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
index 9be8428dc..b134a75b5 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml
@@ -70,10 +70,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -101,10 +101,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -264,7 +264,7 @@ spec:
                             description: The time to wait after all the clusters in
                               the current stage complete the update before moving
                               to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$
                             type: string
                         required:
                         - type
@@ -399,10 +399,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -430,10 +430,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
index 8ab1049c5..db0cdea5d 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml
@@ -546,6 +546,24 @@ spec:
                         For ClusterResourcePlacement, you can use SelectionScope to control what gets selected:
                         - NamespaceOnly: Only the namespace object itself
                         - NamespaceWithResources: The namespace AND all resources within it (default)
+                        - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors
+
+                        When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms
+                        (after the namespace selector) to specify which resources to include. These additional selectors can
+                        target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources.
+
+                        Important requirements for NamespaceWithResourceSelectors mode:
+                        - Exactly one namespace selector with this mode is allowed
+                        - The namespace selector must select by name (not by label)
+                        - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors)
+                        - All requirements are validated via CEL at API validation time
+                        - If the selected namespace is deleted after CRP creation, the controller will report an error condition
+
+                        Example using NamespaceWithResourceSelectors:
+                        - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"}
+                        - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}}
+                        - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"}
+                        This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole.
                       type: string
                     labelSelector:
                       description: |-
@@ -607,6 +625,7 @@ spec:
                       enum:
                       - NamespaceOnly
                       - NamespaceWithResources
+                      - NamespaceWithResourceSelectors
                       type: string
                     version:
                       description: Version of the resource to be selected.
@@ -946,6 +965,25 @@ spec:
             x-kubernetes-validations:
             - message: policy cannot be removed once set
               rule: '!(has(oldSelf.policy) && !has(self.policy))'
+            - message: only one namespace selector with NamespaceWithResourceSelectors
+                mode is allowed
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) <= 1
+            - message: namespace selector with NamespaceWithResourceSelectors mode
+                must select by name (not by label)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope)
+                && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name)
+                && size(x.name) > 0 && !has(x.labelSelector))) == 1)
+            - message: when using NamespaceWithResourceSelectors mode, only one namespace
+                selector is allowed (cannot mix with other namespace selectors)
+              rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' &&
+                x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope
+                == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x,
+                x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1
           status:
             description: The observed status of ResourcePlacement.
             properties:
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
index ff9152c9d..81b506f4a 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml
@@ -845,10 +845,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -879,10 +879,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2188,10 +2188,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
@@ -2222,10 +2222,10 @@ spec:
                                 - Approval
                                 type: string
                               waitTime:
-                                description: The time to wait after all the clusters
-                                  in the current stage complete the update before
-                                  moving to the next stage.
-                                pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                                description: |-
+                                  The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                                  Only hours (h), minutes (m), and seconds (s) units are accepted.
+                                pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                                 type: string
                             required:
                             - type
diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
index 5315f7322..ccfb44e85 100644
--- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
+++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml
@@ -70,10 +70,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -101,10 +101,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -259,10 +259,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
@@ -290,10 +290,10 @@ spec:
                             - Approval
                             type: string
                           waitTime:
-                            description: The time to wait after all the clusters in
-                              the current stage complete the update before moving
-                              to the next stage.
-                            pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$
+                            description: |-
+                              The time to wait after all the clusters in the current stage complete the update before moving to the next stage.
+                              Only hours (h), minutes (m), and seconds (s) units are accepted.
+                            pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$
                             type: string
                         required:
                         - type
diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile
index d62a3e1df..10e5a05a5 100644
--- a/docker/refresh-token.Dockerfile
+++ b/docker/refresh-token.Dockerfile
@@ -15,6 +15,8 @@ RUN go mod download
 # Copy the go source
 COPY cmd/authtoken/main.go main.go
 COPY pkg/authtoken pkg/authtoken
+# writefile is a dependency of pkg/authtoken for secure file creation (0600 permissions)
+COPY pkg/utils/writefile pkg/utils/writefile
 
 ARG TARGETARCH
 
diff --git a/go.mod b/go.mod
index d89f80974..70c80ae6e 100644
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
 	github.com/crossplane/crossplane-runtime/v2 v2.1.0
 	github.com/evanphx/json-patch/v5 v5.9.11
 	github.com/gofrs/uuid v4.4.0+incompatible
+	github.com/go-logr/logr v1.4.3
 	github.com/google/go-cmp v0.7.0
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3
 	github.com/onsi/ginkgo/v2 v2.23.4
@@ -70,7 +71,6 @@ require (
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
 	github.com/go-errors/errors v1.4.2 // indirect
-	github.com/go-logr/logr v1.4.3 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
 	github.com/go-openapi/jsonpointer v0.21.1 // indirect
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
diff --git a/pkg/authtoken/token_writer.go b/pkg/authtoken/token_writer.go
index 48b965f3f..a7b2f32c1 100644
--- a/pkg/authtoken/token_writer.go
+++ b/pkg/authtoken/token_writer.go
@@ -18,9 +18,10 @@ package authtoken
 import (
 	"fmt"
 	"io"
-	"os"
 
 	"k8s.io/klog/v2"
+
+	"go.goms.io/fleet/pkg/utils/writefile"
 )
 
 type Factory struct {
@@ -31,8 +32,11 @@ func NewFactory(filePath string) Factory {
 	return Factory{filePath: filePath}
 }
 
+// Create opens the token file for writing using CreateSecureFile.
+// The member-agent container runs as the same UID and reads this file via client-go's
+// BearerTokenFile (O_RDONLY), so owner permission is sufficient for both writer and reader.
 func (w Factory) Create() (io.WriteCloser, error) {
-	wc, err := os.Create(w.filePath)
+	wc, err := writefile.CreateSecureFile(w.filePath)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go
index 8def38445..42a5a3276 100644
--- a/pkg/controllers/placement/controller.go
+++ b/pkg/controllers/placement/controller.go
@@ -507,25 +507,23 @@ func (r *Reconciler) ensureLatestPolicySnapshot(ctx context.Context, placementOb
 // 2 & 3 should never happen.
 func (r *Reconciler) lookupLatestSchedulingPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, int, error) {
 	placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()}
-	snapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey)
-	if err != nil {
-		return nil, -1, err
-	}
 	placementKObj := klog.KObj(placement)
-	policySnapshotItems := snapshotList.GetPolicySnapshotObjs()
-	if len(policySnapshotItems) == 1 {
-		policyIndex, err := labels.ParsePolicyIndexFromLabel(policySnapshotItems[0])
-		if err != nil {
-			klog.ErrorS(err, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(policySnapshotItems[0]))
-			return nil, -1, controller.NewUnexpectedBehaviorError(err)
-		}
-		return policySnapshotItems[0], policyIndex, nil
-	} else if len(policySnapshotItems) > 1 {
-		// It means there are multiple active snapshots and should never happen.
-		err := fmt.Errorf("there are %d active schedulingPolicySnapshots owned by placement %v", len(policySnapshotItems), placementKey)
-		klog.ErrorS(err, "Invalid schedulingPolicySnapshots", "placement", placementKObj)
-		return nil, -1, controller.NewUnexpectedBehaviorError(err)
+	activeSnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey)
+	switch {
+	case err == nil:
+		policyIndex, parseErr := labels.ParsePolicyIndexFromLabel(activeSnapshot)
+		if parseErr != nil {
+			klog.ErrorS(parseErr, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(activeSnapshot))
+			return nil, -1, controller.NewUnexpectedBehaviorError(parseErr)
+		}
+		return activeSnapshot, policyIndex, nil
+	case errors.Is(err, controller.ErrNoLatestPolicySnapshot):
+		// No active snapshot found; recover below by picking the largest-index snapshot.
+	default:
+		// API or invariant-violation error — propagate.
+		return nil, -1, err
 	}
+
 	// When there are no active snapshots, find the one who has the largest policy index.
 	// It should be rare only when placement controller is crashed before creating the new active snapshot.
 	sortedList, err := r.listSortedSchedulingPolicySnapshots(ctx, placement)
diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go
index 4b9dc09aa..f84c540cc 100644
--- a/pkg/controllers/updaterun/execution_test.go
+++ b/pkg/controllers/updaterun/execution_test.go
@@ -242,7 +242,9 @@ func TestCheckClusterUpdateResult(t *testing.T) {
 			wantErr:       true,
 		},
 		{
-			name: "checkClusterUpdateResult should return false and error if the binding has false workSynchronized condition",
+			// WorkNotSynchronizedYet is an in-progress reason, not a terminal failure (issue #648).
+			// checkClusterUpdateResult now treats it as "still updating" rather than an error.
+			name: "checkClusterUpdateResult should return false but no error if workSynchronized is in-progress (WorkNotSynchronizedYet)",
 			binding: &placementv1beta1.ClusterResourceBinding{
 				ObjectMeta: metav1.ObjectMeta{Generation: 1},
 				Status: placementv1beta1.ResourceBindingStatus{
@@ -258,6 +260,25 @@ func TestCheckClusterUpdateResult(t *testing.T) {
 			},
 			clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
 			wantSucceeded: false,
+			wantErr:       false,
+		},
+		{
+			name: "checkClusterUpdateResult should return false and error if workSynchronized failed terminally (SyncWorkFailed)",
+			binding: &placementv1beta1.ClusterResourceBinding{
+				ObjectMeta: metav1.ObjectMeta{Generation: 1},
+				Status: placementv1beta1.ResourceBindingStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:               string(placementv1beta1.ResourceBindingWorkSynchronized),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.SyncWorkFailedReason,
+						},
+					},
+				},
+			},
+			clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
+			wantSucceeded: false,
 			wantErr:       true,
 		},
 		{
diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go
index 1cdf8bcff..2cf0a95d4 100644
--- a/pkg/controllers/updaterun/initialization.go
+++ b/pkg/controllers/updaterun/initialization.go
@@ -125,29 +125,19 @@ func (r *Reconciler) determinePolicySnapshot(
 	updateRunRef := klog.KObj(updateRun)
 
 	// Get the latest policy snapshot.
-	policySnapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey)
+	latestPolicySnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey)
 	if err != nil {
-		klog.ErrorS(err, "Failed to list the latest policy snapshots", "placement", placementKey, "updateRun", updateRunRef)
-		// err can be retried.
-		return nil, -1, controller.NewAPIServerError(true, err)
-	}
-
-	policySnapshotObjs := policySnapshotList.GetPolicySnapshotObjs()
-	if len(policySnapshotObjs) != 1 {
-		if len(policySnapshotObjs) > 1 {
-			err := controller.NewUnexpectedBehaviorError(fmt.Errorf("more than one (%d in actual) latest policy snapshots associated with the placement: `%s`", len(policySnapshotObjs), placementKey))
-			klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
-			// no more retries for this error.
+		klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
+		// Missing or duplicate-active snapshots are placement-state invariants the run cannot
+		// recover from on its own — fail validation. Anything else (e.g. List errors classified as
+		// ErrAPIServerError, or unexpected cache failures promoted to ErrUnexpectedBehavior) is
+		// propagated raw so controller-runtime retries.
+		if errors.Is(err, controller.ErrNoLatestPolicySnapshot) || errors.Is(err, controller.ErrMultipleActivePolicySnapshots) {
 			return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error())
 		}
-		// no latest policy snapshot found.
-		err := fmt.Errorf("no latest policy snapshot associated with the placement: `%s`", placementKey)
-		klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef)
-		// again, no more retries here.
-		return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error())
+		return nil, -1, err
 	}
 
-	latestPolicySnapshot := policySnapshotObjs[0]
 	policySnapshotRef := klog.KObj(latestPolicySnapshot)
 	policyIndex, foundIndex := latestPolicySnapshot.GetLabels()[placementv1beta1.PolicyIndexLabel]
 	if !foundIndex || len(policyIndex) == 0 {
diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go
index 9fa92d4d5..d7aea3463 100644
--- a/pkg/controllers/updaterun/initialization_integration_test.go
+++ b/pkg/controllers/updaterun/initialization_integration_test.go
@@ -210,7 +210,7 @@ var _ = Describe("Updaterun initialization tests", func() {
 			Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
 
 			By("Validating the initialization failed")
-			validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot associated")
+			validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot found")
 		})
 
 		It("Should fail to initialize if there are multiple latest policy snapshots", func() {
@@ -223,7 +223,7 @@ var _ = Describe("Updaterun initialization tests", func() {
 			Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
 
 			By("Validating the initialization failed")
-			validateFailedInitCondition(ctx, updateRun, "more than one (2 in actual) latest policy snapshots associated")
+			validateFailedInitCondition(ctx, updateRun, "multiple active policy snapshots found")
 
 			By("Deleting the second scheduling policy snapshot")
 			Expect(k8sClient.Delete(ctx, snapshot2)).Should(Succeed())
diff --git a/pkg/controllers/updaterun/validation_integration_test.go b/pkg/controllers/updaterun/validation_integration_test.go
index 54a34369d..53e88f57a 100644
--- a/pkg/controllers/updaterun/validation_integration_test.go
+++ b/pkg/controllers/updaterun/validation_integration_test.go
@@ -219,7 +219,7 @@ var _ = Describe("UpdateRun validation tests", func() {
 
 				By("Validating the validation failed")
 				wantStatus = generateFailedValidationStatus(updateRun, wantStatus)
-				validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot associated")
+				validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot found")
 
 				By("Checking update run status metrics are emitted")
 				validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateFailedMetric(placementv1beta1.StateRun, updateRun, string(failureType)))
diff --git a/pkg/controllers/workgenerator/controller_integration_test.go b/pkg/controllers/workgenerator/controller_integration_test.go
index 8ed516447..62f1abaa1 100644
--- a/pkg/controllers/workgenerator/controller_integration_test.go
+++ b/pkg/controllers/workgenerator/controller_integration_test.go
@@ -57,10 +57,12 @@ var (
 	validClusterResourceOverrideSnapshotName   = "cro-1"
 	validResourceOverrideSnapshotName          = "ro-1"
 	invalidClusterResourceOverrideSnapshotName = "cro-2" // the overridden manifest is invalid
+	invalidResourceOverrideSnapshotName        = "ro-2"  // the overridden manifest is invalid
 
 	validClusterResourceOverrideSnapshot   placementv1beta1.ClusterResourceOverrideSnapshot
 	validResourceOverrideSnapshot          placementv1beta1.ResourceOverrideSnapshot
 	invalidClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot
+	invalidResourceOverrideSnapshot        placementv1beta1.ResourceOverrideSnapshot
 
 	bindingStatusCmpOpts = cmp.Options{
 		cmpopts.SortSlices(utils.LessFuncConditionByType),
@@ -280,7 +282,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 			// delete the binding
 			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 			Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())
-			By(fmt.Sprintf("resource binding  %s is deleted", binding.Name))
+			By(fmt.Sprintf("resource binding %s is deleted", binding.Name))
 			// check the binding is deleted
 			Eventually(func() bool {
 				err := k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)
@@ -302,7 +304,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -383,7 +385,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				Consistently(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
@@ -589,7 +591,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testConfigMap, testResourceEnvelope, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -692,12 +694,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceEnvelope2, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second generation
 				Eventually(func() bool {
@@ -711,7 +713,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the work that contains none enveloped object is updated
 				work := placementv1beta1.Work{}
 				Eventually(func() error {
@@ -790,12 +792,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second binding generation
 				Eventually(func() bool {
@@ -809,7 +811,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the enveloped work is deleted
 				Eventually(func() error {
 					envelopWorkLabelMatcher := client.MatchingLabels{
@@ -839,7 +841,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testClusterScopedEnvelope, testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -942,12 +944,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("another master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				// check the binding status till the bound condition is true for the second binding generation
 				Eventually(func() bool {
@@ -961,7 +963,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return condition.IsConditionStatusTrue(
 						meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
 				}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
-				By(fmt.Sprintf("resource binding  %s is reconciled", binding.Name))
+				By(fmt.Sprintf("resource binding %s is reconciled", binding.Name))
 				// check the enveloped work is deleted
 				Eventually(func() error {
 					envelopWorkLabelMatcher := client.MatchingLabels{
@@ -990,7 +992,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				// create binding
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
@@ -1003,19 +1005,19 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testConfigMap, testPdb,
 				})
 				Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("secondary resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name))
 			})
 
 			AfterEach(func() {
 				// delete the master resource snapshot
 				Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("master resource snapshot  %s deleted", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name))
 				// delete the second resource snapshot
 				Expect(k8sClient.Delete(ctx, secondSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("secondary resource snapshot  %s deleted", secondSnapshot.Name))
+				By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name))
 				// delete the binding
 				Expect(k8sClient.Delete(ctx, binding)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
-				By(fmt.Sprintf("resource binding  %s deleted", binding.Name))
+				By(fmt.Sprintf("resource binding %s deleted", binding.Name))
 			})
 
 			It("Should create all the work in the target namespace after all the resource snapshot are created", func() {
@@ -1033,7 +1035,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				}
 				diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests)
 				Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				secondWork := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork)
@@ -1099,7 +1101,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				}
 				diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests)
 				Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				secondWork := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork)
@@ -1155,7 +1157,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
@@ -1165,25 +1167,25 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
 				updateRolloutStartedGeneration(&binding)
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				// Now create the second resource snapshot
 				secondSnapshot = generateClusterResourceSnapshot(3, 3, 1, [][]byte{
 					testResource, testConfigMap,
 				})
 				Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new secondary resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name))
 				// Now create the third resource snapshot
 				thirdSnapshot := generateClusterResourceSnapshot(3, 3, 2, [][]byte{
 					testPdb,
 				})
 				Expect(k8sClient.Create(ctx, thirdSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("third resource snapshot  %s created", secondSnapshot.Name))
+				By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name))
 				// check the work for the master resource snapshot is created
 				expectedManifest := []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
@@ -1201,7 +1203,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return nil
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is updated in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				expectedManifest = []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResource}},
 					{RawExtension: runtime.RawExtension{Raw: testConfigMap}},
@@ -1220,7 +1222,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return nil
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("second work %s is updated in %s", work.Name, work.Namespace))
-				// check the work for the third resource snapshot is created, it's name is crp-subindex
+				// check the work for the third resource snapshot is created, its name is crp-subindex
 				expectedManifest = []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testPdb}},
 				}
@@ -1247,7 +1249,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
@@ -1257,13 +1259,13 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource, testConfigMap, testPdb,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("new master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name))
 				// update binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.ResourceSnapshotName = masterSnapshot.Name
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
 				updateRolloutStartedGeneration(&binding)
-				By(fmt.Sprintf("resource binding  %s updated", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated", binding.Name))
 				//inspect the work manifest that should have been updated to contain all
 				expectedManifest := []placementv1beta1.Manifest{
 					{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
@@ -1301,7 +1303,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
 				}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
 				By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace))
-				// check the work for the secondary resource snapshot is created, it's name is crp-subindex
+				// check the work for the secondary resource snapshot is created, its name is crp-subindex
 				work2 := placementv1beta1.Work{}
 				Eventually(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work2)
@@ -1310,7 +1312,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				// delete the binding
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s is deleted", binding.Name))
+				By(fmt.Sprintf("resource binding %s is deleted", binding.Name))
 
 				// verify that all associated works have been deleted
 				Eventually(func() error {
@@ -1352,7 +1354,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				crolist := []string{validClusterResourceOverrideSnapshotName}
 				roList := []placementv1beta1.NamespacedName{
 					{
@@ -1446,7 +1448,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				Consistently(func() error {
 					return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
@@ -1472,7 +1474,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1519,7 +1521,70 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					}
 					return cmp.Diff(wantStatus, binding.Status, cmpConditionOption)
 				}, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name))
-				Expect(binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message).Should(ContainSubstring("Failed to apply the override rules on the resources: add operation does not apply"))
+				message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message
+				Expect(message).Should(MatchRegexp(`ClusterResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`))
+			})
+		})
+
+		Context("Test Bound ClusterResourceBinding with a single resource snapshot and invalid resource override", func() {
+			var masterSnapshot *placementv1beta1.ClusterResourceSnapshot
+
+			BeforeEach(func() {
+				masterSnapshot = generateClusterResourceSnapshot(1, 1, 0, [][]byte{
+					testResourceCRD, testNameSpace, testResource,
+				})
+				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
+				spec := placementv1beta1.ResourceBindingSpec{
+					State:                placementv1beta1.BindingStateBound,
+					ResourceSnapshotName: masterSnapshot.Name,
+					TargetCluster:        memberClusterName,
+					ResourceOverrideSnapshots: []placementv1beta1.NamespacedName{
+						{
+							Name:      invalidResourceOverrideSnapshotName,
+							Namespace: appNamespaceName,
+						},
+					},
+				}
+				createClusterResourceBinding(&binding, spec)
+			})
+
+			AfterEach(func() {
+				By("Deleting master clusterResourceSnapshot")
+				Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
+			})
+
+			It("Should not create the work in the target namespace and surface ResourceOverrideSnapshot identity in the condition message", func() {
+				work := placementv1beta1.Work{}
+				Consistently(func() bool {
+					err := k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
+					return errors.IsNotFound(err)
+				}, duration, interval).Should(BeTrue(), "controller should not create work in hub cluster when an override fails")
+				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
+				Expect(len(binding.Finalizers)).Should(Equal(1))
+
+				Eventually(func() string {
+					Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
+					wantStatus := placementv1beta1.ResourceBindingStatus{
+						Conditions: []metav1.Condition{
+							{
+								Type:               string(placementv1beta1.ResourceBindingRolloutStarted),
+								Status:             metav1.ConditionTrue,
+								Reason:             condition.RolloutStartedReason,
+								ObservedGeneration: binding.GetGeneration(),
+							},
+							{
+								Type:               string(placementv1beta1.ResourceBindingOverridden),
+								Status:             metav1.ConditionFalse,
+								Reason:             condition.OverriddenFailedReason,
+								ObservedGeneration: binding.GetGeneration(),
+							},
+						},
+					}
+					return cmp.Diff(wantStatus, binding.Status, cmpConditionOption)
+				}, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name))
+				message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message
+				Expect(message).Should(MatchRegexp(`ResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`))
 			})
 		})
 
@@ -1531,7 +1596,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1588,7 +1653,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 					testResourceCRD, testNameSpace, testResource,
 				})
 				Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
-				By(fmt.Sprintf("master resource snapshot  %s created", masterSnapshot.Name))
+				By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
 				spec := placementv1beta1.ResourceBindingSpec{
 					State:                placementv1beta1.BindingStateBound,
 					ResourceSnapshotName: masterSnapshot.Name,
@@ -1673,7 +1738,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement",
 				Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
 				binding.Spec.State = placementv1beta1.BindingStateUnscheduled
 				Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
-				By(fmt.Sprintf("resource binding  %s updated to be unscheduled", binding.Name))
+				By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
 				updateRolloutStartedGeneration(&binding)
 				rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted))
 				Consistently(func() error {
diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go
index b7134b77c..0ecf12cdc 100644
--- a/pkg/controllers/workgenerator/envelope.go
+++ b/pkg/controllers/workgenerator/envelope.go
@@ -18,13 +18,15 @@ package workgenerator
 
 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"fmt"
 	"sort"
 	"strings"
 
+	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
-	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/klog/v2"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
@@ -87,18 +89,27 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj(
 	var work *fleetv1beta1.Work
 	switch {
 	case len(workList.Items) > 1:
-		// Multiple matching work objects found; this should never occur under normal conditions.
-		wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef())
-		klog.ErrorS(wrappedErr, "Failed to create or update work object for envelope",
-			"resourceBinding", klog.KObj(binding),
-			"resourceSnapshot", klog.KObj(resourceSnapshot),
-			"envelope", envelopeReader.GetEnvelopeObjRef())
-		// Log the work object names to help debug.
+		// Multiple matching work objects found. This can only happen in environments that
+		// were stuck on this state *before* deterministic Work naming was introduced in
+		// buildNewWorkForEnvelopeCR; newly-raced reconciles are now prevented by the
+		// API server's name-uniqueness check. We deliberately do NOT auto-delete the
+		// extras: both Works may have been applied by the member agent, and deleting one
+		// would trigger member-side resource cleanup and user-visible fluctuation on
+		// resources the survivor still owns. Surface the state for operator cleanup
+		// (manually delete all but the oldest Work in the affected namespace).
 		workNames := make([]string, len(workList.Items))
 		for i := range workList.Items {
 			workNames[i] = workList.Items[i].Name
 		}
-		klog.ErrorS(wrappedErr, "Duplicate work objects found", "works", workNames)
+		wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef())
+		klog.ErrorS(wrappedErr, "Duplicate envelope Work objects found; manual cleanup required — delete all but the oldest Work in the namespace",
+			"works", workNames,
+			"resourceBinding", klog.KObj(binding),
+			"resourceSnapshot", klog.KObj(resourceSnapshot),
+			"envelope", envelopeReader.GetEnvelopeObjRef())
+		r.recorder.Eventf(binding, corev1.EventTypeWarning, "DuplicateEnvelopeWorks",
+			"Multiple Work objects (%v) found for envelope %v in namespace %s; delete all but the oldest to recover",
+			workNames, envelopeReader.GetEnvelopeObjRef(), fmt.Sprintf(utils.NamespaceNameFormat, binding.GetBindingSpec().TargetCluster))
 		return nil, controller.NewUnexpectedBehaviorError(wrappedErr)
 	case len(workList.Items) == 1:
 		klog.V(2).InfoS("Found existing work object for the envelope; updating it",
@@ -200,6 +211,24 @@ func refreshWorkForEnvelopeCR(
 	work.Spec.ApplyStrategy = resourceBinding.GetBindingSpec().ApplyStrategy
 }
 
+// envelopeWorkNameSuffix returns a deterministic suffix for the envelope Work's name,
+// derived from the envelope's identity. Using a deterministic suffix (instead of a random
+// UUID) means two concurrent Create attempts for the same envelope collide on name at the
+// API server, which enforces the "one Work per (binding, envelope)" invariant server-side.
+// The hash is truncated to 8 bytes of SHA-256 (rendered as 16 hex characters), keeping
+// the overall Work name well under the DNS-1123 subdomain limit; collision probability
+// at the scale of a single binding's envelope set is ~2^-64.
+func envelopeWorkNameSuffix(envelopeReader fleetv1beta1.EnvelopeReader) string {
+	// NOTE: ClusterResourceEnvelope has an empty namespace; ResourceEnvelope has a namespace.
+	// Including the type disambiguates an unlikely name collision across kinds.
+	identity := fmt.Sprintf("%s/%s/%s",
+		envelopeReader.GetEnvelopeType(),
+		envelopeReader.GetNamespace(),
+		envelopeReader.GetName())
+	sum := sha256.Sum256([]byte(identity))
+	return hex.EncodeToString(sum[:8])
+}
+
 func buildNewWorkForEnvelopeCR(
 	workNamePrefix string,
 	resourceBinding fleetv1beta1.BindingObj,
@@ -208,7 +237,7 @@ func buildNewWorkForEnvelopeCR(
 	manifests []fleetv1beta1.Manifest,
 	resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string,
 ) *fleetv1beta1.Work {
-	workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, uuid.NewUUID())
+	workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, envelopeWorkNameSuffix(envelopeReader))
 	workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.GetBindingSpec().TargetCluster)
 
 	// Create the labels map
diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go
index 29514ff74..bb7c26e26 100644
--- a/pkg/controllers/workgenerator/envelope_test.go
+++ b/pkg/controllers/workgenerator/envelope_test.go
@@ -17,8 +17,11 @@ limitations under the License.
 package workgenerator
 
 import (
+	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -32,9 +35,12 @@ import (
 
 	fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 	"go.goms.io/fleet/pkg/utils"
+	"go.goms.io/fleet/pkg/utils/controller"
 	"go.goms.io/fleet/test/utils/informer"
 )
 
+const testWorkNamePrefix = "test-work"
+
 func TestExtractManifestsFromEnvelopeCR(t *testing.T) {
 	tests := []struct {
 		name           string
@@ -255,8 +261,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 	ignoreWorkMeta := cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Name", "OwnerReferences")
 	scheme := serviceScheme(t)
 
-	workNamePrefix := "test-work"
-
 	resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "test-snapshot",
@@ -306,7 +310,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 	// Create an existing work for update test
 	existingWork := &fleetv1beta1.Work{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      workNamePrefix,
+			Name:      testWorkNamePrefix,
 			Namespace: "fleet-member-test-cluster-1",
 			Labels: map[string]string{
 				fleetv1beta1.ParentBindingLabel:     resourceBinding.Name,
@@ -483,7 +487,13 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 			wantErr:                             true,
 		},
 		{
-			name:                                "two existing works should result in error",
+			// Duplicate envelope Works indicate an environment that was stuck before
+			// deterministic naming was introduced. The controller surfaces the state
+			// (error + Event) but does NOT auto-delete, because deleting a Work the
+			// member agent has already applied would trigger resource fluctuation on
+			// the member side. Full post-conditions are verified in
+			// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation.
+			name:                                "two existing works surface an UnexpectedBehaviorError without mutation",
 			envelopeReader:                      resourceEnvelope,
 			resourceOverrideSnapshotHash:        "new-resource-hash",
 			clusterResourceOverrideSnapshotHash: "new-cluster-resource-hash",
@@ -512,7 +522,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 			}
 
 			// Call the function under test
-			got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, workNamePrefix,
+			got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, testWorkNamePrefix,
 				resourceBinding, resourceSnapshot, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash)
 
 			if (err != nil) != tt.wantErr {
@@ -532,7 +542,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) {
 func TestProcessOneSelectedResource(t *testing.T) {
 	scheme := serviceScheme(t)
 
-	workNamePrefix := "test-work"
 	resourceBinding := &fleetv1beta1.ClusterResourceBinding{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "test-binding",
@@ -685,7 +694,7 @@ func TestProcessOneSelectedResource(t *testing.T) {
 				tt.selectedResource,
 				resourceBinding,
 				snapshot,
-				workNamePrefix,
+				testWorkNamePrefix,
 				tt.resourceOverrideSnapshotHash,
 				tt.clusterResourceOverrideSnapshotHash,
 				activeWork,
@@ -725,3 +734,172 @@ func createResourceContent(t *testing.T, obj runtime.Object) *fleetv1beta1.Resou
 		},
 	}
 }
+
+// TestEnvelopeWorkNameSuffix verifies that the suffix is a stable, length-bounded function
+// of the envelope's identity. Determinism is what lets the API server enforce the
+// "one Work per (binding, envelope)" invariant via name uniqueness.
+func TestEnvelopeWorkNameSuffix(t *testing.T) {
+	base := &fleetv1beta1.ResourceEnvelope{
+		ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"},
+	}
+
+	tests := []struct {
+		name      string
+		other     fleetv1beta1.EnvelopeReader
+		wantEqual bool // true → suffix must match base; false → suffix must differ
+	}{
+		{
+			name:      "same identity",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}},
+			wantEqual: true,
+		},
+		{
+			name:      "different name",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-b", Namespace: "ns-1"}},
+			wantEqual: false,
+		},
+		{
+			name:      "different namespace",
+			other:     &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-2"}},
+			wantEqual: false,
+		},
+		{
+			// Same name as base, but cluster-scoped — the type field in the hash input disambiguates.
+			name:      "different type, same name",
+			other:     &fleetv1beta1.ClusterResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a"}},
+			wantEqual: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotBase := envelopeWorkNameSuffix(base)
+			gotOther := envelopeWorkNameSuffix(tt.other)
+			if tt.wantEqual && gotBase != gotOther {
+				t.Errorf("envelopeWorkNameSuffix(%v) = %q, envelopeWorkNameSuffix(%v) = %q, want equal", base, gotBase, tt.other, gotOther)
+			}
+			if !tt.wantEqual && gotBase == gotOther {
+				t.Errorf("envelopeWorkNameSuffix(%v) = envelopeWorkNameSuffix(%v) = %q, want different", base, tt.other, gotBase)
+			}
+		})
+	}
+
+	// Suffix is bounded and DNS-1123-safe (lowercase hex).
+	suffix := envelopeWorkNameSuffix(base)
+	if len(suffix) != 16 {
+		t.Errorf("envelopeWorkNameSuffix length = %d, want 16", len(suffix))
+	}
+	if strings.ToLower(suffix) != suffix {
+		t.Errorf("envelopeWorkNameSuffix(%v) = %q, want lowercase", base, suffix)
+	}
+}
+
+// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation verifies that
+// when the label query returns multiple Works for the same (binding, envelope), the
+// controller surfaces the condition (UnexpectedBehaviorError + Event) but does NOT
+// mutate cluster state. Auto-deleting duplicates is unsafe because the member agent
+// may have applied them; deleting one triggers resource cleanup that conflicts with
+// the survivor. Deterministic naming (see buildNewWorkForEnvelopeCR) prevents new
+// duplicates; pre-existing ones require operator cleanup.
+func TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation(t *testing.T) {
+	scheme := serviceScheme(t)
+	ctx := context.Background()
+
+	resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "test-snapshot",
+			Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"},
+		},
+	}
+	resourceBinding := &fleetv1beta1.ClusterResourceBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "test-binding",
+			Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"},
+		},
+		Spec: fleetv1beta1.ResourceBindingSpec{
+			TargetCluster:        "test-cluster-1",
+			ResourceSnapshotName: resourceSnapshot.Name,
+		},
+	}
+	resourceEnvelope := &fleetv1beta1.ResourceEnvelope{
+		ObjectMeta: metav1.ObjectMeta{Name: "dup-envelope", Namespace: "default"},
+		Data: map[string]runtime.RawExtension{
+			"configmap": {Raw: []byte(`{"apiVersion":"v1","kind":"ConfigMap","metadata":{"name":"cm","namespace":"default"}}`)},
+		},
+	}
+
+	workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.Spec.TargetCluster)
+	labelsForWork := map[string]string{
+		fleetv1beta1.ParentBindingLabel:     resourceBinding.Name,
+		fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel],
+		fleetv1beta1.EnvelopeTypeLabel:      string(fleetv1beta1.ResourceEnvelopeType),
+		fleetv1beta1.EnvelopeNameLabel:      resourceEnvelope.Name,
+		fleetv1beta1.EnvelopeNamespaceLabel: resourceEnvelope.Namespace,
+	}
+	dupNames := []string{"dup-envelope-a", "dup-envelope-b", "dup-envelope-c"}
+	mkWork := func(name string) *fleetv1beta1.Work {
+		return &fleetv1beta1.Work{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      name,
+				Namespace: workNamespace,
+				Labels:    labelsForWork,
+			},
+		}
+	}
+	objs := make([]client.Object, 0, len(dupNames))
+	for _, n := range dupNames {
+		objs = append(objs, mkWork(n))
+	}
+	fakeClient := fake.NewClientBuilder().
+		WithScheme(scheme).
+		WithObjects(objs...).
+		Build()
+
+	recorder := record.NewFakeRecorder(10)
+	r := &Reconciler{
+		Client:          fakeClient,
+		recorder:        recorder,
+		InformerManager: &informer.FakeManager{},
+	}
+
+	got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, resourceEnvelope, testWorkNamePrefix,
+		resourceBinding, resourceSnapshot, "", "")
+
+	if got != nil {
+		t.Errorf("createOrUpdateEnvelopeCRWorkObj() = %v, want nil on duplicate-detected path", got)
+	}
+	if err == nil {
+		t.Fatalf("createOrUpdateEnvelopeCRWorkObj() error = nil, want UnexpectedBehaviorError")
+	}
+	if !errors.Is(err, controller.ErrUnexpectedBehavior) {
+		t.Errorf("createOrUpdateEnvelopeCRWorkObj() error = %v, want wrapping controller.ErrUnexpectedBehavior", err)
+	}
+
+	// Post-condition: all duplicates remain untouched — no auto-deletion.
+	remaining := &fleetv1beta1.WorkList{}
+	if err := fakeClient.List(ctx, remaining, client.InNamespace(workNamespace)); err != nil {
+		t.Fatalf("List Works in %q: %v", workNamespace, err)
+	}
+	sortStrings := cmpopts.SortSlices(func(a, b string) bool { return a < b })
+	if diff := cmp.Diff(dupNames, workNames(remaining.Items), sortStrings); diff != "" {
+		t.Errorf("Works after call mismatch (-want +got):\n%s", diff)
+	}
+
+	// Post-condition: a Warning Event was emitted so operators can see the stuck state.
+	select {
+	case ev := <-recorder.Events:
+		if !strings.Contains(ev, "DuplicateEnvelopeWorks") {
+			t.Errorf("event = %q, want reason DuplicateEnvelopeWorks", ev)
+		}
+	default:
+		t.Error("no Event emitted for duplicate envelope Works; operators would have no surface signal")
+	}
+}
+
+func workNames(items []fleetv1beta1.Work) []string {
+	out := make([]string, len(items))
+	for i := range items {
+		out[i] = items[i].Name
+	}
+	return out
+}
diff --git a/pkg/controllers/workgenerator/override.go b/pkg/controllers/workgenerator/override.go
index 5c6ecb339..72b879cf3 100644
--- a/pkg/controllers/workgenerator/override.go
+++ b/pkg/controllers/workgenerator/override.go
@@ -35,6 +35,10 @@ import (
 	"go.goms.io/fleet/pkg/utils/overrider"
 )
 
+// fetchClusterResourceOverrideSnapshots returns the binding's CRO snapshots keyed by the
+// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so
+// callers can apply deterministically.
+//
 // TODO: combine the following two functions into one, as they are very similar.
 func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot, error) {
 	croMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot)
@@ -70,6 +74,9 @@ func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context,
 	return croMap, nil
 }
 
+// fetchResourceOverrideSnapshots returns the binding's RO snapshots keyed by the
+// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so
+// callers can apply deterministically.
 func (r *Reconciler) fetchResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot, error) {
 	roMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot)
 
@@ -127,11 +134,11 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 		Kind:    gvk.Kind,
 		Name:    uResource.GetName(),
 	}
-	isClusterScopeResource := r.InformerManager.IsClusterScopedResources(gvk)
+	isClusterScopedResource := r.InformerManager.IsClusterScopedResources(gvk)
 
 	// For the namespace scoped resource, it could be selected by the namespace itself.
-	// use the namespace as the key
-	if !isClusterScopeResource {
+	// Use the namespace as the key.
+	if !isClusterScopedResource {
 		key = placementv1beta1.ResourceIdentifier{
 			Group:   utils.NamespaceMetaGVK.Group,
 			Version: utils.NamespaceMetaGVK.Version,
@@ -149,14 +156,15 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 		}
 		if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil {
 			klog.ErrorS(err, "Failed to apply the override rules", "clusterResourceOverrideSnapshot", klog.KObj(snapshot))
-			return false, err
+			return false, controller.NewUserError(fmt.Errorf("ClusterResourceOverrideSnapshot %q failed to apply on %s: %s",
+				snapshot.Name, formatOverrideTarget(&uResource), err.Error()))
 		}
 	}
 	klog.V(2).InfoS("Applied clusterResourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(croMap[key]))
 
 	// If the resource is selected by both ClusterResourceOverride and ResourceOverride, ResourceOverride will win when resolving conflicts.
 	// Apply ResourceOverrideSnapshots.
-	if !isClusterScopeResource {
+	if !isClusterScopedResource {
 		key = placementv1beta1.ResourceIdentifier{
 			Group:     gvk.Group,
 			Version:   gvk.Version,
@@ -172,7 +180,8 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 			}
 			if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil {
 				klog.ErrorS(err, "Failed to apply the override rules", "resourceOverrideSnapshot", klog.KObj(snapshot))
-				return false, err
+				return false, controller.NewUserError(fmt.Errorf("ResourceOverrideSnapshot %q failed to apply on %s: %s",
+					snapshot.Name, formatOverrideTarget(&uResource), err.Error()))
 			}
 		}
 		klog.V(2).InfoS("Applied resourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(roMap[key]))
@@ -180,12 +189,29 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent,
 	return resource.Raw == nil, nil
 }
 
+// formatOverrideTarget renders the target as e.g. `Deployment "my-app" in namespace "default"`
+// for inclusion in a user-facing error message.
+func formatOverrideTarget(target *unstructured.Unstructured) string {
+	// Fall back to "Unknown" so a snapshot missing `kind` doesn't render as `"" "name"`.
+	kind := target.GetObjectKind().GroupVersionKind().Kind
+	if kind == "" {
+		kind = "Unknown"
+	}
+	if ns := target.GetNamespace(); ns != "" {
+		return fmt.Sprintf("%s %q in namespace %q", kind, target.GetName(), ns)
+	}
+	return fmt.Sprintf("%s %q", kind, target.GetName())
+}
+
+// applyOverrideRules applies matching rules to the resource. A DeleteOverrideType rule clears
+// the resource and stops; otherwise JSON patches apply in order. Errors are returned raw — the
+// caller (applyOverrides) tags them as user errors so we don't double-wrap the sentinel.
 func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, rules []placementv1beta1.OverrideRule) error {
 	for _, rule := range rules {
 		matched, err := overrider.IsClusterMatched(cluster, rule)
 		if err != nil {
-			klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Found an invalid override rule")
-			return controller.NewUserError(err) // should not happen though and should be rejected by the webhook
+			klog.ErrorS(err, "Found an invalid override rule")
+			return err
 		}
 		if !matched {
 			continue
@@ -198,7 +224,7 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu
 		// Apply JSONPatchOverrides by default
 		if err = applyJSONPatchOverride(resource, cluster, rule.JSONPatchOverrides); err != nil {
 			klog.ErrorS(err, "Failed to apply JSON patch override")
-			return controller.NewUserError(err)
+			return err
 		}
 	}
 	return nil
@@ -207,11 +233,11 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu
 // applyJSONPatchOverride applies a JSON patch on the selected resources following [RFC 6902](https://datatracker.ietf.org/doc/html/rfc6902).
 func applyJSONPatchOverride(resourceContent *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, overrides []placementv1beta1.JSONPatchOverride) error {
 	var err error
-	if len(overrides) == 0 { // do nothing
+	if len(overrides) == 0 {
 		return nil
 	}
-	// go through the JSON patch overrides to replace the built-in variables before json Marshal
-	// as it may contain the built-in variables that cannot be marshaled directly
+	// Go through the JSON patch overrides to replace the built-in variables before json.Marshal,
+	// as the patch values may contain built-in variables that cannot be marshaled directly.
 	for i := range overrides {
 		// Process the JSON string to replace variables
 		jsonStr := string(overrides[i].Value.Raw)
@@ -263,14 +289,14 @@ func replaceClusterLabelKeyVariables(input string, cluster *clusterv1beta1.Membe
 		// extract the key value user wants to replace
 		endIdx := strings.Index(result[startIdx+prefixLen:], "}")
 		if endIdx == -1 {
-			klog.V(2).InfoS("malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input)
+			klog.V(2).InfoS("Malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input)
 			return "", fmt.Errorf("input %s is missing the closing bracket `}`", input)
 		}
 		endIdx += startIdx + prefixLen
 		// extract the key name
 		keyName := result[startIdx+prefixLen : endIdx]
 		// check if the key exists in the cluster labels
-		labelValue, exists := cluster.ObjectMeta.Labels[keyName]
+		labelValue, exists := cluster.Labels[keyName]
 		if !exists {
 			klog.V(2).InfoS("Label key not found on cluster", "key", keyName, "cluster", cluster.Name)
 			return "", fmt.Errorf("label key %s not found on cluster %s", keyName, cluster.Name)
diff --git a/pkg/controllers/workgenerator/override_test.go b/pkg/controllers/workgenerator/override_test.go
index a0b18498a..88c43dd86 100644
--- a/pkg/controllers/workgenerator/override_test.go
+++ b/pkg/controllers/workgenerator/override_test.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -428,6 +429,9 @@ func TestFetchResourceOverrideSnapshot(t *testing.T) {
 }
 
 func TestApplyOverrides_clusterScopedResource(t *testing.T) {
+	// FakeManager.IsClusterScopedResources returns (APIResources[gvk] == IsClusterScopedResource).
+	// With the flag set to false and APIResources listing namespace-scoped GVKs, listed GVKs
+	// report as namespace-scoped and unlisted GVKs (ClusterRole) report as cluster-scoped.
 	fakeInformer := informer.FakeManager{
 		APIResources: map[schema.GroupVersionKind]bool{
 			{
@@ -450,7 +454,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 		croMap          map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot
 		wantClusterRole rbacv1.ClusterRole
 		wantErr         error
-		wantDeleted     bool
+		// wantErrSubstr asserts the failure message names the failing snapshot and target.
+		wantErrSubstr []string
+		wantDeleted   bool
 	}{
 		{
 			name: "empty overrides",
@@ -844,6 +850,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 					Name:    "clusterrole-name",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "conflicting-rules-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -895,6 +904,10 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "conflicting-rules-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+			},
 		},
 		{
 			name: "invalid json patch of clusterResourceOverride",
@@ -921,6 +934,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 					Name:    "clusterrole-name",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "test-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -953,6 +969,71 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "test-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+			},
+		},
+		{
+			// Invalid LabelSelector Operator makes IsClusterMatched fail — exercises the
+			// applyOverrideRules raw-error return that applyOverrides wraps.
+			name: "invalid cluster label selector in clusterResourceOverride",
+			clusterRole: rbacv1.ClusterRole{
+				TypeMeta: clusterRoleType,
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "clusterrole-name",
+				},
+			},
+			cluster: clusterv1beta1.MemberCluster{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "cluster-1",
+				},
+			},
+			croMap: map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot{
+				{
+					Group:   "rbac.authorization.k8s.io",
+					Version: "v1",
+					Kind:    "ClusterRole",
+					Name:    "clusterrole-name",
+				}: {
+					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "bad-selector-cro-snapshot",
+						},
+						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
+							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
+								Policy: &placementv1beta1.OverridePolicy{
+									OverrideRules: []placementv1beta1.OverrideRule{
+										{
+											ClusterSelector: &placementv1beta1.ClusterSelector{
+												ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{
+													{
+														LabelSelector: &metav1.LabelSelector{
+															MatchExpressions: []metav1.LabelSelectorRequirement{
+																{
+																	Key:      "key1",
+																	Operator: "InvalidOperator",
+																	Values:   []string{"value1"},
+																},
+															},
+														},
+													},
+												},
+											},
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "bad-selector-cro-snapshot"`,
+				`ClusterRole "clusterrole-name"`,
+				`invalid cluster label selector`,
+			},
 		},
 		{
 			name: "delete during the clusterResourceOverride",
@@ -1118,6 +1199,11 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted)
 			}
 			if tc.wantErr != nil {
+				for _, want := range tc.wantErrSubstr {
+					if !strings.Contains(err.Error(), want) {
+						t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want)
+					}
+				}
 				return
 			}
 			if tc.wantDeleted {
@@ -1126,12 +1212,12 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var clusterRole rbacv1.ClusterRole
 			if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &clusterRole); err != nil {
-				t.Fatalf("Failed to convert the result to clusterole: %v, want nil", err)
+				t.Fatalf("Failed to convert the result to clusterRole: %v, want nil", err)
 			}
 
 			if diff := cmp.Diff(tc.wantClusterRole, clusterRole); diff != "" {
@@ -1141,7 +1227,8 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) {
 	}
 }
 
-func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
+func TestApplyOverrides_namespaceScopedResource(t *testing.T) {
+	// See TestApplyOverrides_clusterScopedResource for the IsClusterScopedResource: false rationale.
 	fakeInformer := informer.FakeManager{
 		APIResources: map[schema.GroupVersionKind]bool{
 			{
@@ -1165,7 +1252,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 		roMap          map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot
 		wantDeployment appsv1.Deployment
 		wantErr        error
-		wantDelete     bool
+		// wantErrSubstr asserts the failure message names the failing snapshot and target.
+		wantErrSubstr []string
+		wantDeleted   bool
 	}{
 		{
 			name: "empty overrides",
@@ -1664,6 +1753,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Name:    "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: "invalid-patch-cro-snapshot",
+						},
 						Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -1697,6 +1789,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ClusterResourceOverrideSnapshot "invalid-patch-cro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 		{
 			name: "invalid json patch of resourceOverride",
@@ -1728,6 +1825,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Namespace: "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "invalid-patch-ro-snapshot",
+							Namespace: "deployment-namespace",
+						},
 						Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -1751,6 +1852,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ResourceOverrideSnapshot "invalid-patch-ro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 		{
 			name: "delete type of resourceOverride",
@@ -1797,7 +1903,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "resourceOverride delete the cro override",
@@ -1884,7 +1990,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "resourceOverride no-op when the cro delete",
@@ -1970,7 +2076,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					},
 				},
 			},
-			wantDelete: true,
+			wantDeleted: true,
 		},
 		{
 			name: "cluster name as value in json patch of resourceOverride",
@@ -2218,6 +2324,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 					Namespace: "deployment-namespace",
 				}: {
 					{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "test-ro-snapshot",
+							Namespace: "deployment-namespace",
+						},
 						Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
 							OverrideSpec: placementv1beta1.ResourceOverrideSpec{
 								Policy: &placementv1beta1.OverridePolicy{
@@ -2240,6 +2350,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 				},
 			},
 			wantErr: controller.ErrUserError,
+			wantErrSubstr: []string{
+				`ResourceOverrideSnapshot "test-ro-snapshot"`,
+				`Deployment "deployment-name"`,
+				`namespace "deployment-namespace"`,
+			},
 		},
 	}
 	for _, tc := range tests {
@@ -2252,19 +2367,24 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) {
 			if gotErr, wantErr := err != nil, tc.wantErr != nil; gotErr != wantErr || !errors.Is(err, tc.wantErr) {
 				t.Fatalf("applyOverrides() got error %v, want error %v", err, tc.wantErr)
 			}
-			if gotDeleted != tc.wantDelete {
-				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDelete)
+			if gotDeleted != tc.wantDeleted {
+				t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted)
 			}
 			if tc.wantErr != nil {
+				for _, want := range tc.wantErrSubstr {
+					if !strings.Contains(err.Error(), want) {
+						t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want)
+					}
+				}
 				return
 			}
-			if tc.wantDelete {
+			if tc.wantDeleted {
 				return
 			}
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var deployment appsv1.Deployment
@@ -2757,7 +2877,7 @@ func TestApplyJSONPatchOverride(t *testing.T) {
 
 			var u unstructured.Unstructured
 			if err := u.UnmarshalJSON(rc.Raw); err != nil {
-				t.Fatalf("Failed to unmarshl the result: %v, want nil", err)
+				t.Fatalf("Failed to unmarshal the result: %v, want nil", err)
 			}
 
 			var deployment appsv1.Deployment
@@ -2774,10 +2894,10 @@ func TestApplyJSONPatchOverride(t *testing.T) {
 
 func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 	tests := map[string]struct {
-		cluster   *clusterv1beta1.MemberCluster
-		input     string
-		expected  string
-		expectErr bool
+		cluster *clusterv1beta1.MemberCluster
+		input   string
+		want    string
+		wantErr bool
 	}{
 		"No clusterLabelKey variables": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2787,8 +2907,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in us-west-1",
-			expected: "The cluster is in us-west-1",
+			input: "The cluster is in us-west-1",
+			want:  "The cluster is in us-west-1",
 		},
 		"ClusterLabelKey Variable replaced": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2798,8 +2918,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
-			expected: "The cluster is in us-west-1",
+			input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
+			want:  "The cluster is in us-west-1",
 		},
 		"The clusterLabelKey key is misspelled": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2807,8 +2927,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					Labels: map[string]string{},
 				},
 			},
-			input:    "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
-			expected: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
+			input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
+			want:  "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region",
 		},
 		"Multiple complex clusterLabelKey variables replaced": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2819,8 +2939,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:    "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}",
-			expected: "The cluster is in us-west-1 and environment is prod",
+			input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}",
+			want:  "The cluster is in us-west-1 and environment is prod",
 		},
 		"The clusterLabelKey key is not found": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2828,8 +2948,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					Labels: map[string]string{},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}",
+			wantErr: true,
 		},
 		"ClusterLabelKey Variable key case not match": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2839,10 +2959,10 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}",
+			wantErr: true,
 		},
-		"Invalid  clusterLabelKey variable format": {
+		"Invalid clusterLabelKey variable format": {
 			cluster: &clusterv1beta1.MemberCluster{
 				ObjectMeta: metav1.ObjectMeta{
 					Labels: map[string]string{
@@ -2850,8 +2970,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region",
+			wantErr: true,
 		},
 		"ClusterLabelKey variable key empty": {
 			cluster: &clusterv1beta1.MemberCluster{
@@ -2861,19 +2981,61 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) {
 					},
 				},
 			},
-			input:     "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}",
-			expectErr: true,
+			input:   "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}",
+			wantErr: true,
 		},
 	}
 
 	for name, tc := range tests {
 		t.Run(name, func(t *testing.T) {
 			result, err := replaceClusterLabelKeyVariables(tc.input, tc.cluster)
-			if gotErr := err != nil; gotErr != tc.expectErr {
-				t.Fatalf("applyJSONPatchOverride() = error %v, want %v", err, tc.expectErr)
+			if gotErr := err != nil; gotErr != tc.wantErr {
+				t.Fatalf("replaceClusterLabelKeyVariables() = error %v, want %v", err, tc.wantErr)
 			}
-			if result != tc.expected {
-				t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.expected)
+			if result != tc.want {
+				t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.want)
+			}
+		})
+	}
+}
+
+func TestFormatOverrideTarget(t *testing.T) {
+	tests := map[string]struct {
+		kind      string
+		name      string
+		namespace string
+		want      string
+	}{
+		"namespaced resource": {
+			kind:      "Deployment",
+			name:      "my-app",
+			namespace: "default",
+			want:      `Deployment "my-app" in namespace "default"`,
+		},
+		"cluster-scoped resource": {
+			kind: "Namespace",
+			name: "app",
+			want: `Namespace "app"`,
+		},
+		"empty kind falls back to Unknown": {
+			name:      "my-app",
+			namespace: "default",
+			want:      `Unknown "my-app" in namespace "default"`,
+		},
+		"empty kind, cluster-scoped": {
+			name: "app",
+			want: `Unknown "app"`,
+		},
+	}
+
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			target := &unstructured.Unstructured{}
+			target.SetGroupVersionKind(schema.GroupVersionKind{Kind: tc.kind})
+			target.SetName(tc.name)
+			target.SetNamespace(tc.namespace)
+			if got := formatOverrideTarget(target); got != tc.want {
+				t.Errorf("formatOverrideTarget() = %q, want %q", got, tc.want)
 			}
 		})
 	}
diff --git a/pkg/controllers/workgenerator/suite_test.go b/pkg/controllers/workgenerator/suite_test.go
index 24c03edd4..d7b999110 100644
--- a/pkg/controllers/workgenerator/suite_test.go
+++ b/pkg/controllers/workgenerator/suite_test.go
@@ -301,6 +301,47 @@ func createOverrides() {
 	}
 	Expect(k8sClient.Create(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the cro-2")
 	By(fmt.Sprintf("Invalid cluster resource override snapshot %s created", invalidClusterResourceOverrideSnapshotName))
+
+	invalidResourceOverrideSnapshot = placementv1beta1.ResourceOverrideSnapshot{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      invalidResourceOverrideSnapshotName,
+			Namespace: appNamespaceName,
+			Labels: map[string]string{
+				placementv1beta1.IsLatestSnapshotLabel: "true",
+			},
+		},
+		Spec: placementv1beta1.ResourceOverrideSnapshotSpec{
+			OverrideSpec: placementv1beta1.ResourceOverrideSpec{
+				ResourceSelectors: []placementv1beta1.ResourceSelector{
+					{
+						Group:   "test.kubernetes-fleet.io",
+						Version: "v1alpha1",
+						Kind:    "TestResource",
+						Name:    "random-test-resource",
+					},
+				},
+				Policy: &placementv1beta1.OverridePolicy{
+					OverrideRules: []placementv1beta1.OverrideRule{
+						{
+							ClusterSelector: &placementv1beta1.ClusterSelector{
+								ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{}, // select all members
+							},
+							JSONPatchOverrides: []placementv1beta1.JSONPatchOverride{
+								{
+									Operator: placementv1beta1.JSONPatchOverrideOpAdd,
+									Path:     "/invalid/path",
+									Value:    apiextensionsv1.JSON{Raw: []byte(`"new-value"`)},
+								},
+							},
+						},
+					},
+				},
+			},
+			OverrideHash: []byte("123"),
+		},
+	}
+	Expect(k8sClient.Create(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the ro-2")
+	By(fmt.Sprintf("Invalid resource override snapshot %s created", invalidResourceOverrideSnapshotName))
 }
 
 var _ = AfterSuite(func() {
@@ -309,6 +350,7 @@ var _ = AfterSuite(func() {
 	Expect(k8sClient.Delete(ctx, &validClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-1")
 	Expect(k8sClient.Delete(ctx, &validResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-1")
 	Expect(k8sClient.Delete(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-2")
+	Expect(k8sClient.Delete(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-2")
 	Expect(k8sClient.Delete(ctx, &appNamespace)).Should(Succeed(), "Failed to delete app namespace")
 
 	cancel()
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
index 866f7670b..7dfa3a943 100644
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@@ -21,12 +21,10 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"strconv"
 	"sync"
 	"time"
 
 	apiErrors "k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/client-go/tools/record"
@@ -347,57 +345,21 @@ func (s *Scheduler) cleanUpAllBindingsFor(ctx context.Context, placement fleetv1
 }
 
 // lookupLatestPolicySnapshot returns the latest (i.e., active) policy snapshot associated with a placement.
-// TODO(ryan): move this to a common lib
+//
+// It delegates to controller.LookupLatestPolicySnapshot so the same lookup can be reused outside the
+// scheduler. The scheduler lists with a cached client; this does not have any consistency concern as
+// sources will always trigger the scheduler when a new policy snapshot is created.
+//
+// The "no latest snapshot" case can happen transiently when a placement is newly created or its
+// latest snapshot is being replaced; the scheduler will be triggered again when the situation
+// corrects itself, so this is logged and surfaced as a non-unexpected error.
 func (s *Scheduler) lookupLatestPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, error) {
-	placementRef := klog.KObj(placement)
-	// Prepare the list options to filter policy snapshots by the placement name and the latest snapshot label.
-	var listOptions []client.ListOption
-	labelSelector := labels.SelectorFromSet(labels.Set{
-		fleetv1beta1.PlacementTrackingLabel: placement.GetName(),
-		fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
-	})
-	listOptions = append(listOptions, &client.ListOptions{LabelSelector: labelSelector})
-	// Find out the latest policy snapshot associated with the placement.
-	var policySnapshotList fleetv1beta1.PolicySnapshotList
-	if placement.GetNamespace() == "" {
-		policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{}
-	} else {
-		policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{}
-		listOptions = append(listOptions, client.InNamespace(placement.GetNamespace()))
-	}
-
-	// The scheduler lists with a cached client; this does not have any consistency concern as sources
-	// will always trigger the scheduler when a new policy snapshot is created.
-	if err := s.client.List(ctx, policySnapshotList, listOptions...); err != nil {
-		klog.ErrorS(err, "Failed to list policy snapshots of a placement", "placement", placementRef)
-		return nil, controller.NewAPIServerError(true, err)
-	}
-	policySnapshots := policySnapshotList.GetPolicySnapshotObjs()
-	switch {
-	case len(policySnapshots) == 0:
-		// There is no latest policy snapshot associated with the placement; it could happen when
-		// * the placement is newly created; or
-		// * the new policy snapshot is in the middle of being replaced.
-		//
-		// Either way, it is out of the scheduler's scope to handle such a case; the scheduler will
-		// be triggered again if the situation is corrected.
-		err := fmt.Errorf("no latest policy snapshot associated with placement")
-		klog.ErrorS(err, "Failed to find the latest policy snapshot, will retry", "placement", placementRef)
-		return nil, err
-	case len(policySnapshots) > 1:
-		// There are multiple active policy snapshots associated with the placement; normally this
-		// will never happen, as only one policy snapshot can be active in the sequence.
-		//
-		// Similarly, it is out of the scheduler's scope to handle such a case; the scheduler will
-		// report this unexpected occurrence but does not register it as a scheduler-side error.
-		// If (and when) the situation is corrected, the scheduler will be triggered again.
-		err := controller.NewUnexpectedBehaviorError(fmt.Errorf("too many active policy snapshots: %d, want 1", len(policySnapshots)))
-		klog.ErrorS(err, "There are multiple latest policy snapshots associated with placement", "placement", placementRef)
+	snapshot, err := controller.LookupLatestPolicySnapshot(ctx, s.client, types.NamespacedName{Namespace: placement.GetNamespace(), Name: placement.GetName()})
+	if err != nil {
+		klog.ErrorS(err, "Failed to look up latest policy snapshot, will retry", "placement", klog.KObj(placement))
 		return nil, err
-	default:
-		// Found the one and only active policy snapshot.
-		return policySnapshots[0], nil
 	}
+	return snapshot, nil
 }
 
 // addSchedulerCleanupFinalizer adds the scheduler cleanup finalizer to a placement (if it does not
diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go
index acbf8a482..a252638a4 100644
--- a/pkg/scheduler/scheduler_test.go
+++ b/pkg/scheduler/scheduler_test.go
@@ -18,6 +18,8 @@ package scheduler
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"log"
 	"os"
 	"strconv"
@@ -30,10 +32,13 @@ import (
 	"github.com/prometheus/client_golang/prometheus/testutil"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 
 	fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 	hubmetrics "go.goms.io/fleet/pkg/metrics/hub"
+	"go.goms.io/fleet/pkg/utils/controller"
 )
 
 const (
@@ -591,6 +596,40 @@ func TestLookupLatestPolicySnapshot(t *testing.T) {
 	}
 }
 
+// TestLookupLatestPolicySnapshot_ListError covers the error path: when the underlying
+// controller.LookupLatestPolicySnapshot returns an error, the scheduler logs and propagates
+// it without panicking on the nil snapshot.
+func TestLookupLatestPolicySnapshot_ListError(t *testing.T) {
+	placement := &fleetv1beta1.ClusterResourcePlacement{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:       crpName,
+			Finalizers: []string{fleetv1beta1.SchedulerCleanupFinalizer},
+		},
+	}
+	listErr := fmt.Errorf("simulated cache failure")
+	fakeClient := interceptor.NewClient(
+		fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(placement).Build(),
+		interceptor.Funcs{
+			List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error {
+				return listErr
+			},
+		},
+	)
+
+	s := &Scheduler{client: fakeClient, uncachedReader: fakeClient}
+
+	got, err := s.lookupLatestPolicySnapshot(context.Background(), placement)
+	if err == nil {
+		t.Fatalf("lookupLatestPolicySnapshot() = %v, nil; want non-nil error", got)
+	}
+	if got != nil {
+		t.Errorf("lookupLatestPolicySnapshot() returned snapshot %v, want nil on error", got)
+	}
+	if !errors.Is(err, controller.ErrUnexpectedBehavior) {
+		t.Errorf("lookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+	}
+}
+
 func TestObserveSchedulingCycleMetrics(t *testing.T) {
 	metricMetadata := `
 		# HELP scheduling_cycle_duration_milliseconds The duration of a scheduling cycle run in milliseconds
diff --git a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
index 70a01eac4..1cca8b048 100644
--- a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
+++ b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go
@@ -74,32 +74,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 		return ctrl.Result{}, nil
 	}
 
-	// Verify if the policy snapshot is currently active.
-	// TODO: create a lib to check if the policy snapshot is latest.
-	isLatestVal, ok := policySnapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel]
-	if !ok {
-		// The IsLatestSnapshot label is not present; normally this should never occur.
-		klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("IsLatestSnapshotLabel is missing")),
-			"IsLatestSnapshot label is not present",
-			"policySnapshot", policySnapshotRef)
-		// This is not a situation that the controller can recover by itself. Should the label
-		// value be corrected, the controller will be triggered again.
-		return ctrl.Result{}, nil
-	}
-	isLatest, err := strconv.ParseBool(isLatestVal)
+	// Verify if the policy snapshot is currently active. A missing or malformed IsLatestSnapshot
+	// label is treated as "not latest" — the controller cannot recover from those states by
+	// itself; the placement controller will repair the label and re-trigger reconciliation.
+	isLatest, err := controller.IsLatestPolicySnapshot(policySnapshot)
 	if err != nil {
-		// The label value is not a correctly formatted boolean value; normally this should never
-		// occur.
-		klog.ErrorS(controller.NewUnexpectedBehaviorError(err),
-			"Failed to parse IsLatestSnapshot value",
-			"policySnapshot", policySnapshotRef)
-		// This is not an error that the controller can recover by itself; ignore the error.
-		// Should the label value be corrected, the controller will be triggered again.
+		klog.ErrorS(err, "Failed to determine whether policy snapshot is latest", "policySnapshot", policySnapshotRef)
 		return ctrl.Result{}, nil
 	}
 	if !isLatest {
-		// The policy snapshot is not currently active, i.e., it is not the latest policy snapshot;
-		// ignore it.
 		return ctrl.Result{}, nil
 	}
 
diff --git a/pkg/utils/binding/binding.go b/pkg/utils/binding/binding.go
index 29b3483c6..6d7a98361 100644
--- a/pkg/utils/binding/binding.go
+++ b/pkg/utils/binding/binding.go
@@ -23,14 +23,51 @@ import (
 	"go.goms.io/fleet/pkg/utils/condition"
 )
 
-// HasBindingFailed checks if BindingObj has failed based on its conditions.
+// nonTerminalBindingFailureReasons is the closed set of `Reason` strings that the per-cluster
+// binding conditions (Overridden, WorkSynchronized, Applied, Available) can carry while still
+// `Status=False` *without* representing a terminal failure. Each one means the binding is still
+// progressing and the controller should keep waiting rather than treat the binding as failed.
+//
+// The set is intentionally an allowlist: any new `Status=False` reason added to the codebase is
+// treated as terminal until it is explicitly added here. This is the safer default because the
+// alternative (treat unknown reasons as transient) would stall the rollout controller forever
+// on a genuinely-failing binding it does not yet know how to classify.
+//
+// When you add a new in-progress / pending / "not yet" reason to pkg/utils/condition/reason.go
+// for any of the above conditions, also add it here.
+var nonTerminalBindingFailureReasons = map[string]struct{}{
+	condition.OverriddenPendingReason:      {},
+	condition.WorkNotSynchronizedYetReason: {},
+	condition.ApplyPendingReason:           {},
+	condition.NotAvailableYetReason:        {},
+}
+
+// HasBindingFailed reports whether a binding has reached a terminal failure on any of its
+// per-cluster conditions (Overridden, WorkSynchronized, Applied, Available).
+//
+// A condition counts as a terminal failure when its `Status` is `False` *and* its `Reason` is
+// not in `nonTerminalBindingFailureReasons`. The Reason check is necessary because several of
+// the in-progress states (e.g. `WorkNotSynchronizedYetReason`, `NotAvailableYetReason`) are
+// expressed as `Status=False` rather than `Status=Unknown`. Treating those as failures was the
+// previous bug — it caused the rollout controller to drop bindings that were still progressing
+// out of `canBeReadyBindings`, stalling rollout decisions.
+//
+// Unknown reasons fall through to "terminal" by design; see the comment on
+// `nonTerminalBindingFailureReasons`.
 func HasBindingFailed(binding placementv1beta1.BindingObj) bool {
 	for i := condition.OverriddenCondition; i <= condition.AvailableCondition; i++ {
-		if condition.IsConditionStatusFalse(binding.GetCondition(string(i.ResourceBindingConditionType())), binding.GetGeneration()) {
-			// TODO: parse the reason of the condition to see if the failure is recoverable/retriable or not
-			klog.V(2).Infof("binding %s has condition %s with status false", klog.KObj(binding), string(i.ResourceBindingConditionType()))
-			return true
+		c := binding.GetCondition(string(i.ResourceBindingConditionType()))
+		if !condition.IsConditionStatusFalse(c, binding.GetGeneration()) {
+			continue
+		}
+		if _, transient := nonTerminalBindingFailureReasons[c.Reason]; transient {
+			klog.V(3).Infof("binding %s has condition %s status false with non-terminal reason %q; treating as in-progress",
+				klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
+			continue
 		}
+		klog.V(2).Infof("binding %s has terminal failure on condition %s (reason %q)",
+			klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
+		return true
 	}
 	return false
 }
diff --git a/pkg/utils/binding/binding_test.go b/pkg/utils/binding/binding_test.go
index e471afa11..b3ecb6b06 100644
--- a/pkg/utils/binding/binding_test.go
+++ b/pkg/utils/binding/binding_test.go
@@ -22,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
+	"go.goms.io/fleet/pkg/utils/condition"
 )
 
 func TestHasBindingFailed(t *testing.T) {
@@ -326,6 +327,102 @@ func TestHasBindingFailed(t *testing.T) {
 	}
 }
 
+// TestHasBindingFailed_NonTerminalReasons verifies that conditions with Status=False but a
+// known in-progress reason (WorkNotSynchronizedYet, NotAvailableYet, OverriddenPending,
+// ApplyPending) are NOT treated as failures. This is the bug fix for issue #648.
+func TestHasBindingFailed_NonTerminalReasons(t *testing.T) {
+	bindingWithCondition := func(condType string, reason string) *placementv1beta1.ClusterResourceBinding {
+		return &placementv1beta1.ClusterResourceBinding{
+			ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
+			Status: placementv1beta1.ResourceBindingStatus{
+				Conditions: []metav1.Condition{
+					{
+						Type:               condType,
+						Status:             metav1.ConditionFalse,
+						ObservedGeneration: 1,
+						Reason:             reason,
+						Message:            "in progress",
+					},
+				},
+			},
+		}
+	}
+
+	tests := []struct {
+		name    string
+		binding *placementv1beta1.ClusterResourceBinding
+		want    bool
+	}{
+		{
+			name:    "WorkSynchronized=False with WorkNotSynchronizedYet is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingWorkSynchronized), condition.WorkNotSynchronizedYetReason),
+			want:    false,
+		},
+		{
+			name:    "Available=False with NotAvailableYet is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingAvailable), condition.NotAvailableYetReason),
+			want:    false,
+		},
+		{
+			name:    "Overridden=False with OverriddenPending is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenPendingReason),
+			want:    false,
+		},
+		{
+			name:    "Applied=False with ApplyPending is in-progress, not a failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyPendingReason),
+			want:    false,
+		},
+		{
+			name:    "Applied=False with ApplyFailed is a terminal failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyFailedReason),
+			want:    true,
+		},
+		{
+			name:    "Overridden=False with OverriddenFailed is a terminal failure",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenFailedReason),
+			want:    true,
+		},
+		{
+			name:    "Status=False with an unknown reason defaults to terminal (safer default)",
+			binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), "SomeReasonNobodyAddedToTheAllowlist"),
+			want:    true,
+		},
+		{
+			name: "non-terminal reason on one condition does not mask a terminal failure on another",
+			binding: &placementv1beta1.ClusterResourceBinding{
+				ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
+				Status: placementv1beta1.ResourceBindingStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:               string(placementv1beta1.ResourceBindingOverridden),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.OverriddenPendingReason,
+						},
+						{
+							Type:               string(placementv1beta1.ResourceBindingApplied),
+							Status:             metav1.ConditionFalse,
+							ObservedGeneration: 1,
+							Reason:             condition.ApplyFailedReason,
+						},
+					},
+				},
+			},
+			want: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := HasBindingFailed(tc.binding)
+			if got != tc.want {
+				t.Errorf("HasBindingFailed(%v) = %v, want %v", tc.binding, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestIsBindingReportDiff(t *testing.T) {
 	tests := []struct {
 		name    string
diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go
index 0da49c41c..1e5067931 100644
--- a/pkg/utils/controller/controller.go
+++ b/pkg/utils/controller/controller.go
@@ -63,6 +63,19 @@ var (
 
 	// ErrUserError indicates the error is caused by the user and customer needs to take the action.
 	ErrUserError = errors.New("failed to process the request due to a client error")
+
+	// ErrNoLatestPolicySnapshot indicates that no active (latest) policy snapshot was found for a
+	// placement. This is a transient condition during placement creation or rotation and is typically
+	// recoverable by the next reconcile; callers branch on it via errors.Is to distinguish it from
+	// real API or invariant-violation errors.
+	ErrNoLatestPolicySnapshot = errors.New("no latest policy snapshot found")
+
+	// ErrMultipleActivePolicySnapshots indicates that more than one policy snapshot for the same
+	// placement carries IsLatestSnapshot=true, violating the "exactly one active snapshot"
+	// invariant. Callers branch on it via errors.Is so they can fail fast on a real invariant
+	// violation without misclassifying transient cache errors that share the ErrUnexpectedBehavior
+	// category.
+	ErrMultipleActivePolicySnapshots = errors.New("multiple active policy snapshots found")
 )
 
 // NewUnexpectedBehaviorError returns ErrUnexpectedBehavior type error when err is not nil.
diff --git a/pkg/utils/controller/policy_snapshot_resolver.go b/pkg/utils/controller/policy_snapshot_resolver.go
index 5406c7e1e..5a2fd8ea2 100644
--- a/pkg/utils/controller/policy_snapshot_resolver.go
+++ b/pkg/utils/controller/policy_snapshot_resolver.go
@@ -110,34 +110,69 @@ func BuildPolicySnapshot(placementObj fleetv1beta1.PlacementObj, policySnapshotI
 	return snapshot
 }
 
-// FetchLatestPolicySnapshot fetches the latest policy snapshot for a given placement.
-// For cluster-scoped placements, it fetches ClusterSchedulingPolicySnapshot.
-// For namespaced placements, it fetches SchedulingPolicySnapshot.
-func FetchLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotList, error) {
-	namespace := placementKey.Namespace
-	name := placementKey.Name
+// IsLatestPolicySnapshot reports whether the given policy snapshot carries the IsLatestSnapshot
+// label set to "true". A non-nil error is returned if the label is missing or its value cannot be
+// parsed as a bool; in those cases the boolean result is false.
+//
+// Callers typically log the error and treat the snapshot as non-latest, since the controller cannot
+// recover from a malformed label by itself — the placement controller will repair it and re-trigger
+// reconciliation.
+func IsLatestPolicySnapshot(snapshot fleetv1beta1.PolicySnapshotObj) (bool, error) {
+	val, ok := snapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel]
+	if !ok {
+		return false, NewUnexpectedBehaviorError(fmt.Errorf("%s label is missing", fleetv1beta1.IsLatestSnapshotLabel))
+	}
+	isLatest, err := strconv.ParseBool(val)
+	if err != nil {
+		return false, NewUnexpectedBehaviorError(fmt.Errorf("failed to parse %s label value %q: %w", fleetv1beta1.IsLatestSnapshotLabel, val, err))
+	}
+	return isLatest, nil
+}
 
+// LookupLatestPolicySnapshot returns the single active (latest) policy snapshot associated with the
+// placement identified by placementKey, enforcing the "exactly one latest snapshot" invariant.
+// A namespaced placementKey selects SchedulingPolicySnapshot; a cluster-scoped key (empty
+// namespace) selects ClusterSchedulingPolicySnapshot.
+//
+// Returns:
+//   - (snapshot, nil) when exactly one latest snapshot exists.
+//   - (nil, ErrNoLatestPolicySnapshot-wrapped error) when none exist. This is a transient state for
+//     newly-created placements or in-progress rotations; callers branch on it via errors.Is.
+//   - (nil, ErrMultipleActivePolicySnapshots-wrapped error) when more than one exists.
+//   - (nil, ErrAPIServerError-wrapped error) when the List call fails. The List error is also
+//     promoted to ErrUnexpectedBehavior for unexpected cache failures; callers must NOT collapse
+//     ErrUnexpectedBehavior with the dedicated invariant sentinels above when classifying retries.
+func LookupLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotObj, error) {
 	var policySnapshotList fleetv1beta1.PolicySnapshotList
-	var listOptions []client.ListOption
-	listOptions = append(listOptions, client.MatchingLabels{
-		fleetv1beta1.PlacementTrackingLabel: name,
-		fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
-	})
-
-	if namespace != "" {
-		// This is a namespaced SchedulingPolicySnapshotList
+	listOptions := []client.ListOption{
+		client.MatchingLabels{
+			fleetv1beta1.PlacementTrackingLabel: placementKey.Name,
+			fleetv1beta1.IsLatestSnapshotLabel:  strconv.FormatBool(true),
+		},
+	}
+	if placementKey.Namespace != "" {
 		policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{}
-		listOptions = append(listOptions, client.InNamespace(namespace))
+		listOptions = append(listOptions, client.InNamespace(placementKey.Namespace))
 	} else {
-		// This is a cluster-scoped ClusterSchedulingPolicySnapshotList
 		policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{}
 	}
-
 	if err := k8Client.List(ctx, policySnapshotList, listOptions...); err != nil {
 		klog.ErrorS(err, "Failed to list the policySnapshots associated with the placement", "placement", placementKey)
-		return nil, err
+		return nil, NewAPIServerError(true, err)
+	}
+
+	policySnapshots := policySnapshotList.GetPolicySnapshotObjs()
+	switch len(policySnapshots) {
+	case 0:
+		return nil, fmt.Errorf("%w for placement %s", ErrNoLatestPolicySnapshot, placementKey)
+	case 1:
+		return policySnapshots[0], nil
+	default:
+		// Wrap both the specific invariant sentinel and the categorical ErrUnexpectedBehavior so
+		// callers that branch on either continue to work; the specific sentinel lets retry-gate
+		// callers distinguish this invariant violation from transient ErrUnexpectedBehavior errors.
+		return nil, fmt.Errorf("%w: %w for placement %s: got %d, want 1", ErrUnexpectedBehavior, ErrMultipleActivePolicySnapshots, placementKey, len(policySnapshots))
 	}
-	return policySnapshotList, nil
 }
 
 // ListPolicySnapshots lists all policy snapshots associated with a placement key.
diff --git a/pkg/utils/controller/policy_snapshot_resolver_test.go b/pkg/utils/controller/policy_snapshot_resolver_test.go
index 379c939a3..56a856de8 100644
--- a/pkg/utils/controller/policy_snapshot_resolver_test.go
+++ b/pkg/utils/controller/policy_snapshot_resolver_test.go
@@ -18,6 +18,8 @@ package controller
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -28,6 +30,7 @@ import (
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 
 	fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 )
@@ -489,7 +492,7 @@ func TestBuildPolicySnapshot(t *testing.T) {
 	}
 }
 
-func TestFetchLatestPolicySnapshot(t *testing.T) {
+func TestListPolicySnapshots(t *testing.T) {
 	scheme := runtime.NewScheme()
 	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
 		t.Fatalf("Failed to add to scheme: %v", err)
@@ -502,7 +505,7 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 		expectedSnapshots int
 	}{
 		{
-			name: "fetch latest cluster scoped policy snapshots",
+			name: "list cluster scoped policy snapshots",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
@@ -512,7 +515,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -521,15 +523,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-1",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "other-placement-0",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1,
+			expectedSnapshots: 2,
 		},
 		{
-			name: "fetch latest cluster scoped policy snapshots - mixed setup",
+			name: "list cluster scoped policy snapshots - mixed setup",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
@@ -539,7 +548,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -548,7 +556,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-1",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "test-placement-2",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: placementName,
+						},
+					},
+				},
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "other-placement-0",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -559,15 +582,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "test-placement-1",
+						Namespace: policySnapshotNamespace,
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: placementName,
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1, // Should only return cluster-scoped ones for cluster-scoped placement
+			expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement
 		},
 		{
-			name: "fetch latest namespaced policy snapshots",
+			name: "list namespaced policy snapshots",
 			placementKey: types.NamespacedName{
 				Name:      placementName,
 				Namespace: policySnapshotNamespace,
@@ -579,7 +610,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -589,15 +619,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "other-placement-0",
+						Namespace: "other-namespace",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1,
+			expectedSnapshots: 2,
 		},
 		{
-			name: "fetch latest namespaced policy snapshots - mixed setup",
+			name: "list namespaced policy snapshots - mixed setup",
 			placementKey: types.NamespacedName{
 				Name:      placementName,
 				Namespace: policySnapshotNamespace,
@@ -609,7 +647,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
@@ -619,7 +656,15 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Namespace: policySnapshotNamespace,
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+						},
+					},
+				},
+				&fleetv1beta1.SchedulingPolicySnapshot{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "other-placement-0",
+						Namespace: "other-namespace",
+						Labels: map[string]string{
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -629,25 +674,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 1, // Should only return namespaced ones for namespaced placement
+			expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement
 		},
 		{
-			name: "no latest policy snapshots found",
+			name: "no policy snapshots found",
 			placementKey: types.NamespacedName{
 				Name: placementName,
 			},
 			existingSnapshots: []client.Object{
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-0",
+						Name: "other-placement-0",
 						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-							fleetv1beta1.IsLatestSnapshotLabel:  "false",
+							fleetv1beta1.PlacementTrackingLabel: "other-placement",
 						},
 					},
 				},
@@ -664,9 +707,9 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 				WithObjects(tc.existingSnapshots...).
 				Build()
 
-			snapshots, err := FetchLatestPolicySnapshot(ctx, fakeClient, tc.placementKey)
+			snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey)
 			if err != nil {
-				t.Fatalf("FetchLatestPolicySnapshot() = %v, want nil", err)
+				t.Fatalf("ListPolicySnapshots() = %v, want nil", err)
 			}
 
 			if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots {
@@ -676,210 +719,204 @@ func TestFetchLatestPolicySnapshot(t *testing.T) {
 	}
 }
 
-func TestListPolicySnapshots(t *testing.T) {
+func TestIsLatestPolicySnapshot(t *testing.T) {
+	snapshotWithLabels := func(labels map[string]string) fleetv1beta1.PolicySnapshotObj {
+		return &fleetv1beta1.ClusterSchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:   policySnapshotName,
+				Labels: labels,
+			},
+		}
+	}
+
+	testCases := []struct {
+		name     string
+		snapshot fleetv1beta1.PolicySnapshotObj
+		want     bool
+		wantErr  bool
+	}{
+		{
+			name:     "label true returns true",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "true"}),
+			want:     true,
+			wantErr:  false,
+		},
+		{
+			name:     "label false returns false",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "false"}),
+			want:     false,
+			wantErr:  false,
+		},
+		{
+			name:     "label missing returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.PlacementTrackingLabel: placementName}),
+			want:     false,
+			wantErr:  true,
+		},
+		{
+			name:     "nil labels returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(nil),
+			want:     false,
+			wantErr:  true,
+		},
+		{
+			name:     "label malformed returns false with unexpected-behavior error",
+			snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "yes"}),
+			want:     false,
+			wantErr:  true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := IsLatestPolicySnapshot(tc.snapshot)
+			if (err != nil) != tc.wantErr {
+				t.Errorf("IsLatestPolicySnapshot() error = %v, wantErr %v", err, tc.wantErr)
+			}
+			if err != nil && !errors.Is(err, ErrUnexpectedBehavior) {
+				t.Errorf("IsLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+			}
+			if got != tc.want {
+				t.Errorf("IsLatestPolicySnapshot() = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestLookupLatestPolicySnapshot(t *testing.T) {
 	scheme := runtime.NewScheme()
 	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
 		t.Fatalf("Failed to add to scheme: %v", err)
 	}
 
+	clusterKey := types.NamespacedName{Name: placementName}
+	namespacedKey := types.NamespacedName{Name: placementName, Namespace: policySnapshotNamespace}
+
+	clusterLatestSnapshot := func(name string) *fleetv1beta1.ClusterSchedulingPolicySnapshot {
+		return &fleetv1beta1.ClusterSchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: name,
+				Labels: map[string]string{
+					fleetv1beta1.PlacementTrackingLabel: placementName,
+					fleetv1beta1.IsLatestSnapshotLabel:  "true",
+				},
+			},
+		}
+	}
+	namespacedLatestSnapshot := func(name string) *fleetv1beta1.SchedulingPolicySnapshot {
+		return &fleetv1beta1.SchedulingPolicySnapshot{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      name,
+				Namespace: policySnapshotNamespace,
+				Labels: map[string]string{
+					fleetv1beta1.PlacementTrackingLabel: placementName,
+					fleetv1beta1.IsLatestSnapshotLabel:  "true",
+				},
+			},
+		}
+	}
+
 	testCases := []struct {
 		name              string
 		placementKey      types.NamespacedName
 		existingSnapshots []client.Object
-		expectedSnapshots int
+		wantName          string
+		wantErrSentinel   error // nil means a snapshot is expected; otherwise the error must wrap this sentinel.
 	}{
 		{
-			name: "list cluster scoped policy snapshots",
-			placementKey: types.NamespacedName{
-				Name: placementName,
-			},
+			name:              "exactly one cluster-scoped latest snapshot",
+			placementKey:      clusterKey,
+			existingSnapshots: []client.Object{clusterLatestSnapshot("test-placement-0")},
+			wantName:          "test-placement-0",
+		},
+		{
+			name:              "exactly one namespaced latest snapshot",
+			placementKey:      namespacedKey,
+			existingSnapshots: []client.Object{namespacedLatestSnapshot("test-placement-0")},
+			wantName:          "test-placement-0",
+		},
+		{
+			name:              "no latest snapshot returns ErrNoLatestPolicySnapshot",
+			placementKey:      clusterKey,
+			existingSnapshots: nil,
+			wantErrSentinel:   ErrNoLatestPolicySnapshot,
+		},
+		{
+			name:         "only non-latest snapshots present returns ErrNoLatestPolicySnapshot",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-1",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
+							fleetv1beta1.IsLatestSnapshotLabel:  "false",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2,
+			wantErrSentinel: ErrNoLatestPolicySnapshot,
 		},
 		{
-			name: "list cluster scoped policy snapshots - mixed setup",
-			placementKey: types.NamespacedName{
-				Name: placementName,
-			},
+			name:         "multiple latest snapshots returns ErrMultipleActivePolicySnapshots",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-1",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "test-placement-2",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
-				// Add namespaced snapshots to test mixed scenario
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
+				clusterLatestSnapshot("test-placement-0"),
+				clusterLatestSnapshot("test-placement-1"),
 			},
-			expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement
+			wantErrSentinel: ErrMultipleActivePolicySnapshots,
 		},
 		{
-			name: "list namespaced policy snapshots",
-			placementKey: types.NamespacedName{
-				Name:      placementName,
-				Namespace: policySnapshotNamespace,
-			},
+			name:         "ignores snapshots from other placements",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
+				clusterLatestSnapshot("test-placement-0"),
+				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
-						Name:      "other-placement-0",
-						Namespace: "other-namespace",
+						Name: "other-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: "other-placement",
+							fleetv1beta1.IsLatestSnapshotLabel:  "true",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2,
+			wantName: "test-placement-0",
 		},
 		{
-			name: "list namespaced policy snapshots - mixed setup",
-			placementKey: types.NamespacedName{
-				Name:      placementName,
-				Namespace: policySnapshotNamespace,
-			},
+			name:         "ignores non-latest snapshots",
+			placementKey: clusterKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-0",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "test-placement-1",
-						Namespace: policySnapshotNamespace,
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: placementName,
-						},
-					},
-				},
-				&fleetv1beta1.SchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name:      "other-placement-0",
-						Namespace: "other-namespace",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
-				// Add cluster-scoped snapshots to test mixed scenario
+				clusterLatestSnapshot("test-placement-1"),
 				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-placement-0",
 						Labels: map[string]string{
 							fleetv1beta1.PlacementTrackingLabel: placementName,
+							fleetv1beta1.IsLatestSnapshotLabel:  "false",
 						},
 					},
 				},
 			},
-			expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement
+			wantName: "test-placement-1",
 		},
 		{
-			name: "no policy snapshots found",
-			placementKey: types.NamespacedName{
-				Name: placementName,
+			name:         "cluster-scoped lookup ignores a same-named latest namespaced snapshot",
+			placementKey: clusterKey,
+			existingSnapshots: []client.Object{
+				clusterLatestSnapshot("test-placement-0"),
+				namespacedLatestSnapshot("test-placement-0"),
 			},
+			wantName: "test-placement-0",
+		},
+		{
+			name:         "namespaced lookup ignores a same-named latest cluster snapshot",
+			placementKey: namespacedKey,
 			existingSnapshots: []client.Object{
-				&fleetv1beta1.ClusterSchedulingPolicySnapshot{
-					ObjectMeta: metav1.ObjectMeta{
-						Name: "other-placement-0",
-						Labels: map[string]string{
-							fleetv1beta1.PlacementTrackingLabel: "other-placement",
-						},
-					},
-				},
+				namespacedLatestSnapshot("test-placement-0"),
+				clusterLatestSnapshot("test-placement-0"),
 			},
-			expectedSnapshots: 0,
+			wantName: "test-placement-0",
 		},
 	}
 
@@ -891,14 +928,51 @@ func TestListPolicySnapshots(t *testing.T) {
 				WithObjects(tc.existingSnapshots...).
 				Build()
 
-			snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey)
-			if err != nil {
-				t.Fatalf("ListPolicySnapshots() = %v, want nil", err)
+			got, err := LookupLatestPolicySnapshot(ctx, fakeClient, tc.placementKey)
+			if (err != nil) != (tc.wantErrSentinel != nil) {
+				t.Fatalf("LookupLatestPolicySnapshot() error = %v, wantErrSentinel %v", err, tc.wantErrSentinel)
 			}
-
-			if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots {
-				t.Errorf("Expected %d snapshots, got %d", tc.expectedSnapshots, len(snapshots.GetPolicySnapshotObjs()))
+			if tc.wantErrSentinel != nil {
+				if !errors.Is(err, tc.wantErrSentinel) {
+					t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping %v", err, tc.wantErrSentinel)
+				}
+				return
+			}
+			if got == nil || got.GetName() != tc.wantName {
+				t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want name %q", got, tc.wantName)
 			}
 		})
 	}
 }
+
+// TestLookupLatestPolicySnapshot_ListError covers the API-error path: when the underlying List
+// call fails with an error that isUnexpectedCacheError treats as unexpected (i.e. not a
+// *apierrors.StatusError, context.Canceled, or context.DeadlineExceeded), NewAPIServerError
+// promotes it to ErrUnexpectedBehavior.
+func TestLookupLatestPolicySnapshot_ListError(t *testing.T) {
+	scheme := runtime.NewScheme()
+	if err := fleetv1beta1.AddToScheme(scheme); err != nil {
+		t.Fatalf("Failed to add to scheme: %v", err)
+	}
+
+	listErr := fmt.Errorf("simulated cache failure")
+	fakeClient := interceptor.NewClient(
+		fake.NewClientBuilder().WithScheme(scheme).Build(),
+		interceptor.Funcs{
+			List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error {
+				return listErr
+			},
+		},
+	)
+
+	got, err := LookupLatestPolicySnapshot(context.Background(), fakeClient, types.NamespacedName{Name: placementName})
+	if err == nil {
+		t.Fatalf("LookupLatestPolicySnapshot() = %v, nil; want non-nil error", got)
+	}
+	if got != nil {
+		t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want nil", got)
+	}
+	if !errors.Is(err, ErrUnexpectedBehavior) {
+		t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err)
+	}
+}
diff --git a/pkg/utils/controller/resource_snapshot_resolver_test.go b/pkg/utils/controller/resource_snapshot_resolver_test.go
index d2b652cf3..3bd8f52e3 100644
--- a/pkg/utils/controller/resource_snapshot_resolver_test.go
+++ b/pkg/utils/controller/resource_snapshot_resolver_test.go
@@ -1683,10 +1683,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 	}
 
 	tests := []struct {
-		name          string
-		placementObj  fleetv1beta1.PlacementObj
-		objects       []client.Object
-		expectedError string
+		name         string
+		placementObj fleetv1beta1.PlacementObj
+		objects      []client.Object
+		// Snapshots that must still exist after the delete. Nil means every seeded object should be deleted.
+		wantSurvivors []types.NamespacedName
+		wantError     string
 	}{
 		{
 			name: "delete resource snapshots - namespaced",
@@ -1770,8 +1772,8 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 						},
 					},
 				},
-				// Namespaced snapshots with same placement name (should NOT be deleted)
-				// TODO: find a way to test this
+				// Namespaced snapshot sharing the same placement name; cross-scope isolation
+				// requires it to survive a cluster-scoped delete.
 				&fleetv1beta1.ResourceSnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "test-namespaced-snapshot",
@@ -1782,6 +1784,9 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 					},
 				},
 			},
+			wantSurvivors: []types.NamespacedName{
+				{Namespace: "test-namespace", Name: "test-namespaced-snapshot"},
+			},
 		},
 		{
 			name: "mixed environment - delete namespaced snapshots only",
@@ -1792,8 +1797,8 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 				},
 			},
 			objects: []client.Object{
-				// Cluster-scoped snapshots with same placement name (should NOT be deleted)
-				// TODO: find a way to test this
+				// Cluster-scoped snapshot sharing the same placement name; cross-scope isolation
+				// requires it to survive a namespaced delete.
 				&fleetv1beta1.ClusterResourceSnapshot{
 					ObjectMeta: metav1.ObjectMeta{
 						Name: "test-cluster-snapshot",
@@ -1832,6 +1837,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 					},
 				},
 			},
+			wantSurvivors: []types.NamespacedName{
+				// Cluster-scoped survivor (different scope, same placement name).
+				{Name: "test-cluster-snapshot"},
+				// Different-namespace survivor (same scope, different namespace).
+				{Namespace: "other-namespace", Name: "other-namespaced-snapshot"},
+			},
 		},
 	}
 
@@ -1841,12 +1852,12 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 
 			err := DeleteResourceSnapshots(context.Background(), k8Client, tt.placementObj)
 
-			if tt.expectedError != "" {
+			if tt.wantError != "" {
 				if err == nil {
 					t.Fatalf("Expected error but got nil")
 				}
-				if !strings.Contains(err.Error(), tt.expectedError) {
-					t.Errorf("Expected error to contain: %s, but got: %v", tt.expectedError, err)
+				if !strings.Contains(err.Error(), tt.wantError) {
+					t.Errorf("Expected error to contain: %s, but got: %v", tt.wantError, err)
 				}
 				return
 			}
@@ -1855,18 +1866,30 @@ func TestDeleteResourceSnapshots(t *testing.T) {
 				t.Fatalf("Expected no error but got: %v", err)
 			}
 
-			// Verify snapshots were deleted by checking they no longer exist
-			placementKey := types.NamespacedName{
-				Name:      tt.placementObj.GetName(),
-				Namespace: tt.placementObj.GetNamespace(),
+			// Unfiltered list of both scopes; exact match guards the cross-scope regression.
+			var clusterList fleetv1beta1.ClusterResourceSnapshotList
+			if err := k8Client.List(context.Background(), &clusterList); err != nil {
+				t.Fatalf("Failed to list ClusterResourceSnapshots after deletion: %v", err)
 			}
-			result, err := ListAllResourceSnapshots(context.Background(), k8Client, placementKey)
-			if err != nil {
-				t.Fatalf("Expected no error when listing snapshots after deletion, but got: %v", err)
+			var nsList fleetv1beta1.ResourceSnapshotList
+			if err := k8Client.List(context.Background(), &nsList); err != nil {
+				t.Fatalf("Failed to list ResourceSnapshots after deletion: %v", err)
 			}
-
-			if len(result.GetResourceSnapshotObjs()) != 0 {
-				t.Errorf("Expected 0 resource snapshots after deletion, got %d", len(result.GetResourceSnapshotObjs()))
+			got := make([]types.NamespacedName, 0, len(clusterList.Items)+len(nsList.Items))
+			for _, s := range clusterList.Items {
+				got = append(got, types.NamespacedName{Name: s.Name})
+			}
+			for _, s := range nsList.Items {
+				got = append(got, types.NamespacedName{Namespace: s.Namespace, Name: s.Name})
+			}
+			less := func(a, b types.NamespacedName) bool {
+				if a.Namespace != b.Namespace {
+					return a.Namespace < b.Namespace
+				}
+				return a.Name < b.Name
+			}
+			if diff := cmp.Diff(tt.wantSurvivors, got, cmpopts.SortSlices(less), cmpopts.EquateEmpty()); diff != "" {
+				t.Errorf("surviving snapshots mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go
new file mode 100644
index 000000000..3f04059e5
--- /dev/null
+++ b/pkg/utils/errors/errors.go
@@ -0,0 +1,240 @@
+/*
+Copyright 2026 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// TO-DO: apply the new error pattern incrementally when it is ready to do so.
+
+// errors features custom error categorizing, wrapping and string formatting logic
+// used in the KubeFleet project.
+package errors
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+)
+
+type ErrCategory string
+
+const (
+	// An error is considered of the unexpected error category when a KubeFleet controller
+	// encounters a situation that it does not know how to handle and cannot be resolved
+	// by itself or by user fixing their inputs.
+	ErrCategoryUnexpected ErrCategory = "unexpected"
+
+	// An error is considered of the transient error category when a KubeFleet controller
+	// encounters a situation that would self-resolve soon, e.g., cache not sync'd yet.
+	ErrCategoryTransient ErrCategory = "transient"
+
+	// An error is considered of the API server error category when it arises during a call
+	// to the Kubernetes API server.
+	ErrCategoryAPIServer ErrCategory = "APIserver"
+
+	// An error is considered of the user error category when it arises due to some
+	// incorrect input or action from the user side and can resolve when the
+	// user fixes their inputs.
+	ErrCategoryUser ErrCategory = "user"
+
+	// An error is considered uncategorized when it does not fit into any of the other
+	// predefined error categories or no category has been explicitly assigned.
+	ErrCategoryUncategorized ErrCategory = "uncategorized"
+)
+
+var _ error = &Error{}
+
+type Error struct {
+	// category is the category of the error.
+	category ErrCategory
+
+	// wrapped is an error that this error wraps.
+	wrapped error
+
+	// desc is the additional description about the wrapped error.
+	desc string
+
+	// attrs are the additional attributes associated with the wrapped error.
+	attrs []interface{}
+}
+
+func (e *Error) categoryWithDefault() ErrCategory {
+	if len(e.category) == 0 {
+		return ErrCategoryUncategorized
+	}
+	return e.category
+}
+
+// Error implements the error interface.
+//
+// Note the output intentionally does not include the additional attributes, so as to keep the cardinality
+// low; to print them out, pass them explicitly to the klog functions.
+func (e *Error) Error() string {
+	switch {
+	case e.wrapped != nil && len(e.desc) == 0:
+		// With wrapped error but no description.
+		return e.wrapped.Error()
+	case e.wrapped != nil:
+		// With wrapped error, with description.
+		return fmt.Sprintf("%s: %s", e.desc, e.wrapped.Error())
+	case e.wrapped == nil && len(e.desc) > 0:
+		// No wrapped error but with description.
+		return e.desc
+	default:
+		// No wrapped error and no description.
+		return "an unknown error occurred"
+	}
+}
+
+// Unwrap returns the wrapped error, so that errors.Is and errors.As can traverse the error chain as
+// expected.
+func (e *Error) Unwrap() error {
+	return e.wrapped
+}
+
+// Wraps wraps an existing error with a higher-level description and optional key-value attributes,
+// and returns a new *Error. If the existing error (or its children) is already an *Error,
+// Wraps will surface the category and the attributes to the top-level.
+func Wraps(err error, desc string, kvs ...interface{}) *Error {
+	// Find if any of error down in the chain is already an *Error; if so, use its category and surface its
+	// attributes to the current level.
+	var category ErrCategory
+	var mergedKVs []interface{}
+	var childError *Error
+	if errors.As(err, &childError) {
+		category = childError.category
+		mergedKVs = append(mergedKVs, childError.attrs...)
+	}
+	mergedKVs = append(mergedKVs, kvs...)
+	if len(category) == 0 {
+		category = ErrCategoryUncategorized
+	}
+
+	return &Error{
+		category: category,
+		wrapped:  err,
+		desc:     desc,
+		attrs:    mergedKVs,
+	}
+}
+
+// Args returns the k-v attributes associated with an *Error (and its children).
+//
+// The function is designed to work with klog calls; to use it, call klog.ErrorS with:
+//
+// klog.ErrorS(err, "additional top-level error description", Args(err)...).
+//
+// If you need to add additional key-value attributes at the top level in addition to those
+// already captured from the error chain, append them to Args as follows:
+//
+// klog.ErrorS(err, "additional top-level error description", append(Args(err), "k", "v")...).
+func Args(err error, kvs ...interface{}) []interface{} {
+	var compositeKVs []interface{}
+	var childError *Error
+	if errors.As(err, &childError) {
+		// Make sure that the error category is always included as the first kv pair.
+		compositeKVs = append(compositeKVs, "errCategory", childError.categoryWithDefault())
+		compositeKVs = append(compositeKVs, childError.attrs...)
+		compositeKVs = append(compositeKVs, kvs...)
+	}
+	return compositeKVs
+}
+
+// CallerFrameOverview is a simple struct that summarizes a single call frame.
+type CallerFrameOverview struct {
+	Function string
+	File     string
+	Line     int
+}
+
+// captureCallers captures the current call stack and returns a slice of CallerFrameOverview
+// summarizing each frame in the stack, skipping the top 3 frames for simplicity.
+func captureCallers() []CallerFrameOverview {
+	pcs := make([]uintptr, 32)
+	// Skip 3 frames (the runtime.Callers function itself, this utility function, and the caller of this utility)
+	// when capturing the call stack for simplicity reasons.
+	n := runtime.Callers(3, pcs)
+	frames := runtime.CallersFrames(pcs[:n])
+	var callerFrames []CallerFrameOverview
+	for {
+		frame, more := frames.Next()
+		callerFrames = append(callerFrames, CallerFrameOverview{
+			Function: frame.Function,
+			File:     frame.File,
+			Line:     frame.Line,
+		})
+		if !more {
+			break
+		}
+	}
+	return callerFrames
+}
+
+// NewUnexpectedError creates a new Error categorized as ErrCategoryUnexpected. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewUnexpectedError(err error, desc string, kvs ...interface{}) *Error {
+	// For unexpected errors, collect the caller frames.
+	//
+	// Note (chenyu1): previously for unexpected errors, we use the debug.Stack() to print out a full stack
+	// trace in the logs. Effective as it is, the operation is expensive and it always captures the full stack,
+	// where the top of the stack is always the error handling utility function/method itself rather than
+	// the actual caller site where the error originates. Furthermore, as debug.Stack() returns a multi-line string,
+	// often the logging backend cannot render it properly, making it difficult to read/summarize. To address this,
+	// in this new implementation we switch to runtime.Callers and runtime.CallersFrames to capture the call stack
+	// in a structured way with the unrelated stacks skipped.
+	frames := captureCallers()
+	kvs = append(kvs, "callers", frames)
+
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryUnexpected
+	return newErr
+}
+
+// NewTransientError creates a new Error categorized as ErrCategoryTransient. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewTransientError(err error, desc string, kvs ...interface{}) *Error {
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryTransient
+	return newErr
+}
+
+// NewUserError creates a new Error categorized as ErrCategoryUser. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewUserError(err error, desc string, kvs ...interface{}) *Error {
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryUser
+	return newErr
+}
+
+// NewAPIServerError creates a new Error categorized as ErrCategoryAPIServer. It is supposed to be called
+// at the lowest level possible, where the error category can be determined. The error can then be returned
+// and wrapped (w/ additional attributes if applicable).
+func NewAPIServerError(err error, desc string, cached bool, kvs ...interface{}) *Error {
+	// For API server errors, collect caching and HTTP response related information.
+	kvs = append(kvs, "cached", cached)
+	kvs = append(kvs, "APIServerErrReason", apierrors.ReasonForError(err))
+
+	apiErr, ok := err.(apierrors.APIStatus)
+	if ok {
+		kvs = append(kvs, "APIServerResHTTPCode", apiErr.Status().Code)
+	}
+
+	newErr := Wraps(err, desc, kvs...)
+	newErr.category = ErrCategoryAPIServer
+	return newErr
+}
diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go
new file mode 100644
index 000000000..5b62fbee6
--- /dev/null
+++ b/pkg/utils/errors/errors_test.go
@@ -0,0 +1,579 @@
+/*
+Copyright 2026 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package errors
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log/slog"
+	"strings"
+	"testing"
+
+	"github.com/go-logr/logr"
+	"github.com/google/go-cmp/cmp"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/klog/v2"
+)
+
+var (
+	// wrappedErrComparer is a cmp option that compares the .wrapped field by pointer equality,
+	// avoiding the need to recurse into unexported fields of standard-library error types.
+	wrappedErrComparer = cmp.FilterPath(
+		func(p cmp.Path) bool { return p.Last().String() == ".wrapped" },
+		cmp.Comparer(func(x, y error) bool { return x == y }), //nolint:errorlint
+	)
+)
+
+// TestCommonUsePatterns demonstrates how the common error handling patterns are enabled by the errors package.
+func TestCommonUsePatterns(t *testing.T) {
+	var bytesBuf bytes.Buffer
+	slogHandler := slog.NewTextHandler(&bytesBuf, &slog.HandlerOptions{Level: slog.LevelDebug})
+	logger := logr.FromSlogHandler(slogHandler)
+	klog.SetLogger(logger)
+
+	// Example 1: handling API server errors with rich context and structured logging.
+	rawAPIServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "work")
+	categorizedAPIServerErr := NewAPIServerError(rawAPIServerErr, "failed to create namespace", false, "k1", "v1")
+	wrappedErr := Wraps(categorizedAPIServerErr, "additional high-level error description", "k2", "v2", "k3", "v3")
+	klog.ErrorS(wrappedErr, "additional top/controller-level error description", Args(wrappedErr)...)
+	// If there needs to be kv attributes at the top level as well:
+	// klog.ErrorS(wrappedErr, "additional top/controller-level error description", append(Args(wrappedErr), "k4", "v4", "k5", "v5")...)
+	klog.Flush()
+
+	outputStr := readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:53: time=2026-04-29T02:39:44.853+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: failed to create namespace: namespaces \"work\" already exists" errCategory=APIserver k1=v1 cached=false APIServerErrReason=AlreadyExists APIServerResHTTPCode=409 k2=v2 k3=v3
+	wantSubStrings := []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: failed to create namespace: namespaces \\\"work\\\" already exists\"",
+		"errCategory=APIserver",
+		"k1=v1",
+		"k2=v2",
+		"k3=v3",
+		"cached=false",
+		"APIServerErrReason=AlreadyExists",
+		"APIServerResHTTPCode=409",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+
+	// Example 2: handling unexpected errors with rich context and structured logging.
+	rawUtilityCallErr := fmt.Errorf("cannot calculate resource hash")
+	categorizedUnexpectedErr := NewUnexpectedError(rawUtilityCallErr, "additional low-level error description", "k1", "v1")
+	wrappedUnexpectedErr := Wraps(categorizedUnexpectedErr, "additional high-level error description", "k2", "v2")
+	klog.ErrorS(wrappedUnexpectedErr, "additional top/controller-level error description", Args(wrappedUnexpectedErr)...)
+	klog.Flush()
+
+	outputStr = readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:go.goms.io/fleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2
+	wantSubStrings = []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: additional low-level error description: cannot calculate resource hash\"",
+		"errCategory=unexpected",
+		"k1=v1",
+		"callers=",
+		"Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns",
+		"k2=v2",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+
+	// Example 3: handling uncategorized errors with rich context and structured logging.
+	rawErr := fmt.Errorf("something went wrong")
+	uncategorizedErr := Wraps(rawErr, "additional low-level error description", "k1", "v1")
+	wrappedUncategorizedErr := Wraps(uncategorizedErr, "additional high-level error description", "k2", "v2")
+	klog.ErrorS(wrappedUncategorizedErr, "additional top/controller-level error description", Args(wrappedUncategorizedErr)...)
+	klog.Flush()
+
+	outputStr = readFromBuffer(t, &bytesBuf)
+
+	// The full error output looks like the follows:
+	// errors_test.go:119: time=2026-04-29T02:59:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: something went wrong" errCategory=uncategorized k1=v1 k2=v2
+	wantSubStrings = []string{
+		"msg=\"additional top/controller-level error description\"",
+		"err=\"additional high-level error description: additional low-level error description: something went wrong\"",
+		"errCategory=uncategorized",
+		"k1=v1",
+		"k2=v2",
+	}
+	for _, subStr := range wantSubStrings {
+		if strings.Contains(outputStr, subStr) == false {
+			t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr)
+		}
+	}
+}
+
+func TestErrorStringer(t *testing.T) {
+	wrappedErr := fmt.Errorf("something went wrong")
+
+	testCases := []struct {
+		name    string
+		err     *Error
+		wantMsg string
+	}{
+		{
+			name:    "no wrapped error, no description",
+			err:     &Error{},
+			wantMsg: "an unknown error occurred",
+		},
+		{
+			name:    "no wrapped error, with description",
+			err:     &Error{desc: "operation failed"},
+			wantMsg: "operation failed",
+		},
+		{
+			name:    "with wrapped error, no description",
+			err:     &Error{wrapped: wrappedErr},
+			wantMsg: "something went wrong",
+		},
+		{
+			name:    "with wrapped error, with description",
+			err:     &Error{wrapped: wrappedErr, desc: "operation failed"},
+			wantMsg: "operation failed: something went wrong",
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.err.Error()
+			if got != tc.wantMsg {
+				t.Errorf("Error() = %q, want %q", got, tc.wantMsg)
+			}
+		})
+	}
+}
+
+func TestWraps(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryUser,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "wrapping a plain error, no kvs",
+			err:  plainErr,
+			desc: "high level desc",
+			wantErr: &Error{
+				category: ErrCategoryUncategorized,
+				wrapped:  plainErr,
+				desc:     "high level desc",
+			},
+		},
+		{
+			name: "wrapping a plain error, with kvs",
+			err:  plainErr,
+			desc: "high level desc",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryUncategorized,
+				wrapped:  plainErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		{
+			name: "wrapping an *Error, no extra kvs",
+			err:  categorizedChildErr,
+			desc: "high level desc",
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"ck1", "cv1"},
+			},
+		},
+		{
+			name: "wrapping an *Error, with extra kvs",
+			err:  categorizedChildErr,
+			desc: "high level desc",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "high level desc",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := Wraps(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("Wraps() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestArgs(t *testing.T) {
+	testCases := []struct {
+		name    string
+		err     error
+		kvs     []interface{}
+		wantKVs []interface{}
+	}{
+		{
+			name:    "plain error (not an *Error)",
+			err:     fmt.Errorf("plain error"),
+			wantKVs: nil,
+		},
+		{
+			name:    "*Error with no category set, no attrs, no extra kvs",
+			err:     &Error{},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUncategorized},
+		},
+		{
+			name:    "*Error with explicit category, no attrs, no extra kvs",
+			err:     &Error{category: ErrCategoryUser},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUser},
+		},
+		{
+			name:    "*Error with explicit category and attrs, no extra kvs",
+			err:     &Error{category: ErrCategoryAPIServer, attrs: []interface{}{"k1", "v1", "k2", "v2"}},
+			wantKVs: []interface{}{"errCategory", ErrCategoryAPIServer, "k1", "v1", "k2", "v2"},
+		},
+		{
+			name:    "*Error with explicit category and attrs, with extra kvs",
+			err:     &Error{category: ErrCategoryUnexpected, attrs: []interface{}{"k1", "v1"}},
+			kvs:     []interface{}{"k2", "v2"},
+			wantKVs: []interface{}{"errCategory", ErrCategoryUnexpected, "k1", "v1", "k2", "v2"},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotKVs := Args(tc.err, tc.kvs...)
+			if diff := cmp.Diff(gotKVs, tc.wantKVs); diff != "" {
+				t.Errorf("Args() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestUnwrap(t *testing.T) {
+	innerErr := fmt.Errorf("inner error")
+	testCases := []struct {
+		name    string
+		err     *Error
+		wantErr error
+	}{
+		{
+			name:    "nil wrapped error",
+			err:     &Error{},
+			wantErr: nil,
+		},
+		{
+			name:    "non-nil wrapped error",
+			err:     &Error{wrapped: innerErr},
+			wantErr: innerErr,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := tc.err.Unwrap()
+			// Note: here we actually want to just compare the error values by pointer equality.
+			if gotErr != tc.wantErr { //nolint:errorlint
+				t.Errorf("Unwrap() = %v, want %v", gotErr, tc.wantErr)
+			}
+		})
+	}
+}
+
+func TestNewTransientError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryUser,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "plain error, no kvs",
+			err:  plainErr,
+			desc: "transient condition",
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  plainErr,
+				desc:     "transient condition",
+			},
+		},
+		{
+			name: "plain error, with kvs",
+			err:  plainErr,
+			desc: "transient condition",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  plainErr,
+				desc:     "transient condition",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		// This is technically speaking possible, but in practice users should not need to override categories.
+		{
+			name: "wrapping an *Error, category overridden to transient",
+			err:  categorizedChildErr,
+			desc: "transient condition",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryTransient,
+				wrapped:  categorizedChildErr,
+				desc:     "transient condition",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewTransientError(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewTransientError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewUserError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	categorizedChildErr := &Error{
+		category: ErrCategoryTransient,
+		desc:     "child desc",
+		attrs:    []interface{}{"ck1", "cv1"},
+	}
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name: "plain error, no kvs",
+			err:  plainErr,
+			desc: "invalid input",
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  plainErr,
+				desc:     "invalid input",
+			},
+		},
+		{
+			name: "plain error, with kvs",
+			err:  plainErr,
+			desc: "invalid input",
+			kvs:  []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  plainErr,
+				desc:     "invalid input",
+				attrs:    []interface{}{"k1", "v1"},
+			},
+		},
+		{
+			// This is technically speaking possible, but in practice users should not need to override categories.
+			name: "wrapping an *Error, category overridden to user",
+			err:  categorizedChildErr,
+			desc: "invalid input",
+			kvs:  []interface{}{"k2", "v2"},
+			wantErr: &Error{
+				category: ErrCategoryUser,
+				wrapped:  categorizedChildErr,
+				desc:     "invalid input",
+				attrs:    []interface{}{"ck1", "cv1", "k2", "v2"},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewUserError(tc.err, tc.desc, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewUserError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewAPIServerError(t *testing.T) {
+	apiServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "fleet")
+	plainErr := fmt.Errorf("plain error")
+
+	testCases := []struct {
+		name    string
+		err     error
+		desc    string
+		cached  bool
+		kvs     []interface{}
+		wantErr *Error
+	}{
+		{
+			name:   "API server error, not cached, no extra kvs",
+			err:    apiServerErr,
+			desc:   "failed to create namespace",
+			cached: false,
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  apiServerErr,
+				desc:     "failed to create namespace",
+				attrs: []interface{}{
+					"cached", false,
+					"APIServerErrReason", apierrors.ReasonForError(apiServerErr),
+					"APIServerResHTTPCode", apiServerErr.Status().Code,
+				},
+			},
+		},
+		{
+			name:   "API server error, cached, with extra kvs",
+			err:    apiServerErr,
+			desc:   "failed to create namespace",
+			cached: true,
+			kvs:    []interface{}{"k1", "v1"},
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  apiServerErr,
+				desc:     "failed to create namespace",
+				attrs: []interface{}{
+					"k1", "v1",
+					"cached", true,
+					"APIServerErrReason", apierrors.ReasonForError(apiServerErr),
+					"APIServerResHTTPCode", apiServerErr.Status().Code,
+				},
+			},
+		},
+		{
+			name:   "non API server error, no HTTPCode in attrs",
+			err:    plainErr,
+			desc:   "unexpected failure",
+			cached: false,
+			wantErr: &Error{
+				category: ErrCategoryAPIServer,
+				wrapped:  plainErr,
+				desc:     "unexpected failure",
+				attrs: []interface{}{
+					"cached", false,
+					"APIServerErrReason", apierrors.ReasonForError(plainErr),
+				},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotErr := NewAPIServerError(tc.err, tc.desc, tc.cached, tc.kvs...)
+			if diff := cmp.Diff(gotErr, tc.wantErr,
+				cmp.AllowUnexported(Error{}),
+				wrappedErrComparer,
+			); diff != "" {
+				t.Errorf("NewAPIServerError() mismatch (-got, +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNewUnexpectedError(t *testing.T) {
+	plainErr := fmt.Errorf("plain error")
+	gotErr := NewUnexpectedError(plainErr, "low-level desc", "k1", "v1")
+
+	// Verify the category.
+	if gotErr.category != ErrCategoryUnexpected {
+		t.Errorf("NewUnexpectedError() category = %v, want %v", gotErr.category, ErrCategoryUnexpected)
+	}
+	// Verify wrapped error and description via Error().
+	wantMsg := "low-level desc: plain error"
+	if got := gotErr.Error(); got != wantMsg {
+		t.Errorf("NewUnexpectedError() Error() = %q, want %q", got, wantMsg)
+	}
+	// Verify that the user-provided kvs appear at the start of attrs.
+	if len(gotErr.attrs) < 2 || gotErr.attrs[0] != "k1" || gotErr.attrs[1] != "v1" {
+		t.Errorf("NewUnexpectedError() attrs = %v, want k1=v1 at start", gotErr.attrs)
+	}
+	// Verify that the callers key is present in attrs and its frame count is not zero.
+	found := false
+	canCast := false
+	var frames []CallerFrameOverview
+	for i := 0; i < len(gotErr.attrs)-1; i++ {
+		if gotErr.attrs[i] == "callers" {
+			found = true
+			frames, canCast = gotErr.attrs[i+1].([]CallerFrameOverview)
+			break
+		}
+	}
+	if !found {
+		t.Errorf("NewUnexpectedError() attrs = %v, want callers key present", gotErr.attrs)
+	}
+	if !canCast {
+		t.Errorf("NewUnexpectedError() callers value has type %T, want []CallerFrameOverview", gotErr.attrs)
+	}
+	if len(frames) == 0 {
+		t.Errorf("NewUnexpectedError() callers = %v, want non-zero frame count", frames)
+	}
+
+	// Verify the first frame (the current test code).
+	callerFunc := frames[0].Function
+	if callerFunc != "go.goms.io/fleet/pkg/utils/errors.TestNewUnexpectedError" {
+		t.Errorf("NewUnexpectedError() first caller function = %s, want TestNewUnexpectedError", callerFunc)
+	}
+	callerFile := frames[0].File
+	if !strings.HasSuffix(callerFile, "pkg/utils/errors/errors_test.go") {
+		t.Errorf("NewUnexpectedError() first caller file = %s, want to end with pkg/utils/errors/errors_test.go", callerFile)
+	}
+	// Note: skip verifying the exact line number as it may change with code edits.
+}
+
+func readFromBuffer(t *testing.T, buf *bytes.Buffer) string {
+	output := make([]byte, 1000)
+	n, err := buf.Read(output)
+	if err != nil && err != io.EOF {
+		t.Fatalf("Failed to read from bytes buffer: %v", err)
+	}
+	output = output[:n]
+	outputStr := string(output)
+	return outputStr
+}
diff --git a/pkg/utils/writefile/writefile.go b/pkg/utils/writefile/writefile.go
new file mode 100644
index 000000000..f161bfeca
--- /dev/null
+++ b/pkg/utils/writefile/writefile.go
@@ -0,0 +1,34 @@
+/*
+Copyright 2025 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package writefile
+
+import (
+	"os"
+	"path/filepath"
+)
+
+// CreateSecureFile creates or truncates a file with owner-only permissions (0600).
+// The path is sanitized with filepath.Clean to resolve any relative or redundant elements.
+//
+//   - O_WRONLY: write-only, since callers only need to write sensitive data.
+//   - O_CREATE: creates the file if it does not exist.
+//   - O_TRUNC: empties the file before writing so that no leftover bytes from a
+//     previous (possibly longer) write remain in the file.
+//   - 0600: owner read/write only, preventing other users from accessing the file.
+func CreateSecureFile(path string) (*os.File, error) {
+	return os.OpenFile(filepath.Clean(path), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
+}
diff --git a/pkg/utils/writefile/writefile_test.go b/pkg/utils/writefile/writefile_test.go
new file mode 100644
index 000000000..7d389ac66
--- /dev/null
+++ b/pkg/utils/writefile/writefile_test.go
@@ -0,0 +1,89 @@
+/*
+Copyright 2025 The KubeFleet Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package writefile
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestCreateSecureFile(t *testing.T) {
+	tests := []struct {
+		name         string
+		oldContent   string
+		writeContent string
+	}{
+		{
+			name:         "creates new file with restricted permissions",
+			writeContent: "sensitive-data",
+		},
+		{
+			name:         "truncates longer existing content",
+			oldContent:   "this-is-a-very-long-existing-content-that-should-be-truncated",
+			writeContent: "short",
+		},
+		{
+			name:         "overwrites shorter existing content",
+			oldContent:   "short",
+			writeContent: "a-much-longer-replacement-content",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			dir := t.TempDir()
+			filePath := filepath.Join(dir, "secure-file")
+
+			writeAndClose := func(content string) {
+				f, err := CreateSecureFile(filePath)
+				if err != nil {
+					t.Fatalf("CreateSecureFile(%q) returned error: %v", filePath, err)
+				}
+				if _, err := io.WriteString(f, content); err != nil {
+					t.Fatalf("io.WriteString() returned error: %v", err)
+				}
+				f.Close()
+			}
+
+			if tc.oldContent != "" {
+				writeAndClose(tc.oldContent)
+			}
+
+			writeAndClose(tc.writeContent)
+
+			got, err := os.ReadFile(filePath)
+			if err != nil {
+				t.Fatalf("os.ReadFile(%q) returned error: %v", filePath, err)
+			}
+			if diff := cmp.Diff(tc.writeContent, string(got)); diff != "" {
+				t.Errorf("CreateSecureFile() file content mismatch (-want +got):\n%s", diff)
+			}
+
+			info, err := os.Stat(filePath)
+			if err != nil {
+				t.Fatalf("os.Stat(%q) returned error: %v", filePath, err)
+			}
+			if gotPerm := info.Mode().Perm(); gotPerm != 0600 {
+				t.Errorf("CreateSecureFile() file permission = %o, want %o", gotPerm, 0600)
+			}
+		})
+	}
+}
diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go
index 507664b61..b7febe5af 100644
--- a/pkg/webhook/webhook.go
+++ b/pkg/webhook/webhook.go
@@ -55,6 +55,7 @@ import (
 	clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1"
 	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 	"go.goms.io/fleet/cmd/hubagent/options"
+	"go.goms.io/fleet/pkg/utils/writefile"
 	"go.goms.io/fleet/pkg/webhook/clusterresourceoverride"
 	"go.goms.io/fleet/pkg/webhook/clusterresourceplacement"
 	"go.goms.io/fleet/pkg/webhook/clusterresourceplacementdisruptionbudget"
@@ -671,10 +672,20 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho
 	}
 
 	// Build core v1 resources list, conditionally including pods if workload is enabled
-	coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName,
-		limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status",
-		replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName,
-		serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"}
+	coreV1Resources := []string{
+		bindingResourceName,
+		configMapResourceName,
+		endPointResourceName,
+		limitRangeResourceName,
+		persistentVolumeClaimsName, persistentVolumeClaimsName + "/status",
+		podTemplateResourceName,
+		podResourceName, podResourceName + "/status",
+		replicationControllerResourceName, replicationControllerResourceName + "/status",
+		resourceQuotaResourceName, resourceQuotaResourceName + "/status",
+		secretResourceName,
+		serviceAccountResourceName, serviceAccountResourceName + "/token",
+		servicesResourceName, servicesResourceName + "/status",
+	}
 
 	namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{
 		Operations: cuOperations,
@@ -967,7 +978,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error {
 		return fmt.Errorf("could not create directory %q to store certificates: %w", certDir, err)
 	}
 	certPath := filepath.Join(certDir, fleetWebhookCertFileName)
-	f, err := os.OpenFile(filepath.Clean(certPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600)
+	f, err := writefile.CreateSecureFile(certPath)
 	if err != nil {
 		return fmt.Errorf("could not open %q: %w", certPath, err)
 	}
@@ -981,7 +992,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error {
 	}
 
 	keyPath := filepath.Join(certDir, fleetWebhookKeyFileName)
-	kf, err := os.OpenFile(filepath.Clean(keyPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600)
+	kf, err := writefile.CreateSecureFile(keyPath)
 	if err != nil {
 		return fmt.Errorf("could not open %q: %w", keyPath, err)
 	}
diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go
index c9d0c4506..7759cb5a1 100644
--- a/test/apis/placement/v1beta1/api_validation_integration_test.go
+++ b/test/apis/placement/v1beta1/api_validation_integration_test.go
@@ -27,6 +27,7 @@ import (
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	k8sErrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/util/intstr"
 
 	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
@@ -1685,6 +1686,60 @@ var _ = Describe("Test placement v1beta1 API validation", func() {
 			Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency))
 			Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed())
 		})
+
+		It("Should allow creation of ClusterStagedUpdateStrategy with waitTime set to '0s'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "0s",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			Expect(hubClient.Create(ctx, obj)).Should(Succeed())
+			Expect(hubClient.Delete(ctx, obj)).Should(Succeed())
+		})
+
+		It("Should allow creation of ClusterStagedUpdateStrategy with waitTime as a compound duration '1h30m'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1h30m",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			Expect(hubClient.Create(ctx, obj)).Should(Succeed())
+			Expect(hubClient.Delete(ctx, obj)).Should(Succeed())
+		})
 	})
 
 	Context("Test ClusterStagedUpdateStrategy API validation - invalid cases", func() {
@@ -2096,6 +2151,123 @@ var _ = Describe("Test placement v1beta1 API validation", func() {
 			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
 			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match"))
 		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '0abc'", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "0abc",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1' (number without unit)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1d' (unsupported unit)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "1d",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
+		It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '01h' (leading zero)", func() {
+			obj := &unstructured.Unstructured{
+				Object: map[string]interface{}{
+					"apiVersion": "placement.kubernetes-fleet.io/v1beta1",
+					"kind":       "ClusterStagedUpdateStrategy",
+					"metadata": map[string]interface{}{
+						"name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()),
+					},
+					"spec": map[string]interface{}{
+						"stages": []interface{}{
+							map[string]interface{}{
+								"name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1),
+								"afterStageTasks": []interface{}{
+									map[string]interface{}{
+										"type":     "TimedWait",
+										"waitTime": "01h",
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			err := hubClient.Create(ctx, obj)
+			var statusErr *k8sErrors.StatusError
+			Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{})))
+			Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match"))
+		})
+
 	})
 
 	Context("Test ClusterApprovalRequest API validation - valid cases", func() {
diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go
index 6f108dbca..ed778d4ce 100644
--- a/test/e2e/cluster_staged_updaterun_test.go
+++ b/test/e2e/cluster_staged_updaterun_test.go
@@ -2203,7 +2203,7 @@ func approveClusterApprovalRequest(stageName, updateRunName, stageTask string) {
 
 	// Use kubectl-fleet approve plugin to approve the request
 	cmd := exec.Command(fleetBinaryPath, "approve", "clusterapprovalrequest",
-		"--hubClusterContext", "kind-hub",
+		"--hub-cluster-context", "kind-hub",
 		"--name", approvalRequestName)
 	output, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "kubectl-fleet approve failed: %s", string(output))
diff --git a/test/e2e/drain_tool_test.go b/test/e2e/drain_tool_test.go
index 199a48ba7..5939a9299 100644
--- a/test/e2e/drain_tool_test.go
+++ b/test/e2e/drain_tool_test.go
@@ -360,8 +360,8 @@ var _ = Describe("Drain is allowed on one cluster, blocked on others - ClusterRe
 func runDrainClusterBinary(hubClusterName, memberClusterName string) {
 	By(fmt.Sprintf("draining cluster %s", memberClusterName))
 	cmd := exec.Command(fleetBinaryPath, "draincluster",
-		"--hubClusterContext", hubClusterName,
-		"--clusterName", memberClusterName)
+		"--hub-cluster-context", hubClusterName,
+		"--cluster-name", memberClusterName)
 	_, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "Drain command failed with error: %v", err)
 }
@@ -369,8 +369,8 @@ func runDrainClusterBinary(hubClusterName, memberClusterName string) {
 func runUncordonClusterBinary(hubClusterName, memberClusterName string) {
 	By(fmt.Sprintf("uncordoning cluster %s", memberClusterName))
 	cmd := exec.Command(fleetBinaryPath, "uncordoncluster",
-		"--hubClusterContext", hubClusterName,
-		"--clusterName", memberClusterName)
+		"--hub-cluster-context", hubClusterName,
+		"--cluster-name", memberClusterName)
 	_, err := cmd.CombinedOutput()
 	Expect(err).ToNot(HaveOccurred(), "Uncordon command failed with error: %v", err)
 }
diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go
index 450331eb0..b073ccd11 100644
--- a/test/e2e/fleet_guard_rail_test.go
+++ b/test/e2e/fleet_guard_rail_test.go
@@ -24,6 +24,7 @@ import (
 	. "github.com/onsi/gomega"
 	admissionv1 "k8s.io/api/admission/v1"
 	appsv1 "k8s.io/api/apps/v1"
+	authenticationv1 "k8s.io/api/authentication/v1"
 	corev1 "k8s.io/api/core/v1"
 	policyv1 "k8s.io/api/policy/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
@@ -1607,3 +1608,88 @@ var _ = Describe("fleet deployment webhook tests for validating and mutating dep
 		})
 	})
 })
+
+var _ = Describe("fleet guard rail webhook tests for service accounts in restricted namespaces", Ordered, func() {
+	svcAccountGVK := metav1.GroupVersionKind{Group: corev1.SchemeGroupVersion.Group, Version: corev1.SchemeGroupVersion.Version, Kind: "ServiceAccount"}
+	tokenReqGVK := metav1.GroupVersionKind{Group: "authentication.k8s.io", Version: "v1", Kind: "TokenRequest"}
+
+	svcAccountName := fmt.Sprintf(svcAccountNameTemplate, GinkgoParallelProcess())
+	svcAccountToAddName := "added-sa"
+	kubeSystemNamespaceName := "kube-system"
+
+	var svcAccount *corev1.ServiceAccount
+	BeforeAll(func() {
+		// Create a service account in the kube-system namespace.
+		svcAccount = &corev1.ServiceAccount{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      svcAccountName,
+				Namespace: kubeSystemNamespaceName,
+			},
+		}
+		Expect(hubClient.Create(ctx, svcAccount)).Should(Succeed(), "Failed to create service account")
+	})
+
+	AfterAll(func() {
+		// Ensure the removal of the created service account.
+		Eventually(func() error {
+			svcAccount := corev1.ServiceAccount{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      svcAccountName,
+					Namespace: kubeSystemNamespaceName,
+				},
+			}
+			if err := hubClient.Delete(ctx, &svcAccount); err != nil {
+				if k8sErrors.IsNotFound(err) {
+					return nil
+				}
+				return fmt.Errorf("failed to delete service account: %w", err)
+			}
+
+			if err := hubClient.Get(ctx, types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, &corev1.ServiceAccount{}); err != nil {
+				if k8sErrors.IsNotFound(err) {
+					return nil
+				}
+				return fmt.Errorf("failed to retrieve service account: %w", err)
+			}
+			return fmt.Errorf("service account still exists")
+		}, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete service account")
+	})
+
+	It("should deny creation of service accounts in kube-system namespace for non-whitelisted users", func() {
+		newSvcAccount := &corev1.ServiceAccount{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      svcAccountToAddName,
+				Namespace: kubeSystemNamespaceName,
+			},
+		}
+		wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat,
+			testUser,
+			utils.GenerateGroupString(testGroups),
+			admissionv1.Create,
+			svcAccountGVK, "",
+			types.NamespacedName{Name: newSvcAccount.Name, Namespace: newSvcAccount.Namespace},
+		)
+		Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, newSvcAccount), wantErrMsg)).Should(Succeed(), "Failed to deny creation of service account in kube-system namespace")
+	})
+
+	It("should deny access to service account token subresource in restricted namespace for non-whitelisted users", func() {
+		tokenRequest := &authenticationv1.TokenRequest{
+			ObjectMeta: metav1.ObjectMeta{
+				Namespace: kubeSystemNamespaceName,
+				Name:      svcAccountName,
+			},
+			Spec: authenticationv1.TokenRequestSpec{
+				Audiences:         []string{"experimental"},
+				ExpirationSeconds: ptr.To(int64(3600)),
+			},
+		}
+		wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat,
+			testUser,
+			utils.GenerateGroupString(testGroups),
+			admissionv1.Create,
+			tokenReqGVK, "token",
+			types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName},
+		)
+		Expect(checkIfStatusErrorWithMessage(impersonateHubClient.SubResource("token").Create(ctx, svcAccount, tokenRequest), wantErrMsg)).Should(Succeed(), "Failed to deny access to service account token subresource in restricted namespace")
+	})
+})
diff --git a/test/e2e/placement_selecting_resources_test.go b/test/e2e/placement_selecting_resources_test.go
index 10d6a56c3..b12ffa399 100644
--- a/test/e2e/placement_selecting_resources_test.go
+++ b/test/e2e/placement_selecting_resources_test.go
@@ -1392,11 +1392,12 @@ var _ = Describe("creating CRP and checking selected resources order", Ordered,
 
 	It("should update CRP status with the correct order of the selected resources", func() {
 		// Define the expected resources in order
-		// Note: PVCs are not propagated, so they should not appear in selected resources
+		// Note: PVCs are propagated when enableWorkload=false (the default for E2E tests)
 		expectedResources := []placementv1beta1.ResourceIdentifier{
 			{Kind: "Namespace", Name: nsName, Version: "v1"},
 			{Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"},
 			{Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"},
+			{Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"},
 			{Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"},
 		}
 
diff --git a/test/e2e/resource_placement_hub_workload_test.go b/test/e2e/resource_placement_hub_workload_test.go
index b088109ee..0e644d145 100644
--- a/test/e2e/resource_placement_hub_workload_test.go
+++ b/test/e2e/resource_placement_hub_workload_test.go
@@ -33,6 +33,9 @@ import (
 	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 )
 
+// This test requires enableWorkload=true because it creates workloads on the hub cluster
+// and expects them to become ready. With enableWorkload=false, the pod webhook blocks
+// pod creation on the hub cluster.
 var _ = Describe("placing workloads using a CRP with PickAll policy", Label("resourceplacement"), Ordered, func() {
 	crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess())
 	var testDeployment appsv1.Deployment
@@ -41,6 +44,8 @@ var _ = Describe("placing workloads using a CRP with PickAll policy", Label("res
 	var testStatefulSet appsv1.StatefulSet
 
 	BeforeAll(func() {
+		Skip("This test requires enableWorkload=true to run workloads on the hub cluster")
+
 		// Read the test manifests
 		readDeploymentTestManifest(&testDeployment)
 		readDaemonSetTestManifest(&testDaemonSet)
diff --git a/test/e2e/resource_placement_selecting_resources_test.go b/test/e2e/resource_placement_selecting_resources_test.go
index 2e6966274..9737dd787 100644
--- a/test/e2e/resource_placement_selecting_resources_test.go
+++ b/test/e2e/resource_placement_selecting_resources_test.go
@@ -1057,10 +1057,11 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f
 
 		It("should update RP status with the correct order of the selected resources", func() {
 			// Define the expected resources in order.
-			// Note: PVCs are not propagated, so they should not appear in selected resources
+			// Note: PVCs are propagated when enableWorkload=false (the default for E2E tests)
 			expectedResources := []placementv1beta1.ResourceIdentifier{
 				{Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"},
 				{Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"},
+				{Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"},
 				{Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"},
 			}
 
diff --git a/test/e2e/resources_test.go b/test/e2e/resources_test.go
index bf1d15318..e631a8c3a 100644
--- a/test/e2e/resources_test.go
+++ b/test/e2e/resources_test.go
@@ -54,6 +54,7 @@ const (
 	clusterStagedUpdateRunNameWithSubIndexTemplate = "cur-%d-%d"
 	stagedUpdateRunStrategyNameTemplate            = "sus-%d"
 	stagedUpdateRunNameWithSubIndexTemplate        = "sur-%d-%d"
+	svcAccountNameTemplate                         = "sa-%d"
 
 	customDeletionBlockerFinalizer = "kubernetes-fleet.io/custom-deletion-blocker-finalizer"
 	workNamespaceLabelName         = "process"
diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go
index 4f0432bcf..a0116847c 100644
--- a/test/e2e/utils_test.go
+++ b/test/e2e/utils_test.go
@@ -1749,7 +1749,7 @@ func checkIfStatusErrorWithMessage(err error, errorMsg string) error {
 			return nil
 		}
 	}
-	return fmt.Errorf("error message %s not found in error %w", errorMsg, err)
+	return fmt.Errorf("error message (%s) not found in given error (%w)", errorMsg, err)
 }
 
 // buildOwnerReference builds an owner reference given a cluster and a placement name.
diff --git a/tools/fleet/README.md b/tools/fleet/README.md
index 897978db3..c7405896d 100644
--- a/tools/fleet/README.md
+++ b/tools/fleet/README.md
@@ -72,25 +72,25 @@ Use the `approve` subcommand to approve approval request resources for staged up
 
 **Cluster-scoped (ClusterApprovalRequest):**
 ```bash
-kubectl fleet approve clusterapprovalrequest --hubClusterContext <hub-cluster-context> --name <approval-request-name>
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context <hub-cluster-context> --name <approval-request-name>
 # Or using alias:
-kubectl fleet approve careq --hubClusterContext <hub-cluster-context> --name <approval-request-name>
+kubectl fleet approve careq --hub-cluster-context <hub-cluster-context> --name <approval-request-name>
 ```
 
 **Namespace-scoped (ApprovalRequest):**
 ```bash
-kubectl fleet approve approvalrequest --hubClusterContext <hub-cluster-context> --name <approval-request-name> --namespace <namespace>
+kubectl fleet approve approvalrequest --hub-cluster-context <hub-cluster-context> --name <approval-request-name> --namespace <namespace>
 # Or using alias:
-kubectl fleet approve areq --hubClusterContext <hub-cluster-context> --name <approval-request-name> -n <namespace>
+kubectl fleet approve areq --hub-cluster-context <hub-cluster-context> --name <approval-request-name> -n <namespace>
 ```
 
 Example:
 ```bash
 # Approve a ClusterApprovalRequest
-kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name my-approval-request
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name my-approval-request
 
 # Approve an ApprovalRequest in a specific namespace
-kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval-request --namespace my-namespace
+kubectl fleet approve approvalrequest --hub-cluster-context hub --name my-approval-request --namespace my-namespace
 ```
 
 ### Drain a Member Cluster
@@ -98,12 +98,12 @@ kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval
 Use the `draincluster` subcommand to remove all resources propagated to a member cluster from the hub cluster by any `Placement` resource. This is useful when you want to temporarily move all workloads off a member cluster in preparation for an event like upgrade or reconfiguration.
 
 ```bash
-kubectl fleet draincluster --hubClusterContext <hub-cluster-context> --clusterName <memberClusterName>
+kubectl fleet draincluster --hub-cluster-context <hub-cluster-context> --cluster-name <memberClusterName>
 ```
 
 Example:
 ```bash
-kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-1
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name member-cluster-1
 ```
 
 ### Uncordon a Member Cluster
@@ -111,12 +111,12 @@ kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-
 Use the `uncordoncluster` subcommand to uncordon a member cluster that has been previously drained, allowing resources to be propagated to the cluster again.
 
 ```bash
-kubectl fleet uncordoncluster --hubClusterContext <hub-cluster-context> --clusterName <memberClusterName>
+kubectl fleet uncordoncluster --hub-cluster-context <hub-cluster-context> --cluster-name <memberClusterName>
 ```
 
 Example:
 ```bash
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName member-cluster-1
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name member-cluster-1
 ```
 
 ## Subcommands
@@ -156,13 +156,13 @@ If the `cordon` taint is not present on the member cluster, the command will hav
 ## Flags
 
 The `approve` subcommand uses the following flags:
-- `--hubClusterContext`: kubectl context for the hub cluster (required)
+- `--hub-cluster-context`: kubectl context for the hub cluster (required)
 - `--name`: name of the resource to approve (required)
 - `--namespace`, `-n`: namespace of the resource to approve (required for `approvalrequest`, not allowed for `clusterapprovalrequest`)
 
 Both `draincluster` and `uncordoncluster` subcommands use the following flags:
-- `--hubClusterContext`: kubectl context for the hub cluster (required)
-- `--clusterName`: name of the member cluster to operate on (required)
+- `--hub-cluster-context`: kubectl context for the hub cluster (required)
+- `--cluster-name`: name of the member cluster to operate on (required)
 
 ## Examples
 
@@ -173,31 +173,31 @@ Both `draincluster` and `uncordoncluster` subcommands use the following flags:
 kubectl config get-contexts
 
 # 2. Drain a cluster for maintenance
-kubectl fleet draincluster --hubClusterContext production-hub --clusterName worker-node-1
+kubectl fleet draincluster --hub-cluster-context production-hub --cluster-name worker-node-1
 
 # 3. Perform maintenance on the worker-node-1 cluster
 # ... maintenance operations ...
 
 # 4. After maintenance, uncordon the cluster to allow workloads back
-kubectl fleet uncordoncluster --hubClusterContext production-hub --clusterName worker-node-1
+kubectl fleet uncordoncluster --hub-cluster-context production-hub --cluster-name worker-node-1
 ```
 
 ### Additional Examples
 
 ```bash
 # Approve a ClusterApprovalRequest for staged updates
-kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name update-approval-stage-1
+kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name update-approval-stage-1
 
 # Approve a ApprovalRequest for staged updates
-kubectl fleet approve approvalrequest --hubClusterContext hub --name update-approval-stage-1 --namespace test-namespace
+kubectl fleet approve approvalrequest --hub-cluster-context hub --name update-approval-stage-1 --namespace test-namespace
 
 # Drain multiple clusters (run separately for each cluster)
-kubectl fleet draincluster --hubClusterContext hub --clusterName east-cluster
-kubectl fleet draincluster --hubClusterContext hub --clusterName west-cluster
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name east-cluster
+kubectl fleet draincluster --hub-cluster-context hub --cluster-name west-cluster
 
 # Uncordon clusters after maintenance
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName east-cluster
-kubectl fleet uncordoncluster --hubClusterContext hub --clusterName west-cluster
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name east-cluster
+kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name west-cluster
 ```
 
 ## Troubleshooting
diff --git a/tools/fleet/cmd/approve/approve.go b/tools/fleet/cmd/approve/approve.go
index 7d216e5db..9406cf0c3 100644
--- a/tools/fleet/cmd/approve/approve.go
+++ b/tools/fleet/cmd/approve/approve.go
@@ -20,11 +20,11 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -87,6 +87,7 @@ type approveOptions struct {
 	hubClusterContext string
 	name              string
 	namespace         string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -119,16 +120,19 @@ For namespace-scoped resources (approvalrequest), you must also specify the --na
 			if err := o.setupClient(); err != nil {
 				return err
 			}
-			return o.run(cmd.Context(), cfg)
+			ctx, cancel := context.WithTimeout(cmd.Context(), o.timeout)
+			defer cancel()
+			return o.run(ctx, cfg)
 		},
 	}
 
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "The name of the kubeconfig context to use for the hub cluster")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "The name of the kubeconfig context to use for the hub cluster")
 	cmd.Flags().StringVar(&o.name, "name", "", "The name of the resource to approve")
 	cmd.Flags().StringVarP(&o.namespace, "namespace", "n", "", "The namespace of the resource to approve (required for namespace-scoped resources)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags.
-	_ = cmd.MarkFlagRequired("hubClusterContext")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
 	_ = cmd.MarkFlagRequired("name")
 
 	return cmd
@@ -208,10 +212,9 @@ func (o *approveOptions) approveApprovalRequest(ctx context.Context) error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *approveOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/cmd/draincluster/draincluster.go b/tools/fleet/cmd/draincluster/draincluster.go
index 5efd974dc..31b472165 100644
--- a/tools/fleet/cmd/draincluster/draincluster.go
+++ b/tools/fleet/cmd/draincluster/draincluster.go
@@ -20,11 +20,11 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
 	k8errors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/validation"
@@ -49,6 +49,7 @@ const (
 type drainOptions struct {
 	hubClusterContext string
 	clusterName       string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -70,18 +71,20 @@ func NewCmdDrainCluster() *cobra.Command {
 	}
 
 	// Add flags specific to drain command
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)")
-	cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)")
+	cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags
-	_ = cmd.MarkFlagRequired("hubClusterContext")
-	_ = cmd.MarkFlagRequired("clusterName")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
+	_ = cmd.MarkFlagRequired("cluster-name")
 
 	return cmd
 }
 
 func (o *drainOptions) runDrain() error {
-	ctx := context.Background()
+	ctx, cancel := context.WithTimeout(context.Background(), o.timeout)
+	defer cancel()
 
 	isDrainSuccessful, err := o.drain(ctx)
 	if err != nil {
@@ -111,13 +114,9 @@ func (o *drainOptions) runDrain() error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *drainOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err)
-	}
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
index a16a41d36..ad4a15ef9 100644
--- a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
+++ b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go
@@ -20,15 +20,14 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
 	"github.com/spf13/cobra"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1"
-	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 	toolsutils "go.goms.io/fleet/tools/utils"
 )
 
@@ -36,6 +35,7 @@ import (
 type uncordonOptions struct {
 	hubClusterContext string
 	clusterName       string
+	timeout           time.Duration
 
 	hubClient client.Client
 }
@@ -57,18 +57,21 @@ func NewCmdUncordonCluster() *cobra.Command {
 	}
 
 	// Add flags specific to uncordon command
-	cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)")
-	cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)")
+	cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)")
+	cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)")
+	cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete")
 
 	// Mark required flags
-	_ = cmd.MarkFlagRequired("hubClusterContext")
-	_ = cmd.MarkFlagRequired("clusterName")
+	_ = cmd.MarkFlagRequired("hub-cluster-context")
+	_ = cmd.MarkFlagRequired("cluster-name")
 
 	return cmd
 }
 
 func (o *uncordonOptions) runUncordon() error {
-	ctx := context.Background()
+	ctx, cancel := context.WithTimeout(context.Background(), o.timeout)
+	defer cancel()
+
 	if err := o.uncordon(ctx); err != nil {
 		return fmt.Errorf("failed to uncordon cluster %s: %w", o.clusterName, err)
 	}
@@ -79,13 +82,9 @@ func (o *uncordonOptions) runUncordon() error {
 
 // setupClient creates and configures the Kubernetes client
 func (o *uncordonOptions) setupClient() error {
-	scheme := runtime.NewScheme()
-
-	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err)
-	}
-	if err := placementv1beta1.AddToScheme(scheme); err != nil {
-		return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err)
+	scheme, err := toolsutils.NewFleetScheme()
+	if err != nil {
+		return fmt.Errorf("failed to create runtime scheme: %w", err)
 	}
 
 	hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme)
diff --git a/tools/fleet/main.go b/tools/fleet/main.go
index 989eee78c..6fdcadf7e 100644
--- a/tools/fleet/main.go
+++ b/tools/fleet/main.go
@@ -17,6 +17,7 @@ limitations under the License.
 package main
 
 import (
+	"fmt"
 	"log"
 
 	"github.com/spf13/cobra"
@@ -26,6 +27,12 @@ import (
 	"go.goms.io/fleet/tools/fleet/cmd/uncordoncluster"
 )
 
+// These variables are set via ldflags at build time.
+var (
+	version = "dev"
+	commit  = "unknown"
+)
+
 func main() {
 	rootCmd := &cobra.Command{
 		Use:   "kubectl-fleet",
@@ -33,6 +40,15 @@ func main() {
 		Long:  "kubectl-fleet is a kubectl plugin for managing KubeFleet member clusters",
 	}
 
+	// Add version subcommand
+	rootCmd.AddCommand(&cobra.Command{
+		Use:   "version",
+		Short: "Print the version information",
+		Run: func(cmd *cobra.Command, args []string) {
+			fmt.Printf("kubectl-fleet %s (%s)\n", version, commit)
+		},
+	})
+
 	// Add subcommands
 	rootCmd.AddCommand(approve.NewCmdApprove())
 	rootCmd.AddCommand(draincluster.NewCmdDrainCluster())
diff --git a/tools/utils/common.go b/tools/utils/common.go
index 23f68c852..98c14ef5a 100644
--- a/tools/utils/common.go
+++ b/tools/utils/common.go
@@ -13,6 +13,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1"
+	placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
 )
 
 var (
@@ -24,6 +25,18 @@ var (
 	}
 )
 
+// NewFleetScheme creates a runtime.Scheme with all Fleet API types registered.
+func NewFleetScheme() (*runtime.Scheme, error) {
+	scheme := runtime.NewScheme()
+	if err := clusterv1beta1.AddToScheme(scheme); err != nil {
+		return nil, err
+	}
+	if err := placementv1beta1.AddToScheme(scheme); err != nil {
+		return nil, err
+	}
+	return scheme, nil
+}
+
 // GetClusterClientFromClusterContext creates a new client.Client for the given cluster context and scheme.
 func GetClusterClientFromClusterContext(clusterContext string, scheme *runtime.Scheme) (client.Client, error) {
 	clusterConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(