From 5c572cf12eced2407a869bd00c71d9d74777bb09 Mon Sep 17 00:00:00 2001 From: Wantong Date: Tue, 28 Apr 2026 15:17:33 -0700 Subject: [PATCH 01/19] fix: add pod/rs guardrail check consistently (#682) --- pkg/webhook/webhook.go | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 82710a39c..2fe915f79 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -641,12 +641,9 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho // Build core v1 resources list, conditionally including pods if workload is enabled coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName, - limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, + limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status", replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName, serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"} - if w.enableWorkload { - coreV1Resources = append(coreV1Resources, podResourceName, podResourceName+"/status") - } namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ Operations: cuOperations, @@ -654,11 +651,8 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho }) // Build apps/v1 resources list, conditionally including replicasets if workload is enabled - appsV1Resources := []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status", + appsV1Resources := []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status", replicaSetResourceName, replicaSetResourceName + "/status", deploymentResourceName, deploymentResourceName + "/status", statefulSetResourceName, statefulSetResourceName + "/status"} - if w.enableWorkload { - appsV1Resources = append(appsV1Resources, replicaSetResourceName, replicaSetResourceName+"/status") - } namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ Operations: cuOperations, From 34ef3309fadba18198f9adfd0af5dae46c4ac866 Mon Sep 17 00:00:00 2001 From: Arvind Thirumurugan Date: Tue, 28 Apr 2026 17:37:10 -0700 Subject: [PATCH 02/19] chore: update token writer file permission (#675) --- docker/refresh-token.Dockerfile | 2 + pkg/authtoken/token_writer.go | 8 ++- pkg/utils/writefile/writefile.go | 34 ++++++++++ pkg/utils/writefile/writefile_test.go | 89 +++++++++++++++++++++++++++ pkg/webhook/webhook.go | 5 +- 5 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 pkg/utils/writefile/writefile.go create mode 100644 pkg/utils/writefile/writefile_test.go diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile index 20da08c80..22b99f038 100644 --- a/docker/refresh-token.Dockerfile +++ b/docker/refresh-token.Dockerfile @@ -15,6 +15,8 @@ RUN go mod download # Copy the go source COPY cmd/authtoken/main.go main.go COPY pkg/authtoken pkg/authtoken +# writefile is a dependency of pkg/authtoken for secure file creation (0600 permissions) +COPY pkg/utils/writefile pkg/utils/writefile ARG TARGETARCH diff --git a/pkg/authtoken/token_writer.go b/pkg/authtoken/token_writer.go index 48b965f3f..66591f715 100644 --- a/pkg/authtoken/token_writer.go +++ b/pkg/authtoken/token_writer.go @@ -18,9 +18,10 @@ package authtoken import ( "fmt" "io" - "os" "k8s.io/klog/v2" + + "github.com/kubefleet-dev/kubefleet/pkg/utils/writefile" ) type Factory struct { @@ -31,8 +32,11 @@ func NewFactory(filePath string) Factory { return Factory{filePath: filePath} } +// Create opens the token file for writing using CreateSecureFile. +// The member-agent container runs as the same UID and reads this file via client-go's +// BearerTokenFile (O_RDONLY), so owner permission is sufficient for both writer and reader. func (w Factory) Create() (io.WriteCloser, error) { - wc, err := os.Create(w.filePath) + wc, err := writefile.CreateSecureFile(w.filePath) if err != nil { return nil, err } diff --git a/pkg/utils/writefile/writefile.go b/pkg/utils/writefile/writefile.go new file mode 100644 index 000000000..f161bfeca --- /dev/null +++ b/pkg/utils/writefile/writefile.go @@ -0,0 +1,34 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package writefile + +import ( + "os" + "path/filepath" +) + +// CreateSecureFile creates or truncates a file with owner-only permissions (0600). +// The path is sanitized with filepath.Clean to resolve any relative or redundant elements. +// +// - O_WRONLY: write-only, since callers only need to write sensitive data. +// - O_CREATE: creates the file if it does not exist. +// - O_TRUNC: empties the file before writing so that no leftover bytes from a +// previous (possibly longer) write remain in the file. +// - 0600: owner read/write only, preventing other users from accessing the file. +func CreateSecureFile(path string) (*os.File, error) { + return os.OpenFile(filepath.Clean(path), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) +} diff --git a/pkg/utils/writefile/writefile_test.go b/pkg/utils/writefile/writefile_test.go new file mode 100644 index 000000000..7d389ac66 --- /dev/null +++ b/pkg/utils/writefile/writefile_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package writefile + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestCreateSecureFile(t *testing.T) { + tests := []struct { + name string + oldContent string + writeContent string + }{ + { + name: "creates new file with restricted permissions", + writeContent: "sensitive-data", + }, + { + name: "truncates longer existing content", + oldContent: "this-is-a-very-long-existing-content-that-should-be-truncated", + writeContent: "short", + }, + { + name: "overwrites shorter existing content", + oldContent: "short", + writeContent: "a-much-longer-replacement-content", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + dir := t.TempDir() + filePath := filepath.Join(dir, "secure-file") + + writeAndClose := func(content string) { + f, err := CreateSecureFile(filePath) + if err != nil { + t.Fatalf("CreateSecureFile(%q) returned error: %v", filePath, err) + } + if _, err := io.WriteString(f, content); err != nil { + t.Fatalf("io.WriteString() returned error: %v", err) + } + f.Close() + } + + if tc.oldContent != "" { + writeAndClose(tc.oldContent) + } + + writeAndClose(tc.writeContent) + + got, err := os.ReadFile(filePath) + if err != nil { + t.Fatalf("os.ReadFile(%q) returned error: %v", filePath, err) + } + if diff := cmp.Diff(tc.writeContent, string(got)); diff != "" { + t.Errorf("CreateSecureFile() file content mismatch (-want +got):\n%s", diff) + } + + info, err := os.Stat(filePath) + if err != nil { + t.Fatalf("os.Stat(%q) returned error: %v", filePath, err) + } + if gotPerm := info.Mode().Perm(); gotPerm != 0600 { + t.Errorf("CreateSecureFile() file permission = %o, want %o", gotPerm, 0600) + } + }) + } +} diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 2fe915f79..e3bf5beb9 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -55,6 +55,7 @@ import ( clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/cmd/hubagent/options" + "github.com/kubefleet-dev/kubefleet/pkg/utils/writefile" "github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceoverride" "github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceplacement" "github.com/kubefleet-dev/kubefleet/pkg/webhook/clusterresourceplacementdisruptionbudget" @@ -936,7 +937,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error { return fmt.Errorf("could not create directory %q to store certificates: %w", certDir, err) } certPath := filepath.Join(certDir, fleetWebhookCertFileName) - f, err := os.OpenFile(filepath.Clean(certPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600) + f, err := writefile.CreateSecureFile(certPath) if err != nil { return fmt.Errorf("could not open %q: %w", certPath, err) } @@ -950,7 +951,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error { } keyPath := filepath.Join(certDir, fleetWebhookKeyFileName) - kf, err := os.OpenFile(filepath.Clean(keyPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600) + kf, err := writefile.CreateSecureFile(keyPath) if err != nil { return fmt.Errorf("could not open %q: %w", keyPath, err) } From 5355ec8e7407f337ab83d88e42ee98c6981e15aa Mon Sep 17 00:00:00 2001 From: Akash Kumar <91385321+AkashKumar7902@users.noreply.github.com> Date: Wed, 29 Apr 2026 18:56:25 +0530 Subject: [PATCH 03/19] docs: clarify hub-agent cert secret default (#673) Signed-off-by: Akash Kumar --- charts/hub-agent/README.md | 12 +++++++----- charts/hub-agent/values.yaml | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index b49e1bd15..9582dda49 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -164,7 +164,8 @@ helm install hub-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/hub-agent \ --create-namespace \ --set useCertManager=true \ --set enableWorkload=true \ - --set enableWebhook=true + --set enableWebhook=true \ + --set webhookCertSecretName=fleet-webhook-server-cert # Or using traditional repository helm install hub-agent kubefleet/hub-agent \ @@ -172,12 +173,13 @@ helm install hub-agent kubefleet/hub-agent \ --create-namespace \ --set useCertManager=true \ --set enableWorkload=true \ - --set enableWebhook=true + --set enableWebhook=true \ + --set webhookCertSecretName=fleet-webhook-server-cert ``` The `webhookCertSecretName` parameter specifies the Secret name for the certificate: -- Default: `fleet-webhook-server-cert` -- When using cert-manager, this is where cert-manager stores the certificate +- Default: `unset`; there is no default in `values.yaml` +- When using cert-manager, set this to the Secret where cert-manager stores the certificate - Must match the secret name referenced in the deployment volume mount Example with custom secret name: @@ -198,4 +200,4 @@ helm install hub-agent kubefleet/hub-agent \ --set useCertManager=true \ --set enableWorkload=true \ --set webhookCertSecretName=my-webhook-secret -``` \ No newline at end of file +``` diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index e87061b58..4964cba84 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -23,6 +23,7 @@ enableWorkload: false useCertManager: false # webhookCertSecretName is ONLY used when useCertManager=true # It specifies the name of the Secret where cert-manager stores the certificate +# Example: # webhookCertSecretName: fleet-webhook-server-cert forceDeleteWaitTime: 15m0s From 81ecb5bb0d09c7301b3643c93d3311ff117fdf98 Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Wed, 29 Apr 2026 10:20:19 -0700 Subject: [PATCH 04/19] fix: deterministic envelope Work names to prevent duplicates (#665) * fix: deterministic envelope Work names to prevent duplicates Envelope Work names were generated with a fresh uuid.NewUUID() on every Create, so two back-to-back reconciles of the same binding with a stale informer-cache List could each build a Work with different names and both succeed. The `len > 1` branch then returned UnexpectedBehaviorError without any recovery, leaving the binding permanently stuck in WorkSynchronized=False/SyncWorkFailed and the placement never converging on that cluster. buildNewWorkForEnvelopeCR now derives the name suffix from a truncated SHA-256 hash of (envelope type, namespace, name). Two concurrent Creates for the same (binding, envelope) produce the same name and collide at the API server; the existing AlreadyExists handling in upsertWork requeues into the len == 1 update branch. Auto-deletion on detection of multiple Works is deliberately avoided: the member agent may have already applied both Works, so deleting one would trigger resource fluctuation on the member side. When the `len > 1` state is still observed (possible only in environments that were stuck before this change), the controller now emits a Warning Event on the binding (reason=DuplicateEnvelopeWorks) and a clear log message instructing the operator to delete all but the oldest Work in the affected namespace. Fixes #630 Signed-off-by: Yetkin Timocin * Addressing feedback Signed-off-by: Yetkin Timocin --------- Signed-off-by: Yetkin Timocin --- pkg/controllers/workgenerator/envelope.go | 49 ++++- .../workgenerator/envelope_test.go | 192 +++++++++++++++++- 2 files changed, 224 insertions(+), 17 deletions(-) diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go index 24cff56cd..95453274b 100644 --- a/pkg/controllers/workgenerator/envelope.go +++ b/pkg/controllers/workgenerator/envelope.go @@ -18,13 +18,15 @@ package workgenerator import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "sort" "strings" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -87,18 +89,27 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj( var work *fleetv1beta1.Work switch { case len(workList.Items) > 1: - // Multiple matching work objects found; this should never occur under normal conditions. - wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef()) - klog.ErrorS(wrappedErr, "Failed to create or update work object for envelope", - "resourceBinding", klog.KObj(binding), - "resourceSnapshot", klog.KObj(resourceSnapshot), - "envelope", envelopeReader.GetEnvelopeObjRef()) - // Log the work object names to help debug. + // Multiple matching work objects found. This can only happen in environments that + // were stuck on this state *before* deterministic Work naming was introduced in + // buildNewWorkForEnvelopeCR; newly-raced reconciles are now prevented by the + // API server's name-uniqueness check. We deliberately do NOT auto-delete the + // extras: both Works may have been applied by the member agent, and deleting one + // would trigger member-side resource cleanup and user-visible fluctuation on + // resources the survivor still owns. Surface the state for operator cleanup + // (manually delete all but the oldest Work in the affected namespace). workNames := make([]string, len(workList.Items)) for i := range workList.Items { workNames[i] = workList.Items[i].Name } - klog.ErrorS(wrappedErr, "Duplicate work objects found", "works", workNames) + wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef()) + klog.ErrorS(wrappedErr, "Duplicate envelope Work objects found; manual cleanup required — delete all but the oldest Work in the namespace", + "works", workNames, + "resourceBinding", klog.KObj(binding), + "resourceSnapshot", klog.KObj(resourceSnapshot), + "envelope", envelopeReader.GetEnvelopeObjRef()) + r.recorder.Eventf(binding, corev1.EventTypeWarning, "DuplicateEnvelopeWorks", + "Multiple Work objects (%v) found for envelope %v in namespace %s; delete all but the oldest to recover", + workNames, envelopeReader.GetEnvelopeObjRef(), fmt.Sprintf(utils.NamespaceNameFormat, binding.GetBindingSpec().TargetCluster)) return nil, controller.NewUnexpectedBehaviorError(wrappedErr) case len(workList.Items) == 1: klog.V(2).InfoS("Found existing work object for the envelope; updating it", @@ -200,6 +211,24 @@ func refreshWorkForEnvelopeCR( work.Spec.ApplyStrategy = resourceBinding.GetBindingSpec().ApplyStrategy } +// envelopeWorkNameSuffix returns a deterministic suffix for the envelope Work's name, +// derived from the envelope's identity. Using a deterministic suffix (instead of a random +// UUID) means two concurrent Create attempts for the same envelope collide on name at the +// API server, which enforces the "one Work per (binding, envelope)" invariant server-side. +// The hash is truncated to 8 bytes of SHA-256 (rendered as 16 hex characters), keeping +// the overall Work name well under the DNS-1123 subdomain limit; collision probability +// at the scale of a single binding's envelope set is ~2^-64. +func envelopeWorkNameSuffix(envelopeReader fleetv1beta1.EnvelopeReader) string { + // NOTE: ClusterResourceEnvelope has an empty namespace; ResourceEnvelope has a namespace. + // Including the type disambiguates an unlikely name collision across kinds. + identity := fmt.Sprintf("%s/%s/%s", + envelopeReader.GetEnvelopeType(), + envelopeReader.GetNamespace(), + envelopeReader.GetName()) + sum := sha256.Sum256([]byte(identity)) + return hex.EncodeToString(sum[:8]) +} + func buildNewWorkForEnvelopeCR( workNamePrefix string, resourceBinding fleetv1beta1.BindingObj, @@ -208,7 +237,7 @@ func buildNewWorkForEnvelopeCR( manifests []fleetv1beta1.Manifest, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string, ) *fleetv1beta1.Work { - workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, uuid.NewUUID()) + workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, envelopeWorkNameSuffix(envelopeReader)) workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.GetBindingSpec().TargetCluster) // Create the labels map diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go index f262766f3..a3283c482 100644 --- a/pkg/controllers/workgenerator/envelope_test.go +++ b/pkg/controllers/workgenerator/envelope_test.go @@ -17,8 +17,11 @@ limitations under the License. package workgenerator import ( + "context" "encoding/json" + "errors" "fmt" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -32,9 +35,12 @@ import ( fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils" + "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" "github.com/kubefleet-dev/kubefleet/test/utils/informer" ) +const testWorkNamePrefix = "test-work" + func TestExtractManifestsFromEnvelopeCR(t *testing.T) { tests := []struct { name string @@ -255,8 +261,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { ignoreWorkMeta := cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Name", "OwnerReferences") scheme := serviceScheme(t) - workNamePrefix := "test-work" - resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-snapshot", @@ -306,7 +310,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { // Create an existing work for update test existingWork := &fleetv1beta1.Work{ ObjectMeta: metav1.ObjectMeta{ - Name: workNamePrefix, + Name: testWorkNamePrefix, Namespace: "fleet-member-test-cluster-1", Labels: map[string]string{ fleetv1beta1.ParentBindingLabel: resourceBinding.Name, @@ -483,7 +487,13 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { wantErr: true, }, { - name: "two existing works should result in error", + // Duplicate envelope Works indicate an environment that was stuck before + // deterministic naming was introduced. The controller surfaces the state + // (error + Event) but does NOT auto-delete, because deleting a Work the + // member agent has already applied would trigger resource fluctuation on + // the member side. Full post-conditions are verified in + // TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation. + name: "two existing works surface an UnexpectedBehaviorError without mutation", envelopeReader: resourceEnvelope, resourceOverrideSnapshotHash: "new-resource-hash", clusterResourceOverrideSnapshotHash: "new-cluster-resource-hash", @@ -512,7 +522,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { } // Call the function under test - got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, workNamePrefix, + got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, testWorkNamePrefix, resourceBinding, resourceSnapshot, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash) if (err != nil) != tt.wantErr { @@ -532,7 +542,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { func TestProcessOneSelectedResource(t *testing.T) { scheme := serviceScheme(t) - workNamePrefix := "test-work" resourceBinding := &fleetv1beta1.ClusterResourceBinding{ ObjectMeta: metav1.ObjectMeta{ Name: "test-binding", @@ -685,7 +694,7 @@ func TestProcessOneSelectedResource(t *testing.T) { tt.selectedResource, resourceBinding, snapshot, - workNamePrefix, + testWorkNamePrefix, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash, activeWork, @@ -725,3 +734,172 @@ func createResourceContent(t *testing.T, obj runtime.Object) *fleetv1beta1.Resou }, } } + +// TestEnvelopeWorkNameSuffix verifies that the suffix is a stable, length-bounded function +// of the envelope's identity. Determinism is what lets the API server enforce the +// "one Work per (binding, envelope)" invariant via name uniqueness. +func TestEnvelopeWorkNameSuffix(t *testing.T) { + base := &fleetv1beta1.ResourceEnvelope{ + ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}, + } + + tests := []struct { + name string + other fleetv1beta1.EnvelopeReader + wantEqual bool // true → suffix must match base; false → suffix must differ + }{ + { + name: "same identity", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}}, + wantEqual: true, + }, + { + name: "different name", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-b", Namespace: "ns-1"}}, + wantEqual: false, + }, + { + name: "different namespace", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-2"}}, + wantEqual: false, + }, + { + // Same name as base, but cluster-scoped — the type field in the hash input disambiguates. + name: "different type, same name", + other: &fleetv1beta1.ClusterResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a"}}, + wantEqual: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotBase := envelopeWorkNameSuffix(base) + gotOther := envelopeWorkNameSuffix(tt.other) + if tt.wantEqual && gotBase != gotOther { + t.Errorf("envelopeWorkNameSuffix(%v) = %q, envelopeWorkNameSuffix(%v) = %q, want equal", base, gotBase, tt.other, gotOther) + } + if !tt.wantEqual && gotBase == gotOther { + t.Errorf("envelopeWorkNameSuffix(%v) = envelopeWorkNameSuffix(%v) = %q, want different", base, tt.other, gotBase) + } + }) + } + + // Suffix is bounded and DNS-1123-safe (lowercase hex). + suffix := envelopeWorkNameSuffix(base) + if len(suffix) != 16 { + t.Errorf("envelopeWorkNameSuffix length = %d, want 16", len(suffix)) + } + if strings.ToLower(suffix) != suffix { + t.Errorf("envelopeWorkNameSuffix(%v) = %q, want lowercase", base, suffix) + } +} + +// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation verifies that +// when the label query returns multiple Works for the same (binding, envelope), the +// controller surfaces the condition (UnexpectedBehaviorError + Event) but does NOT +// mutate cluster state. Auto-deleting duplicates is unsafe because the member agent +// may have applied them; deleting one triggers resource cleanup that conflicts with +// the survivor. Deterministic naming (see buildNewWorkForEnvelopeCR) prevents new +// duplicates; pre-existing ones require operator cleanup. +func TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation(t *testing.T) { + scheme := serviceScheme(t) + ctx := context.Background() + + resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-snapshot", + Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"}, + }, + } + resourceBinding := &fleetv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-binding", + Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"}, + }, + Spec: fleetv1beta1.ResourceBindingSpec{ + TargetCluster: "test-cluster-1", + ResourceSnapshotName: resourceSnapshot.Name, + }, + } + resourceEnvelope := &fleetv1beta1.ResourceEnvelope{ + ObjectMeta: metav1.ObjectMeta{Name: "dup-envelope", Namespace: "default"}, + Data: map[string]runtime.RawExtension{ + "configmap": {Raw: []byte(`{"apiVersion":"v1","kind":"ConfigMap","metadata":{"name":"cm","namespace":"default"}}`)}, + }, + } + + workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.Spec.TargetCluster) + labelsForWork := map[string]string{ + fleetv1beta1.ParentBindingLabel: resourceBinding.Name, + fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel], + fleetv1beta1.EnvelopeTypeLabel: string(fleetv1beta1.ResourceEnvelopeType), + fleetv1beta1.EnvelopeNameLabel: resourceEnvelope.Name, + fleetv1beta1.EnvelopeNamespaceLabel: resourceEnvelope.Namespace, + } + dupNames := []string{"dup-envelope-a", "dup-envelope-b", "dup-envelope-c"} + mkWork := func(name string) *fleetv1beta1.Work { + return &fleetv1beta1.Work{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: workNamespace, + Labels: labelsForWork, + }, + } + } + objs := make([]client.Object, 0, len(dupNames)) + for _, n := range dupNames { + objs = append(objs, mkWork(n)) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objs...). + Build() + + recorder := record.NewFakeRecorder(10) + r := &Reconciler{ + Client: fakeClient, + recorder: recorder, + InformerManager: &informer.FakeManager{}, + } + + got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, resourceEnvelope, testWorkNamePrefix, + resourceBinding, resourceSnapshot, "", "") + + if got != nil { + t.Errorf("createOrUpdateEnvelopeCRWorkObj() = %v, want nil on duplicate-detected path", got) + } + if err == nil { + t.Fatalf("createOrUpdateEnvelopeCRWorkObj() error = nil, want UnexpectedBehaviorError") + } + if !errors.Is(err, controller.ErrUnexpectedBehavior) { + t.Errorf("createOrUpdateEnvelopeCRWorkObj() error = %v, want wrapping controller.ErrUnexpectedBehavior", err) + } + + // Post-condition: all duplicates remain untouched — no auto-deletion. + remaining := &fleetv1beta1.WorkList{} + if err := fakeClient.List(ctx, remaining, client.InNamespace(workNamespace)); err != nil { + t.Fatalf("List Works in %q: %v", workNamespace, err) + } + sortStrings := cmpopts.SortSlices(func(a, b string) bool { return a < b }) + if diff := cmp.Diff(dupNames, workNames(remaining.Items), sortStrings); diff != "" { + t.Errorf("Works after call mismatch (-want +got):\n%s", diff) + } + + // Post-condition: a Warning Event was emitted so operators can see the stuck state. + select { + case ev := <-recorder.Events: + if !strings.Contains(ev, "DuplicateEnvelopeWorks") { + t.Errorf("event = %q, want reason DuplicateEnvelopeWorks", ev) + } + default: + t.Error("no Event emitted for duplicate envelope Works; operators would have no surface signal") + } +} + +func workNames(items []fleetv1beta1.Work) []string { + out := make([]string, len(items)) + for i := range items { + out[i] = items[i].Name + } + return out +} From abfa3ef4bfa8f5b267b801a4205fa37d2a731d76 Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Wed, 29 Apr 2026 10:21:17 -0700 Subject: [PATCH 05/19] fix: member agent validate opts, fix klog.ErrorS, and kubectl-fleet CLI improvements (#575) Signed-off-by: Yetkin Timocin --- Makefile | 1 + cmd/memberagent/main.go | 12 +++++- test/e2e/cluster_staged_updaterun_test.go | 2 +- test/e2e/drain_tool_test.go | 8 ++-- tools/fleet/README.md | 42 +++++++++---------- tools/fleet/cmd/approve/approve.go | 19 +++++---- tools/fleet/cmd/draincluster/draincluster.go | 25 ++++++----- .../cmd/uncordoncluster/uncordoncluster.go | 27 ++++++------ tools/fleet/main.go | 16 +++++++ tools/utils/common.go | 13 ++++++ 10 files changed, 102 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index 4e4ffa70a..210ed1a6e 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,7 @@ generate: $(CONTROLLER_GEN) ## Generate deep copy methods build: generate fmt vet ## Build agent binaries go build -o bin/hubagent cmd/hubagent/main.go go build -o bin/memberagent cmd/memberagent/main.go + go build -o bin/kubectl-fleet ./tools/fleet/ .PHONY: run-hubagent run-hubagent: manifests generate fmt vet ## Run hub-agent from your host diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go index a710ab2ca..e281f48f3 100644 --- a/cmd/memberagent/main.go +++ b/cmd/memberagent/main.go @@ -91,6 +91,11 @@ func main() { klog.InfoS("flag:", "name", f.Name, "value", f.Value) }) + if errs := opts.Validate(); len(errs) != 0 { + klog.ErrorS(errs.ToAggregate(), "invalid parameter") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) + } + // Set up controller-runtime logger ctrl.SetLogger(zap.New(zap.UseDevMode(true))) @@ -213,13 +218,16 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo return err }) if err != nil { - klog.ErrorS(err, "Failed to retrieve token file from the path %s", tokenFilePath) + klog.ErrorS(err, "Failed to retrieve token file", "path", tokenFilePath) return nil, err } hubConfig.BearerTokenFile = tokenFilePath } hubConfig.TLSClientConfig.Insecure = tlsClientInsecure + if tlsClientInsecure { + klog.Warning("TLS verification is disabled for hub cluster connection. This is insecure and should not be used in production.") + } if !tlsClientInsecure { caBundle, ok := os.LookupEnv("CA_BUNDLE") if ok && caBundle == "" { @@ -257,7 +265,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo r := textproto.NewReader(bufio.NewReader(strings.NewReader(header))) h, err := r.ReadMIMEHeader() if err != nil && !errors.Is(err, io.EOF) { - klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER %q", header) + klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER", "header", header) return nil, err } hubConfig.WrapTransport = func(rt http.RoundTripper) http.RoundTripper { diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go index df5c8b4a7..3215362ea 100644 --- a/test/e2e/cluster_staged_updaterun_test.go +++ b/test/e2e/cluster_staged_updaterun_test.go @@ -2203,7 +2203,7 @@ func approveClusterApprovalRequest(stageName, updateRunName, stageTask string) { // Use kubectl-fleet approve plugin to approve the request cmd := exec.Command(fleetBinaryPath, "approve", "clusterapprovalrequest", - "--hubClusterContext", "kind-hub", + "--hub-cluster-context", "kind-hub", "--name", approvalRequestName) output, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "kubectl-fleet approve failed: %s", string(output)) diff --git a/test/e2e/drain_tool_test.go b/test/e2e/drain_tool_test.go index 07ab7e6a1..408f8a670 100644 --- a/test/e2e/drain_tool_test.go +++ b/test/e2e/drain_tool_test.go @@ -360,8 +360,8 @@ var _ = Describe("Drain is allowed on one cluster, blocked on others - ClusterRe func runDrainClusterBinary(hubClusterName, memberClusterName string) { By(fmt.Sprintf("draining cluster %s", memberClusterName)) cmd := exec.Command(fleetBinaryPath, "draincluster", - "--hubClusterContext", hubClusterName, - "--clusterName", memberClusterName) + "--hub-cluster-context", hubClusterName, + "--cluster-name", memberClusterName) _, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "Drain command failed with error: %v", err) } @@ -369,8 +369,8 @@ func runDrainClusterBinary(hubClusterName, memberClusterName string) { func runUncordonClusterBinary(hubClusterName, memberClusterName string) { By(fmt.Sprintf("uncordoning cluster %s", memberClusterName)) cmd := exec.Command(fleetBinaryPath, "uncordoncluster", - "--hubClusterContext", hubClusterName, - "--clusterName", memberClusterName) + "--hub-cluster-context", hubClusterName, + "--cluster-name", memberClusterName) _, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "Uncordon command failed with error: %v", err) } diff --git a/tools/fleet/README.md b/tools/fleet/README.md index 897978db3..c7405896d 100644 --- a/tools/fleet/README.md +++ b/tools/fleet/README.md @@ -72,25 +72,25 @@ Use the `approve` subcommand to approve approval request resources for staged up **Cluster-scoped (ClusterApprovalRequest):** ```bash -kubectl fleet approve clusterapprovalrequest --hubClusterContext --name +kubectl fleet approve clusterapprovalrequest --hub-cluster-context --name # Or using alias: -kubectl fleet approve careq --hubClusterContext --name +kubectl fleet approve careq --hub-cluster-context --name ``` **Namespace-scoped (ApprovalRequest):** ```bash -kubectl fleet approve approvalrequest --hubClusterContext --name --namespace +kubectl fleet approve approvalrequest --hub-cluster-context --name --namespace # Or using alias: -kubectl fleet approve areq --hubClusterContext --name -n +kubectl fleet approve areq --hub-cluster-context --name -n ``` Example: ```bash # Approve a ClusterApprovalRequest -kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name my-approval-request +kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name my-approval-request # Approve an ApprovalRequest in a specific namespace -kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval-request --namespace my-namespace +kubectl fleet approve approvalrequest --hub-cluster-context hub --name my-approval-request --namespace my-namespace ``` ### Drain a Member Cluster @@ -98,12 +98,12 @@ kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval Use the `draincluster` subcommand to remove all resources propagated to a member cluster from the hub cluster by any `Placement` resource. This is useful when you want to temporarily move all workloads off a member cluster in preparation for an event like upgrade or reconfiguration. ```bash -kubectl fleet draincluster --hubClusterContext --clusterName +kubectl fleet draincluster --hub-cluster-context --cluster-name ``` Example: ```bash -kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-1 +kubectl fleet draincluster --hub-cluster-context hub --cluster-name member-cluster-1 ``` ### Uncordon a Member Cluster @@ -111,12 +111,12 @@ kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster- Use the `uncordoncluster` subcommand to uncordon a member cluster that has been previously drained, allowing resources to be propagated to the cluster again. ```bash -kubectl fleet uncordoncluster --hubClusterContext --clusterName +kubectl fleet uncordoncluster --hub-cluster-context --cluster-name ``` Example: ```bash -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName member-cluster-1 +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name member-cluster-1 ``` ## Subcommands @@ -156,13 +156,13 @@ If the `cordon` taint is not present on the member cluster, the command will hav ## Flags The `approve` subcommand uses the following flags: -- `--hubClusterContext`: kubectl context for the hub cluster (required) +- `--hub-cluster-context`: kubectl context for the hub cluster (required) - `--name`: name of the resource to approve (required) - `--namespace`, `-n`: namespace of the resource to approve (required for `approvalrequest`, not allowed for `clusterapprovalrequest`) Both `draincluster` and `uncordoncluster` subcommands use the following flags: -- `--hubClusterContext`: kubectl context for the hub cluster (required) -- `--clusterName`: name of the member cluster to operate on (required) +- `--hub-cluster-context`: kubectl context for the hub cluster (required) +- `--cluster-name`: name of the member cluster to operate on (required) ## Examples @@ -173,31 +173,31 @@ Both `draincluster` and `uncordoncluster` subcommands use the following flags: kubectl config get-contexts # 2. Drain a cluster for maintenance -kubectl fleet draincluster --hubClusterContext production-hub --clusterName worker-node-1 +kubectl fleet draincluster --hub-cluster-context production-hub --cluster-name worker-node-1 # 3. Perform maintenance on the worker-node-1 cluster # ... maintenance operations ... # 4. After maintenance, uncordon the cluster to allow workloads back -kubectl fleet uncordoncluster --hubClusterContext production-hub --clusterName worker-node-1 +kubectl fleet uncordoncluster --hub-cluster-context production-hub --cluster-name worker-node-1 ``` ### Additional Examples ```bash # Approve a ClusterApprovalRequest for staged updates -kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name update-approval-stage-1 +kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name update-approval-stage-1 # Approve a ApprovalRequest for staged updates -kubectl fleet approve approvalrequest --hubClusterContext hub --name update-approval-stage-1 --namespace test-namespace +kubectl fleet approve approvalrequest --hub-cluster-context hub --name update-approval-stage-1 --namespace test-namespace # Drain multiple clusters (run separately for each cluster) -kubectl fleet draincluster --hubClusterContext hub --clusterName east-cluster -kubectl fleet draincluster --hubClusterContext hub --clusterName west-cluster +kubectl fleet draincluster --hub-cluster-context hub --cluster-name east-cluster +kubectl fleet draincluster --hub-cluster-context hub --cluster-name west-cluster # Uncordon clusters after maintenance -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName east-cluster -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName west-cluster +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name east-cluster +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name west-cluster ``` ## Troubleshooting diff --git a/tools/fleet/cmd/approve/approve.go b/tools/fleet/cmd/approve/approve.go index 72834bd9c..aa97605c7 100644 --- a/tools/fleet/cmd/approve/approve.go +++ b/tools/fleet/cmd/approve/approve.go @@ -20,11 +20,11 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" @@ -87,6 +87,7 @@ type approveOptions struct { hubClusterContext string name string namespace string + timeout time.Duration hubClient client.Client } @@ -119,16 +120,19 @@ For namespace-scoped resources (approvalrequest), you must also specify the --na if err := o.setupClient(); err != nil { return err } - return o.run(cmd.Context(), cfg) + ctx, cancel := context.WithTimeout(cmd.Context(), o.timeout) + defer cancel() + return o.run(ctx, cfg) }, } - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "The name of the kubeconfig context to use for the hub cluster") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "The name of the kubeconfig context to use for the hub cluster") cmd.Flags().StringVar(&o.name, "name", "", "The name of the resource to approve") cmd.Flags().StringVarP(&o.namespace, "namespace", "n", "", "The namespace of the resource to approve (required for namespace-scoped resources)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags. - _ = cmd.MarkFlagRequired("hubClusterContext") + _ = cmd.MarkFlagRequired("hub-cluster-context") _ = cmd.MarkFlagRequired("name") return cmd @@ -208,10 +212,9 @@ func (o *approveOptions) approveApprovalRequest(ctx context.Context) error { // setupClient creates and configures the Kubernetes client func (o *approveOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/cmd/draincluster/draincluster.go b/tools/fleet/cmd/draincluster/draincluster.go index 9616bd35b..7dc6f334b 100644 --- a/tools/fleet/cmd/draincluster/draincluster.go +++ b/tools/fleet/cmd/draincluster/draincluster.go @@ -20,11 +20,11 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" k8errors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/validation" @@ -49,6 +49,7 @@ const ( type drainOptions struct { hubClusterContext string clusterName string + timeout time.Duration hubClient client.Client } @@ -70,18 +71,20 @@ func NewCmdDrainCluster() *cobra.Command { } // Add flags specific to drain command - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)") - cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)") + cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags - _ = cmd.MarkFlagRequired("hubClusterContext") - _ = cmd.MarkFlagRequired("clusterName") + _ = cmd.MarkFlagRequired("hub-cluster-context") + _ = cmd.MarkFlagRequired("cluster-name") return cmd } func (o *drainOptions) runDrain() error { - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), o.timeout) + defer cancel() isDrainSuccessful, err := o.drain(ctx) if err != nil { @@ -111,13 +114,9 @@ func (o *drainOptions) runDrain() error { // setupClient creates and configures the Kubernetes client func (o *drainOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := clusterv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err) - } - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go index aba52f121..32b7d1779 100644 --- a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go +++ b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go @@ -20,15 +20,14 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" - placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" toolsutils "github.com/kubefleet-dev/kubefleet/tools/utils" ) @@ -36,6 +35,7 @@ import ( type uncordonOptions struct { hubClusterContext string clusterName string + timeout time.Duration hubClient client.Client } @@ -57,18 +57,21 @@ func NewCmdUncordonCluster() *cobra.Command { } // Add flags specific to uncordon command - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)") - cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)") + cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags - _ = cmd.MarkFlagRequired("hubClusterContext") - _ = cmd.MarkFlagRequired("clusterName") + _ = cmd.MarkFlagRequired("hub-cluster-context") + _ = cmd.MarkFlagRequired("cluster-name") return cmd } func (o *uncordonOptions) runUncordon() error { - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), o.timeout) + defer cancel() + if err := o.uncordon(ctx); err != nil { return fmt.Errorf("failed to uncordon cluster %s: %w", o.clusterName, err) } @@ -79,13 +82,9 @@ func (o *uncordonOptions) runUncordon() error { // setupClient creates and configures the Kubernetes client func (o *uncordonOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := clusterv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err) - } - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/main.go b/tools/fleet/main.go index 7c614bf4e..7f3156ad4 100644 --- a/tools/fleet/main.go +++ b/tools/fleet/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "fmt" "log" "github.com/spf13/cobra" @@ -26,6 +27,12 @@ import ( "github.com/kubefleet-dev/kubefleet/tools/fleet/cmd/uncordoncluster" ) +// These variables are set via ldflags at build time. +var ( + version = "dev" + commit = "unknown" +) + func main() { rootCmd := &cobra.Command{ Use: "kubectl-fleet", @@ -33,6 +40,15 @@ func main() { Long: "kubectl-fleet is a kubectl plugin for managing KubeFleet member clusters", } + // Add version subcommand + rootCmd.AddCommand(&cobra.Command{ + Use: "version", + Short: "Print the version information", + Run: func(cmd *cobra.Command, args []string) { + fmt.Printf("kubectl-fleet %s (%s)\n", version, commit) + }, + }) + // Add subcommands rootCmd.AddCommand(approve.NewCmdApprove()) rootCmd.AddCommand(draincluster.NewCmdDrainCluster()) diff --git a/tools/utils/common.go b/tools/utils/common.go index 6439dee14..d62ddae4d 100644 --- a/tools/utils/common.go +++ b/tools/utils/common.go @@ -13,6 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" ) var ( @@ -24,6 +25,18 @@ var ( } ) +// NewFleetScheme creates a runtime.Scheme with all Fleet API types registered. +func NewFleetScheme() (*runtime.Scheme, error) { + scheme := runtime.NewScheme() + if err := clusterv1beta1.AddToScheme(scheme); err != nil { + return nil, err + } + if err := placementv1beta1.AddToScheme(scheme); err != nil { + return nil, err + } + return scheme, nil +} + // GetClusterClientFromClusterContext creates a new client.Client for the given cluster context and scheme. func GetClusterClientFromClusterContext(clusterContext string, scheme *runtime.Scheme) (client.Client, error) { clusterConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( From 447629b09f73a2cfc91de244cd27fe00f27a3feb Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Thu, 30 Apr 2026 16:24:34 -0700 Subject: [PATCH 06/19] fix: don't treat in-progress binding states as terminal failures (#670) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HasBindingFailed previously returned true for any condition with Status=False, regardless of the Reason. Several in-progress reasons on the per-cluster binding conditions (Overridden, WorkSynchronized, Applied, Available) are expressed as Status=False rather than Status=Unknown — most notably WorkNotSynchronizedYetReason and NotAvailableYetReason. The over-broad classification caused upstream controllers (rollout, updaterun) to bail on bindings that were still making forward progress. Switch HasBindingFailed to consult an explicit allowlist of non-terminal reasons (nonTerminalBindingFailureReasons) drawn from the condition package: OverriddenPendingReason, WorkNotSynchronizedYetReason, ApplyPendingReason, NotAvailableYetReason. Anything outside that set is treated as terminal — the safer default, since wrongly classifying an unknown reason as "transient" would stall rollouts forever. Per code review feedback (the original draft used a `strings.Contains` on the literal "failed", which is fragile against typos and convention drift), the allowlist uses typed constants. New "Pending" / "NotYet" reasons added to pkg/utils/condition must also be added to the allowlist; this is documented inline. Tests: - TestHasBindingFailed_NonTerminalReasons covers each known transient reason returning false, each known terminal reason returning true, an unknown-reason fall-through to terminal, and a mixed-condition case where a transient reason on one condition does not mask a terminal failure on another. - Existing TestHasBindingFailed (which uses camelCase reason strings not in the allowlist) still passes — those legacy strings are correctly classified as terminal. - TestCheckClusterUpdateResult in the updaterun package was encoding the previous bug: it asserted that WorkNotSynchronizedYet should produce an error from checkClusterUpdateResult. Updated the case to match corrected semantics (in-progress, no error) and added a new case for the terminal SyncWorkFailedReason path. This aligns with the existing TODO at execution.go:626 that called for distinguishing recoverable from non-recoverable failures. Fixes #648 Signed-off-by: Yetkin Timocin Co-authored-by: Claude Code --- pkg/controllers/updaterun/execution_test.go | 23 ++++- pkg/utils/binding/binding.go | 47 ++++++++-- pkg/utils/binding/binding_test.go | 97 +++++++++++++++++++++ 3 files changed, 161 insertions(+), 6 deletions(-) diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go index daedfdfe3..c65d8fb1f 100644 --- a/pkg/controllers/updaterun/execution_test.go +++ b/pkg/controllers/updaterun/execution_test.go @@ -242,7 +242,9 @@ func TestCheckClusterUpdateResult(t *testing.T) { wantErr: true, }, { - name: "checkClusterUpdateResult should return false and error if the binding has false workSynchronized condition", + // WorkNotSynchronizedYet is an in-progress reason, not a terminal failure (issue #648). + // checkClusterUpdateResult now treats it as "still updating" rather than an error. + name: "checkClusterUpdateResult should return false but no error if workSynchronized is in-progress (WorkNotSynchronizedYet)", binding: &placementv1beta1.ClusterResourceBinding{ ObjectMeta: metav1.ObjectMeta{Generation: 1}, Status: placementv1beta1.ResourceBindingStatus{ @@ -258,6 +260,25 @@ func TestCheckClusterUpdateResult(t *testing.T) { }, clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"}, wantSucceeded: false, + wantErr: false, + }, + { + name: "checkClusterUpdateResult should return false and error if workSynchronized failed terminally (SyncWorkFailed)", + binding: &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingWorkSynchronized), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.SyncWorkFailedReason, + }, + }, + }, + }, + clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"}, + wantSucceeded: false, wantErr: true, }, { diff --git a/pkg/utils/binding/binding.go b/pkg/utils/binding/binding.go index 7fb449615..c4eacea44 100644 --- a/pkg/utils/binding/binding.go +++ b/pkg/utils/binding/binding.go @@ -23,14 +23,51 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" ) -// HasBindingFailed checks if BindingObj has failed based on its conditions. +// nonTerminalBindingFailureReasons is the closed set of `Reason` strings that the per-cluster +// binding conditions (Overridden, WorkSynchronized, Applied, Available) can carry while still +// `Status=False` *without* representing a terminal failure. Each one means the binding is still +// progressing and the controller should keep waiting rather than treat the binding as failed. +// +// The set is intentionally an allowlist: any new `Status=False` reason added to the codebase is +// treated as terminal until it is explicitly added here. This is the safer default because the +// alternative (treat unknown reasons as transient) would stall the rollout controller forever +// on a genuinely-failing binding it does not yet know how to classify. +// +// When you add a new in-progress / pending / "not yet" reason to pkg/utils/condition/reason.go +// for any of the above conditions, also add it here. +var nonTerminalBindingFailureReasons = map[string]struct{}{ + condition.OverriddenPendingReason: {}, + condition.WorkNotSynchronizedYetReason: {}, + condition.ApplyPendingReason: {}, + condition.NotAvailableYetReason: {}, +} + +// HasBindingFailed reports whether a binding has reached a terminal failure on any of its +// per-cluster conditions (Overridden, WorkSynchronized, Applied, Available). +// +// A condition counts as a terminal failure when its `Status` is `False` *and* its `Reason` is +// not in `nonTerminalBindingFailureReasons`. The Reason check is necessary because several of +// the in-progress states (e.g. `WorkNotSynchronizedYetReason`, `NotAvailableYetReason`) are +// expressed as `Status=False` rather than `Status=Unknown`. Treating those as failures was the +// previous bug — it caused the rollout controller to drop bindings that were still progressing +// out of `canBeReadyBindings`, stalling rollout decisions. +// +// Unknown reasons fall through to "terminal" by design; see the comment on +// `nonTerminalBindingFailureReasons`. func HasBindingFailed(binding placementv1beta1.BindingObj) bool { for i := condition.OverriddenCondition; i <= condition.AvailableCondition; i++ { - if condition.IsConditionStatusFalse(binding.GetCondition(string(i.ResourceBindingConditionType())), binding.GetGeneration()) { - // TODO: parse the reason of the condition to see if the failure is recoverable/retriable or not - klog.V(2).Infof("binding %s has condition %s with status false", klog.KObj(binding), string(i.ResourceBindingConditionType())) - return true + c := binding.GetCondition(string(i.ResourceBindingConditionType())) + if !condition.IsConditionStatusFalse(c, binding.GetGeneration()) { + continue + } + if _, transient := nonTerminalBindingFailureReasons[c.Reason]; transient { + klog.V(3).Infof("binding %s has condition %s status false with non-terminal reason %q; treating as in-progress", + klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason) + continue } + klog.V(2).Infof("binding %s has terminal failure on condition %s (reason %q)", + klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason) + return true } return false } diff --git a/pkg/utils/binding/binding_test.go b/pkg/utils/binding/binding_test.go index b190f4c1e..722d0c340 100644 --- a/pkg/utils/binding/binding_test.go +++ b/pkg/utils/binding/binding_test.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" ) func TestHasBindingFailed(t *testing.T) { @@ -326,6 +327,102 @@ func TestHasBindingFailed(t *testing.T) { } } +// TestHasBindingFailed_NonTerminalReasons verifies that conditions with Status=False but a +// known in-progress reason (WorkNotSynchronizedYet, NotAvailableYet, OverriddenPending, +// ApplyPending) are NOT treated as failures. This is the bug fix for issue #648. +func TestHasBindingFailed_NonTerminalReasons(t *testing.T) { + bindingWithCondition := func(condType string, reason string) *placementv1beta1.ClusterResourceBinding { + return &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: condType, + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: reason, + Message: "in progress", + }, + }, + }, + } + } + + tests := []struct { + name string + binding *placementv1beta1.ClusterResourceBinding + want bool + }{ + { + name: "WorkSynchronized=False with WorkNotSynchronizedYet is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingWorkSynchronized), condition.WorkNotSynchronizedYetReason), + want: false, + }, + { + name: "Available=False with NotAvailableYet is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingAvailable), condition.NotAvailableYetReason), + want: false, + }, + { + name: "Overridden=False with OverriddenPending is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenPendingReason), + want: false, + }, + { + name: "Applied=False with ApplyPending is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyPendingReason), + want: false, + }, + { + name: "Applied=False with ApplyFailed is a terminal failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyFailedReason), + want: true, + }, + { + name: "Overridden=False with OverriddenFailed is a terminal failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenFailedReason), + want: true, + }, + { + name: "Status=False with an unknown reason defaults to terminal (safer default)", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), "SomeReasonNobodyAddedToTheAllowlist"), + want: true, + }, + { + name: "non-terminal reason on one condition does not mask a terminal failure on another", + binding: &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingOverridden), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.OverriddenPendingReason, + }, + { + Type: string(placementv1beta1.ResourceBindingApplied), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.ApplyFailedReason, + }, + }, + }, + }, + want: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := HasBindingFailed(tc.binding) + if got != tc.want { + t.Errorf("HasBindingFailed(%v) = %v, want %v", tc.binding, got, tc.want) + } + }) + } +} + func TestIsBindingReportDiff(t *testing.T) { tests := []struct { name string From 97f9ed4afc1c3a12e53bc7cc288fd7905af4ede8 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 6 May 2026 10:51:53 -0700 Subject: [PATCH 07/19] fix: correct TimedWait waitTime regex to reject invalid duration strings (#688) --- apis/placement/v1/stageupdate_types.go | 3 +- apis/placement/v1alpha1/stagedupdate_types.go | 2 +- apis/placement/v1beta1/stageupdate_types.go | 3 +- ...etes-fleet.io_clusterstagedupdateruns.yaml | 34 ++-- ...leet.io_clusterstagedupdatestrategies.yaml | 34 ++-- ....kubernetes-fleet.io_stagedupdateruns.yaml | 32 ++-- ...netes-fleet.io_stagedupdatestrategies.yaml | 32 ++-- .../api_validation_integration_test.go | 172 ++++++++++++++++++ 8 files changed, 243 insertions(+), 69 deletions(-) diff --git a/apis/placement/v1/stageupdate_types.go b/apis/placement/v1/stageupdate_types.go index afcc88f96..2e06af937 100644 --- a/apis/placement/v1/stageupdate_types.go +++ b/apis/placement/v1/stageupdate_types.go @@ -209,7 +209,8 @@ type StageTask struct { Type StageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // Only hours (h), minutes (m), and seconds (s) units are accepted. + // +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime *metav1.Duration `json:"waitTime,omitempty"` diff --git a/apis/placement/v1alpha1/stagedupdate_types.go b/apis/placement/v1alpha1/stagedupdate_types.go index b6389a5d4..e8644d357 100644 --- a/apis/placement/v1alpha1/stagedupdate_types.go +++ b/apis/placement/v1alpha1/stagedupdate_types.go @@ -153,7 +153,7 @@ type AfterStageTask struct { Type AfterStageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // +kubebuilder:validation:Pattern="^(?:0|(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+)$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime metav1.Duration `json:"waitTime,omitempty"` diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go index 6e26851b9..0054bba0e 100644 --- a/apis/placement/v1beta1/stageupdate_types.go +++ b/apis/placement/v1beta1/stageupdate_types.go @@ -347,7 +347,8 @@ type StageTask struct { Type StageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // Only hours (h), minutes (m), and seconds (s) units are accepted. + // +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime *metav1.Duration `json:"waitTime,omitempty"` diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index 2b34db4d3..27dc987da 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -845,10 +845,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -879,10 +879,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2070,7 +2070,7 @@ spec: description: The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$ type: string required: - type @@ -3268,10 +3268,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -3302,10 +3302,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 9be8428dc..b134a75b5 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -70,10 +70,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -101,10 +101,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -264,7 +264,7 @@ spec: description: The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$ type: string required: - type @@ -399,10 +399,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -430,10 +430,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index ff9152c9d..81b506f4a 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -845,10 +845,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -879,10 +879,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2188,10 +2188,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2222,10 +2222,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml index 5315f7322..ccfb44e85 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml @@ -70,10 +70,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -101,10 +101,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -259,10 +259,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -290,10 +290,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go index aae616ef7..797cab861 100644 --- a/test/apis/placement/v1beta1/api_validation_integration_test.go +++ b/test/apis/placement/v1beta1/api_validation_integration_test.go @@ -27,6 +27,7 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" k8sErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/intstr" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" @@ -1685,6 +1686,60 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) }) + + It("Should allow creation of ClusterStagedUpdateStrategy with waitTime set to '0s'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "0s", + }, + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, obj)).Should(Succeed()) + Expect(hubClient.Delete(ctx, obj)).Should(Succeed()) + }) + + It("Should allow creation of ClusterStagedUpdateStrategy with waitTime as a compound duration '1h30m'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1h30m", + }, + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, obj)).Should(Succeed()) + Expect(hubClient.Delete(ctx, obj)).Should(Succeed()) + }) }) Context("Test ClusterStagedUpdateStrategy API validation - invalid cases", func() { @@ -2096,6 +2151,123 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '0abc'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "0abc", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1' (number without unit)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1d' (unsupported unit)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1d", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '01h' (leading zero)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "01h", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + }) Context("Test ClusterApprovalRequest API validation - valid cases", func() { From 1c525ee64405b9036efe86903b500c8fdaaed70d Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Thu, 7 May 2026 06:59:15 -0400 Subject: [PATCH 08/19] chore: Move NamespaceWithResourceSelectors to v1 (#697) --- .../v1/clusterresourceplacement_types.go | 74 ++++++++++++++++++- ...tes-fleet.io_clusterresourceoverrides.yaml | 19 +++++ ...t.io_clusterresourceoverridesnapshots.yaml | 19 +++++ ...es-fleet.io_clusterresourceplacements.yaml | 38 ++++++++++ ...ubernetes-fleet.io_resourceplacements.yaml | 38 ++++++++++ 5 files changed, 184 insertions(+), 4 deletions(-) diff --git a/apis/placement/v1/clusterresourceplacement_types.go b/apis/placement/v1/clusterresourceplacement_types.go index b6d452ae8..bf4086ecc 100644 --- a/apis/placement/v1/clusterresourceplacement_types.go +++ b/apis/placement/v1/clusterresourceplacement_types.go @@ -123,13 +123,15 @@ type ClusterResourcePlacement struct { } // PlacementSpec defines the desired state of ClusterResourcePlacement and ResourcePlacement. +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) <= 1",message="only one namespace selector with NamespaceWithResourceSelectors mode is allowed" +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) && size(x.name) > 0 && !has(x.labelSelector))) == 1)",message="namespace selector with NamespaceWithResourceSelectors mode must select by name (not by label)" +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1')) == 1",message="when using NamespaceWithResourceSelectors mode, only one namespace selector is allowed (cannot mix with other namespace selectors)" type PlacementSpec struct { - // +kubebuilder:validation:MinItems=1 - // +kubebuilder:validation:MaxItems=100 - // ResourceSelectors is an array of selectors used to select cluster scoped resources. The selectors are `ORed`. // You can have 1-100 selectors. // +kubebuilder:validation:Required + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 ResourceSelectors []ResourceSelectorTerm `json:"resourceSelectors"` // Policy defines how to select member clusters to place the selected resources. @@ -185,6 +187,24 @@ type ResourceSelectorTerm struct { // For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: // - NamespaceOnly: Only the namespace object itself // - NamespaceWithResources: The namespace AND all resources within it (default) + // - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + // + // When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + // (after the namespace selector) to specify which resources to include. These additional selectors can + // target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + // + // Important requirements for NamespaceWithResourceSelectors mode: + // - Exactly one namespace selector with this mode is allowed + // - The namespace selector must select by name (not by label) + // - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + // - All requirements are validated via CEL at API validation time + // - If the selected namespace is deleted after CRP creation, the controller will report an error condition + // + // Example using NamespaceWithResourceSelectors: + // - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + // - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + // - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + // This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. // // +kubebuilder:validation:Required Kind string `json:"kind"` @@ -204,7 +224,7 @@ type ResourceSelectorTerm struct { // SelectionScope defines the scope of resource selections when the Kind is `namespace`. // This field is only applicable when Kind is "Namespace" and is ignored for other resource kinds. // See the Kind field documentation for detailed examples and usage patterns. - // +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources + // +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources;NamespaceWithResourceSelectors // +kubebuilder:default=NamespaceWithResources // +kubebuilder:validation:Optional SelectionScope SelectionScope `json:"selectionScope,omitempty"` @@ -230,6 +250,52 @@ const ( // Note: This is the default value. When you select a namespace without specifying SelectionScope, // this mode is used automatically. NamespaceWithResources SelectionScope = "NamespaceWithResources" + + // NamespaceWithResourceSelectors allows fine-grained selection of specific resources within a namespace. + // The namespace itself is always selected, and you can optionally specify which resources to include + // by adding additional ResourceSelectorTerm entries after the namespace selector. + // + // Use cases: + // 1. Select only specific resource types from a namespace (e.g., only Deployments and Services) + // 2. Select resources matching certain labels within a namespace + // 3. Include specific cluster-scoped resources along with namespace-scoped resources + // + // How "additional selectors" work: + // - Exactly one namespace selector with NamespaceWithResourceSelectors mode is required + // - This selector must select a namespace by name (label selectors not allowed) + // - ADDITIONAL selectors specify which resources to include: + // - Namespace-scoped resources are filtered to only those within the selected namespace + // - Cluster-scoped resources are included as specified (not limited to the namespace) + // - If no additional selectors are provided, only the namespace object itself is selected + // + // Example 1 - Select specific deployments from a namespace: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "production", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {tier: "frontend"}} + // Result: The "production" namespace + all Deployments labeled tier=frontend within "production" + // + // Example 2 - Select namespace with multiple resource types: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"} + // Selector 3: {Group: "", Version: "v1", Kind: "Service"} + // Result: The "app" namespace + ALL Deployments and Services within "app" + // + // Example 3 - Include cluster-scoped resources: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"} + // Selector 3: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "app-admin"} + // Result: The "app" namespace + ALL Deployments in "app" + the "app-admin" ClusterRole + // + // Important constraints: + // - Exactly ONE namespace selector with NamespaceWithResourceSelectors mode is allowed + // - The namespace selector must select by name (label selectors not allowed) + // - Only ONE namespace selector total is allowed when using this mode (cannot mix with other namespace selectors) + // - All constraints are enforced via CEL at API validation time + // + // Runtime behavior: + // - If the selected namespace is deleted after the CRP is created, the controller will detect this during + // the next reconciliation and report an error condition in the CRP status + // - The CRP will transition to a failed state until the namespace is recreated or the CRP is updated + NamespaceWithResourceSelectors SelectionScope = "NamespaceWithResourceSelectors" ) // PlacementPolicy contains the rules to select target member clusters to place the selected resources. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml index 79927f6d1..f3f4cdfef 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml @@ -72,6 +72,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -133,6 +151,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml index 6b80c6832..a57b5b3d2 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml @@ -86,6 +86,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -147,6 +165,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml index 8ffe14382..5c586555d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml @@ -554,6 +554,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -615,6 +633,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. @@ -961,6 +980,25 @@ spec: - message: statusReportingScope is immutable rule: '!has(oldSelf.statusReportingScope) || self.statusReportingScope == oldSelf.statusReportingScope' + - message: only one namespace selector with NamespaceWithResourceSelectors + mode is allowed + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) <= 1 + - message: namespace selector with NamespaceWithResourceSelectors mode + must select by name (not by label) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope) + && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) + && size(x.name) > 0 && !has(x.labelSelector))) == 1) + - message: when using NamespaceWithResourceSelectors mode, only one namespace + selector is allowed (cannot mix with other namespace selectors) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1 status: description: The observed status of ClusterResourcePlacement. properties: diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml index 8ab1049c5..db0cdea5d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml @@ -546,6 +546,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -607,6 +625,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. @@ -946,6 +965,25 @@ spec: x-kubernetes-validations: - message: policy cannot be removed once set rule: '!(has(oldSelf.policy) && !has(self.policy))' + - message: only one namespace selector with NamespaceWithResourceSelectors + mode is allowed + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) <= 1 + - message: namespace selector with NamespaceWithResourceSelectors mode + must select by name (not by label) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope) + && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) + && size(x.name) > 0 && !has(x.labelSelector))) == 1) + - message: when using NamespaceWithResourceSelectors mode, only one namespace + selector is allowed (cannot mix with other namespace selectors) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1 status: description: The observed status of ResourcePlacement. properties: From dd2db6f74e20ccd612712a22b9e738a815f28016 Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Fri, 8 May 2026 19:34:59 -0700 Subject: [PATCH 09/19] refactor: extract IsLatest/LookupLatest policy snapshot helpers (#667) * refactor: extract IsLatest/LookupLatest policy snapshot helpers Move the two duplicated "latest policy snapshot" lookups into pkg/utils/controller/policy_snapshot_resolver.go so both the scheduler and the policy-snapshot watcher share the implementation. Adds two helpers: - IsLatestPolicySnapshot(snapshot) (bool, error): inspects the IsLatestSnapshot label. Returns (false, ErrUnexpectedBehavior-wrapped) for missing or non-bool values. The watcher's Reconcile now uses this in place of inline label parsing. - LookupLatestPolicySnapshot(ctx, client.Reader, placement) (PolicySnapshotObj, error): wraps the existing FetchLatestPolicySnapshot and enforces the "exactly one latest snapshot" invariant. Returns a plain error for the (transient) zero case and ErrUnexpectedBehavior for the >1 case. The scheduler's lookupLatestPolicySnapshot now delegates to this and keeps only its own log + retry handling. The watcher's buildCustomPredicate continues to do its own ParseBool because it compares old-vs-new label transitions, which doesn't fit the helper's signature. Adds table-driven tests for both helpers covering: label-true/false, missing label, nil labels map, malformed value, exactly-one snapshot, zero snapshots, multiple snapshots, ignoring snapshots from other placements, and ignoring non-latest snapshots. Drops two now-unused imports (strconv, labels) from scheduler.go. Fixes #643 Co-Authored-By: Claude Code Signed-off-by: Yetkin Timocin * chore: address review feedback Signed-off-by: Yetkin Timocin * test: align integration substring with new error wording Signed-off-by: Yetkin Timocin * test: cover LookupLatestPolicySnapshot list-error branch Address codecov/patch shortfall on PR #667. The List-failure path in controller.LookupLatestPolicySnapshot and the corresponding wrapper in scheduler.lookupLatestPolicySnapshot were not exercised by any existing unit test. Add an interceptor-based test in each package that injects a non-StatusError on List, asserts (nil, error) and that the error wraps ErrUnexpectedBehavior (NewAPIServerError promotes non-Status errors via isUnexpectedCacheError). Signed-off-by: Yetkin Timocin --------- Signed-off-by: Yetkin Timocin Co-authored-by: Claude Code --- pkg/controllers/placement/controller.go | 32 +- pkg/controllers/updaterun/initialization.go | 26 +- .../initialization_integration_test.go | 4 +- .../updaterun/validation_integration_test.go | 2 +- pkg/scheduler/scheduler.go | 62 +-- pkg/scheduler/scheduler_test.go | 39 ++ .../schedulingpolicysnapshot/watcher.go | 27 +- pkg/utils/controller/controller.go | 13 + .../controller/policy_snapshot_resolver.go | 73 ++- .../policy_snapshot_resolver_test.go | 440 ++++++++++-------- 10 files changed, 406 insertions(+), 312 deletions(-) diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go index a41d8f4c9..e6a80a899 100644 --- a/pkg/controllers/placement/controller.go +++ b/pkg/controllers/placement/controller.go @@ -507,25 +507,23 @@ func (r *Reconciler) ensureLatestPolicySnapshot(ctx context.Context, placementOb // 2 & 3 should never happen. func (r *Reconciler) lookupLatestSchedulingPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, int, error) { placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()} - snapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey) - if err != nil { - return nil, -1, err - } placementKObj := klog.KObj(placement) - policySnapshotItems := snapshotList.GetPolicySnapshotObjs() - if len(policySnapshotItems) == 1 { - policyIndex, err := labels.ParsePolicyIndexFromLabel(policySnapshotItems[0]) - if err != nil { - klog.ErrorS(err, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(policySnapshotItems[0])) - return nil, -1, controller.NewUnexpectedBehaviorError(err) - } - return policySnapshotItems[0], policyIndex, nil - } else if len(policySnapshotItems) > 1 { - // It means there are multiple active snapshots and should never happen. - err := fmt.Errorf("there are %d active schedulingPolicySnapshots owned by placement %v", len(policySnapshotItems), placementKey) - klog.ErrorS(err, "Invalid schedulingPolicySnapshots", "placement", placementKObj) - return nil, -1, controller.NewUnexpectedBehaviorError(err) + activeSnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey) + switch { + case err == nil: + policyIndex, parseErr := labels.ParsePolicyIndexFromLabel(activeSnapshot) + if parseErr != nil { + klog.ErrorS(parseErr, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(activeSnapshot)) + return nil, -1, controller.NewUnexpectedBehaviorError(parseErr) + } + return activeSnapshot, policyIndex, nil + case errors.Is(err, controller.ErrNoLatestPolicySnapshot): + // No active snapshot found; recover below by picking the largest-index snapshot. + default: + // API or invariant-violation error — propagate. + return nil, -1, err } + // When there are no active snapshots, find the one who has the largest policy index. // It should be rare only when placement controller is crashed before creating the new active snapshot. sortedList, err := r.listSortedSchedulingPolicySnapshots(ctx, placement) diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index c41c2174a..4eca2b1d9 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -125,29 +125,19 @@ func (r *Reconciler) determinePolicySnapshot( updateRunRef := klog.KObj(updateRun) // Get the latest policy snapshot. - policySnapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey) + latestPolicySnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey) if err != nil { - klog.ErrorS(err, "Failed to list the latest policy snapshots", "placement", placementKey, "updateRun", updateRunRef) - // err can be retried. - return nil, -1, controller.NewAPIServerError(true, err) - } - - policySnapshotObjs := policySnapshotList.GetPolicySnapshotObjs() - if len(policySnapshotObjs) != 1 { - if len(policySnapshotObjs) > 1 { - err := controller.NewUnexpectedBehaviorError(fmt.Errorf("more than one (%d in actual) latest policy snapshots associated with the placement: `%s`", len(policySnapshotObjs), placementKey)) - klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) - // no more retries for this error. + klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) + // Missing or duplicate-active snapshots are placement-state invariants the run cannot + // recover from on its own — fail validation. Anything else (e.g. List errors classified as + // ErrAPIServerError, or unexpected cache failures promoted to ErrUnexpectedBehavior) is + // propagated raw so controller-runtime retries. + if errors.Is(err, controller.ErrNoLatestPolicySnapshot) || errors.Is(err, controller.ErrMultipleActivePolicySnapshots) { return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) } - // no latest policy snapshot found. - err := fmt.Errorf("no latest policy snapshot associated with the placement: `%s`", placementKey) - klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) - // again, no more retries here. - return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) + return nil, -1, err } - latestPolicySnapshot := policySnapshotObjs[0] policySnapshotRef := klog.KObj(latestPolicySnapshot) policyIndex, foundIndex := latestPolicySnapshot.GetLabels()[placementv1beta1.PolicyIndexLabel] if !foundIndex || len(policyIndex) == 0 { diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go index 0516cb9e5..f208fc67a 100644 --- a/pkg/controllers/updaterun/initialization_integration_test.go +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -210,7 +210,7 @@ var _ = Describe("Updaterun initialization tests", func() { Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) By("Validating the initialization failed") - validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot associated") + validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot found") }) It("Should fail to initialize if there are multiple latest policy snapshots", func() { @@ -223,7 +223,7 @@ var _ = Describe("Updaterun initialization tests", func() { Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) By("Validating the initialization failed") - validateFailedInitCondition(ctx, updateRun, "more than one (2 in actual) latest policy snapshots associated") + validateFailedInitCondition(ctx, updateRun, "multiple active policy snapshots found") By("Deleting the second scheduling policy snapshot") Expect(k8sClient.Delete(ctx, snapshot2)).Should(Succeed()) diff --git a/pkg/controllers/updaterun/validation_integration_test.go b/pkg/controllers/updaterun/validation_integration_test.go index 11265bd3a..fe2182573 100644 --- a/pkg/controllers/updaterun/validation_integration_test.go +++ b/pkg/controllers/updaterun/validation_integration_test.go @@ -219,7 +219,7 @@ var _ = Describe("UpdateRun validation tests", func() { By("Validating the validation failed") wantStatus = generateFailedValidationStatus(updateRun, wantStatus) - validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot associated") + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot found") By("Checking update run status metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateFailedMetric(placementv1beta1.StateRun, updateRun, string(failureType))) diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 672cd0460..30cc6e84e 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -21,12 +21,10 @@ import ( "context" "errors" "fmt" - "strconv" "sync" "time" apiErrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/tools/record" @@ -347,57 +345,21 @@ func (s *Scheduler) cleanUpAllBindingsFor(ctx context.Context, placement fleetv1 } // lookupLatestPolicySnapshot returns the latest (i.e., active) policy snapshot associated with a placement. -// TODO(ryan): move this to a common lib +// +// It delegates to controller.LookupLatestPolicySnapshot so the same lookup can be reused outside the +// scheduler. The scheduler lists with a cached client; this does not have any consistency concern as +// sources will always trigger the scheduler when a new policy snapshot is created. +// +// The "no latest snapshot" case can happen transiently when a placement is newly created or its +// latest snapshot is being replaced; the scheduler will be triggered again when the situation +// corrects itself, so this is logged and surfaced as a non-unexpected error. func (s *Scheduler) lookupLatestPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, error) { - placementRef := klog.KObj(placement) - // Prepare the list options to filter policy snapshots by the placement name and the latest snapshot label. - var listOptions []client.ListOption - labelSelector := labels.SelectorFromSet(labels.Set{ - fleetv1beta1.PlacementTrackingLabel: placement.GetName(), - fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - }) - listOptions = append(listOptions, &client.ListOptions{LabelSelector: labelSelector}) - // Find out the latest policy snapshot associated with the placement. - var policySnapshotList fleetv1beta1.PolicySnapshotList - if placement.GetNamespace() == "" { - policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{} - } else { - policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{} - listOptions = append(listOptions, client.InNamespace(placement.GetNamespace())) - } - - // The scheduler lists with a cached client; this does not have any consistency concern as sources - // will always trigger the scheduler when a new policy snapshot is created. - if err := s.client.List(ctx, policySnapshotList, listOptions...); err != nil { - klog.ErrorS(err, "Failed to list policy snapshots of a placement", "placement", placementRef) - return nil, controller.NewAPIServerError(true, err) - } - policySnapshots := policySnapshotList.GetPolicySnapshotObjs() - switch { - case len(policySnapshots) == 0: - // There is no latest policy snapshot associated with the placement; it could happen when - // * the placement is newly created; or - // * the new policy snapshot is in the middle of being replaced. - // - // Either way, it is out of the scheduler's scope to handle such a case; the scheduler will - // be triggered again if the situation is corrected. - err := fmt.Errorf("no latest policy snapshot associated with placement") - klog.ErrorS(err, "Failed to find the latest policy snapshot, will retry", "placement", placementRef) - return nil, err - case len(policySnapshots) > 1: - // There are multiple active policy snapshots associated with the placement; normally this - // will never happen, as only one policy snapshot can be active in the sequence. - // - // Similarly, it is out of the scheduler's scope to handle such a case; the scheduler will - // report this unexpected occurrence but does not register it as a scheduler-side error. - // If (and when) the situation is corrected, the scheduler will be triggered again. - err := controller.NewUnexpectedBehaviorError(fmt.Errorf("too many active policy snapshots: %d, want 1", len(policySnapshots))) - klog.ErrorS(err, "There are multiple latest policy snapshots associated with placement", "placement", placementRef) + snapshot, err := controller.LookupLatestPolicySnapshot(ctx, s.client, types.NamespacedName{Namespace: placement.GetNamespace(), Name: placement.GetName()}) + if err != nil { + klog.ErrorS(err, "Failed to look up latest policy snapshot, will retry", "placement", klog.KObj(placement)) return nil, err - default: - // Found the one and only active policy snapshot. - return policySnapshots[0], nil } + return snapshot, nil } // addSchedulerCleanupFinalizer adds the scheduler cleanup finalizer to a placement (if it does not diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 6a33a6943..8a6ccad45 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -18,6 +18,8 @@ package scheduler import ( "context" + "errors" + "fmt" "log" "os" "strconv" @@ -30,10 +32,13 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" hubmetrics "github.com/kubefleet-dev/kubefleet/pkg/metrics/hub" + "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" ) const ( @@ -591,6 +596,40 @@ func TestLookupLatestPolicySnapshot(t *testing.T) { } } +// TestLookupLatestPolicySnapshot_ListError covers the error path: when the underlying +// controller.LookupLatestPolicySnapshot returns an error, the scheduler logs and propagates +// it without panicking on the nil snapshot. +func TestLookupLatestPolicySnapshot_ListError(t *testing.T) { + placement := &fleetv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + Finalizers: []string{fleetv1beta1.SchedulerCleanupFinalizer}, + }, + } + listErr := fmt.Errorf("simulated cache failure") + fakeClient := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(placement).Build(), + interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error { + return listErr + }, + }, + ) + + s := &Scheduler{client: fakeClient, uncachedReader: fakeClient} + + got, err := s.lookupLatestPolicySnapshot(context.Background(), placement) + if err == nil { + t.Fatalf("lookupLatestPolicySnapshot() = %v, nil; want non-nil error", got) + } + if got != nil { + t.Errorf("lookupLatestPolicySnapshot() returned snapshot %v, want nil on error", got) + } + if !errors.Is(err, controller.ErrUnexpectedBehavior) { + t.Errorf("lookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } +} + func TestObserveSchedulingCycleMetrics(t *testing.T) { metricMetadata := ` # HELP scheduling_cycle_duration_milliseconds The duration of a scheduling cycle run in milliseconds diff --git a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go index 79d9e5a96..54a453b7e 100644 --- a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go +++ b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go @@ -74,32 +74,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, nil } - // Verify if the policy snapshot is currently active. - // TODO: create a lib to check if the policy snapshot is latest. - isLatestVal, ok := policySnapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel] - if !ok { - // The IsLatestSnapshot label is not present; normally this should never occur. - klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("IsLatestSnapshotLabel is missing")), - "IsLatestSnapshot label is not present", - "policySnapshot", policySnapshotRef) - // This is not a situation that the controller can recover by itself. Should the label - // value be corrected, the controller will be triggered again. - return ctrl.Result{}, nil - } - isLatest, err := strconv.ParseBool(isLatestVal) + // Verify if the policy snapshot is currently active. A missing or malformed IsLatestSnapshot + // label is treated as "not latest" — the controller cannot recover from those states by + // itself; the placement controller will repair the label and re-trigger reconciliation. + isLatest, err := controller.IsLatestPolicySnapshot(policySnapshot) if err != nil { - // The label value is not a correctly formatted boolean value; normally this should never - // occur. - klog.ErrorS(controller.NewUnexpectedBehaviorError(err), - "Failed to parse IsLatestSnapshot value", - "policySnapshot", policySnapshotRef) - // This is not an error that the controller can recover by itself; ignore the error. - // Should the label value be corrected, the controller will be triggered again. + klog.ErrorS(err, "Failed to determine whether policy snapshot is latest", "policySnapshot", policySnapshotRef) return ctrl.Result{}, nil } if !isLatest { - // The policy snapshot is not currently active, i.e., it is not the latest policy snapshot; - // ignore it. return ctrl.Result{}, nil } diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go index 4dbeb1cd0..22bd64a56 100644 --- a/pkg/utils/controller/controller.go +++ b/pkg/utils/controller/controller.go @@ -63,6 +63,19 @@ var ( // ErrUserError indicates the error is caused by the user and customer needs to take the action. ErrUserError = errors.New("failed to process the request due to a client error") + + // ErrNoLatestPolicySnapshot indicates that no active (latest) policy snapshot was found for a + // placement. This is a transient condition during placement creation or rotation and is typically + // recoverable by the next reconcile; callers branch on it via errors.Is to distinguish it from + // real API or invariant-violation errors. + ErrNoLatestPolicySnapshot = errors.New("no latest policy snapshot found") + + // ErrMultipleActivePolicySnapshots indicates that more than one policy snapshot for the same + // placement carries IsLatestSnapshot=true, violating the "exactly one active snapshot" + // invariant. Callers branch on it via errors.Is so they can fail fast on a real invariant + // violation without misclassifying transient cache errors that share the ErrUnexpectedBehavior + // category. + ErrMultipleActivePolicySnapshots = errors.New("multiple active policy snapshots found") ) // NewUnexpectedBehaviorError returns ErrUnexpectedBehavior type error when err is not nil. diff --git a/pkg/utils/controller/policy_snapshot_resolver.go b/pkg/utils/controller/policy_snapshot_resolver.go index 29bb13f04..722a65a16 100644 --- a/pkg/utils/controller/policy_snapshot_resolver.go +++ b/pkg/utils/controller/policy_snapshot_resolver.go @@ -110,34 +110,69 @@ func BuildPolicySnapshot(placementObj fleetv1beta1.PlacementObj, policySnapshotI return snapshot } -// FetchLatestPolicySnapshot fetches the latest policy snapshot for a given placement. -// For cluster-scoped placements, it fetches ClusterSchedulingPolicySnapshot. -// For namespaced placements, it fetches SchedulingPolicySnapshot. -func FetchLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotList, error) { - namespace := placementKey.Namespace - name := placementKey.Name +// IsLatestPolicySnapshot reports whether the given policy snapshot carries the IsLatestSnapshot +// label set to "true". A non-nil error is returned if the label is missing or its value cannot be +// parsed as a bool; in those cases the boolean result is false. +// +// Callers typically log the error and treat the snapshot as non-latest, since the controller cannot +// recover from a malformed label by itself — the placement controller will repair it and re-trigger +// reconciliation. +func IsLatestPolicySnapshot(snapshot fleetv1beta1.PolicySnapshotObj) (bool, error) { + val, ok := snapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel] + if !ok { + return false, NewUnexpectedBehaviorError(fmt.Errorf("%s label is missing", fleetv1beta1.IsLatestSnapshotLabel)) + } + isLatest, err := strconv.ParseBool(val) + if err != nil { + return false, NewUnexpectedBehaviorError(fmt.Errorf("failed to parse %s label value %q: %w", fleetv1beta1.IsLatestSnapshotLabel, val, err)) + } + return isLatest, nil +} +// LookupLatestPolicySnapshot returns the single active (latest) policy snapshot associated with the +// placement identified by placementKey, enforcing the "exactly one latest snapshot" invariant. +// A namespaced placementKey selects SchedulingPolicySnapshot; a cluster-scoped key (empty +// namespace) selects ClusterSchedulingPolicySnapshot. +// +// Returns: +// - (snapshot, nil) when exactly one latest snapshot exists. +// - (nil, ErrNoLatestPolicySnapshot-wrapped error) when none exist. This is a transient state for +// newly-created placements or in-progress rotations; callers branch on it via errors.Is. +// - (nil, ErrMultipleActivePolicySnapshots-wrapped error) when more than one exists. +// - (nil, ErrAPIServerError-wrapped error) when the List call fails. The List error is also +// promoted to ErrUnexpectedBehavior for unexpected cache failures; callers must NOT collapse +// ErrUnexpectedBehavior with the dedicated invariant sentinels above when classifying retries. +func LookupLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotObj, error) { var policySnapshotList fleetv1beta1.PolicySnapshotList - var listOptions []client.ListOption - listOptions = append(listOptions, client.MatchingLabels{ - fleetv1beta1.PlacementTrackingLabel: name, - fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - }) - - if namespace != "" { - // This is a namespaced SchedulingPolicySnapshotList + listOptions := []client.ListOption{ + client.MatchingLabels{ + fleetv1beta1.PlacementTrackingLabel: placementKey.Name, + fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), + }, + } + if placementKey.Namespace != "" { policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{} - listOptions = append(listOptions, client.InNamespace(namespace)) + listOptions = append(listOptions, client.InNamespace(placementKey.Namespace)) } else { - // This is a cluster-scoped ClusterSchedulingPolicySnapshotList policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{} } - if err := k8Client.List(ctx, policySnapshotList, listOptions...); err != nil { klog.ErrorS(err, "Failed to list the policySnapshots associated with the placement", "placement", placementKey) - return nil, err + return nil, NewAPIServerError(true, err) + } + + policySnapshots := policySnapshotList.GetPolicySnapshotObjs() + switch len(policySnapshots) { + case 0: + return nil, fmt.Errorf("%w for placement %s", ErrNoLatestPolicySnapshot, placementKey) + case 1: + return policySnapshots[0], nil + default: + // Wrap both the specific invariant sentinel and the categorical ErrUnexpectedBehavior so + // callers that branch on either continue to work; the specific sentinel lets retry-gate + // callers distinguish this invariant violation from transient ErrUnexpectedBehavior errors. + return nil, fmt.Errorf("%w: %w for placement %s: got %d, want 1", ErrUnexpectedBehavior, ErrMultipleActivePolicySnapshots, placementKey, len(policySnapshots)) } - return policySnapshotList, nil } // ListPolicySnapshots lists all policy snapshots associated with a placement key. diff --git a/pkg/utils/controller/policy_snapshot_resolver_test.go b/pkg/utils/controller/policy_snapshot_resolver_test.go index c58f07ac1..777ccb759 100644 --- a/pkg/utils/controller/policy_snapshot_resolver_test.go +++ b/pkg/utils/controller/policy_snapshot_resolver_test.go @@ -18,6 +18,8 @@ package controller import ( "context" + "errors" + "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -28,6 +30,7 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" ) @@ -489,7 +492,7 @@ func TestBuildPolicySnapshot(t *testing.T) { } } -func TestFetchLatestPolicySnapshot(t *testing.T) { +func TestListPolicySnapshots(t *testing.T) { scheme := runtime.NewScheme() if err := fleetv1beta1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add to scheme: %v", err) @@ -502,7 +505,7 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { expectedSnapshots int }{ { - name: "fetch latest cluster scoped policy snapshots", + name: "list cluster scoped policy snapshots", placementKey: types.NamespacedName{ Name: placementName, }, @@ -512,7 +515,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -521,15 +523,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-1", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, }, - expectedSnapshots: 1, + expectedSnapshots: 2, }, { - name: "fetch latest cluster scoped policy snapshots - mixed setup", + name: "list cluster scoped policy snapshots - mixed setup", placementKey: types.NamespacedName{ Name: placementName, }, @@ -539,7 +548,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -548,7 +556,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-1", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-placement-2", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -559,15 +582,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-placement-1", + Namespace: policySnapshotNamespace, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, }, }, }, }, - expectedSnapshots: 1, // Should only return cluster-scoped ones for cluster-scoped placement + expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement }, { - name: "fetch latest namespaced policy snapshots", + name: "list namespaced policy snapshots", placementKey: types.NamespacedName{ Name: placementName, Namespace: policySnapshotNamespace, @@ -579,7 +610,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -589,15 +619,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Namespace: "other-namespace", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, }, - expectedSnapshots: 1, + expectedSnapshots: 2, }, { - name: "fetch latest namespaced policy snapshots - mixed setup", + name: "list namespaced policy snapshots - mixed setup", placementKey: types.NamespacedName{ Name: placementName, Namespace: policySnapshotNamespace, @@ -609,7 +647,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -619,7 +656,15 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Namespace: "other-namespace", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -629,25 +674,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, }, - expectedSnapshots: 1, // Should only return namespaced ones for namespaced placement + expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement }, { - name: "no latest policy snapshots found", + name: "no policy snapshots found", placementKey: types.NamespacedName{ Name: placementName, }, existingSnapshots: []client.Object{ &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", + Name: "other-placement-0", Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -664,9 +707,9 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { WithObjects(tc.existingSnapshots...). Build() - snapshots, err := FetchLatestPolicySnapshot(ctx, fakeClient, tc.placementKey) + snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey) if err != nil { - t.Fatalf("FetchLatestPolicySnapshot() = %v, want nil", err) + t.Fatalf("ListPolicySnapshots() = %v, want nil", err) } if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots { @@ -676,210 +719,204 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { } } -func TestListPolicySnapshots(t *testing.T) { +func TestIsLatestPolicySnapshot(t *testing.T) { + snapshotWithLabels := func(labels map[string]string) fleetv1beta1.PolicySnapshotObj { + return &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: policySnapshotName, + Labels: labels, + }, + } + } + + testCases := []struct { + name string + snapshot fleetv1beta1.PolicySnapshotObj + want bool + wantErr bool + }{ + { + name: "label true returns true", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "true"}), + want: true, + wantErr: false, + }, + { + name: "label false returns false", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "false"}), + want: false, + wantErr: false, + }, + { + name: "label missing returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.PlacementTrackingLabel: placementName}), + want: false, + wantErr: true, + }, + { + name: "nil labels returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(nil), + want: false, + wantErr: true, + }, + { + name: "label malformed returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "yes"}), + want: false, + wantErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got, err := IsLatestPolicySnapshot(tc.snapshot) + if (err != nil) != tc.wantErr { + t.Errorf("IsLatestPolicySnapshot() error = %v, wantErr %v", err, tc.wantErr) + } + if err != nil && !errors.Is(err, ErrUnexpectedBehavior) { + t.Errorf("IsLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } + if got != tc.want { + t.Errorf("IsLatestPolicySnapshot() = %v, want %v", got, tc.want) + } + }) + } +} + +func TestLookupLatestPolicySnapshot(t *testing.T) { scheme := runtime.NewScheme() if err := fleetv1beta1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add to scheme: %v", err) } + clusterKey := types.NamespacedName{Name: placementName} + namespacedKey := types.NamespacedName{Name: placementName, Namespace: policySnapshotNamespace} + + clusterLatestSnapshot := func(name string) *fleetv1beta1.ClusterSchedulingPolicySnapshot { + return &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + } + } + namespacedLatestSnapshot := func(name string) *fleetv1beta1.SchedulingPolicySnapshot { + return &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: policySnapshotNamespace, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + } + } + testCases := []struct { name string placementKey types.NamespacedName existingSnapshots []client.Object - expectedSnapshots int + wantName string + wantErrSentinel error // nil means a snapshot is expected; otherwise the error must wrap this sentinel. }{ { - name: "list cluster scoped policy snapshots", - placementKey: types.NamespacedName{ - Name: placementName, - }, + name: "exactly one cluster-scoped latest snapshot", + placementKey: clusterKey, + existingSnapshots: []client.Object{clusterLatestSnapshot("test-placement-0")}, + wantName: "test-placement-0", + }, + { + name: "exactly one namespaced latest snapshot", + placementKey: namespacedKey, + existingSnapshots: []client.Object{namespacedLatestSnapshot("test-placement-0")}, + wantName: "test-placement-0", + }, + { + name: "no latest snapshot returns ErrNoLatestPolicySnapshot", + placementKey: clusterKey, + existingSnapshots: nil, + wantErrSentinel: ErrNoLatestPolicySnapshot, + }, + { + name: "only non-latest snapshots present returns ErrNoLatestPolicySnapshot", + placementKey: clusterKey, existingSnapshots: []client.Object{ &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", + fleetv1beta1.IsLatestSnapshotLabel: "false", }, }, }, }, - expectedSnapshots: 2, + wantErrSentinel: ErrNoLatestPolicySnapshot, }, { - name: "list cluster scoped policy snapshots - mixed setup", - placementKey: types.NamespacedName{ - Name: placementName, - }, + name: "multiple latest snapshots returns ErrMultipleActivePolicySnapshots", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-2", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, - // Add namespaced snapshots to test mixed scenario - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, + clusterLatestSnapshot("test-placement-0"), + clusterLatestSnapshot("test-placement-1"), }, - expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement + wantErrSentinel: ErrMultipleActivePolicySnapshots, }, { - name: "list namespaced policy snapshots", - placementKey: types.NamespacedName{ - Name: placementName, - Namespace: policySnapshotNamespace, - }, + name: "ignores snapshots from other placements", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ + clusterLatestSnapshot("test-placement-0"), + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Namespace: "other-namespace", + Name: "other-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: "other-placement", + fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, }, - expectedSnapshots: 2, + wantName: "test-placement-0", }, { - name: "list namespaced policy snapshots - mixed setup", - placementKey: types.NamespacedName{ - Name: placementName, - Namespace: policySnapshotNamespace, - }, + name: "ignores non-latest snapshots", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Namespace: "other-namespace", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, - // Add cluster-scoped snapshots to test mixed scenario + clusterLatestSnapshot("test-placement-1"), &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "false", }, }, }, }, - expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement + wantName: "test-placement-1", }, { - name: "no policy snapshots found", - placementKey: types.NamespacedName{ - Name: placementName, + name: "cluster-scoped lookup ignores a same-named latest namespaced snapshot", + placementKey: clusterKey, + existingSnapshots: []client.Object{ + clusterLatestSnapshot("test-placement-0"), + namespacedLatestSnapshot("test-placement-0"), }, + wantName: "test-placement-0", + }, + { + name: "namespaced lookup ignores a same-named latest cluster snapshot", + placementKey: namespacedKey, existingSnapshots: []client.Object{ - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, + namespacedLatestSnapshot("test-placement-0"), + clusterLatestSnapshot("test-placement-0"), }, - expectedSnapshots: 0, + wantName: "test-placement-0", }, } @@ -891,14 +928,51 @@ func TestListPolicySnapshots(t *testing.T) { WithObjects(tc.existingSnapshots...). Build() - snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey) - if err != nil { - t.Fatalf("ListPolicySnapshots() = %v, want nil", err) + got, err := LookupLatestPolicySnapshot(ctx, fakeClient, tc.placementKey) + if (err != nil) != (tc.wantErrSentinel != nil) { + t.Fatalf("LookupLatestPolicySnapshot() error = %v, wantErrSentinel %v", err, tc.wantErrSentinel) } - - if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots { - t.Errorf("Expected %d snapshots, got %d", tc.expectedSnapshots, len(snapshots.GetPolicySnapshotObjs())) + if tc.wantErrSentinel != nil { + if !errors.Is(err, tc.wantErrSentinel) { + t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping %v", err, tc.wantErrSentinel) + } + return + } + if got == nil || got.GetName() != tc.wantName { + t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want name %q", got, tc.wantName) } }) } } + +// TestLookupLatestPolicySnapshot_ListError covers the API-error path: when the underlying List +// call fails with an error that isUnexpectedCacheError treats as unexpected (i.e. not a +// *apierrors.StatusError, context.Canceled, or context.DeadlineExceeded), NewAPIServerError +// promotes it to ErrUnexpectedBehavior. +func TestLookupLatestPolicySnapshot_ListError(t *testing.T) { + scheme := runtime.NewScheme() + if err := fleetv1beta1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add to scheme: %v", err) + } + + listErr := fmt.Errorf("simulated cache failure") + fakeClient := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme).Build(), + interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error { + return listErr + }, + }, + ) + + got, err := LookupLatestPolicySnapshot(context.Background(), fakeClient, types.NamespacedName{Name: placementName}) + if err == nil { + t.Fatalf("LookupLatestPolicySnapshot() = %v, nil; want non-nil error", got) + } + if got != nil { + t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want nil", got) + } + if !errors.Is(err, ErrUnexpectedBehavior) { + t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } +} From 444e487c14e238c78525b8d8c0a231480d2f9d57 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Mon, 11 May 2026 15:52:36 +1000 Subject: [PATCH 10/19] feat: add new error handling utilities (#679) --- pkg/utils/errors/errors.go | 240 +++++++++++++ pkg/utils/errors/errors_test.go | 579 ++++++++++++++++++++++++++++++++ 2 files changed, 819 insertions(+) create mode 100644 pkg/utils/errors/errors.go create mode 100644 pkg/utils/errors/errors_test.go diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go new file mode 100644 index 000000000..3f04059e5 --- /dev/null +++ b/pkg/utils/errors/errors.go @@ -0,0 +1,240 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// TO-DO: apply the new error pattern incrementally when it is ready to do so. + +// errors features custom error categorizing, wrapping and string formatting logic +// used in the KubeFleet project. +package errors + +import ( + "errors" + "fmt" + "runtime" + + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +type ErrCategory string + +const ( + // An error is considered of the unexpected error category when a KubeFleet controller + // encounters a situation that it does not know how to handle and cannot be resolved + // by itself or by user fixing their inputs. + ErrCategoryUnexpected ErrCategory = "unexpected" + + // An error is considered of the transient error category when a KubeFleet controller + // encounters a situation that would self-resolve soon, e.g., cache not sync'd yet. + ErrCategoryTransient ErrCategory = "transient" + + // An error is considered of the API server error category when it arises during a call + // to the Kubernetes API server. + ErrCategoryAPIServer ErrCategory = "APIserver" + + // An error is considered of the user error category when it arises due to some + // incorrect input or action from the user side and can resolve when the + // user fixes their inputs. + ErrCategoryUser ErrCategory = "user" + + // An error is considered uncategorized when it does not fit into any of the other + // predefined error categories or no category has been explicitly assigned. + ErrCategoryUncategorized ErrCategory = "uncategorized" +) + +var _ error = &Error{} + +type Error struct { + // category is the category of the error. + category ErrCategory + + // wrapped is an error that this error wraps. + wrapped error + + // desc is the additional description about the wrapped error. + desc string + + // attrs are the additional attributes associated with the wrapped error. + attrs []interface{} +} + +func (e *Error) categoryWithDefault() ErrCategory { + if len(e.category) == 0 { + return ErrCategoryUncategorized + } + return e.category +} + +// Error implements the error interface. +// +// Note the output intentionally does not include the additional attributes, so as to keep the cardinality +// low; to print them out, pass them explicitly to the klog functions. +func (e *Error) Error() string { + switch { + case e.wrapped != nil && len(e.desc) == 0: + // With wrapped error but no description. + return e.wrapped.Error() + case e.wrapped != nil: + // With wrapped error, with description. + return fmt.Sprintf("%s: %s", e.desc, e.wrapped.Error()) + case e.wrapped == nil && len(e.desc) > 0: + // No wrapped error but with description. + return e.desc + default: + // No wrapped error and no description. + return "an unknown error occurred" + } +} + +// Unwrap returns the wrapped error, so that errors.Is and errors.As can traverse the error chain as +// expected. +func (e *Error) Unwrap() error { + return e.wrapped +} + +// Wraps wraps an existing error with a higher-level description and optional key-value attributes, +// and returns a new *Error. If the existing error (or its children) is already an *Error, +// Wraps will surface the category and the attributes to the top-level. +func Wraps(err error, desc string, kvs ...interface{}) *Error { + // Find if any of error down in the chain is already an *Error; if so, use its category and surface its + // attributes to the current level. + var category ErrCategory + var mergedKVs []interface{} + var childError *Error + if errors.As(err, &childError) { + category = childError.category + mergedKVs = append(mergedKVs, childError.attrs...) + } + mergedKVs = append(mergedKVs, kvs...) + if len(category) == 0 { + category = ErrCategoryUncategorized + } + + return &Error{ + category: category, + wrapped: err, + desc: desc, + attrs: mergedKVs, + } +} + +// Args returns the k-v attributes associated with an *Error (and its children). +// +// The function is designed to work with klog calls; to use it, call klog.ErrorS with: +// +// klog.ErrorS(err, "additional top-level error description", Args(err)...). +// +// If you need to add additional key-value attributes at the top level in addition to those +// already captured from the error chain, append them to Args as follows: +// +// klog.ErrorS(err, "additional top-level error description", append(Args(err), "k", "v")...). +func Args(err error, kvs ...interface{}) []interface{} { + var compositeKVs []interface{} + var childError *Error + if errors.As(err, &childError) { + // Make sure that the error category is always included as the first kv pair. + compositeKVs = append(compositeKVs, "errCategory", childError.categoryWithDefault()) + compositeKVs = append(compositeKVs, childError.attrs...) + compositeKVs = append(compositeKVs, kvs...) + } + return compositeKVs +} + +// CallerFrameOverview is a simple struct that summarizes a single call frame. +type CallerFrameOverview struct { + Function string + File string + Line int +} + +// captureCallers captures the current call stack and returns a slice of CallerFrameOverview +// summarizing each frame in the stack, skipping the top 3 frames for simplicity. +func captureCallers() []CallerFrameOverview { + pcs := make([]uintptr, 32) + // Skip 3 frames (the runtime.Callers function itself, this utility function, and the caller of this utility) + // when capturing the call stack for simplicity reasons. + n := runtime.Callers(3, pcs) + frames := runtime.CallersFrames(pcs[:n]) + var callerFrames []CallerFrameOverview + for { + frame, more := frames.Next() + callerFrames = append(callerFrames, CallerFrameOverview{ + Function: frame.Function, + File: frame.File, + Line: frame.Line, + }) + if !more { + break + } + } + return callerFrames +} + +// NewUnexpectedError creates a new Error categorized as ErrCategoryUnexpected. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewUnexpectedError(err error, desc string, kvs ...interface{}) *Error { + // For unexpected errors, collect the caller frames. + // + // Note (chenyu1): previously for unexpected errors, we use the debug.Stack() to print out a full stack + // trace in the logs. Effective as it is, the operation is expensive and it always captures the full stack, + // where the top of the stack is always the error handling utility function/method itself rather than + // the actual caller site where the error originates. Furthermore, as debug.Stack() returns a multi-line string, + // often the logging backend cannot render it properly, making it difficult to read/summarize. To address this, + // in this new implementation we switch to runtime.Callers and runtime.CallersFrames to capture the call stack + // in a structured way with the unrelated stacks skipped. + frames := captureCallers() + kvs = append(kvs, "callers", frames) + + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryUnexpected + return newErr +} + +// NewTransientError creates a new Error categorized as ErrCategoryTransient. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewTransientError(err error, desc string, kvs ...interface{}) *Error { + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryTransient + return newErr +} + +// NewUserError creates a new Error categorized as ErrCategoryUser. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewUserError(err error, desc string, kvs ...interface{}) *Error { + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryUser + return newErr +} + +// NewAPIServerError creates a new Error categorized as ErrCategoryAPIServer. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewAPIServerError(err error, desc string, cached bool, kvs ...interface{}) *Error { + // For API server errors, collect caching and HTTP response related information. + kvs = append(kvs, "cached", cached) + kvs = append(kvs, "APIServerErrReason", apierrors.ReasonForError(err)) + + apiErr, ok := err.(apierrors.APIStatus) + if ok { + kvs = append(kvs, "APIServerResHTTPCode", apiErr.Status().Code) + } + + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryAPIServer + return newErr +} diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go new file mode 100644 index 000000000..95d406477 --- /dev/null +++ b/pkg/utils/errors/errors_test.go @@ -0,0 +1,579 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package errors + +import ( + "bytes" + "fmt" + "io" + "log/slog" + "strings" + "testing" + + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" +) + +var ( + // wrappedErrComparer is a cmp option that compares the .wrapped field by pointer equality, + // avoiding the need to recurse into unexported fields of standard-library error types. + wrappedErrComparer = cmp.FilterPath( + func(p cmp.Path) bool { return p.Last().String() == ".wrapped" }, + cmp.Comparer(func(x, y error) bool { return x == y }), //nolint:errorlint + ) +) + +// TestCommonUsePatterns demonstrates how the common error handling patterns are enabled by the errors package. +func TestCommonUsePatterns(t *testing.T) { + var bytesBuf bytes.Buffer + slogHandler := slog.NewTextHandler(&bytesBuf, &slog.HandlerOptions{Level: slog.LevelDebug}) + logger := logr.FromSlogHandler(slogHandler) + klog.SetLogger(logger) + + // Example 1: handling API server errors with rich context and structured logging. + rawAPIServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "work") + categorizedAPIServerErr := NewAPIServerError(rawAPIServerErr, "failed to create namespace", false, "k1", "v1") + wrappedErr := Wraps(categorizedAPIServerErr, "additional high-level error description", "k2", "v2", "k3", "v3") + klog.ErrorS(wrappedErr, "additional top/controller-level error description", Args(wrappedErr)...) + // If there needs to be kv attributes at the top level as well: + // klog.ErrorS(wrappedErr, "additional top/controller-level error description", append(Args(wrappedErr), "k4", "v4", "k5", "v5")...) + klog.Flush() + + outputStr := readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:53: time=2026-04-29T02:39:44.853+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: failed to create namespace: namespaces \"work\" already exists" errCategory=APIserver k1=v1 cached=false APIServerErrReason=AlreadyExists APIServerResHTTPCode=409 k2=v2 k3=v3 + wantSubStrings := []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: failed to create namespace: namespaces \\\"work\\\" already exists\"", + "errCategory=APIserver", + "k1=v1", + "k2=v2", + "k3=v3", + "cached=false", + "APIServerErrReason=AlreadyExists", + "APIServerResHTTPCode=409", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } + + // Example 2: handling unexpected errors with rich context and structured logging. + rawUtilityCallErr := fmt.Errorf("cannot calculate resource hash") + categorizedUnexpectedErr := NewUnexpectedError(rawUtilityCallErr, "additional low-level error description", "k1", "v1") + wrappedUnexpectedErr := Wraps(categorizedUnexpectedErr, "additional high-level error description", "k2", "v2") + klog.ErrorS(wrappedUnexpectedErr, "additional top/controller-level error description", Args(wrappedUnexpectedErr)...) + klog.Flush() + + outputStr = readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2 + wantSubStrings = []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: additional low-level error description: cannot calculate resource hash\"", + "errCategory=unexpected", + "k1=v1", + "callers=", + "Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns", + "k2=v2", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } + + // Example 3: handling uncategorized errors with rich context and structured logging. + rawErr := fmt.Errorf("something went wrong") + uncategorizedErr := Wraps(rawErr, "additional low-level error description", "k1", "v1") + wrappedUncategorizedErr := Wraps(uncategorizedErr, "additional high-level error description", "k2", "v2") + klog.ErrorS(wrappedUncategorizedErr, "additional top/controller-level error description", Args(wrappedUncategorizedErr)...) + klog.Flush() + + outputStr = readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:119: time=2026-04-29T02:59:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: something went wrong" errCategory=uncategorized k1=v1 k2=v2 + wantSubStrings = []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: additional low-level error description: something went wrong\"", + "errCategory=uncategorized", + "k1=v1", + "k2=v2", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } +} + +func TestErrorStringer(t *testing.T) { + wrappedErr := fmt.Errorf("something went wrong") + + testCases := []struct { + name string + err *Error + wantMsg string + }{ + { + name: "no wrapped error, no description", + err: &Error{}, + wantMsg: "an unknown error occurred", + }, + { + name: "no wrapped error, with description", + err: &Error{desc: "operation failed"}, + wantMsg: "operation failed", + }, + { + name: "with wrapped error, no description", + err: &Error{wrapped: wrappedErr}, + wantMsg: "something went wrong", + }, + { + name: "with wrapped error, with description", + err: &Error{wrapped: wrappedErr, desc: "operation failed"}, + wantMsg: "operation failed: something went wrong", + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := tc.err.Error() + if got != tc.wantMsg { + t.Errorf("Error() = %q, want %q", got, tc.wantMsg) + } + }) + } +} + +func TestWraps(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryUser, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "wrapping a plain error, no kvs", + err: plainErr, + desc: "high level desc", + wantErr: &Error{ + category: ErrCategoryUncategorized, + wrapped: plainErr, + desc: "high level desc", + }, + }, + { + name: "wrapping a plain error, with kvs", + err: plainErr, + desc: "high level desc", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryUncategorized, + wrapped: plainErr, + desc: "high level desc", + attrs: []interface{}{"k1", "v1"}, + }, + }, + { + name: "wrapping an *Error, no extra kvs", + err: categorizedChildErr, + desc: "high level desc", + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "high level desc", + attrs: []interface{}{"ck1", "cv1"}, + }, + }, + { + name: "wrapping an *Error, with extra kvs", + err: categorizedChildErr, + desc: "high level desc", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "high level desc", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := Wraps(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("Wraps() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestArgs(t *testing.T) { + testCases := []struct { + name string + err error + kvs []interface{} + wantKVs []interface{} + }{ + { + name: "plain error (not an *Error)", + err: fmt.Errorf("plain error"), + wantKVs: nil, + }, + { + name: "*Error with no category set, no attrs, no extra kvs", + err: &Error{}, + wantKVs: []interface{}{"errCategory", ErrCategoryUncategorized}, + }, + { + name: "*Error with explicit category, no attrs, no extra kvs", + err: &Error{category: ErrCategoryUser}, + wantKVs: []interface{}{"errCategory", ErrCategoryUser}, + }, + { + name: "*Error with explicit category and attrs, no extra kvs", + err: &Error{category: ErrCategoryAPIServer, attrs: []interface{}{"k1", "v1", "k2", "v2"}}, + wantKVs: []interface{}{"errCategory", ErrCategoryAPIServer, "k1", "v1", "k2", "v2"}, + }, + { + name: "*Error with explicit category and attrs, with extra kvs", + err: &Error{category: ErrCategoryUnexpected, attrs: []interface{}{"k1", "v1"}}, + kvs: []interface{}{"k2", "v2"}, + wantKVs: []interface{}{"errCategory", ErrCategoryUnexpected, "k1", "v1", "k2", "v2"}, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotKVs := Args(tc.err, tc.kvs...) + if diff := cmp.Diff(gotKVs, tc.wantKVs); diff != "" { + t.Errorf("Args() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestUnwrap(t *testing.T) { + innerErr := fmt.Errorf("inner error") + testCases := []struct { + name string + err *Error + wantErr error + }{ + { + name: "nil wrapped error", + err: &Error{}, + wantErr: nil, + }, + { + name: "non-nil wrapped error", + err: &Error{wrapped: innerErr}, + wantErr: innerErr, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := tc.err.Unwrap() + // Note: here we actually want to just compare the error values by pointer equality. + if gotErr != tc.wantErr { //nolint:errorlint + t.Errorf("Unwrap() = %v, want %v", gotErr, tc.wantErr) + } + }) + } +} + +func TestNewTransientError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryUser, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "plain error, no kvs", + err: plainErr, + desc: "transient condition", + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: plainErr, + desc: "transient condition", + }, + }, + { + name: "plain error, with kvs", + err: plainErr, + desc: "transient condition", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: plainErr, + desc: "transient condition", + attrs: []interface{}{"k1", "v1"}, + }, + }, + // This is technically speaking possible, but in practice users should not need to override categories. + { + name: "wrapping an *Error, category overridden to transient", + err: categorizedChildErr, + desc: "transient condition", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: categorizedChildErr, + desc: "transient condition", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewTransientError(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewTransientError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewUserError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryTransient, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "plain error, no kvs", + err: plainErr, + desc: "invalid input", + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: plainErr, + desc: "invalid input", + }, + }, + { + name: "plain error, with kvs", + err: plainErr, + desc: "invalid input", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: plainErr, + desc: "invalid input", + attrs: []interface{}{"k1", "v1"}, + }, + }, + { + // This is technically speaking possible, but in practice users should not need to override categories. + name: "wrapping an *Error, category overridden to user", + err: categorizedChildErr, + desc: "invalid input", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "invalid input", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewUserError(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewUserError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewAPIServerError(t *testing.T) { + apiServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "fleet") + plainErr := fmt.Errorf("plain error") + + testCases := []struct { + name string + err error + desc string + cached bool + kvs []interface{} + wantErr *Error + }{ + { + name: "API server error, not cached, no extra kvs", + err: apiServerErr, + desc: "failed to create namespace", + cached: false, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: apiServerErr, + desc: "failed to create namespace", + attrs: []interface{}{ + "cached", false, + "APIServerErrReason", apierrors.ReasonForError(apiServerErr), + "APIServerResHTTPCode", apiServerErr.Status().Code, + }, + }, + }, + { + name: "API server error, cached, with extra kvs", + err: apiServerErr, + desc: "failed to create namespace", + cached: true, + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: apiServerErr, + desc: "failed to create namespace", + attrs: []interface{}{ + "k1", "v1", + "cached", true, + "APIServerErrReason", apierrors.ReasonForError(apiServerErr), + "APIServerResHTTPCode", apiServerErr.Status().Code, + }, + }, + }, + { + name: "non API server error, no HTTPCode in attrs", + err: plainErr, + desc: "unexpected failure", + cached: false, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: plainErr, + desc: "unexpected failure", + attrs: []interface{}{ + "cached", false, + "APIServerErrReason", apierrors.ReasonForError(plainErr), + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewAPIServerError(tc.err, tc.desc, tc.cached, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewAPIServerError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewUnexpectedError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + gotErr := NewUnexpectedError(plainErr, "low-level desc", "k1", "v1") + + // Verify the category. + if gotErr.category != ErrCategoryUnexpected { + t.Errorf("NewUnexpectedError() category = %v, want %v", gotErr.category, ErrCategoryUnexpected) + } + // Verify wrapped error and description via Error(). + wantMsg := "low-level desc: plain error" + if got := gotErr.Error(); got != wantMsg { + t.Errorf("NewUnexpectedError() Error() = %q, want %q", got, wantMsg) + } + // Verify that the user-provided kvs appear at the start of attrs. + if len(gotErr.attrs) < 2 || gotErr.attrs[0] != "k1" || gotErr.attrs[1] != "v1" { + t.Errorf("NewUnexpectedError() attrs = %v, want k1=v1 at start", gotErr.attrs) + } + // Verify that the callers key is present in attrs and its frame count is not zero. + found := false + canCast := false + var frames []CallerFrameOverview + for i := 0; i < len(gotErr.attrs)-1; i++ { + if gotErr.attrs[i] == "callers" { + found = true + frames, canCast = gotErr.attrs[i+1].([]CallerFrameOverview) + break + } + } + if !found { + t.Errorf("NewUnexpectedError() attrs = %v, want callers key present", gotErr.attrs) + } + if !canCast { + t.Errorf("NewUnexpectedError() callers value has type %T, want []CallerFrameOverview", gotErr.attrs) + } + if len(frames) == 0 { + t.Errorf("NewUnexpectedError() callers = %v, want non-zero frame count", frames) + } + + // Verify the first frame (the current test code). + callerFunc := frames[0].Function + if callerFunc != "github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestNewUnexpectedError" { + t.Errorf("NewUnexpectedError() first caller function = %s, want TestNewUnexpectedError", callerFunc) + } + callerFile := frames[0].File + if !strings.HasSuffix(callerFile, "pkg/utils/errors/errors_test.go") { + t.Errorf("NewUnexpectedError() first caller file = %s, want to end with pkg/utils/errors/errors_test.go", callerFile) + } + // Note: skip verifying the exact line number as it may change with code edits. +} + +func readFromBuffer(t *testing.T, buf *bytes.Buffer) string { + output := make([]byte, 1000) + n, err := buf.Read(output) + if err != nil && err != io.EOF { + t.Fatalf("Failed to read from bytes buffer: %v", err) + } + output = output[:n] + outputStr := string(output) + return outputStr +} From 4846c3c3f72daa9e1030c5794e93c2171c4cc97e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 May 2026 15:18:08 -0700 Subject: [PATCH 11/19] chore: bump github/codeql-action from 4.35.2 to 4.35.3 (#699) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.35.2 to 4.35.3. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/95e58e9a2cdfd71adc6e0353d5c52f41a045d225...e46ed2cbd01164d986452f91f178727624ae40d7) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.35.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Simon Waight --- .github/workflows/codeql-analysis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index ccf33b9b8..2d761329b 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -42,7 +42,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -56,7 +56,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/autobuild@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -69,4 +69,4 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 From 4a5301ff572aa311563781be9d54aab41f45ec3f Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Tue, 12 May 2026 11:09:15 +1000 Subject: [PATCH 12/19] feat: guardrail: added additional service account protection (#702) --- pkg/webhook/webhook.go | 18 +++++-- test/e2e/fleet_guard_rail_test.go | 86 +++++++++++++++++++++++++++++++ test/e2e/resources_test.go | 1 + test/e2e/utils_test.go | 2 +- 4 files changed, 102 insertions(+), 5 deletions(-) diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index e3bf5beb9..9f2d921b0 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -641,10 +641,20 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho } // Build core v1 resources list, conditionally including pods if workload is enabled - coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName, - limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status", - replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName, - serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"} + coreV1Resources := []string{ + bindingResourceName, + configMapResourceName, + endPointResourceName, + limitRangeResourceName, + persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", + podTemplateResourceName, + podResourceName, podResourceName + "/status", + replicationControllerResourceName, replicationControllerResourceName + "/status", + resourceQuotaResourceName, resourceQuotaResourceName + "/status", + secretResourceName, + serviceAccountResourceName, serviceAccountResourceName + "/token", + servicesResourceName, servicesResourceName + "/status", + } namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ Operations: cuOperations, diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go index 322af5ff1..043304783 100644 --- a/test/e2e/fleet_guard_rail_test.go +++ b/test/e2e/fleet_guard_rail_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/gomega" admissionv1 "k8s.io/api/admission/v1" appsv1 "k8s.io/api/apps/v1" + authenticationv1 "k8s.io/api/authentication/v1" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -1304,3 +1305,88 @@ var _ = Describe("fleet guard rail restrict internal fleet resources from being Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &ise), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &iseGVK, "", types.NamespacedName{Name: ise.Name, Namespace: ise.Namespace}))).Should(Succeed()) }) }) + +var _ = Describe("fleet guard rail webhook tests for service accounts in restricted namespaces", Ordered, func() { + svcAccountGVK := metav1.GroupVersionKind{Group: corev1.SchemeGroupVersion.Group, Version: corev1.SchemeGroupVersion.Version, Kind: "ServiceAccount"} + tokenReqGVK := metav1.GroupVersionKind{Group: "authentication.k8s.io", Version: "v1", Kind: "TokenRequest"} + + svcAccountName := fmt.Sprintf(svcAccountNameTemplate, GinkgoParallelProcess()) + svcAccountToAddName := "added-sa" + kubeSystemNamespaceName := "kube-system" + + var svcAccount *corev1.ServiceAccount + BeforeAll(func() { + // Create a service account in the kube-system namespace. + svcAccount = &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + Expect(hubClient.Create(ctx, svcAccount)).Should(Succeed(), "Failed to create service account") + }) + + AfterAll(func() { + // Ensure the removal of the created service account. + Eventually(func() error { + svcAccount := corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + if err := hubClient.Delete(ctx, &svcAccount); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete service account: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, &corev1.ServiceAccount{}); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to retrieve service account: %w", err) + } + return fmt.Errorf("service account still exists") + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete service account") + }) + + It("should deny creation of service accounts in kube-system namespace for non-whitelisted users", func() { + newSvcAccount := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountToAddName, + Namespace: kubeSystemNamespaceName, + }, + } + wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat, + testUser, + utils.GenerateGroupString(testGroups), + admissionv1.Create, + svcAccountGVK, "", + types.NamespacedName{Name: newSvcAccount.Name, Namespace: newSvcAccount.Namespace}, + ) + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, newSvcAccount), wantErrMsg)).Should(Succeed(), "Failed to deny creation of service account in kube-system namespace") + }) + + It("should deny access to service account token subresource in restricted namespace for non-whitelisted users", func() { + tokenRequest := &authenticationv1.TokenRequest{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: kubeSystemNamespaceName, + Name: svcAccountName, + }, + Spec: authenticationv1.TokenRequestSpec{ + Audiences: []string{"experimental"}, + ExpirationSeconds: ptr.To(int64(3600)), + }, + } + wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat, + testUser, + utils.GenerateGroupString(testGroups), + admissionv1.Create, + tokenReqGVK, "token", + types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, + ) + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.SubResource("token").Create(ctx, svcAccount, tokenRequest), wantErrMsg)).Should(Succeed(), "Failed to deny access to service account token subresource in restricted namespace") + }) +}) diff --git a/test/e2e/resources_test.go b/test/e2e/resources_test.go index b4cb066cf..f096b52c0 100644 --- a/test/e2e/resources_test.go +++ b/test/e2e/resources_test.go @@ -54,6 +54,7 @@ const ( clusterStagedUpdateRunNameWithSubIndexTemplate = "cur-%d-%d" stagedUpdateRunStrategyNameTemplate = "sus-%d" stagedUpdateRunNameWithSubIndexTemplate = "sur-%d-%d" + svcAccountNameTemplate = "sa-%d" customDeletionBlockerFinalizer = "kubernetes-fleet.io/custom-deletion-blocker-finalizer" workNamespaceLabelName = "process" diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index 9e7da13b3..48a7bdc0e 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -1749,7 +1749,7 @@ func checkIfStatusErrorWithMessage(err error, errorMsg string) error { return nil } } - return fmt.Errorf("error message %s not found in error %w", errorMsg, err) + return fmt.Errorf("error message (%s) not found in given error (%w)", errorMsg, err) } // buildOwnerReference builds an owner reference given a cluster and a placement name. From 359d2b2f722b200bd7e190ffc3e5b76ef5cbd466 Mon Sep 17 00:00:00 2001 From: Simon Waight Date: Tue, 12 May 2026 13:55:51 +1000 Subject: [PATCH 13/19] chore: update community meetings. (#701) Update community meetings. Signed-off-by: Simon Waight --- README.md | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index d5782ce8e..776e3ecd4 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,14 @@ ![cncf_logo](screenshots/cncf-logo.png) KubeFleet is a sandbox project of the [Cloud Native Computing Foundation](https://cncf.io/) (CNCF) that works on any Kubernetes cluster. -We are working towards the vision where we will eventually be able to treat each Kubernetes cluster as a [cattle](https://cloudscaling.com/blog/cloud-computing/the-history-of-pets-vs-cattle/). + +We are working towards the vision to enable you treat your Kubernetes clusters as a flexible container runtime environment where workloads can run anywhere within your fleet. ## What is KubeFleet? -KubeFleet contains a set of Kubernetes controllers and CRDs which provide an advanced cloud-native solution for multi-cluster application management. -Use KubeFleet to schedule workloads smartly, roll out changes progressively, and perform administrative tasks easily, across a group of Kubernetes clusters on any cloud or on-premises clusters. +KubeFleet contains a set of Kubernetes controllers and CRDs which provide an advanced cloud-native solution for multi-cluster application management. +Use KubeFleet to schedule workloads intelligently, roll out changes progressively, and perform administrative tasks easily, across a group of Kubernetes clusters in any location. ## Quickstart @@ -24,12 +25,15 @@ Use KubeFleet to schedule workloads smartly, roll out changes progressively, and ## Key benefits and capabilities ### Centralized policy-driven fleet governance + KubeFleet utilizes a hub-spoke architecture that creates a single control plane for the fleet. It allows fleet administrators to apply uniform cloud native policies on every member cluster, whether they reside in public clouds, private data centers, or edge locations. This greatly simplifies governance across large, geographically distributed fleets spanning hybrid and multi-cloud environments. ### Progressive Rollouts with Safeguards + KubeFleet provides a cloud native progressive rollout plans sequence updates across the entire fleet with health verification at each step. The application owner can pause or rollback to any previous versions when they observe failures, limiting blast radius. This keeps multi-cluster application deployments reliable and predictable spanning edge, on-premises, and cloud environments. ### Powerful Multi-Cluster Scheduling + KubeFleet's scheduler evaluates member cluster properties, available capacity, and declarative placement policies to select optimal destinations for workloads. It supports cluster affinity and anti-affinity rules, topology spread constraints to distribute workloads across failure domains, and resource-based placement to ensure sufficient compute, memory, and storage. The scheduler continuously reconciles as fleet conditions change, automatically adapting to cluster additions, removals, or capacity shifts across edge, on-premises, and cloud environments. For more details, please refer to the [KubeFleet website](https://kubefleet.dev/docs/). ## Documentation @@ -46,22 +50,14 @@ You can reach the KubeFleet community and developers via the following channels: ## Community Meetings -March 2026: we're currently revamping our community call schedule and will have more to share soon. - -Future plans will land on our [community repository](https://github.com/kubefleet-dev/community). +Please refer to the [calendar](https://zoom-lfx.platform.linuxfoundation.org/meetings/kubefleet?view=month) for the latest schedule. - +For more meeting information please see the [KubeFleet community repository](https://github.com/kubefleet-dev/community#community-meetings). ## Code of Conduct + Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). See the [Code of Conduct](CODE_OF_CONDUCT.md) for more information. ## Contributing @@ -69,8 +65,8 @@ Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://git The [contribution guide](CONTRIBUTING.md) covers everything you need to know about how you can contribute to KubeFleet. ## Support -For more information, see [SUPPORT](SUPPORT.md). +For more information, see [SUPPORT](SUPPORT.md). [1]: https://img.shields.io/github/v/release/kubefleet-dev/kubefleet [2]: https://goreportcard.com/badge/go.goms.io/fleet @@ -79,4 +75,5 @@ For more information, see [SUPPORT](SUPPORT.md). [5]: https://img.shields.io/github/go-mod/go-version/kubefleet-dev/kubefleet Copyright The KubeFleet Authors. -The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/trademark-usage/). + +The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/trademark-usage/). \ No newline at end of file From e12aa9f38cc13332c228170669d95d215518d5ab Mon Sep 17 00:00:00 2001 From: Wantong Date: Mon, 11 May 2026 22:27:58 -0700 Subject: [PATCH 14/19] fix: add securityContext to hub-agent and member-agent deployment (#703) --- charts/hub-agent/templates/deployment.yaml | 23 +++++++++++++++---- charts/member-agent/templates/deployment.yaml | 19 +++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index c1acd945c..3bdca63fa 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -19,10 +19,23 @@ spec: {{- include "hub-agent.selectorLabels" . | nindent 8 }} spec: serviceAccountName: {{ include "hub-agent.fullname" . }}-sa + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: {{ .Chart.Name }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL args: - --leader-elect=true - --enable-webhook={{ .Values.enableWebhook }} @@ -77,22 +90,24 @@ spec: fieldPath: metadata.namespace resources: {{- toYaml .Values.resources | nindent 12 }} - {{- if .Values.useCertManager }} volumeMounts: - name: webhook-cert # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go mountPath: /tmp/k8s-webhook-server/serving-certs + {{- if .Values.useCertManager }} readOnly: true - {{- end }} - {{- if .Values.useCertManager }} + {{- end }} volumes: - name: webhook-cert + {{- if .Values.useCertManager }} secret: secretName: {{ .Values.webhookCertSecretName }} # defaultMode 0444 (read for all) allows the container process to read the certs # regardless of the user/group it runs as defaultMode: 0444 - {{- end }} + {{- else }} + emptyDir: {} + {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml index ae83e5d95..30b564c4d 100644 --- a/charts/member-agent/templates/deployment.yaml +++ b/charts/member-agent/templates/deployment.yaml @@ -16,10 +16,23 @@ spec: spec: restartPolicy: Always serviceAccountName: {{ include "member-agent.fullname" . }}-sa + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: {{ include "member-agent.fullname" . }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL ports: - name: http containerPort: 80 @@ -140,6 +153,12 @@ spec: - name: refresh-token image: "{{ .Values.refreshtoken.repository }}:{{ .Values.refreshtoken.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.refreshtoken.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL args: {{- $provider := .Values.config.provider }} - {{ $provider }} From 57640d658fbc9e4f6394cd688539d1d8a87e3340 Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Tue, 12 May 2026 13:20:21 -0700 Subject: [PATCH 15/19] test: cover snapshot cross-scope isolation in DeleteResourceSnapshots (#669) --- go.mod | 2 +- .../resource_snapshot_resolver_test.go | 65 +++++++++++++------ 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/go.mod b/go.mod index 8dfac998d..1b36cfd48 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/Azure/karpenter-provider-azure v1.5.1 github.com/crossplane/crossplane-runtime/v2 v2.1.0 github.com/evanphx/json-patch/v5 v5.9.11 + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 @@ -65,7 +66,6 @@ require ( github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.4.2 // indirect - github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect diff --git a/pkg/utils/controller/resource_snapshot_resolver_test.go b/pkg/utils/controller/resource_snapshot_resolver_test.go index 22670ad81..16b8d6941 100644 --- a/pkg/utils/controller/resource_snapshot_resolver_test.go +++ b/pkg/utils/controller/resource_snapshot_resolver_test.go @@ -1683,10 +1683,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { } tests := []struct { - name string - placementObj fleetv1beta1.PlacementObj - objects []client.Object - expectedError string + name string + placementObj fleetv1beta1.PlacementObj + objects []client.Object + // Snapshots that must still exist after the delete. Nil means every seeded object should be deleted. + wantSurvivors []types.NamespacedName + wantError string }{ { name: "delete resource snapshots - namespaced", @@ -1770,8 +1772,8 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, - // Namespaced snapshots with same placement name (should NOT be deleted) - // TODO: find a way to test this + // Namespaced snapshot sharing the same placement name; cross-scope isolation + // requires it to survive a cluster-scoped delete. &fleetv1beta1.ResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-namespaced-snapshot", @@ -1782,6 +1784,9 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, + wantSurvivors: []types.NamespacedName{ + {Namespace: "test-namespace", Name: "test-namespaced-snapshot"}, + }, }, { name: "mixed environment - delete namespaced snapshots only", @@ -1792,8 +1797,8 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, objects: []client.Object{ - // Cluster-scoped snapshots with same placement name (should NOT be deleted) - // TODO: find a way to test this + // Cluster-scoped snapshot sharing the same placement name; cross-scope isolation + // requires it to survive a namespaced delete. &fleetv1beta1.ClusterResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster-snapshot", @@ -1832,6 +1837,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, + wantSurvivors: []types.NamespacedName{ + // Cluster-scoped survivor (different scope, same placement name). + {Name: "test-cluster-snapshot"}, + // Different-namespace survivor (same scope, different namespace). + {Namespace: "other-namespace", Name: "other-namespaced-snapshot"}, + }, }, } @@ -1841,12 +1852,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { err := DeleteResourceSnapshots(context.Background(), k8Client, tt.placementObj) - if tt.expectedError != "" { + if tt.wantError != "" { if err == nil { t.Fatalf("Expected error but got nil") } - if !strings.Contains(err.Error(), tt.expectedError) { - t.Errorf("Expected error to contain: %s, but got: %v", tt.expectedError, err) + if !strings.Contains(err.Error(), tt.wantError) { + t.Errorf("Expected error to contain: %s, but got: %v", tt.wantError, err) } return } @@ -1855,18 +1866,30 @@ func TestDeleteResourceSnapshots(t *testing.T) { t.Fatalf("Expected no error but got: %v", err) } - // Verify snapshots were deleted by checking they no longer exist - placementKey := types.NamespacedName{ - Name: tt.placementObj.GetName(), - Namespace: tt.placementObj.GetNamespace(), + // Unfiltered list of both scopes; exact match guards the cross-scope regression. + var clusterList fleetv1beta1.ClusterResourceSnapshotList + if err := k8Client.List(context.Background(), &clusterList); err != nil { + t.Fatalf("Failed to list ClusterResourceSnapshots after deletion: %v", err) } - result, err := ListAllResourceSnapshots(context.Background(), k8Client, placementKey) - if err != nil { - t.Fatalf("Expected no error when listing snapshots after deletion, but got: %v", err) + var nsList fleetv1beta1.ResourceSnapshotList + if err := k8Client.List(context.Background(), &nsList); err != nil { + t.Fatalf("Failed to list ResourceSnapshots after deletion: %v", err) } - - if len(result.GetResourceSnapshotObjs()) != 0 { - t.Errorf("Expected 0 resource snapshots after deletion, got %d", len(result.GetResourceSnapshotObjs())) + got := make([]types.NamespacedName, 0, len(clusterList.Items)+len(nsList.Items)) + for _, s := range clusterList.Items { + got = append(got, types.NamespacedName{Name: s.Name}) + } + for _, s := range nsList.Items { + got = append(got, types.NamespacedName{Namespace: s.Namespace, Name: s.Name}) + } + less := func(a, b types.NamespacedName) bool { + if a.Namespace != b.Namespace { + return a.Namespace < b.Namespace + } + return a.Name < b.Name + } + if diff := cmp.Diff(tt.wantSurvivors, got, cmpopts.SortSlices(less), cmpopts.EquateEmpty()); diff != "" { + t.Errorf("surviving snapshots mismatch (-want +got):\n%s", diff) } }) } From 63da14cf6db5f579aabc9120f44723073c22f563 Mon Sep 17 00:00:00 2001 From: Yetkin Timocin Date: Tue, 12 May 2026 20:03:54 -0700 Subject: [PATCH 16/19] fix: specify which object failed in override error message (#686) * fix: specify which object failed in override error message Signed-off-by: Yetkin Timocin * fix: harden formatOverrideTarget fallback and align IsClusterMatched log Address review feedback on PR #686: - formatOverrideTarget: fall back to "Unknown" when the unstructured target has an empty Kind, so a malformed snapshot doesn't render as `"" "name"` in user-facing error messages. - applyOverrideRules: drop the NewUnexpectedBehaviorError log wrapper on the IsClusterMatched failure path. The outer applyOverrides wrap already tags this as ErrUserError, so classifying the log line as "unexpected" contradicted the user-error classification. - applyOverrides: switch the outer wrap from the inline fmt.Errorf("%w: ...", controller.ErrUserError, ...) form to controller.NewUserError(fmt.Errorf(...)) to match the form used by every other call site in the codebase. Behaviorally identical (same Error() string, same errors.Is, same trim output at workgenerator/controller.go:188-194). - Add TestFormatOverrideTarget covering namespaced, cluster-scoped, and the new empty-Kind fallback (both scopes). - Trim the verbose doc and test comments introduced by this PR. Signed-off-by: Yetkin Timocin --------- Signed-off-by: Yetkin Timocin --- .../controller_integration_test.go | 145 ++++++++--- pkg/controllers/workgenerator/override.go | 54 ++-- .../workgenerator/override_test.go | 238 +++++++++++++++--- pkg/controllers/workgenerator/suite_test.go | 42 ++++ 4 files changed, 387 insertions(+), 92 deletions(-) diff --git a/pkg/controllers/workgenerator/controller_integration_test.go b/pkg/controllers/workgenerator/controller_integration_test.go index 1e7e64b1a..19bbafb9f 100644 --- a/pkg/controllers/workgenerator/controller_integration_test.go +++ b/pkg/controllers/workgenerator/controller_integration_test.go @@ -57,10 +57,12 @@ var ( validClusterResourceOverrideSnapshotName = "cro-1" validResourceOverrideSnapshotName = "ro-1" invalidClusterResourceOverrideSnapshotName = "cro-2" // the overridden manifest is invalid + invalidResourceOverrideSnapshotName = "ro-2" // the overridden manifest is invalid validClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot validResourceOverrideSnapshot placementv1beta1.ResourceOverrideSnapshot invalidClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot + invalidResourceOverrideSnapshot placementv1beta1.ResourceOverrideSnapshot bindingStatusCmpOpts = cmp.Options{ cmpopts.SortSlices(utils.LessFuncConditionByType), @@ -280,7 +282,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", // delete the binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) Expect(k8sClient.Delete(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) // check the binding is deleted Eventually(func() bool { err := k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding) @@ -302,7 +304,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -383,7 +385,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) Consistently(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) @@ -589,7 +591,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testConfigMap, testResourceEnvelope, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -692,12 +694,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceEnvelope2, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second generation Eventually(func() bool { @@ -711,7 +713,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the work that contains none enveloped object is updated work := placementv1beta1.Work{} Eventually(func() error { @@ -790,12 +792,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second binding generation Eventually(func() bool { @@ -809,7 +811,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the enveloped work is deleted Eventually(func() error { envelopWorkLabelMatcher := client.MatchingLabels{ @@ -839,7 +841,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testClusterScopedEnvelope, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -942,12 +944,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second binding generation Eventually(func() bool { @@ -961,7 +963,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the enveloped work is deleted Eventually(func() error { envelopWorkLabelMatcher := client.MatchingLabels{ @@ -990,7 +992,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) // create binding spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, @@ -1003,19 +1005,19 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testConfigMap, testPdb, }) Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed()) - By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name)) }) AfterEach(func() { // delete the master resource snapshot Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name)) // delete the second resource snapshot Expect(k8sClient.Delete(ctx, secondSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name)) + By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name)) // delete the binding Expect(k8sClient.Delete(ctx, binding)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("resource binding %s deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s deleted", binding.Name)) }) It("Should create all the work in the target namespace after all the resource snapshot are created", func() { @@ -1033,7 +1035,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests) Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex secondWork := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork) @@ -1099,7 +1101,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests) Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex secondWork := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork) @@ -1155,7 +1157,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") @@ -1165,25 +1167,25 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) updateRolloutStartedGeneration(&binding) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) // Now create the second resource snapshot secondSnapshot = generateClusterResourceSnapshot(3, 3, 1, [][]byte{ testResource, testConfigMap, }) Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name)) // Now create the third resource snapshot thirdSnapshot := generateClusterResourceSnapshot(3, 3, 2, [][]byte{ testPdb, }) Expect(k8sClient.Create(ctx, thirdSnapshot)).Should(Succeed()) - By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name)) // check the work for the master resource snapshot is created expectedManifest := []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResourceCRD}}, @@ -1201,7 +1203,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return nil }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is updated in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex expectedManifest = []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResource}}, {RawExtension: runtime.RawExtension{Raw: testConfigMap}}, @@ -1220,7 +1222,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return nil }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("second work %s is updated in %s", work.Name, work.Namespace)) - // check the work for the third resource snapshot is created, it's name is crp-subindex + // check the work for the third resource snapshot is created, its name is crp-subindex expectedManifest = []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testPdb}}, } @@ -1247,7 +1249,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") @@ -1257,13 +1259,13 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, testConfigMap, testPdb, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) updateRolloutStartedGeneration(&binding) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) //inspect the work manifest that should have been updated to contain all expectedManifest := []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResourceCRD}}, @@ -1301,7 +1303,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex work2 := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work2) @@ -1310,7 +1312,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", // delete the binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) Expect(k8sClient.Delete(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) // verify that all associated works have been deleted Eventually(func() error { @@ -1352,7 +1354,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) crolist := []string{validClusterResourceOverrideSnapshotName} roList := []placementv1beta1.NamespacedName{ { @@ -1446,7 +1448,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) Consistently(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) @@ -1472,7 +1474,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1519,7 +1521,70 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } return cmp.Diff(wantStatus, binding.Status, cmpConditionOption) }, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name)) - Expect(binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message).Should(ContainSubstring("Failed to apply the override rules on the resources: add operation does not apply")) + message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message + Expect(message).Should(MatchRegexp(`ClusterResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`)) + }) + }) + + Context("Test Bound ClusterResourceBinding with a single resource snapshot and invalid resource override", func() { + var masterSnapshot *placementv1beta1.ClusterResourceSnapshot + + BeforeEach(func() { + masterSnapshot = generateClusterResourceSnapshot(1, 1, 0, [][]byte{ + testResourceCRD, testNameSpace, testResource, + }) + Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + spec := placementv1beta1.ResourceBindingSpec{ + State: placementv1beta1.BindingStateBound, + ResourceSnapshotName: masterSnapshot.Name, + TargetCluster: memberClusterName, + ResourceOverrideSnapshots: []placementv1beta1.NamespacedName{ + { + Name: invalidResourceOverrideSnapshotName, + Namespace: appNamespaceName, + }, + }, + } + createClusterResourceBinding(&binding, spec) + }) + + AfterEach(func() { + By("Deleting master clusterResourceSnapshot") + Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + }) + + It("Should not create the work in the target namespace and surface ResourceOverrideSnapshot identity in the condition message", func() { + work := placementv1beta1.Work{} + Consistently(func() bool { + err := k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) + return errors.IsNotFound(err) + }, duration, interval).Should(BeTrue(), "controller should not create work in hub cluster when an override fails") + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) + Expect(len(binding.Finalizers)).Should(Equal(1)) + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) + wantStatus := placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingRolloutStarted), + Status: metav1.ConditionTrue, + Reason: condition.RolloutStartedReason, + ObservedGeneration: binding.GetGeneration(), + }, + { + Type: string(placementv1beta1.ResourceBindingOverridden), + Status: metav1.ConditionFalse, + Reason: condition.OverriddenFailedReason, + ObservedGeneration: binding.GetGeneration(), + }, + }, + } + return cmp.Diff(wantStatus, binding.Status, cmpConditionOption) + }, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name)) + message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message + Expect(message).Should(MatchRegexp(`ResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`)) }) }) @@ -1531,7 +1596,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1588,7 +1653,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1673,7 +1738,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted)) Consistently(func() error { diff --git a/pkg/controllers/workgenerator/override.go b/pkg/controllers/workgenerator/override.go index a5af94417..91c79f6a4 100644 --- a/pkg/controllers/workgenerator/override.go +++ b/pkg/controllers/workgenerator/override.go @@ -35,6 +35,10 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils/overrider" ) +// fetchClusterResourceOverrideSnapshots returns the binding's CRO snapshots keyed by the +// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so +// callers can apply deterministically. +// // TODO: combine the following two functions into one, as they are very similar. func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot, error) { croMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot) @@ -70,6 +74,9 @@ func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, return croMap, nil } +// fetchResourceOverrideSnapshots returns the binding's RO snapshots keyed by the +// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so +// callers can apply deterministically. func (r *Reconciler) fetchResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot, error) { roMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot) @@ -127,11 +134,11 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, Kind: gvk.Kind, Name: uResource.GetName(), } - isClusterScopeResource := r.InformerManager.IsClusterScopedResources(gvk) + isClusterScopedResource := r.InformerManager.IsClusterScopedResources(gvk) // For the namespace scoped resource, it could be selected by the namespace itself. - // use the namespace as the key - if !isClusterScopeResource { + // Use the namespace as the key. + if !isClusterScopedResource { key = placementv1beta1.ResourceIdentifier{ Group: utils.NamespaceMetaGVK.Group, Version: utils.NamespaceMetaGVK.Version, @@ -149,14 +156,15 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, } if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil { klog.ErrorS(err, "Failed to apply the override rules", "clusterResourceOverrideSnapshot", klog.KObj(snapshot)) - return false, err + return false, controller.NewUserError(fmt.Errorf("ClusterResourceOverrideSnapshot %q failed to apply on %s: %s", + snapshot.Name, formatOverrideTarget(&uResource), err.Error())) } } klog.V(2).InfoS("Applied clusterResourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(croMap[key])) // If the resource is selected by both ClusterResourceOverride and ResourceOverride, ResourceOverride will win when resolving conflicts. // Apply ResourceOverrideSnapshots. - if !isClusterScopeResource { + if !isClusterScopedResource { key = placementv1beta1.ResourceIdentifier{ Group: gvk.Group, Version: gvk.Version, @@ -172,7 +180,8 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, } if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil { klog.ErrorS(err, "Failed to apply the override rules", "resourceOverrideSnapshot", klog.KObj(snapshot)) - return false, err + return false, controller.NewUserError(fmt.Errorf("ResourceOverrideSnapshot %q failed to apply on %s: %s", + snapshot.Name, formatOverrideTarget(&uResource), err.Error())) } } klog.V(2).InfoS("Applied resourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(roMap[key])) @@ -180,12 +189,29 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, return resource.Raw == nil, nil } +// formatOverrideTarget renders the target as e.g. `Deployment "my-app" in namespace "default"` +// for inclusion in a user-facing error message. +func formatOverrideTarget(target *unstructured.Unstructured) string { + // Fall back to "Unknown" so a snapshot missing `kind` doesn't render as `"" "name"`. + kind := target.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = "Unknown" + } + if ns := target.GetNamespace(); ns != "" { + return fmt.Sprintf("%s %q in namespace %q", kind, target.GetName(), ns) + } + return fmt.Sprintf("%s %q", kind, target.GetName()) +} + +// applyOverrideRules applies matching rules to the resource. A DeleteOverrideType rule clears +// the resource and stops; otherwise JSON patches apply in order. Errors are returned raw — the +// caller (applyOverrides) tags them as user errors so we don't double-wrap the sentinel. func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, rules []placementv1beta1.OverrideRule) error { for _, rule := range rules { matched, err := overrider.IsClusterMatched(cluster, rule) if err != nil { - klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Found an invalid override rule") - return controller.NewUserError(err) // should not happen though and should be rejected by the webhook + klog.ErrorS(err, "Found an invalid override rule") + return err } if !matched { continue @@ -198,7 +224,7 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu // Apply JSONPatchOverrides by default if err = applyJSONPatchOverride(resource, cluster, rule.JSONPatchOverrides); err != nil { klog.ErrorS(err, "Failed to apply JSON patch override") - return controller.NewUserError(err) + return err } } return nil @@ -207,11 +233,11 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu // applyJSONPatchOverride applies a JSON patch on the selected resources following [RFC 6902](https://datatracker.ietf.org/doc/html/rfc6902). func applyJSONPatchOverride(resourceContent *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, overrides []placementv1beta1.JSONPatchOverride) error { var err error - if len(overrides) == 0 { // do nothing + if len(overrides) == 0 { return nil } - // go through the JSON patch overrides to replace the built-in variables before json Marshal - // as it may contain the built-in variables that cannot be marshaled directly + // Go through the JSON patch overrides to replace the built-in variables before json.Marshal, + // as the patch values may contain built-in variables that cannot be marshaled directly. for i := range overrides { // Process the JSON string to replace variables jsonStr := string(overrides[i].Value.Raw) @@ -263,14 +289,14 @@ func replaceClusterLabelKeyVariables(input string, cluster *clusterv1beta1.Membe // extract the key value user wants to replace endIdx := strings.Index(result[startIdx+prefixLen:], "}") if endIdx == -1 { - klog.V(2).InfoS("malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input) + klog.V(2).InfoS("Malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input) return "", fmt.Errorf("input %s is missing the closing bracket `}`", input) } endIdx += startIdx + prefixLen // extract the key name keyName := result[startIdx+prefixLen : endIdx] // check if the key exists in the cluster labels - labelValue, exists := cluster.ObjectMeta.Labels[keyName] + labelValue, exists := cluster.Labels[keyName] if !exists { klog.V(2).InfoS("Label key not found on cluster", "key", keyName, "cluster", cluster.Name) return "", fmt.Errorf("label key %s not found on cluster %s", keyName, cluster.Name) diff --git a/pkg/controllers/workgenerator/override_test.go b/pkg/controllers/workgenerator/override_test.go index a1e9ed8f2..324d2c02e 100644 --- a/pkg/controllers/workgenerator/override_test.go +++ b/pkg/controllers/workgenerator/override_test.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -428,6 +429,9 @@ func TestFetchResourceOverrideSnapshot(t *testing.T) { } func TestApplyOverrides_clusterScopedResource(t *testing.T) { + // FakeManager.IsClusterScopedResources returns (APIResources[gvk] == IsClusterScopedResource). + // With the flag set to false and APIResources listing namespace-scoped GVKs, listed GVKs + // report as namespace-scoped and unlisted GVKs (ClusterRole) report as cluster-scoped. fakeInformer := informer.FakeManager{ APIResources: map[schema.GroupVersionKind]bool{ { @@ -450,7 +454,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { croMap map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot wantClusterRole rbacv1.ClusterRole wantErr error - wantDeleted bool + // wantErrSubstr asserts the failure message names the failing snapshot and target. + wantErrSubstr []string + wantDeleted bool }{ { name: "empty overrides", @@ -844,6 +850,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { Name: "clusterrole-name", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "conflicting-rules-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -895,6 +904,10 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "conflicting-rules-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + }, }, { name: "invalid json patch of clusterResourceOverride", @@ -921,6 +934,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { Name: "clusterrole-name", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -953,6 +969,71 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "test-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + }, + }, + { + // Invalid LabelSelector Operator makes IsClusterMatched fail — exercises the + // applyOverrideRules raw-error return that applyOverrides wraps. + name: "invalid cluster label selector in clusterResourceOverride", + clusterRole: rbacv1.ClusterRole{ + TypeMeta: clusterRoleType, + ObjectMeta: metav1.ObjectMeta{ + Name: "clusterrole-name", + }, + }, + cluster: clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster-1", + }, + }, + croMap: map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot{ + { + Group: "rbac.authorization.k8s.io", + Version: "v1", + Kind: "ClusterRole", + Name: "clusterrole-name", + }: { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "bad-selector-cro-snapshot", + }, + Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ + OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ + Policy: &placementv1beta1.OverridePolicy{ + OverrideRules: []placementv1beta1.OverrideRule{ + { + ClusterSelector: &placementv1beta1.ClusterSelector{ + ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "key1", + Operator: "InvalidOperator", + Values: []string{"value1"}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "bad-selector-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + `invalid cluster label selector`, + }, }, { name: "delete during the clusterResourceOverride", @@ -1118,6 +1199,11 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted) } if tc.wantErr != nil { + for _, want := range tc.wantErrSubstr { + if !strings.Contains(err.Error(), want) { + t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want) + } + } return } if tc.wantDeleted { @@ -1126,12 +1212,12 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var clusterRole rbacv1.ClusterRole if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &clusterRole); err != nil { - t.Fatalf("Failed to convert the result to clusterole: %v, want nil", err) + t.Fatalf("Failed to convert the result to clusterRole: %v, want nil", err) } if diff := cmp.Diff(tc.wantClusterRole, clusterRole); diff != "" { @@ -1141,7 +1227,8 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { } } -func TestApplyOverrides_namespacedScopeResource(t *testing.T) { +func TestApplyOverrides_namespaceScopedResource(t *testing.T) { + // See TestApplyOverrides_clusterScopedResource for the IsClusterScopedResource: false rationale. fakeInformer := informer.FakeManager{ APIResources: map[schema.GroupVersionKind]bool{ { @@ -1165,7 +1252,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { roMap map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot wantDeployment appsv1.Deployment wantErr error - wantDelete bool + // wantErrSubstr asserts the failure message names the failing snapshot and target. + wantErrSubstr []string + wantDeleted bool }{ { name: "empty overrides", @@ -1664,6 +1753,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Name: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-patch-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -1697,6 +1789,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "invalid-patch-cro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, { name: "invalid json patch of resourceOverride", @@ -1728,6 +1825,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Namespace: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-patch-ro-snapshot", + Namespace: "deployment-namespace", + }, Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -1751,6 +1852,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ResourceOverrideSnapshot "invalid-patch-ro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, { name: "delete type of resourceOverride", @@ -1797,7 +1903,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "resourceOverride delete the cro override", @@ -1884,7 +1990,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "resourceOverride no-op when the cro delete", @@ -1970,7 +2076,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "cluster name as value in json patch of resourceOverride", @@ -2218,6 +2324,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Namespace: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ro-snapshot", + Namespace: "deployment-namespace", + }, Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -2240,6 +2350,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ResourceOverrideSnapshot "test-ro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, } for _, tc := range tests { @@ -2252,19 +2367,24 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { if gotErr, wantErr := err != nil, tc.wantErr != nil; gotErr != wantErr || !errors.Is(err, tc.wantErr) { t.Fatalf("applyOverrides() got error %v, want error %v", err, tc.wantErr) } - if gotDeleted != tc.wantDelete { - t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDelete) + if gotDeleted != tc.wantDeleted { + t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted) } if tc.wantErr != nil { + for _, want := range tc.wantErrSubstr { + if !strings.Contains(err.Error(), want) { + t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want) + } + } return } - if tc.wantDelete { + if tc.wantDeleted { return } var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var deployment appsv1.Deployment @@ -2757,7 +2877,7 @@ func TestApplyJSONPatchOverride(t *testing.T) { var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var deployment appsv1.Deployment @@ -2774,10 +2894,10 @@ func TestApplyJSONPatchOverride(t *testing.T) { func TestReplaceClusterLabelKeyVariables(t *testing.T) { tests := map[string]struct { - cluster *clusterv1beta1.MemberCluster - input string - expected string - expectErr bool + cluster *clusterv1beta1.MemberCluster + input string + want string + wantErr bool }{ "No clusterLabelKey variables": { cluster: &clusterv1beta1.MemberCluster{ @@ -2787,8 +2907,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in us-west-1", - expected: "The cluster is in us-west-1", + input: "The cluster is in us-west-1", + want: "The cluster is in us-west-1", }, "ClusterLabelKey Variable replaced": { cluster: &clusterv1beta1.MemberCluster{ @@ -2798,8 +2918,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", - expected: "The cluster is in us-west-1", + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", + want: "The cluster is in us-west-1", }, "The clusterLabelKey key is misspelled": { cluster: &clusterv1beta1.MemberCluster{ @@ -2807,8 +2927,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { Labels: map[string]string{}, }, }, - input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", - expected: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", + input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", + want: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", }, "Multiple complex clusterLabelKey variables replaced": { cluster: &clusterv1beta1.MemberCluster{ @@ -2819,8 +2939,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}", - expected: "The cluster is in us-west-1 and environment is prod", + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}", + want: "The cluster is in us-west-1 and environment is prod", }, "The clusterLabelKey key is not found": { cluster: &clusterv1beta1.MemberCluster{ @@ -2828,8 +2948,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { Labels: map[string]string{}, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", + wantErr: true, }, "ClusterLabelKey Variable key case not match": { cluster: &clusterv1beta1.MemberCluster{ @@ -2839,10 +2959,10 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}", + wantErr: true, }, - "Invalid clusterLabelKey variable format": { + "Invalid clusterLabelKey variable format": { cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -2850,8 +2970,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region", + wantErr: true, }, "ClusterLabelKey variable key empty": { cluster: &clusterv1beta1.MemberCluster{ @@ -2861,19 +2981,61 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}", + wantErr: true, }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { result, err := replaceClusterLabelKeyVariables(tc.input, tc.cluster) - if gotErr := err != nil; gotErr != tc.expectErr { - t.Fatalf("applyJSONPatchOverride() = error %v, want %v", err, tc.expectErr) + if gotErr := err != nil; gotErr != tc.wantErr { + t.Fatalf("replaceClusterLabelKeyVariables() = error %v, want %v", err, tc.wantErr) } - if result != tc.expected { - t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.expected) + if result != tc.want { + t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.want) + } + }) + } +} + +func TestFormatOverrideTarget(t *testing.T) { + tests := map[string]struct { + kind string + name string + namespace string + want string + }{ + "namespaced resource": { + kind: "Deployment", + name: "my-app", + namespace: "default", + want: `Deployment "my-app" in namespace "default"`, + }, + "cluster-scoped resource": { + kind: "Namespace", + name: "app", + want: `Namespace "app"`, + }, + "empty kind falls back to Unknown": { + name: "my-app", + namespace: "default", + want: `Unknown "my-app" in namespace "default"`, + }, + "empty kind, cluster-scoped": { + name: "app", + want: `Unknown "app"`, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + target := &unstructured.Unstructured{} + target.SetGroupVersionKind(schema.GroupVersionKind{Kind: tc.kind}) + target.SetName(tc.name) + target.SetNamespace(tc.namespace) + if got := formatOverrideTarget(target); got != tc.want { + t.Errorf("formatOverrideTarget() = %q, want %q", got, tc.want) } }) } diff --git a/pkg/controllers/workgenerator/suite_test.go b/pkg/controllers/workgenerator/suite_test.go index 88e6850b8..e607846f3 100644 --- a/pkg/controllers/workgenerator/suite_test.go +++ b/pkg/controllers/workgenerator/suite_test.go @@ -301,6 +301,47 @@ func createOverrides() { } Expect(k8sClient.Create(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the cro-2") By(fmt.Sprintf("Invalid cluster resource override snapshot %s created", invalidClusterResourceOverrideSnapshotName)) + + invalidResourceOverrideSnapshot = placementv1beta1.ResourceOverrideSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: invalidResourceOverrideSnapshotName, + Namespace: appNamespaceName, + Labels: map[string]string{ + placementv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ + OverrideSpec: placementv1beta1.ResourceOverrideSpec{ + ResourceSelectors: []placementv1beta1.ResourceSelector{ + { + Group: "test.kubernetes-fleet.io", + Version: "v1alpha1", + Kind: "TestResource", + Name: "random-test-resource", + }, + }, + Policy: &placementv1beta1.OverridePolicy{ + OverrideRules: []placementv1beta1.OverrideRule{ + { + ClusterSelector: &placementv1beta1.ClusterSelector{ + ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{}, // select all members + }, + JSONPatchOverrides: []placementv1beta1.JSONPatchOverride{ + { + Operator: placementv1beta1.JSONPatchOverrideOpAdd, + Path: "/invalid/path", + Value: apiextensionsv1.JSON{Raw: []byte(`"new-value"`)}, + }, + }, + }, + }, + }, + }, + OverrideHash: []byte("123"), + }, + } + Expect(k8sClient.Create(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the ro-2") + By(fmt.Sprintf("Invalid resource override snapshot %s created", invalidResourceOverrideSnapshotName)) } var _ = AfterSuite(func() { @@ -309,6 +350,7 @@ var _ = AfterSuite(func() { Expect(k8sClient.Delete(ctx, &validClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-1") Expect(k8sClient.Delete(ctx, &validResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-1") Expect(k8sClient.Delete(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-2") + Expect(k8sClient.Delete(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-2") Expect(k8sClient.Delete(ctx, &appNamespace)).Should(Succeed(), "Failed to delete app namespace") cancel() From ce6b31dd4ac9d39de94738a0f89e00a1874fa234 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Wed, 13 May 2026 15:39:35 +1000 Subject: [PATCH 17/19] fix: address a RO directory issue in templates (#706) --- charts/hub-agent/templates/deployment.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index 3bdca63fa..882a3fa12 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -90,13 +90,21 @@ spec: fieldPath: metadata.namespace resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.useCertManager }} volumeMounts: - name: webhook-cert # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go mountPath: /tmp/k8s-webhook-server/serving-certs - {{- if .Values.useCertManager }} readOnly: true - {{- end }} + {{- else }} + volumeMounts: + - name: webhook-cert + # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go. + # Note that this must be mounted one level up from the hardcoded path, otherwise + # the read only root filesystem setup would block the agent from attempting to + # clear the directory. + mountPath: /tmp/k8s-webhook-server/ + {{- end }} volumes: - name: webhook-cert {{- if .Values.useCertManager }} From 9dee67a64663d6037450165d2ba37b91f07795eb Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Wed, 13 May 2026 13:41:07 -0700 Subject: [PATCH 18/19] Fix failing e2e test now that enableWorkload is False meaning PVC can be placed Signed-off-by: Britania Rodriguez Reyes --- test/e2e/placement_selecting_resources_test.go | 3 ++- test/e2e/resource_placement_selecting_resources_test.go | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/e2e/placement_selecting_resources_test.go b/test/e2e/placement_selecting_resources_test.go index 10d6a56c3..b12ffa399 100644 --- a/test/e2e/placement_selecting_resources_test.go +++ b/test/e2e/placement_selecting_resources_test.go @@ -1392,11 +1392,12 @@ var _ = Describe("creating CRP and checking selected resources order", Ordered, It("should update CRP status with the correct order of the selected resources", func() { // Define the expected resources in order - // Note: PVCs are not propagated, so they should not appear in selected resources + // Note: PVCs are propagated when enableWorkload=false (the default for E2E tests) expectedResources := []placementv1beta1.ResourceIdentifier{ {Kind: "Namespace", Name: nsName, Version: "v1"}, {Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"}, {Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"}, + {Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"}, {Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"}, } diff --git a/test/e2e/resource_placement_selecting_resources_test.go b/test/e2e/resource_placement_selecting_resources_test.go index 2e6966274..9737dd787 100644 --- a/test/e2e/resource_placement_selecting_resources_test.go +++ b/test/e2e/resource_placement_selecting_resources_test.go @@ -1057,10 +1057,11 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f It("should update RP status with the correct order of the selected resources", func() { // Define the expected resources in order. - // Note: PVCs are not propagated, so they should not appear in selected resources + // Note: PVCs are propagated when enableWorkload=false (the default for E2E tests) expectedResources := []placementv1beta1.ResourceIdentifier{ {Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"}, {Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"}, + {Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"}, {Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"}, } From 4b121aa34c4a6281726c37f134ed69531b45af82 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Wed, 13 May 2026 15:53:42 -0700 Subject: [PATCH 19/19] Disable test due to enableWorkload=false change Signed-off-by: Britania Rodriguez Reyes --- test/e2e/resource_placement_hub_workload_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/e2e/resource_placement_hub_workload_test.go b/test/e2e/resource_placement_hub_workload_test.go index b088109ee..0e644d145 100644 --- a/test/e2e/resource_placement_hub_workload_test.go +++ b/test/e2e/resource_placement_hub_workload_test.go @@ -33,6 +33,9 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" ) +// This test requires enableWorkload=true because it creates workloads on the hub cluster +// and expects them to become ready. With enableWorkload=false, the pod webhook blocks +// pod creation on the hub cluster. var _ = Describe("placing workloads using a CRP with PickAll policy", Label("resourceplacement"), Ordered, func() { crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) var testDeployment appsv1.Deployment @@ -41,6 +44,8 @@ var _ = Describe("placing workloads using a CRP with PickAll policy", Label("res var testStatefulSet appsv1.StatefulSet BeforeAll(func() { + Skip("This test requires enableWorkload=true to run workloads on the hub cluster") + // Read the test manifests readDeploymentTestManifest(&testDeployment) readDaemonSetTestManifest(&testDaemonSet)