diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index ccf33b9b8..2d761329b 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -42,7 +42,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -56,7 +56,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/autobuild@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -69,4 +69,4 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4 + uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4 diff --git a/Makefile b/Makefile index e2941acc7..be3741254 100644 --- a/Makefile +++ b/Makefile @@ -275,6 +275,7 @@ build: generate fmt vet ## Build agent binaries go build -o bin/hubagent cmd/hubagent/main.go go build -o bin/memberagent cmd/memberagent/main.go go build -o bin/crdinstaller cmd/crdinstaller/main.go + go build -o bin/kubectl-fleet ./tools/fleet/ .PHONY: run-hubagent run-hubagent: manifests generate fmt vet ## Run hub-agent from your host diff --git a/apis/placement/v1/clusterresourceplacement_types.go b/apis/placement/v1/clusterresourceplacement_types.go index 798d09dcf..795841e72 100644 --- a/apis/placement/v1/clusterresourceplacement_types.go +++ b/apis/placement/v1/clusterresourceplacement_types.go @@ -123,13 +123,15 @@ type ClusterResourcePlacement struct { } // PlacementSpec defines the desired state of ClusterResourcePlacement and ResourcePlacement. +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) <= 1",message="only one namespace selector with NamespaceWithResourceSelectors mode is allowed" +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) && size(x.name) > 0 && !has(x.labelSelector))) == 1)",message="namespace selector with NamespaceWithResourceSelectors mode must select by name (not by label)" +// +kubebuilder:validation:XValidation:rule="size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && x.group == \"\" && x.version == 'v1')) == 1",message="when using NamespaceWithResourceSelectors mode, only one namespace selector is allowed (cannot mix with other namespace selectors)" type PlacementSpec struct { - // +kubebuilder:validation:MinItems=1 - // +kubebuilder:validation:MaxItems=100 - // ResourceSelectors is an array of selectors used to select cluster scoped resources. The selectors are `ORed`. // You can have 1-100 selectors. // +kubebuilder:validation:Required + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 ResourceSelectors []ResourceSelectorTerm `json:"resourceSelectors"` // Policy defines how to select member clusters to place the selected resources. @@ -185,6 +187,24 @@ type ResourceSelectorTerm struct { // For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: // - NamespaceOnly: Only the namespace object itself // - NamespaceWithResources: The namespace AND all resources within it (default) + // - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + // + // When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + // (after the namespace selector) to specify which resources to include. These additional selectors can + // target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + // + // Important requirements for NamespaceWithResourceSelectors mode: + // - Exactly one namespace selector with this mode is allowed + // - The namespace selector must select by name (not by label) + // - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + // - All requirements are validated via CEL at API validation time + // - If the selected namespace is deleted after CRP creation, the controller will report an error condition + // + // Example using NamespaceWithResourceSelectors: + // - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + // - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + // - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + // This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. // // +kubebuilder:validation:Required Kind string `json:"kind"` @@ -204,7 +224,7 @@ type ResourceSelectorTerm struct { // SelectionScope defines the scope of resource selections when the Kind is `namespace`. // This field is only applicable when Kind is "Namespace" and is ignored for other resource kinds. // See the Kind field documentation for detailed examples and usage patterns. - // +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources + // +kubebuilder:validation:Enum=NamespaceOnly;NamespaceWithResources;NamespaceWithResourceSelectors // +kubebuilder:default=NamespaceWithResources // +kubebuilder:validation:Optional SelectionScope SelectionScope `json:"selectionScope,omitempty"` @@ -230,6 +250,52 @@ const ( // Note: This is the default value. When you select a namespace without specifying SelectionScope, // this mode is used automatically. NamespaceWithResources SelectionScope = "NamespaceWithResources" + + // NamespaceWithResourceSelectors allows fine-grained selection of specific resources within a namespace. + // The namespace itself is always selected, and you can optionally specify which resources to include + // by adding additional ResourceSelectorTerm entries after the namespace selector. + // + // Use cases: + // 1. Select only specific resource types from a namespace (e.g., only Deployments and Services) + // 2. Select resources matching certain labels within a namespace + // 3. Include specific cluster-scoped resources along with namespace-scoped resources + // + // How "additional selectors" work: + // - Exactly one namespace selector with NamespaceWithResourceSelectors mode is required + // - This selector must select a namespace by name (label selectors not allowed) + // - ADDITIONAL selectors specify which resources to include: + // - Namespace-scoped resources are filtered to only those within the selected namespace + // - Cluster-scoped resources are included as specified (not limited to the namespace) + // - If no additional selectors are provided, only the namespace object itself is selected + // + // Example 1 - Select specific deployments from a namespace: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "production", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {tier: "frontend"}} + // Result: The "production" namespace + all Deployments labeled tier=frontend within "production" + // + // Example 2 - Select namespace with multiple resource types: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"} + // Selector 3: {Group: "", Version: "v1", Kind: "Service"} + // Result: The "app" namespace + ALL Deployments and Services within "app" + // + // Example 3 - Include cluster-scoped resources: + // Selector 1: {Group: "", Version: "v1", Kind: "Namespace", Name: "app", SelectionScope: "NamespaceWithResourceSelectors"} + // Selector 2: {Group: "apps", Version: "v1", Kind: "Deployment"} + // Selector 3: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "app-admin"} + // Result: The "app" namespace + ALL Deployments in "app" + the "app-admin" ClusterRole + // + // Important constraints: + // - Exactly ONE namespace selector with NamespaceWithResourceSelectors mode is allowed + // - The namespace selector must select by name (label selectors not allowed) + // - Only ONE namespace selector total is allowed when using this mode (cannot mix with other namespace selectors) + // - All constraints are enforced via CEL at API validation time + // + // Runtime behavior: + // - If the selected namespace is deleted after the CRP is created, the controller will detect this during + // the next reconciliation and report an error condition in the CRP status + // - The CRP will transition to a failed state until the namespace is recreated or the CRP is updated + NamespaceWithResourceSelectors SelectionScope = "NamespaceWithResourceSelectors" ) // PlacementPolicy contains the rules to select target member clusters to place the selected resources. diff --git a/apis/placement/v1/stageupdate_types.go b/apis/placement/v1/stageupdate_types.go index afcc88f96..2e06af937 100644 --- a/apis/placement/v1/stageupdate_types.go +++ b/apis/placement/v1/stageupdate_types.go @@ -209,7 +209,8 @@ type StageTask struct { Type StageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // Only hours (h), minutes (m), and seconds (s) units are accepted. + // +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime *metav1.Duration `json:"waitTime,omitempty"` diff --git a/apis/placement/v1alpha1/stagedupdate_types.go b/apis/placement/v1alpha1/stagedupdate_types.go index 5ab660fec..25352ca7d 100644 --- a/apis/placement/v1alpha1/stagedupdate_types.go +++ b/apis/placement/v1alpha1/stagedupdate_types.go @@ -153,7 +153,7 @@ type AfterStageTask struct { Type AfterStageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // +kubebuilder:validation:Pattern="^(?:0|(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+)$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime metav1.Duration `json:"waitTime,omitempty"` diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go index ef68dd087..7c6fe3842 100644 --- a/apis/placement/v1beta1/stageupdate_types.go +++ b/apis/placement/v1beta1/stageupdate_types.go @@ -347,7 +347,8 @@ type StageTask struct { Type StageTaskType `json:"type"` // The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - // +kubebuilder:validation:Pattern="^0|([0-9]+(\\.[0-9]+)?(s|m|h))+$" + // Only hours (h), minutes (m), and seconds (s) units are accepted. + // +kubebuilder:validation:Pattern="^(?:(?:0|[1-9][0-9]*)(\\.[0-9]+)?(?:s|m|h))+$" // +kubebuilder:validation:Type=string // +kubebuilder:validation:Optional WaitTime *metav1.Duration `json:"waitTime,omitempty"` diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index b49e1bd15..9582dda49 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -164,7 +164,8 @@ helm install hub-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/hub-agent \ --create-namespace \ --set useCertManager=true \ --set enableWorkload=true \ - --set enableWebhook=true + --set enableWebhook=true \ + --set webhookCertSecretName=fleet-webhook-server-cert # Or using traditional repository helm install hub-agent kubefleet/hub-agent \ @@ -172,12 +173,13 @@ helm install hub-agent kubefleet/hub-agent \ --create-namespace \ --set useCertManager=true \ --set enableWorkload=true \ - --set enableWebhook=true + --set enableWebhook=true \ + --set webhookCertSecretName=fleet-webhook-server-cert ``` The `webhookCertSecretName` parameter specifies the Secret name for the certificate: -- Default: `fleet-webhook-server-cert` -- When using cert-manager, this is where cert-manager stores the certificate +- Default: `unset`; there is no default in `values.yaml` +- When using cert-manager, set this to the Secret where cert-manager stores the certificate - Must match the secret name referenced in the deployment volume mount Example with custom secret name: @@ -198,4 +200,4 @@ helm install hub-agent kubefleet/hub-agent \ --set useCertManager=true \ --set enableWorkload=true \ --set webhookCertSecretName=my-webhook-secret -``` \ No newline at end of file +``` diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index 23fd4197e..e3d115344 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -19,6 +19,13 @@ spec: {{- include "hub-agent.selectorLabels" . | nindent 8 }} spec: serviceAccountName: {{ include "hub-agent.fullname" . }}-sa + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault initContainers: {{- if .Values.crdInstaller.enabled }} - name: crd-installer @@ -32,6 +39,12 @@ spec: - name: {{ .Chart.Name }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL args: - --leader-elect=true - --enable-webhook={{ .Values.enableWebhook }} @@ -92,16 +105,26 @@ spec: # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go mountPath: /tmp/k8s-webhook-server/serving-certs readOnly: true + {{- else }} + volumeMounts: + - name: webhook-cert + # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go. + # Note that this must be mounted one level up from the hardcoded path, otherwise + # the read only root filesystem setup would block the agent from attempting to + # clear the directory. + mountPath: /tmp/k8s-webhook-server/ {{- end }} - {{- if .Values.useCertManager }} volumes: - name: webhook-cert + {{- if .Values.useCertManager }} secret: secretName: {{ .Values.webhookCertSecretName }} # defaultMode 0444 (read for all) allows the container process to read the certs # regardless of the user/group it runs as defaultMode: 0444 - {{- end }} + {{- else }} + emptyDir: {} + {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 90c1106a6..fd24c9182 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -32,6 +32,7 @@ enableWorkload: false useCertManager: false # webhookCertSecretName is ONLY used when useCertManager=true # It specifies the name of the Secret where cert-manager stores the certificate +# Example: # webhookCertSecretName: fleet-webhook-server-cert forceDeleteWaitTime: 15m0s diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml index be57774c6..9aa9ab267 100644 --- a/charts/member-agent/templates/deployment.yaml +++ b/charts/member-agent/templates/deployment.yaml @@ -16,6 +16,13 @@ spec: spec: restartPolicy: Always serviceAccountName: {{ include "member-agent.fullname" . }}-sa + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault initContainers: {{- if .Values.crdInstaller.enabled }} - name: crd-installer @@ -29,6 +36,12 @@ spec: - name: {{ include "member-agent.fullname" . }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL ports: - name: http containerPort: 80 @@ -149,6 +162,12 @@ spec: - name: refresh-token image: "{{ .Values.refreshtoken.repository }}:{{ .Values.refreshtoken.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.refreshtoken.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL args: {{- $provider := .Values.config.provider }} - {{ $provider }} diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go index c5ca386d0..94745b021 100644 --- a/cmd/memberagent/main.go +++ b/cmd/memberagent/main.go @@ -91,6 +91,11 @@ func main() { klog.InfoS("flag:", "name", f.Name, "value", f.Value) }) + if errs := opts.Validate(); len(errs) != 0 { + klog.ErrorS(errs.ToAggregate(), "invalid parameter") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) + } + // Set up controller-runtime logger ctrl.SetLogger(zap.New(zap.UseDevMode(true))) @@ -213,13 +218,16 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo return err }) if err != nil { - klog.ErrorS(err, "Failed to retrieve token file from the path %s", tokenFilePath) + klog.ErrorS(err, "Failed to retrieve token file", "path", tokenFilePath) return nil, err } hubConfig.BearerTokenFile = tokenFilePath } hubConfig.TLSClientConfig.Insecure = tlsClientInsecure + if tlsClientInsecure { + klog.Warning("TLS verification is disabled for hub cluster connection. This is insecure and should not be used in production.") + } if !tlsClientInsecure { caBundle, ok := os.LookupEnv("CA_BUNDLE") if ok && caBundle == "" { @@ -257,7 +265,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo r := textproto.NewReader(bufio.NewReader(strings.NewReader(header))) h, err := r.ReadMIMEHeader() if err != nil && !errors.Is(err, io.EOF) { - klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER %q", header) + klog.ErrorS(err, "Failed to parse HUB_KUBE_HEADER", "header", header) return nil, err } hubConfig.WrapTransport = func(rt http.RoundTripper) http.RoundTripper { diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml index 79927f6d1..f3f4cdfef 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml @@ -72,6 +72,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -133,6 +151,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml index 6b80c6832..a57b5b3d2 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml @@ -86,6 +86,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -147,6 +165,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml index 8ffe14382..5c586555d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml @@ -554,6 +554,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -615,6 +633,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. @@ -961,6 +980,25 @@ spec: - message: statusReportingScope is immutable rule: '!has(oldSelf.statusReportingScope) || self.statusReportingScope == oldSelf.statusReportingScope' + - message: only one namespace selector with NamespaceWithResourceSelectors + mode is allowed + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) <= 1 + - message: namespace selector with NamespaceWithResourceSelectors mode + must select by name (not by label) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope) + && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) + && size(x.name) > 0 && !has(x.labelSelector))) == 1) + - message: when using NamespaceWithResourceSelectors mode, only one namespace + selector is allowed (cannot mix with other namespace selectors) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1 status: description: The observed status of ClusterResourcePlacement. properties: diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index 2b34db4d3..27dc987da 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -845,10 +845,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -879,10 +879,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2070,7 +2070,7 @@ spec: description: The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$ type: string required: - type @@ -3268,10 +3268,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -3302,10 +3302,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 9be8428dc..b134a75b5 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -70,10 +70,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -101,10 +101,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -264,7 +264,7 @@ spec: description: The time to wait after all the clusters in the current stage complete the update before moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + pattern: ^(?:0|(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+)$ type: string required: - type @@ -399,10 +399,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -430,10 +430,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml index 8ab1049c5..db0cdea5d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml @@ -546,6 +546,24 @@ spec: For ClusterResourcePlacement, you can use SelectionScope to control what gets selected: - NamespaceOnly: Only the namespace object itself - NamespaceWithResources: The namespace AND all resources within it (default) + - NamespaceWithResourceSelectors: The namespace AND resources specified by additional selectors + + When SelectionScope is NamespaceWithResourceSelectors, you can define additional ResourceSelectorTerms + (after the namespace selector) to specify which resources to include. These additional selectors can + target both namespace-scoped resources (within the selected namespace) and cluster-scoped resources. + + Important requirements for NamespaceWithResourceSelectors mode: + - Exactly one namespace selector with this mode is allowed + - The namespace selector must select by name (not by label) + - Only one namespace selector is allowed when using this mode (cannot mix with other namespace selectors) + - All requirements are validated via CEL at API validation time + - If the selected namespace is deleted after CRP creation, the controller will report an error condition + + Example using NamespaceWithResourceSelectors: + - Namespace selector: {Group: "", Version: "v1", Kind: "Namespace", Name: "prod", SelectionScope: "NamespaceWithResourceSelectors"} + - Additional selector: {Group: "apps", Version: "v1", Kind: "Deployment", LabelSelector: {app: "frontend"}} + - Third selector: {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole", Name: "admin"} + This selects: the "prod" namespace, all Deployments with label app=frontend in "prod", and the "admin" ClusterRole. type: string labelSelector: description: |- @@ -607,6 +625,7 @@ spec: enum: - NamespaceOnly - NamespaceWithResources + - NamespaceWithResourceSelectors type: string version: description: Version of the resource to be selected. @@ -946,6 +965,25 @@ spec: x-kubernetes-validations: - message: policy cannot be removed once set rule: '!(has(oldSelf.policy) && !has(self.policy))' + - message: only one namespace selector with NamespaceWithResourceSelectors + mode is allowed + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) <= 1 + - message: namespace selector with NamespaceWithResourceSelectors mode + must select by name (not by label) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || (size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1' && has(x.selectionScope) + && x.selectionScope == 'NamespaceWithResourceSelectors' && has(x.name) + && size(x.name) > 0 && !has(x.labelSelector))) == 1) + - message: when using NamespaceWithResourceSelectors mode, only one namespace + selector is allowed (cannot mix with other namespace selectors) + rule: size(self.resourceSelectors.filter(x, x.kind == 'Namespace' && + x.group == "" && x.version == 'v1' && has(x.selectionScope) && x.selectionScope + == 'NamespaceWithResourceSelectors')) == 0 || size(self.resourceSelectors.filter(x, + x.kind == 'Namespace' && x.group == "" && x.version == 'v1')) == 1 status: description: The observed status of ResourcePlacement. properties: diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index ff9152c9d..81b506f4a 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -845,10 +845,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -879,10 +879,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2188,10 +2188,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -2222,10 +2222,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters - in the current stage complete the update before - moving to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml index 5315f7322..ccfb44e85 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml @@ -70,10 +70,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -101,10 +101,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -259,10 +259,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type @@ -290,10 +290,10 @@ spec: - Approval type: string waitTime: - description: The time to wait after all the clusters in - the current stage complete the update before moving - to the next stage. - pattern: ^0|([0-9]+(\.[0-9]+)?(s|m|h))+$ + description: |- + The time to wait after all the clusters in the current stage complete the update before moving to the next stage. + Only hours (h), minutes (m), and seconds (s) units are accepted. + pattern: ^(?:(?:0|[1-9][0-9]*)(\.[0-9]+)?(?:s|m|h))+$ type: string required: - type diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile index d62a3e1df..10e5a05a5 100644 --- a/docker/refresh-token.Dockerfile +++ b/docker/refresh-token.Dockerfile @@ -15,6 +15,8 @@ RUN go mod download # Copy the go source COPY cmd/authtoken/main.go main.go COPY pkg/authtoken pkg/authtoken +# writefile is a dependency of pkg/authtoken for secure file creation (0600 permissions) +COPY pkg/utils/writefile pkg/utils/writefile ARG TARGETARCH diff --git a/go.mod b/go.mod index d89f80974..70c80ae6e 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/crossplane/crossplane-runtime/v2 v2.1.0 github.com/evanphx/json-patch/v5 v5.9.11 github.com/gofrs/uuid v4.4.0+incompatible + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 github.com/onsi/ginkgo/v2 v2.23.4 @@ -70,7 +71,6 @@ require ( github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.4.2 // indirect - github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect diff --git a/pkg/authtoken/token_writer.go b/pkg/authtoken/token_writer.go index 48b965f3f..a7b2f32c1 100644 --- a/pkg/authtoken/token_writer.go +++ b/pkg/authtoken/token_writer.go @@ -18,9 +18,10 @@ package authtoken import ( "fmt" "io" - "os" "k8s.io/klog/v2" + + "go.goms.io/fleet/pkg/utils/writefile" ) type Factory struct { @@ -31,8 +32,11 @@ func NewFactory(filePath string) Factory { return Factory{filePath: filePath} } +// Create opens the token file for writing using CreateSecureFile. +// The member-agent container runs as the same UID and reads this file via client-go's +// BearerTokenFile (O_RDONLY), so owner permission is sufficient for both writer and reader. func (w Factory) Create() (io.WriteCloser, error) { - wc, err := os.Create(w.filePath) + wc, err := writefile.CreateSecureFile(w.filePath) if err != nil { return nil, err } diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go index 8def38445..42a5a3276 100644 --- a/pkg/controllers/placement/controller.go +++ b/pkg/controllers/placement/controller.go @@ -507,25 +507,23 @@ func (r *Reconciler) ensureLatestPolicySnapshot(ctx context.Context, placementOb // 2 & 3 should never happen. func (r *Reconciler) lookupLatestSchedulingPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, int, error) { placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()} - snapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey) - if err != nil { - return nil, -1, err - } placementKObj := klog.KObj(placement) - policySnapshotItems := snapshotList.GetPolicySnapshotObjs() - if len(policySnapshotItems) == 1 { - policyIndex, err := labels.ParsePolicyIndexFromLabel(policySnapshotItems[0]) - if err != nil { - klog.ErrorS(err, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(policySnapshotItems[0])) - return nil, -1, controller.NewUnexpectedBehaviorError(err) - } - return policySnapshotItems[0], policyIndex, nil - } else if len(policySnapshotItems) > 1 { - // It means there are multiple active snapshots and should never happen. - err := fmt.Errorf("there are %d active schedulingPolicySnapshots owned by placement %v", len(policySnapshotItems), placementKey) - klog.ErrorS(err, "Invalid schedulingPolicySnapshots", "placement", placementKObj) - return nil, -1, controller.NewUnexpectedBehaviorError(err) + activeSnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey) + switch { + case err == nil: + policyIndex, parseErr := labels.ParsePolicyIndexFromLabel(activeSnapshot) + if parseErr != nil { + klog.ErrorS(parseErr, "Failed to parse the policy index label", "placement", placementKObj, "policySnapshot", klog.KObj(activeSnapshot)) + return nil, -1, controller.NewUnexpectedBehaviorError(parseErr) + } + return activeSnapshot, policyIndex, nil + case errors.Is(err, controller.ErrNoLatestPolicySnapshot): + // No active snapshot found; recover below by picking the largest-index snapshot. + default: + // API or invariant-violation error — propagate. + return nil, -1, err } + // When there are no active snapshots, find the one who has the largest policy index. // It should be rare only when placement controller is crashed before creating the new active snapshot. sortedList, err := r.listSortedSchedulingPolicySnapshots(ctx, placement) diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go index 4b9dc09aa..f84c540cc 100644 --- a/pkg/controllers/updaterun/execution_test.go +++ b/pkg/controllers/updaterun/execution_test.go @@ -242,7 +242,9 @@ func TestCheckClusterUpdateResult(t *testing.T) { wantErr: true, }, { - name: "checkClusterUpdateResult should return false and error if the binding has false workSynchronized condition", + // WorkNotSynchronizedYet is an in-progress reason, not a terminal failure (issue #648). + // checkClusterUpdateResult now treats it as "still updating" rather than an error. + name: "checkClusterUpdateResult should return false but no error if workSynchronized is in-progress (WorkNotSynchronizedYet)", binding: &placementv1beta1.ClusterResourceBinding{ ObjectMeta: metav1.ObjectMeta{Generation: 1}, Status: placementv1beta1.ResourceBindingStatus{ @@ -258,6 +260,25 @@ func TestCheckClusterUpdateResult(t *testing.T) { }, clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"}, wantSucceeded: false, + wantErr: false, + }, + { + name: "checkClusterUpdateResult should return false and error if workSynchronized failed terminally (SyncWorkFailed)", + binding: &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingWorkSynchronized), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.SyncWorkFailedReason, + }, + }, + }, + }, + clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"}, + wantSucceeded: false, wantErr: true, }, { diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index 1cdf8bcff..2cf0a95d4 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -125,29 +125,19 @@ func (r *Reconciler) determinePolicySnapshot( updateRunRef := klog.KObj(updateRun) // Get the latest policy snapshot. - policySnapshotList, err := controller.FetchLatestPolicySnapshot(ctx, r.Client, placementKey) + latestPolicySnapshot, err := controller.LookupLatestPolicySnapshot(ctx, r.Client, placementKey) if err != nil { - klog.ErrorS(err, "Failed to list the latest policy snapshots", "placement", placementKey, "updateRun", updateRunRef) - // err can be retried. - return nil, -1, controller.NewAPIServerError(true, err) - } - - policySnapshotObjs := policySnapshotList.GetPolicySnapshotObjs() - if len(policySnapshotObjs) != 1 { - if len(policySnapshotObjs) > 1 { - err := controller.NewUnexpectedBehaviorError(fmt.Errorf("more than one (%d in actual) latest policy snapshots associated with the placement: `%s`", len(policySnapshotObjs), placementKey)) - klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) - // no more retries for this error. + klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) + // Missing or duplicate-active snapshots are placement-state invariants the run cannot + // recover from on its own — fail validation. Anything else (e.g. List errors classified as + // ErrAPIServerError, or unexpected cache failures promoted to ErrUnexpectedBehavior) is + // propagated raw so controller-runtime retries. + if errors.Is(err, controller.ErrNoLatestPolicySnapshot) || errors.Is(err, controller.ErrMultipleActivePolicySnapshots) { return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) } - // no latest policy snapshot found. - err := fmt.Errorf("no latest policy snapshot associated with the placement: `%s`", placementKey) - klog.ErrorS(err, "Failed to find the latest policy snapshot", "placement", placementKey, "updateRun", updateRunRef) - // again, no more retries here. - return nil, -1, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) + return nil, -1, err } - latestPolicySnapshot := policySnapshotObjs[0] policySnapshotRef := klog.KObj(latestPolicySnapshot) policyIndex, foundIndex := latestPolicySnapshot.GetLabels()[placementv1beta1.PolicyIndexLabel] if !foundIndex || len(policyIndex) == 0 { diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go index 9fa92d4d5..d7aea3463 100644 --- a/pkg/controllers/updaterun/initialization_integration_test.go +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -210,7 +210,7 @@ var _ = Describe("Updaterun initialization tests", func() { Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) By("Validating the initialization failed") - validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot associated") + validateFailedInitCondition(ctx, updateRun, "no latest policy snapshot found") }) It("Should fail to initialize if there are multiple latest policy snapshots", func() { @@ -223,7 +223,7 @@ var _ = Describe("Updaterun initialization tests", func() { Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) By("Validating the initialization failed") - validateFailedInitCondition(ctx, updateRun, "more than one (2 in actual) latest policy snapshots associated") + validateFailedInitCondition(ctx, updateRun, "multiple active policy snapshots found") By("Deleting the second scheduling policy snapshot") Expect(k8sClient.Delete(ctx, snapshot2)).Should(Succeed()) diff --git a/pkg/controllers/updaterun/validation_integration_test.go b/pkg/controllers/updaterun/validation_integration_test.go index 54a34369d..53e88f57a 100644 --- a/pkg/controllers/updaterun/validation_integration_test.go +++ b/pkg/controllers/updaterun/validation_integration_test.go @@ -219,7 +219,7 @@ var _ = Describe("UpdateRun validation tests", func() { By("Validating the validation failed") wantStatus = generateFailedValidationStatus(updateRun, wantStatus) - validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot associated") + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "no latest policy snapshot found") By("Checking update run status metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateFailedMetric(placementv1beta1.StateRun, updateRun, string(failureType))) diff --git a/pkg/controllers/workgenerator/controller_integration_test.go b/pkg/controllers/workgenerator/controller_integration_test.go index 8ed516447..62f1abaa1 100644 --- a/pkg/controllers/workgenerator/controller_integration_test.go +++ b/pkg/controllers/workgenerator/controller_integration_test.go @@ -57,10 +57,12 @@ var ( validClusterResourceOverrideSnapshotName = "cro-1" validResourceOverrideSnapshotName = "ro-1" invalidClusterResourceOverrideSnapshotName = "cro-2" // the overridden manifest is invalid + invalidResourceOverrideSnapshotName = "ro-2" // the overridden manifest is invalid validClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot validResourceOverrideSnapshot placementv1beta1.ResourceOverrideSnapshot invalidClusterResourceOverrideSnapshot placementv1beta1.ClusterResourceOverrideSnapshot + invalidResourceOverrideSnapshot placementv1beta1.ResourceOverrideSnapshot bindingStatusCmpOpts = cmp.Options{ cmpopts.SortSlices(utils.LessFuncConditionByType), @@ -280,7 +282,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", // delete the binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) Expect(k8sClient.Delete(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) // check the binding is deleted Eventually(func() bool { err := k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding) @@ -302,7 +304,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -383,7 +385,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) Consistently(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) @@ -589,7 +591,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testConfigMap, testResourceEnvelope, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -692,12 +694,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceEnvelope2, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second generation Eventually(func() bool { @@ -711,7 +713,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the work that contains none enveloped object is updated work := placementv1beta1.Work{} Eventually(func() error { @@ -790,12 +792,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second binding generation Eventually(func() bool { @@ -809,7 +811,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the enveloped work is deleted Eventually(func() error { envelopWorkLabelMatcher := client.MatchingLabels{ @@ -839,7 +841,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testClusterScopedEnvelope, testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -942,12 +944,12 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("another master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) updateRolloutStartedGeneration(&binding) // check the binding status till the bound condition is true for the second binding generation Eventually(func() bool { @@ -961,7 +963,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return condition.IsConditionStatusTrue( meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration()) }, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name)) - By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) + By(fmt.Sprintf("resource binding %s is reconciled", binding.Name)) // check the enveloped work is deleted Eventually(func() error { envelopWorkLabelMatcher := client.MatchingLabels{ @@ -990,7 +992,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) // create binding spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, @@ -1003,19 +1005,19 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testConfigMap, testPdb, }) Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed()) - By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("secondary resource snapshot %s created", secondSnapshot.Name)) }) AfterEach(func() { // delete the master resource snapshot Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s deleted", masterSnapshot.Name)) // delete the second resource snapshot Expect(k8sClient.Delete(ctx, secondSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name)) + By(fmt.Sprintf("secondary resource snapshot %s deleted", secondSnapshot.Name)) // delete the binding Expect(k8sClient.Delete(ctx, binding)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) - By(fmt.Sprintf("resource binding %s deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s deleted", binding.Name)) }) It("Should create all the work in the target namespace after all the resource snapshot are created", func() { @@ -1033,7 +1035,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests) Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex secondWork := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork) @@ -1099,7 +1101,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests) Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex secondWork := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &secondWork) @@ -1155,7 +1157,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") @@ -1165,25 +1167,25 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) updateRolloutStartedGeneration(&binding) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) // Now create the second resource snapshot secondSnapshot = generateClusterResourceSnapshot(3, 3, 1, [][]byte{ testResource, testConfigMap, }) Expect(k8sClient.Create(ctx, secondSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("new secondary resource snapshot %s created", secondSnapshot.Name)) // Now create the third resource snapshot thirdSnapshot := generateClusterResourceSnapshot(3, 3, 2, [][]byte{ testPdb, }) Expect(k8sClient.Create(ctx, thirdSnapshot)).Should(Succeed()) - By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name)) + By(fmt.Sprintf("third resource snapshot %s created", secondSnapshot.Name)) // check the work for the master resource snapshot is created expectedManifest := []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResourceCRD}}, @@ -1201,7 +1203,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return nil }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is updated in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex expectedManifest = []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResource}}, {RawExtension: runtime.RawExtension{Raw: testConfigMap}}, @@ -1220,7 +1222,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return nil }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("second work %s is updated in %s", work.Name, work.Namespace)) - // check the work for the third resource snapshot is created, it's name is crp-subindex + // check the work for the third resource snapshot is created, its name is crp-subindex expectedManifest = []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testPdb}}, } @@ -1247,7 +1249,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") @@ -1257,13 +1259,13 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, testConfigMap, testPdb, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("new master resource snapshot %s created", masterSnapshot.Name)) // update binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.ResourceSnapshotName = masterSnapshot.Name Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) updateRolloutStartedGeneration(&binding) - By(fmt.Sprintf("resource binding %s updated", binding.Name)) + By(fmt.Sprintf("resource binding %s updated", binding.Name)) //inspect the work manifest that should have been updated to contain all expectedManifest := []placementv1beta1.Manifest{ {RawExtension: runtime.RawExtension{Raw: testResourceCRD}}, @@ -1301,7 +1303,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) }, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster") By(fmt.Sprintf("first work %s is created in %s", work.Name, work.Namespace)) - // check the work for the secondary resource snapshot is created, it's name is crp-subindex + // check the work for the secondary resource snapshot is created, its name is crp-subindex work2 := placementv1beta1.Work{} Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.WorkNameWithSubindexFmt, testCRPName, 1), Namespace: memberClusterNamespaceName}, &work2) @@ -1310,7 +1312,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", // delete the binding Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) Expect(k8sClient.Delete(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) + By(fmt.Sprintf("resource binding %s is deleted", binding.Name)) // verify that all associated works have been deleted Eventually(func() error { @@ -1352,7 +1354,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) crolist := []string{validClusterResourceOverrideSnapshotName} roList := []placementv1beta1.NamespacedName{ { @@ -1446,7 +1448,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) Consistently(func() error { return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) @@ -1472,7 +1474,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1519,7 +1521,70 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", } return cmp.Diff(wantStatus, binding.Status, cmpConditionOption) }, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name)) - Expect(binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message).Should(ContainSubstring("Failed to apply the override rules on the resources: add operation does not apply")) + message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message + Expect(message).Should(MatchRegexp(`ClusterResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`)) + }) + }) + + Context("Test Bound ClusterResourceBinding with a single resource snapshot and invalid resource override", func() { + var masterSnapshot *placementv1beta1.ClusterResourceSnapshot + + BeforeEach(func() { + masterSnapshot = generateClusterResourceSnapshot(1, 1, 0, [][]byte{ + testResourceCRD, testNameSpace, testResource, + }) + Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + spec := placementv1beta1.ResourceBindingSpec{ + State: placementv1beta1.BindingStateBound, + ResourceSnapshotName: masterSnapshot.Name, + TargetCluster: memberClusterName, + ResourceOverrideSnapshots: []placementv1beta1.NamespacedName{ + { + Name: invalidResourceOverrideSnapshotName, + Namespace: appNamespaceName, + }, + }, + } + createClusterResourceBinding(&binding, spec) + }) + + AfterEach(func() { + By("Deleting master clusterResourceSnapshot") + Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + }) + + It("Should not create the work in the target namespace and surface ResourceOverrideSnapshot identity in the condition message", func() { + work := placementv1beta1.Work{} + Consistently(func() bool { + err := k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work) + return errors.IsNotFound(err) + }, duration, interval).Should(BeTrue(), "controller should not create work in hub cluster when an override fails") + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) + Expect(len(binding.Finalizers)).Should(Equal(1)) + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) + wantStatus := placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingRolloutStarted), + Status: metav1.ConditionTrue, + Reason: condition.RolloutStartedReason, + ObservedGeneration: binding.GetGeneration(), + }, + { + Type: string(placementv1beta1.ResourceBindingOverridden), + Status: metav1.ConditionFalse, + Reason: condition.OverriddenFailedReason, + ObservedGeneration: binding.GetGeneration(), + }, + }, + } + return cmp.Diff(wantStatus, binding.Status, cmpConditionOption) + }, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name)) + message := binding.GetCondition(string(placementv1beta1.ResourceBindingOverridden)).Message + Expect(message).Should(MatchRegexp(`ResourceOverrideSnapshot "[^"]+" failed to apply on \w+ "[^"]+".*: .*add operation does not apply`)) }) }) @@ -1531,7 +1596,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1588,7 +1653,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", testResourceCRD, testNameSpace, testResource, }) Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) - By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) + By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name)) spec := placementv1beta1.ResourceBindingSpec{ State: placementv1beta1.BindingStateBound, ResourceSnapshotName: masterSnapshot.Name, @@ -1673,7 +1738,7 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed()) binding.Spec.State = placementv1beta1.BindingStateUnscheduled Expect(k8sClient.Update(ctx, binding)).Should(Succeed()) - By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) + By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name)) updateRolloutStartedGeneration(&binding) rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted)) Consistently(func() error { diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go index b7134b77c..0ecf12cdc 100644 --- a/pkg/controllers/workgenerator/envelope.go +++ b/pkg/controllers/workgenerator/envelope.go @@ -18,13 +18,15 @@ package workgenerator import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "sort" "strings" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -87,18 +89,27 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj( var work *fleetv1beta1.Work switch { case len(workList.Items) > 1: - // Multiple matching work objects found; this should never occur under normal conditions. - wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef()) - klog.ErrorS(wrappedErr, "Failed to create or update work object for envelope", - "resourceBinding", klog.KObj(binding), - "resourceSnapshot", klog.KObj(resourceSnapshot), - "envelope", envelopeReader.GetEnvelopeObjRef()) - // Log the work object names to help debug. + // Multiple matching work objects found. This can only happen in environments that + // were stuck on this state *before* deterministic Work naming was introduced in + // buildNewWorkForEnvelopeCR; newly-raced reconciles are now prevented by the + // API server's name-uniqueness check. We deliberately do NOT auto-delete the + // extras: both Works may have been applied by the member agent, and deleting one + // would trigger member-side resource cleanup and user-visible fluctuation on + // resources the survivor still owns. Surface the state for operator cleanup + // (manually delete all but the oldest Work in the affected namespace). workNames := make([]string, len(workList.Items)) for i := range workList.Items { workNames[i] = workList.Items[i].Name } - klog.ErrorS(wrappedErr, "Duplicate work objects found", "works", workNames) + wrappedErr := fmt.Errorf("%d work objects found for the same envelope %v, only one expected", len(workList.Items), envelopeReader.GetEnvelopeObjRef()) + klog.ErrorS(wrappedErr, "Duplicate envelope Work objects found; manual cleanup required — delete all but the oldest Work in the namespace", + "works", workNames, + "resourceBinding", klog.KObj(binding), + "resourceSnapshot", klog.KObj(resourceSnapshot), + "envelope", envelopeReader.GetEnvelopeObjRef()) + r.recorder.Eventf(binding, corev1.EventTypeWarning, "DuplicateEnvelopeWorks", + "Multiple Work objects (%v) found for envelope %v in namespace %s; delete all but the oldest to recover", + workNames, envelopeReader.GetEnvelopeObjRef(), fmt.Sprintf(utils.NamespaceNameFormat, binding.GetBindingSpec().TargetCluster)) return nil, controller.NewUnexpectedBehaviorError(wrappedErr) case len(workList.Items) == 1: klog.V(2).InfoS("Found existing work object for the envelope; updating it", @@ -200,6 +211,24 @@ func refreshWorkForEnvelopeCR( work.Spec.ApplyStrategy = resourceBinding.GetBindingSpec().ApplyStrategy } +// envelopeWorkNameSuffix returns a deterministic suffix for the envelope Work's name, +// derived from the envelope's identity. Using a deterministic suffix (instead of a random +// UUID) means two concurrent Create attempts for the same envelope collide on name at the +// API server, which enforces the "one Work per (binding, envelope)" invariant server-side. +// The hash is truncated to 8 bytes of SHA-256 (rendered as 16 hex characters), keeping +// the overall Work name well under the DNS-1123 subdomain limit; collision probability +// at the scale of a single binding's envelope set is ~2^-64. +func envelopeWorkNameSuffix(envelopeReader fleetv1beta1.EnvelopeReader) string { + // NOTE: ClusterResourceEnvelope has an empty namespace; ResourceEnvelope has a namespace. + // Including the type disambiguates an unlikely name collision across kinds. + identity := fmt.Sprintf("%s/%s/%s", + envelopeReader.GetEnvelopeType(), + envelopeReader.GetNamespace(), + envelopeReader.GetName()) + sum := sha256.Sum256([]byte(identity)) + return hex.EncodeToString(sum[:8]) +} + func buildNewWorkForEnvelopeCR( workNamePrefix string, resourceBinding fleetv1beta1.BindingObj, @@ -208,7 +237,7 @@ func buildNewWorkForEnvelopeCR( manifests []fleetv1beta1.Manifest, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string, ) *fleetv1beta1.Work { - workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, uuid.NewUUID()) + workName := fmt.Sprintf(fleetv1beta1.WorkNameWithEnvelopeCRFmt, workNamePrefix, envelopeWorkNameSuffix(envelopeReader)) workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.GetBindingSpec().TargetCluster) // Create the labels map diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go index 29514ff74..bb7c26e26 100644 --- a/pkg/controllers/workgenerator/envelope_test.go +++ b/pkg/controllers/workgenerator/envelope_test.go @@ -17,8 +17,11 @@ limitations under the License. package workgenerator import ( + "context" "encoding/json" + "errors" "fmt" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -32,9 +35,12 @@ import ( fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" "go.goms.io/fleet/pkg/utils" + "go.goms.io/fleet/pkg/utils/controller" "go.goms.io/fleet/test/utils/informer" ) +const testWorkNamePrefix = "test-work" + func TestExtractManifestsFromEnvelopeCR(t *testing.T) { tests := []struct { name string @@ -255,8 +261,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { ignoreWorkMeta := cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Name", "OwnerReferences") scheme := serviceScheme(t) - workNamePrefix := "test-work" - resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-snapshot", @@ -306,7 +310,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { // Create an existing work for update test existingWork := &fleetv1beta1.Work{ ObjectMeta: metav1.ObjectMeta{ - Name: workNamePrefix, + Name: testWorkNamePrefix, Namespace: "fleet-member-test-cluster-1", Labels: map[string]string{ fleetv1beta1.ParentBindingLabel: resourceBinding.Name, @@ -483,7 +487,13 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { wantErr: true, }, { - name: "two existing works should result in error", + // Duplicate envelope Works indicate an environment that was stuck before + // deterministic naming was introduced. The controller surfaces the state + // (error + Event) but does NOT auto-delete, because deleting a Work the + // member agent has already applied would trigger resource fluctuation on + // the member side. Full post-conditions are verified in + // TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation. + name: "two existing works surface an UnexpectedBehaviorError without mutation", envelopeReader: resourceEnvelope, resourceOverrideSnapshotHash: "new-resource-hash", clusterResourceOverrideSnapshotHash: "new-cluster-resource-hash", @@ -512,7 +522,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { } // Call the function under test - got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, workNamePrefix, + got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, tt.envelopeReader, testWorkNamePrefix, resourceBinding, resourceSnapshot, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash) if (err != nil) != tt.wantErr { @@ -532,7 +542,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { func TestProcessOneSelectedResource(t *testing.T) { scheme := serviceScheme(t) - workNamePrefix := "test-work" resourceBinding := &fleetv1beta1.ClusterResourceBinding{ ObjectMeta: metav1.ObjectMeta{ Name: "test-binding", @@ -685,7 +694,7 @@ func TestProcessOneSelectedResource(t *testing.T) { tt.selectedResource, resourceBinding, snapshot, - workNamePrefix, + testWorkNamePrefix, tt.resourceOverrideSnapshotHash, tt.clusterResourceOverrideSnapshotHash, activeWork, @@ -725,3 +734,172 @@ func createResourceContent(t *testing.T, obj runtime.Object) *fleetv1beta1.Resou }, } } + +// TestEnvelopeWorkNameSuffix verifies that the suffix is a stable, length-bounded function +// of the envelope's identity. Determinism is what lets the API server enforce the +// "one Work per (binding, envelope)" invariant via name uniqueness. +func TestEnvelopeWorkNameSuffix(t *testing.T) { + base := &fleetv1beta1.ResourceEnvelope{ + ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}, + } + + tests := []struct { + name string + other fleetv1beta1.EnvelopeReader + wantEqual bool // true → suffix must match base; false → suffix must differ + }{ + { + name: "same identity", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-1"}}, + wantEqual: true, + }, + { + name: "different name", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-b", Namespace: "ns-1"}}, + wantEqual: false, + }, + { + name: "different namespace", + other: &fleetv1beta1.ResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a", Namespace: "ns-2"}}, + wantEqual: false, + }, + { + // Same name as base, but cluster-scoped — the type field in the hash input disambiguates. + name: "different type, same name", + other: &fleetv1beta1.ClusterResourceEnvelope{ObjectMeta: metav1.ObjectMeta{Name: "env-a"}}, + wantEqual: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotBase := envelopeWorkNameSuffix(base) + gotOther := envelopeWorkNameSuffix(tt.other) + if tt.wantEqual && gotBase != gotOther { + t.Errorf("envelopeWorkNameSuffix(%v) = %q, envelopeWorkNameSuffix(%v) = %q, want equal", base, gotBase, tt.other, gotOther) + } + if !tt.wantEqual && gotBase == gotOther { + t.Errorf("envelopeWorkNameSuffix(%v) = envelopeWorkNameSuffix(%v) = %q, want different", base, tt.other, gotBase) + } + }) + } + + // Suffix is bounded and DNS-1123-safe (lowercase hex). + suffix := envelopeWorkNameSuffix(base) + if len(suffix) != 16 { + t.Errorf("envelopeWorkNameSuffix length = %d, want 16", len(suffix)) + } + if strings.ToLower(suffix) != suffix { + t.Errorf("envelopeWorkNameSuffix(%v) = %q, want lowercase", base, suffix) + } +} + +// TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation verifies that +// when the label query returns multiple Works for the same (binding, envelope), the +// controller surfaces the condition (UnexpectedBehaviorError + Event) but does NOT +// mutate cluster state. Auto-deleting duplicates is unsafe because the member agent +// may have applied them; deleting one triggers resource cleanup that conflicts with +// the survivor. Deterministic naming (see buildNewWorkForEnvelopeCR) prevents new +// duplicates; pre-existing ones require operator cleanup. +func TestCreateOrUpdateEnvelopeCRWorkObj_DuplicateWorksSurfaceWithoutMutation(t *testing.T) { + scheme := serviceScheme(t) + ctx := context.Background() + + resourceSnapshot := &fleetv1beta1.ClusterResourceSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-snapshot", + Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"}, + }, + } + resourceBinding := &fleetv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-binding", + Labels: map[string]string{fleetv1beta1.PlacementTrackingLabel: "test-crp"}, + }, + Spec: fleetv1beta1.ResourceBindingSpec{ + TargetCluster: "test-cluster-1", + ResourceSnapshotName: resourceSnapshot.Name, + }, + } + resourceEnvelope := &fleetv1beta1.ResourceEnvelope{ + ObjectMeta: metav1.ObjectMeta{Name: "dup-envelope", Namespace: "default"}, + Data: map[string]runtime.RawExtension{ + "configmap": {Raw: []byte(`{"apiVersion":"v1","kind":"ConfigMap","metadata":{"name":"cm","namespace":"default"}}`)}, + }, + } + + workNamespace := fmt.Sprintf(utils.NamespaceNameFormat, resourceBinding.Spec.TargetCluster) + labelsForWork := map[string]string{ + fleetv1beta1.ParentBindingLabel: resourceBinding.Name, + fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel], + fleetv1beta1.EnvelopeTypeLabel: string(fleetv1beta1.ResourceEnvelopeType), + fleetv1beta1.EnvelopeNameLabel: resourceEnvelope.Name, + fleetv1beta1.EnvelopeNamespaceLabel: resourceEnvelope.Namespace, + } + dupNames := []string{"dup-envelope-a", "dup-envelope-b", "dup-envelope-c"} + mkWork := func(name string) *fleetv1beta1.Work { + return &fleetv1beta1.Work{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: workNamespace, + Labels: labelsForWork, + }, + } + } + objs := make([]client.Object, 0, len(dupNames)) + for _, n := range dupNames { + objs = append(objs, mkWork(n)) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objs...). + Build() + + recorder := record.NewFakeRecorder(10) + r := &Reconciler{ + Client: fakeClient, + recorder: recorder, + InformerManager: &informer.FakeManager{}, + } + + got, err := r.createOrUpdateEnvelopeCRWorkObj(ctx, resourceEnvelope, testWorkNamePrefix, + resourceBinding, resourceSnapshot, "", "") + + if got != nil { + t.Errorf("createOrUpdateEnvelopeCRWorkObj() = %v, want nil on duplicate-detected path", got) + } + if err == nil { + t.Fatalf("createOrUpdateEnvelopeCRWorkObj() error = nil, want UnexpectedBehaviorError") + } + if !errors.Is(err, controller.ErrUnexpectedBehavior) { + t.Errorf("createOrUpdateEnvelopeCRWorkObj() error = %v, want wrapping controller.ErrUnexpectedBehavior", err) + } + + // Post-condition: all duplicates remain untouched — no auto-deletion. + remaining := &fleetv1beta1.WorkList{} + if err := fakeClient.List(ctx, remaining, client.InNamespace(workNamespace)); err != nil { + t.Fatalf("List Works in %q: %v", workNamespace, err) + } + sortStrings := cmpopts.SortSlices(func(a, b string) bool { return a < b }) + if diff := cmp.Diff(dupNames, workNames(remaining.Items), sortStrings); diff != "" { + t.Errorf("Works after call mismatch (-want +got):\n%s", diff) + } + + // Post-condition: a Warning Event was emitted so operators can see the stuck state. + select { + case ev := <-recorder.Events: + if !strings.Contains(ev, "DuplicateEnvelopeWorks") { + t.Errorf("event = %q, want reason DuplicateEnvelopeWorks", ev) + } + default: + t.Error("no Event emitted for duplicate envelope Works; operators would have no surface signal") + } +} + +func workNames(items []fleetv1beta1.Work) []string { + out := make([]string, len(items)) + for i := range items { + out[i] = items[i].Name + } + return out +} diff --git a/pkg/controllers/workgenerator/override.go b/pkg/controllers/workgenerator/override.go index 5c6ecb339..72b879cf3 100644 --- a/pkg/controllers/workgenerator/override.go +++ b/pkg/controllers/workgenerator/override.go @@ -35,6 +35,10 @@ import ( "go.goms.io/fleet/pkg/utils/overrider" ) +// fetchClusterResourceOverrideSnapshots returns the binding's CRO snapshots keyed by the +// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so +// callers can apply deterministically. +// // TODO: combine the following two functions into one, as they are very similar. func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot, error) { croMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot) @@ -70,6 +74,9 @@ func (r *Reconciler) fetchClusterResourceOverrideSnapshots(ctx context.Context, return croMap, nil } +// fetchResourceOverrideSnapshots returns the binding's RO snapshots keyed by the +// ResourceIdentifier their selectors target. Order matches the binding's snapshot list so +// callers can apply deterministically. func (r *Reconciler) fetchResourceOverrideSnapshots(ctx context.Context, resourceBinding placementv1beta1.BindingObj) (map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot, error) { roMap := make(map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot) @@ -127,11 +134,11 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, Kind: gvk.Kind, Name: uResource.GetName(), } - isClusterScopeResource := r.InformerManager.IsClusterScopedResources(gvk) + isClusterScopedResource := r.InformerManager.IsClusterScopedResources(gvk) // For the namespace scoped resource, it could be selected by the namespace itself. - // use the namespace as the key - if !isClusterScopeResource { + // Use the namespace as the key. + if !isClusterScopedResource { key = placementv1beta1.ResourceIdentifier{ Group: utils.NamespaceMetaGVK.Group, Version: utils.NamespaceMetaGVK.Version, @@ -149,14 +156,15 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, } if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil { klog.ErrorS(err, "Failed to apply the override rules", "clusterResourceOverrideSnapshot", klog.KObj(snapshot)) - return false, err + return false, controller.NewUserError(fmt.Errorf("ClusterResourceOverrideSnapshot %q failed to apply on %s: %s", + snapshot.Name, formatOverrideTarget(&uResource), err.Error())) } } klog.V(2).InfoS("Applied clusterResourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(croMap[key])) // If the resource is selected by both ClusterResourceOverride and ResourceOverride, ResourceOverride will win when resolving conflicts. // Apply ResourceOverrideSnapshots. - if !isClusterScopeResource { + if !isClusterScopedResource { key = placementv1beta1.ResourceIdentifier{ Group: gvk.Group, Version: gvk.Version, @@ -172,7 +180,8 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, } if err := applyOverrideRules(resource, cluster, snapshot.Spec.OverrideSpec.Policy.OverrideRules); err != nil { klog.ErrorS(err, "Failed to apply the override rules", "resourceOverrideSnapshot", klog.KObj(snapshot)) - return false, err + return false, controller.NewUserError(fmt.Errorf("ResourceOverrideSnapshot %q failed to apply on %s: %s", + snapshot.Name, formatOverrideTarget(&uResource), err.Error())) } } klog.V(2).InfoS("Applied resourceOverrideSnapshots", "resource", klog.KObj(&uResource), "numberOfOverrides", len(roMap[key])) @@ -180,12 +189,29 @@ func (r *Reconciler) applyOverrides(resource *placementv1beta1.ResourceContent, return resource.Raw == nil, nil } +// formatOverrideTarget renders the target as e.g. `Deployment "my-app" in namespace "default"` +// for inclusion in a user-facing error message. +func formatOverrideTarget(target *unstructured.Unstructured) string { + // Fall back to "Unknown" so a snapshot missing `kind` doesn't render as `"" "name"`. + kind := target.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = "Unknown" + } + if ns := target.GetNamespace(); ns != "" { + return fmt.Sprintf("%s %q in namespace %q", kind, target.GetName(), ns) + } + return fmt.Sprintf("%s %q", kind, target.GetName()) +} + +// applyOverrideRules applies matching rules to the resource. A DeleteOverrideType rule clears +// the resource and stops; otherwise JSON patches apply in order. Errors are returned raw — the +// caller (applyOverrides) tags them as user errors so we don't double-wrap the sentinel. func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, rules []placementv1beta1.OverrideRule) error { for _, rule := range rules { matched, err := overrider.IsClusterMatched(cluster, rule) if err != nil { - klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Found an invalid override rule") - return controller.NewUserError(err) // should not happen though and should be rejected by the webhook + klog.ErrorS(err, "Found an invalid override rule") + return err } if !matched { continue @@ -198,7 +224,7 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu // Apply JSONPatchOverrides by default if err = applyJSONPatchOverride(resource, cluster, rule.JSONPatchOverrides); err != nil { klog.ErrorS(err, "Failed to apply JSON patch override") - return controller.NewUserError(err) + return err } } return nil @@ -207,11 +233,11 @@ func applyOverrideRules(resource *placementv1beta1.ResourceContent, cluster *clu // applyJSONPatchOverride applies a JSON patch on the selected resources following [RFC 6902](https://datatracker.ietf.org/doc/html/rfc6902). func applyJSONPatchOverride(resourceContent *placementv1beta1.ResourceContent, cluster *clusterv1beta1.MemberCluster, overrides []placementv1beta1.JSONPatchOverride) error { var err error - if len(overrides) == 0 { // do nothing + if len(overrides) == 0 { return nil } - // go through the JSON patch overrides to replace the built-in variables before json Marshal - // as it may contain the built-in variables that cannot be marshaled directly + // Go through the JSON patch overrides to replace the built-in variables before json.Marshal, + // as the patch values may contain built-in variables that cannot be marshaled directly. for i := range overrides { // Process the JSON string to replace variables jsonStr := string(overrides[i].Value.Raw) @@ -263,14 +289,14 @@ func replaceClusterLabelKeyVariables(input string, cluster *clusterv1beta1.Membe // extract the key value user wants to replace endIdx := strings.Index(result[startIdx+prefixLen:], "}") if endIdx == -1 { - klog.V(2).InfoS("malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input) + klog.V(2).InfoS("Malformed key ${MEMBER-CLUSTER-LABEL-KEY without the closing `}`", "input", input) return "", fmt.Errorf("input %s is missing the closing bracket `}`", input) } endIdx += startIdx + prefixLen // extract the key name keyName := result[startIdx+prefixLen : endIdx] // check if the key exists in the cluster labels - labelValue, exists := cluster.ObjectMeta.Labels[keyName] + labelValue, exists := cluster.Labels[keyName] if !exists { klog.V(2).InfoS("Label key not found on cluster", "key", keyName, "cluster", cluster.Name) return "", fmt.Errorf("label key %s not found on cluster %s", keyName, cluster.Name) diff --git a/pkg/controllers/workgenerator/override_test.go b/pkg/controllers/workgenerator/override_test.go index a0b18498a..88c43dd86 100644 --- a/pkg/controllers/workgenerator/override_test.go +++ b/pkg/controllers/workgenerator/override_test.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -428,6 +429,9 @@ func TestFetchResourceOverrideSnapshot(t *testing.T) { } func TestApplyOverrides_clusterScopedResource(t *testing.T) { + // FakeManager.IsClusterScopedResources returns (APIResources[gvk] == IsClusterScopedResource). + // With the flag set to false and APIResources listing namespace-scoped GVKs, listed GVKs + // report as namespace-scoped and unlisted GVKs (ClusterRole) report as cluster-scoped. fakeInformer := informer.FakeManager{ APIResources: map[schema.GroupVersionKind]bool{ { @@ -450,7 +454,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { croMap map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot wantClusterRole rbacv1.ClusterRole wantErr error - wantDeleted bool + // wantErrSubstr asserts the failure message names the failing snapshot and target. + wantErrSubstr []string + wantDeleted bool }{ { name: "empty overrides", @@ -844,6 +850,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { Name: "clusterrole-name", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "conflicting-rules-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -895,6 +904,10 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "conflicting-rules-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + }, }, { name: "invalid json patch of clusterResourceOverride", @@ -921,6 +934,9 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { Name: "clusterrole-name", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -953,6 +969,71 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "test-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + }, + }, + { + // Invalid LabelSelector Operator makes IsClusterMatched fail — exercises the + // applyOverrideRules raw-error return that applyOverrides wraps. + name: "invalid cluster label selector in clusterResourceOverride", + clusterRole: rbacv1.ClusterRole{ + TypeMeta: clusterRoleType, + ObjectMeta: metav1.ObjectMeta{ + Name: "clusterrole-name", + }, + }, + cluster: clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster-1", + }, + }, + croMap: map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ClusterResourceOverrideSnapshot{ + { + Group: "rbac.authorization.k8s.io", + Version: "v1", + Kind: "ClusterRole", + Name: "clusterrole-name", + }: { + { + ObjectMeta: metav1.ObjectMeta{ + Name: "bad-selector-cro-snapshot", + }, + Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ + OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ + Policy: &placementv1beta1.OverridePolicy{ + OverrideRules: []placementv1beta1.OverrideRule{ + { + ClusterSelector: &placementv1beta1.ClusterSelector{ + ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "key1", + Operator: "InvalidOperator", + Values: []string{"value1"}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "bad-selector-cro-snapshot"`, + `ClusterRole "clusterrole-name"`, + `invalid cluster label selector`, + }, }, { name: "delete during the clusterResourceOverride", @@ -1118,6 +1199,11 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted) } if tc.wantErr != nil { + for _, want := range tc.wantErrSubstr { + if !strings.Contains(err.Error(), want) { + t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want) + } + } return } if tc.wantDeleted { @@ -1126,12 +1212,12 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var clusterRole rbacv1.ClusterRole if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &clusterRole); err != nil { - t.Fatalf("Failed to convert the result to clusterole: %v, want nil", err) + t.Fatalf("Failed to convert the result to clusterRole: %v, want nil", err) } if diff := cmp.Diff(tc.wantClusterRole, clusterRole); diff != "" { @@ -1141,7 +1227,8 @@ func TestApplyOverrides_clusterScopedResource(t *testing.T) { } } -func TestApplyOverrides_namespacedScopeResource(t *testing.T) { +func TestApplyOverrides_namespaceScopedResource(t *testing.T) { + // See TestApplyOverrides_clusterScopedResource for the IsClusterScopedResource: false rationale. fakeInformer := informer.FakeManager{ APIResources: map[schema.GroupVersionKind]bool{ { @@ -1165,7 +1252,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { roMap map[placementv1beta1.ResourceIdentifier][]*placementv1beta1.ResourceOverrideSnapshot wantDeployment appsv1.Deployment wantErr error - wantDelete bool + // wantErrSubstr asserts the failure message names the failing snapshot and target. + wantErrSubstr []string + wantDeleted bool }{ { name: "empty overrides", @@ -1664,6 +1753,9 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Name: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-patch-cro-snapshot", + }, Spec: placementv1beta1.ClusterResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ClusterResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -1697,6 +1789,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ClusterResourceOverrideSnapshot "invalid-patch-cro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, { name: "invalid json patch of resourceOverride", @@ -1728,6 +1825,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Namespace: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-patch-ro-snapshot", + Namespace: "deployment-namespace", + }, Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -1751,6 +1852,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ResourceOverrideSnapshot "invalid-patch-ro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, { name: "delete type of resourceOverride", @@ -1797,7 +1903,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "resourceOverride delete the cro override", @@ -1884,7 +1990,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "resourceOverride no-op when the cro delete", @@ -1970,7 +2076,7 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, }, - wantDelete: true, + wantDeleted: true, }, { name: "cluster name as value in json patch of resourceOverride", @@ -2218,6 +2324,10 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { Namespace: "deployment-namespace", }: { { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ro-snapshot", + Namespace: "deployment-namespace", + }, Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ OverrideSpec: placementv1beta1.ResourceOverrideSpec{ Policy: &placementv1beta1.OverridePolicy{ @@ -2240,6 +2350,11 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { }, }, wantErr: controller.ErrUserError, + wantErrSubstr: []string{ + `ResourceOverrideSnapshot "test-ro-snapshot"`, + `Deployment "deployment-name"`, + `namespace "deployment-namespace"`, + }, }, } for _, tc := range tests { @@ -2252,19 +2367,24 @@ func TestApplyOverrides_namespacedScopeResource(t *testing.T) { if gotErr, wantErr := err != nil, tc.wantErr != nil; gotErr != wantErr || !errors.Is(err, tc.wantErr) { t.Fatalf("applyOverrides() got error %v, want error %v", err, tc.wantErr) } - if gotDeleted != tc.wantDelete { - t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDelete) + if gotDeleted != tc.wantDeleted { + t.Fatalf("applyOverrides() gotDeleted %v, want %v", gotDeleted, tc.wantDeleted) } if tc.wantErr != nil { + for _, want := range tc.wantErrSubstr { + if !strings.Contains(err.Error(), want) { + t.Errorf("applyOverrides() error = %q, want to contain %q", err.Error(), want) + } + } return } - if tc.wantDelete { + if tc.wantDeleted { return } var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var deployment appsv1.Deployment @@ -2757,7 +2877,7 @@ func TestApplyJSONPatchOverride(t *testing.T) { var u unstructured.Unstructured if err := u.UnmarshalJSON(rc.Raw); err != nil { - t.Fatalf("Failed to unmarshl the result: %v, want nil", err) + t.Fatalf("Failed to unmarshal the result: %v, want nil", err) } var deployment appsv1.Deployment @@ -2774,10 +2894,10 @@ func TestApplyJSONPatchOverride(t *testing.T) { func TestReplaceClusterLabelKeyVariables(t *testing.T) { tests := map[string]struct { - cluster *clusterv1beta1.MemberCluster - input string - expected string - expectErr bool + cluster *clusterv1beta1.MemberCluster + input string + want string + wantErr bool }{ "No clusterLabelKey variables": { cluster: &clusterv1beta1.MemberCluster{ @@ -2787,8 +2907,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in us-west-1", - expected: "The cluster is in us-west-1", + input: "The cluster is in us-west-1", + want: "The cluster is in us-west-1", }, "ClusterLabelKey Variable replaced": { cluster: &clusterv1beta1.MemberCluster{ @@ -2798,8 +2918,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", - expected: "The cluster is in us-west-1", + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", + want: "The cluster is in us-west-1", }, "The clusterLabelKey key is misspelled": { cluster: &clusterv1beta1.MemberCluster{ @@ -2807,8 +2927,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { Labels: map[string]string{}, }, }, - input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", - expected: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", + input: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", + want: "The cluster is in $MEMBER-CLUSTER-LABEL-KEY-region", }, "Multiple complex clusterLabelKey variables replaced": { cluster: &clusterv1beta1.MemberCluster{ @@ -2819,8 +2939,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}", - expected: "The cluster is in us-west-1 and environment is prod", + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/location-region_public} and environment is ${MEMBER-CLUSTER-LABEL-KEY-fleet.azure.com/env}", + want: "The cluster is in us-west-1 and environment is prod", }, "The clusterLabelKey key is not found": { cluster: &clusterv1beta1.MemberCluster{ @@ -2828,8 +2948,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { Labels: map[string]string{}, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region}", + wantErr: true, }, "ClusterLabelKey Variable key case not match": { cluster: &clusterv1beta1.MemberCluster{ @@ -2839,10 +2959,10 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-REGION}", + wantErr: true, }, - "Invalid clusterLabelKey variable format": { + "Invalid clusterLabelKey variable format": { cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -2850,8 +2970,8 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-region", + wantErr: true, }, "ClusterLabelKey variable key empty": { cluster: &clusterv1beta1.MemberCluster{ @@ -2861,19 +2981,61 @@ func TestReplaceClusterLabelKeyVariables(t *testing.T) { }, }, }, - input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}", - expectErr: true, + input: "The cluster is in ${MEMBER-CLUSTER-LABEL-KEY-}", + wantErr: true, }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { result, err := replaceClusterLabelKeyVariables(tc.input, tc.cluster) - if gotErr := err != nil; gotErr != tc.expectErr { - t.Fatalf("applyJSONPatchOverride() = error %v, want %v", err, tc.expectErr) + if gotErr := err != nil; gotErr != tc.wantErr { + t.Fatalf("replaceClusterLabelKeyVariables() = error %v, want %v", err, tc.wantErr) } - if result != tc.expected { - t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.expected) + if result != tc.want { + t.Errorf("replaceClusterLabelKeyVariables() = %v, want %v", result, tc.want) + } + }) + } +} + +func TestFormatOverrideTarget(t *testing.T) { + tests := map[string]struct { + kind string + name string + namespace string + want string + }{ + "namespaced resource": { + kind: "Deployment", + name: "my-app", + namespace: "default", + want: `Deployment "my-app" in namespace "default"`, + }, + "cluster-scoped resource": { + kind: "Namespace", + name: "app", + want: `Namespace "app"`, + }, + "empty kind falls back to Unknown": { + name: "my-app", + namespace: "default", + want: `Unknown "my-app" in namespace "default"`, + }, + "empty kind, cluster-scoped": { + name: "app", + want: `Unknown "app"`, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + target := &unstructured.Unstructured{} + target.SetGroupVersionKind(schema.GroupVersionKind{Kind: tc.kind}) + target.SetName(tc.name) + target.SetNamespace(tc.namespace) + if got := formatOverrideTarget(target); got != tc.want { + t.Errorf("formatOverrideTarget() = %q, want %q", got, tc.want) } }) } diff --git a/pkg/controllers/workgenerator/suite_test.go b/pkg/controllers/workgenerator/suite_test.go index 24c03edd4..d7b999110 100644 --- a/pkg/controllers/workgenerator/suite_test.go +++ b/pkg/controllers/workgenerator/suite_test.go @@ -301,6 +301,47 @@ func createOverrides() { } Expect(k8sClient.Create(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the cro-2") By(fmt.Sprintf("Invalid cluster resource override snapshot %s created", invalidClusterResourceOverrideSnapshotName)) + + invalidResourceOverrideSnapshot = placementv1beta1.ResourceOverrideSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: invalidResourceOverrideSnapshotName, + Namespace: appNamespaceName, + Labels: map[string]string{ + placementv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + Spec: placementv1beta1.ResourceOverrideSnapshotSpec{ + OverrideSpec: placementv1beta1.ResourceOverrideSpec{ + ResourceSelectors: []placementv1beta1.ResourceSelector{ + { + Group: "test.kubernetes-fleet.io", + Version: "v1alpha1", + Kind: "TestResource", + Name: "random-test-resource", + }, + }, + Policy: &placementv1beta1.OverridePolicy{ + OverrideRules: []placementv1beta1.OverrideRule{ + { + ClusterSelector: &placementv1beta1.ClusterSelector{ + ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{}, // select all members + }, + JSONPatchOverrides: []placementv1beta1.JSONPatchOverride{ + { + Operator: placementv1beta1.JSONPatchOverrideOpAdd, + Path: "/invalid/path", + Value: apiextensionsv1.JSON{Raw: []byte(`"new-value"`)}, + }, + }, + }, + }, + }, + }, + OverrideHash: []byte("123"), + }, + } + Expect(k8sClient.Create(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to create the ro-2") + By(fmt.Sprintf("Invalid resource override snapshot %s created", invalidResourceOverrideSnapshotName)) } var _ = AfterSuite(func() { @@ -309,6 +350,7 @@ var _ = AfterSuite(func() { Expect(k8sClient.Delete(ctx, &validClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-1") Expect(k8sClient.Delete(ctx, &validResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-1") Expect(k8sClient.Delete(ctx, &invalidClusterResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the cro-2") + Expect(k8sClient.Delete(ctx, &invalidResourceOverrideSnapshot)).Should(Succeed(), "Failed to delete the ro-2") Expect(k8sClient.Delete(ctx, &appNamespace)).Should(Succeed(), "Failed to delete app namespace") cancel() diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 866f7670b..7dfa3a943 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -21,12 +21,10 @@ import ( "context" "errors" "fmt" - "strconv" "sync" "time" apiErrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/tools/record" @@ -347,57 +345,21 @@ func (s *Scheduler) cleanUpAllBindingsFor(ctx context.Context, placement fleetv1 } // lookupLatestPolicySnapshot returns the latest (i.e., active) policy snapshot associated with a placement. -// TODO(ryan): move this to a common lib +// +// It delegates to controller.LookupLatestPolicySnapshot so the same lookup can be reused outside the +// scheduler. The scheduler lists with a cached client; this does not have any consistency concern as +// sources will always trigger the scheduler when a new policy snapshot is created. +// +// The "no latest snapshot" case can happen transiently when a placement is newly created or its +// latest snapshot is being replaced; the scheduler will be triggered again when the situation +// corrects itself, so this is logged and surfaced as a non-unexpected error. func (s *Scheduler) lookupLatestPolicySnapshot(ctx context.Context, placement fleetv1beta1.PlacementObj) (fleetv1beta1.PolicySnapshotObj, error) { - placementRef := klog.KObj(placement) - // Prepare the list options to filter policy snapshots by the placement name and the latest snapshot label. - var listOptions []client.ListOption - labelSelector := labels.SelectorFromSet(labels.Set{ - fleetv1beta1.PlacementTrackingLabel: placement.GetName(), - fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - }) - listOptions = append(listOptions, &client.ListOptions{LabelSelector: labelSelector}) - // Find out the latest policy snapshot associated with the placement. - var policySnapshotList fleetv1beta1.PolicySnapshotList - if placement.GetNamespace() == "" { - policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{} - } else { - policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{} - listOptions = append(listOptions, client.InNamespace(placement.GetNamespace())) - } - - // The scheduler lists with a cached client; this does not have any consistency concern as sources - // will always trigger the scheduler when a new policy snapshot is created. - if err := s.client.List(ctx, policySnapshotList, listOptions...); err != nil { - klog.ErrorS(err, "Failed to list policy snapshots of a placement", "placement", placementRef) - return nil, controller.NewAPIServerError(true, err) - } - policySnapshots := policySnapshotList.GetPolicySnapshotObjs() - switch { - case len(policySnapshots) == 0: - // There is no latest policy snapshot associated with the placement; it could happen when - // * the placement is newly created; or - // * the new policy snapshot is in the middle of being replaced. - // - // Either way, it is out of the scheduler's scope to handle such a case; the scheduler will - // be triggered again if the situation is corrected. - err := fmt.Errorf("no latest policy snapshot associated with placement") - klog.ErrorS(err, "Failed to find the latest policy snapshot, will retry", "placement", placementRef) - return nil, err - case len(policySnapshots) > 1: - // There are multiple active policy snapshots associated with the placement; normally this - // will never happen, as only one policy snapshot can be active in the sequence. - // - // Similarly, it is out of the scheduler's scope to handle such a case; the scheduler will - // report this unexpected occurrence but does not register it as a scheduler-side error. - // If (and when) the situation is corrected, the scheduler will be triggered again. - err := controller.NewUnexpectedBehaviorError(fmt.Errorf("too many active policy snapshots: %d, want 1", len(policySnapshots))) - klog.ErrorS(err, "There are multiple latest policy snapshots associated with placement", "placement", placementRef) + snapshot, err := controller.LookupLatestPolicySnapshot(ctx, s.client, types.NamespacedName{Namespace: placement.GetNamespace(), Name: placement.GetName()}) + if err != nil { + klog.ErrorS(err, "Failed to look up latest policy snapshot, will retry", "placement", klog.KObj(placement)) return nil, err - default: - // Found the one and only active policy snapshot. - return policySnapshots[0], nil } + return snapshot, nil } // addSchedulerCleanupFinalizer adds the scheduler cleanup finalizer to a placement (if it does not diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index acbf8a482..a252638a4 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -18,6 +18,8 @@ package scheduler import ( "context" + "errors" + "fmt" "log" "os" "strconv" @@ -30,10 +32,13 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" hubmetrics "go.goms.io/fleet/pkg/metrics/hub" + "go.goms.io/fleet/pkg/utils/controller" ) const ( @@ -591,6 +596,40 @@ func TestLookupLatestPolicySnapshot(t *testing.T) { } } +// TestLookupLatestPolicySnapshot_ListError covers the error path: when the underlying +// controller.LookupLatestPolicySnapshot returns an error, the scheduler logs and propagates +// it without panicking on the nil snapshot. +func TestLookupLatestPolicySnapshot_ListError(t *testing.T) { + placement := &fleetv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + Finalizers: []string{fleetv1beta1.SchedulerCleanupFinalizer}, + }, + } + listErr := fmt.Errorf("simulated cache failure") + fakeClient := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(placement).Build(), + interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error { + return listErr + }, + }, + ) + + s := &Scheduler{client: fakeClient, uncachedReader: fakeClient} + + got, err := s.lookupLatestPolicySnapshot(context.Background(), placement) + if err == nil { + t.Fatalf("lookupLatestPolicySnapshot() = %v, nil; want non-nil error", got) + } + if got != nil { + t.Errorf("lookupLatestPolicySnapshot() returned snapshot %v, want nil on error", got) + } + if !errors.Is(err, controller.ErrUnexpectedBehavior) { + t.Errorf("lookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } +} + func TestObserveSchedulingCycleMetrics(t *testing.T) { metricMetadata := ` # HELP scheduling_cycle_duration_milliseconds The duration of a scheduling cycle run in milliseconds diff --git a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go index 70a01eac4..1cca8b048 100644 --- a/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go +++ b/pkg/scheduler/watchers/schedulingpolicysnapshot/watcher.go @@ -74,32 +74,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, nil } - // Verify if the policy snapshot is currently active. - // TODO: create a lib to check if the policy snapshot is latest. - isLatestVal, ok := policySnapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel] - if !ok { - // The IsLatestSnapshot label is not present; normally this should never occur. - klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("IsLatestSnapshotLabel is missing")), - "IsLatestSnapshot label is not present", - "policySnapshot", policySnapshotRef) - // This is not a situation that the controller can recover by itself. Should the label - // value be corrected, the controller will be triggered again. - return ctrl.Result{}, nil - } - isLatest, err := strconv.ParseBool(isLatestVal) + // Verify if the policy snapshot is currently active. A missing or malformed IsLatestSnapshot + // label is treated as "not latest" — the controller cannot recover from those states by + // itself; the placement controller will repair the label and re-trigger reconciliation. + isLatest, err := controller.IsLatestPolicySnapshot(policySnapshot) if err != nil { - // The label value is not a correctly formatted boolean value; normally this should never - // occur. - klog.ErrorS(controller.NewUnexpectedBehaviorError(err), - "Failed to parse IsLatestSnapshot value", - "policySnapshot", policySnapshotRef) - // This is not an error that the controller can recover by itself; ignore the error. - // Should the label value be corrected, the controller will be triggered again. + klog.ErrorS(err, "Failed to determine whether policy snapshot is latest", "policySnapshot", policySnapshotRef) return ctrl.Result{}, nil } if !isLatest { - // The policy snapshot is not currently active, i.e., it is not the latest policy snapshot; - // ignore it. return ctrl.Result{}, nil } diff --git a/pkg/utils/binding/binding.go b/pkg/utils/binding/binding.go index 29b3483c6..6d7a98361 100644 --- a/pkg/utils/binding/binding.go +++ b/pkg/utils/binding/binding.go @@ -23,14 +23,51 @@ import ( "go.goms.io/fleet/pkg/utils/condition" ) -// HasBindingFailed checks if BindingObj has failed based on its conditions. +// nonTerminalBindingFailureReasons is the closed set of `Reason` strings that the per-cluster +// binding conditions (Overridden, WorkSynchronized, Applied, Available) can carry while still +// `Status=False` *without* representing a terminal failure. Each one means the binding is still +// progressing and the controller should keep waiting rather than treat the binding as failed. +// +// The set is intentionally an allowlist: any new `Status=False` reason added to the codebase is +// treated as terminal until it is explicitly added here. This is the safer default because the +// alternative (treat unknown reasons as transient) would stall the rollout controller forever +// on a genuinely-failing binding it does not yet know how to classify. +// +// When you add a new in-progress / pending / "not yet" reason to pkg/utils/condition/reason.go +// for any of the above conditions, also add it here. +var nonTerminalBindingFailureReasons = map[string]struct{}{ + condition.OverriddenPendingReason: {}, + condition.WorkNotSynchronizedYetReason: {}, + condition.ApplyPendingReason: {}, + condition.NotAvailableYetReason: {}, +} + +// HasBindingFailed reports whether a binding has reached a terminal failure on any of its +// per-cluster conditions (Overridden, WorkSynchronized, Applied, Available). +// +// A condition counts as a terminal failure when its `Status` is `False` *and* its `Reason` is +// not in `nonTerminalBindingFailureReasons`. The Reason check is necessary because several of +// the in-progress states (e.g. `WorkNotSynchronizedYetReason`, `NotAvailableYetReason`) are +// expressed as `Status=False` rather than `Status=Unknown`. Treating those as failures was the +// previous bug — it caused the rollout controller to drop bindings that were still progressing +// out of `canBeReadyBindings`, stalling rollout decisions. +// +// Unknown reasons fall through to "terminal" by design; see the comment on +// `nonTerminalBindingFailureReasons`. func HasBindingFailed(binding placementv1beta1.BindingObj) bool { for i := condition.OverriddenCondition; i <= condition.AvailableCondition; i++ { - if condition.IsConditionStatusFalse(binding.GetCondition(string(i.ResourceBindingConditionType())), binding.GetGeneration()) { - // TODO: parse the reason of the condition to see if the failure is recoverable/retriable or not - klog.V(2).Infof("binding %s has condition %s with status false", klog.KObj(binding), string(i.ResourceBindingConditionType())) - return true + c := binding.GetCondition(string(i.ResourceBindingConditionType())) + if !condition.IsConditionStatusFalse(c, binding.GetGeneration()) { + continue + } + if _, transient := nonTerminalBindingFailureReasons[c.Reason]; transient { + klog.V(3).Infof("binding %s has condition %s status false with non-terminal reason %q; treating as in-progress", + klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason) + continue } + klog.V(2).Infof("binding %s has terminal failure on condition %s (reason %q)", + klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason) + return true } return false } diff --git a/pkg/utils/binding/binding_test.go b/pkg/utils/binding/binding_test.go index e471afa11..b3ecb6b06 100644 --- a/pkg/utils/binding/binding_test.go +++ b/pkg/utils/binding/binding_test.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils/condition" ) func TestHasBindingFailed(t *testing.T) { @@ -326,6 +327,102 @@ func TestHasBindingFailed(t *testing.T) { } } +// TestHasBindingFailed_NonTerminalReasons verifies that conditions with Status=False but a +// known in-progress reason (WorkNotSynchronizedYet, NotAvailableYet, OverriddenPending, +// ApplyPending) are NOT treated as failures. This is the bug fix for issue #648. +func TestHasBindingFailed_NonTerminalReasons(t *testing.T) { + bindingWithCondition := func(condType string, reason string) *placementv1beta1.ClusterResourceBinding { + return &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: condType, + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: reason, + Message: "in progress", + }, + }, + }, + } + } + + tests := []struct { + name string + binding *placementv1beta1.ClusterResourceBinding + want bool + }{ + { + name: "WorkSynchronized=False with WorkNotSynchronizedYet is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingWorkSynchronized), condition.WorkNotSynchronizedYetReason), + want: false, + }, + { + name: "Available=False with NotAvailableYet is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingAvailable), condition.NotAvailableYetReason), + want: false, + }, + { + name: "Overridden=False with OverriddenPending is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenPendingReason), + want: false, + }, + { + name: "Applied=False with ApplyPending is in-progress, not a failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyPendingReason), + want: false, + }, + { + name: "Applied=False with ApplyFailed is a terminal failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyFailedReason), + want: true, + }, + { + name: "Overridden=False with OverriddenFailed is a terminal failure", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenFailedReason), + want: true, + }, + { + name: "Status=False with an unknown reason defaults to terminal (safer default)", + binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), "SomeReasonNobodyAddedToTheAllowlist"), + want: true, + }, + { + name: "non-terminal reason on one condition does not mask a terminal failure on another", + binding: &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1}, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingOverridden), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.OverriddenPendingReason, + }, + { + Type: string(placementv1beta1.ResourceBindingApplied), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.ApplyFailedReason, + }, + }, + }, + }, + want: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := HasBindingFailed(tc.binding) + if got != tc.want { + t.Errorf("HasBindingFailed(%v) = %v, want %v", tc.binding, got, tc.want) + } + }) + } +} + func TestIsBindingReportDiff(t *testing.T) { tests := []struct { name string diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go index 0da49c41c..1e5067931 100644 --- a/pkg/utils/controller/controller.go +++ b/pkg/utils/controller/controller.go @@ -63,6 +63,19 @@ var ( // ErrUserError indicates the error is caused by the user and customer needs to take the action. ErrUserError = errors.New("failed to process the request due to a client error") + + // ErrNoLatestPolicySnapshot indicates that no active (latest) policy snapshot was found for a + // placement. This is a transient condition during placement creation or rotation and is typically + // recoverable by the next reconcile; callers branch on it via errors.Is to distinguish it from + // real API or invariant-violation errors. + ErrNoLatestPolicySnapshot = errors.New("no latest policy snapshot found") + + // ErrMultipleActivePolicySnapshots indicates that more than one policy snapshot for the same + // placement carries IsLatestSnapshot=true, violating the "exactly one active snapshot" + // invariant. Callers branch on it via errors.Is so they can fail fast on a real invariant + // violation without misclassifying transient cache errors that share the ErrUnexpectedBehavior + // category. + ErrMultipleActivePolicySnapshots = errors.New("multiple active policy snapshots found") ) // NewUnexpectedBehaviorError returns ErrUnexpectedBehavior type error when err is not nil. diff --git a/pkg/utils/controller/policy_snapshot_resolver.go b/pkg/utils/controller/policy_snapshot_resolver.go index 5406c7e1e..5a2fd8ea2 100644 --- a/pkg/utils/controller/policy_snapshot_resolver.go +++ b/pkg/utils/controller/policy_snapshot_resolver.go @@ -110,34 +110,69 @@ func BuildPolicySnapshot(placementObj fleetv1beta1.PlacementObj, policySnapshotI return snapshot } -// FetchLatestPolicySnapshot fetches the latest policy snapshot for a given placement. -// For cluster-scoped placements, it fetches ClusterSchedulingPolicySnapshot. -// For namespaced placements, it fetches SchedulingPolicySnapshot. -func FetchLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotList, error) { - namespace := placementKey.Namespace - name := placementKey.Name +// IsLatestPolicySnapshot reports whether the given policy snapshot carries the IsLatestSnapshot +// label set to "true". A non-nil error is returned if the label is missing or its value cannot be +// parsed as a bool; in those cases the boolean result is false. +// +// Callers typically log the error and treat the snapshot as non-latest, since the controller cannot +// recover from a malformed label by itself — the placement controller will repair it and re-trigger +// reconciliation. +func IsLatestPolicySnapshot(snapshot fleetv1beta1.PolicySnapshotObj) (bool, error) { + val, ok := snapshot.GetLabels()[fleetv1beta1.IsLatestSnapshotLabel] + if !ok { + return false, NewUnexpectedBehaviorError(fmt.Errorf("%s label is missing", fleetv1beta1.IsLatestSnapshotLabel)) + } + isLatest, err := strconv.ParseBool(val) + if err != nil { + return false, NewUnexpectedBehaviorError(fmt.Errorf("failed to parse %s label value %q: %w", fleetv1beta1.IsLatestSnapshotLabel, val, err)) + } + return isLatest, nil +} +// LookupLatestPolicySnapshot returns the single active (latest) policy snapshot associated with the +// placement identified by placementKey, enforcing the "exactly one latest snapshot" invariant. +// A namespaced placementKey selects SchedulingPolicySnapshot; a cluster-scoped key (empty +// namespace) selects ClusterSchedulingPolicySnapshot. +// +// Returns: +// - (snapshot, nil) when exactly one latest snapshot exists. +// - (nil, ErrNoLatestPolicySnapshot-wrapped error) when none exist. This is a transient state for +// newly-created placements or in-progress rotations; callers branch on it via errors.Is. +// - (nil, ErrMultipleActivePolicySnapshots-wrapped error) when more than one exists. +// - (nil, ErrAPIServerError-wrapped error) when the List call fails. The List error is also +// promoted to ErrUnexpectedBehavior for unexpected cache failures; callers must NOT collapse +// ErrUnexpectedBehavior with the dedicated invariant sentinels above when classifying retries. +func LookupLatestPolicySnapshot(ctx context.Context, k8Client client.Reader, placementKey types.NamespacedName) (fleetv1beta1.PolicySnapshotObj, error) { var policySnapshotList fleetv1beta1.PolicySnapshotList - var listOptions []client.ListOption - listOptions = append(listOptions, client.MatchingLabels{ - fleetv1beta1.PlacementTrackingLabel: name, - fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - }) - - if namespace != "" { - // This is a namespaced SchedulingPolicySnapshotList + listOptions := []client.ListOption{ + client.MatchingLabels{ + fleetv1beta1.PlacementTrackingLabel: placementKey.Name, + fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), + }, + } + if placementKey.Namespace != "" { policySnapshotList = &fleetv1beta1.SchedulingPolicySnapshotList{} - listOptions = append(listOptions, client.InNamespace(namespace)) + listOptions = append(listOptions, client.InNamespace(placementKey.Namespace)) } else { - // This is a cluster-scoped ClusterSchedulingPolicySnapshotList policySnapshotList = &fleetv1beta1.ClusterSchedulingPolicySnapshotList{} } - if err := k8Client.List(ctx, policySnapshotList, listOptions...); err != nil { klog.ErrorS(err, "Failed to list the policySnapshots associated with the placement", "placement", placementKey) - return nil, err + return nil, NewAPIServerError(true, err) + } + + policySnapshots := policySnapshotList.GetPolicySnapshotObjs() + switch len(policySnapshots) { + case 0: + return nil, fmt.Errorf("%w for placement %s", ErrNoLatestPolicySnapshot, placementKey) + case 1: + return policySnapshots[0], nil + default: + // Wrap both the specific invariant sentinel and the categorical ErrUnexpectedBehavior so + // callers that branch on either continue to work; the specific sentinel lets retry-gate + // callers distinguish this invariant violation from transient ErrUnexpectedBehavior errors. + return nil, fmt.Errorf("%w: %w for placement %s: got %d, want 1", ErrUnexpectedBehavior, ErrMultipleActivePolicySnapshots, placementKey, len(policySnapshots)) } - return policySnapshotList, nil } // ListPolicySnapshots lists all policy snapshots associated with a placement key. diff --git a/pkg/utils/controller/policy_snapshot_resolver_test.go b/pkg/utils/controller/policy_snapshot_resolver_test.go index 379c939a3..56a856de8 100644 --- a/pkg/utils/controller/policy_snapshot_resolver_test.go +++ b/pkg/utils/controller/policy_snapshot_resolver_test.go @@ -18,6 +18,8 @@ package controller import ( "context" + "errors" + "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -28,6 +30,7 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" ) @@ -489,7 +492,7 @@ func TestBuildPolicySnapshot(t *testing.T) { } } -func TestFetchLatestPolicySnapshot(t *testing.T) { +func TestListPolicySnapshots(t *testing.T) { scheme := runtime.NewScheme() if err := fleetv1beta1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add to scheme: %v", err) @@ -502,7 +505,7 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { expectedSnapshots int }{ { - name: "fetch latest cluster scoped policy snapshots", + name: "list cluster scoped policy snapshots", placementKey: types.NamespacedName{ Name: placementName, }, @@ -512,7 +515,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -521,15 +523,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-1", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, }, - expectedSnapshots: 1, + expectedSnapshots: 2, }, { - name: "fetch latest cluster scoped policy snapshots - mixed setup", + name: "list cluster scoped policy snapshots - mixed setup", placementKey: types.NamespacedName{ Name: placementName, }, @@ -539,7 +548,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -548,7 +556,22 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-1", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-placement-2", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + }, + }, + }, + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -559,15 +582,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-placement-1", + Namespace: policySnapshotNamespace, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, }, }, }, }, - expectedSnapshots: 1, // Should only return cluster-scoped ones for cluster-scoped placement + expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement }, { - name: "fetch latest namespaced policy snapshots", + name: "list namespaced policy snapshots", placementKey: types.NamespacedName{ Name: placementName, Namespace: policySnapshotNamespace, @@ -579,7 +610,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -589,15 +619,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Namespace: "other-namespace", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, }, - expectedSnapshots: 1, + expectedSnapshots: 2, }, { - name: "fetch latest namespaced policy snapshots - mixed setup", + name: "list namespaced policy snapshots - mixed setup", placementKey: types.NamespacedName{ Name: placementName, Namespace: policySnapshotNamespace, @@ -609,7 +647,6 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, @@ -619,7 +656,15 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Namespace: policySnapshotNamespace, Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + }, + }, + }, + &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-placement-0", + Namespace: "other-namespace", + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -629,25 +674,23 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, }, - expectedSnapshots: 1, // Should only return namespaced ones for namespaced placement + expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement }, { - name: "no latest policy snapshots found", + name: "no policy snapshots found", placementKey: types.NamespacedName{ Name: placementName, }, existingSnapshots: []client.Object{ &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", + Name: "other-placement-0", Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - fleetv1beta1.IsLatestSnapshotLabel: "false", + fleetv1beta1.PlacementTrackingLabel: "other-placement", }, }, }, @@ -664,9 +707,9 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { WithObjects(tc.existingSnapshots...). Build() - snapshots, err := FetchLatestPolicySnapshot(ctx, fakeClient, tc.placementKey) + snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey) if err != nil { - t.Fatalf("FetchLatestPolicySnapshot() = %v, want nil", err) + t.Fatalf("ListPolicySnapshots() = %v, want nil", err) } if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots { @@ -676,210 +719,204 @@ func TestFetchLatestPolicySnapshot(t *testing.T) { } } -func TestListPolicySnapshots(t *testing.T) { +func TestIsLatestPolicySnapshot(t *testing.T) { + snapshotWithLabels := func(labels map[string]string) fleetv1beta1.PolicySnapshotObj { + return &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: policySnapshotName, + Labels: labels, + }, + } + } + + testCases := []struct { + name string + snapshot fleetv1beta1.PolicySnapshotObj + want bool + wantErr bool + }{ + { + name: "label true returns true", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "true"}), + want: true, + wantErr: false, + }, + { + name: "label false returns false", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "false"}), + want: false, + wantErr: false, + }, + { + name: "label missing returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.PlacementTrackingLabel: placementName}), + want: false, + wantErr: true, + }, + { + name: "nil labels returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(nil), + want: false, + wantErr: true, + }, + { + name: "label malformed returns false with unexpected-behavior error", + snapshot: snapshotWithLabels(map[string]string{fleetv1beta1.IsLatestSnapshotLabel: "yes"}), + want: false, + wantErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got, err := IsLatestPolicySnapshot(tc.snapshot) + if (err != nil) != tc.wantErr { + t.Errorf("IsLatestPolicySnapshot() error = %v, wantErr %v", err, tc.wantErr) + } + if err != nil && !errors.Is(err, ErrUnexpectedBehavior) { + t.Errorf("IsLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } + if got != tc.want { + t.Errorf("IsLatestPolicySnapshot() = %v, want %v", got, tc.want) + } + }) + } +} + +func TestLookupLatestPolicySnapshot(t *testing.T) { scheme := runtime.NewScheme() if err := fleetv1beta1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add to scheme: %v", err) } + clusterKey := types.NamespacedName{Name: placementName} + namespacedKey := types.NamespacedName{Name: placementName, Namespace: policySnapshotNamespace} + + clusterLatestSnapshot := func(name string) *fleetv1beta1.ClusterSchedulingPolicySnapshot { + return &fleetv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + } + } + namespacedLatestSnapshot := func(name string) *fleetv1beta1.SchedulingPolicySnapshot { + return &fleetv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: policySnapshotNamespace, + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + } + } + testCases := []struct { name string placementKey types.NamespacedName existingSnapshots []client.Object - expectedSnapshots int + wantName string + wantErrSentinel error // nil means a snapshot is expected; otherwise the error must wrap this sentinel. }{ { - name: "list cluster scoped policy snapshots", - placementKey: types.NamespacedName{ - Name: placementName, - }, + name: "exactly one cluster-scoped latest snapshot", + placementKey: clusterKey, + existingSnapshots: []client.Object{clusterLatestSnapshot("test-placement-0")}, + wantName: "test-placement-0", + }, + { + name: "exactly one namespaced latest snapshot", + placementKey: namespacedKey, + existingSnapshots: []client.Object{namespacedLatestSnapshot("test-placement-0")}, + wantName: "test-placement-0", + }, + { + name: "no latest snapshot returns ErrNoLatestPolicySnapshot", + placementKey: clusterKey, + existingSnapshots: nil, + wantErrSentinel: ErrNoLatestPolicySnapshot, + }, + { + name: "only non-latest snapshots present returns ErrNoLatestPolicySnapshot", + placementKey: clusterKey, existingSnapshots: []client.Object{ &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", + fleetv1beta1.IsLatestSnapshotLabel: "false", }, }, }, }, - expectedSnapshots: 2, + wantErrSentinel: ErrNoLatestPolicySnapshot, }, { - name: "list cluster scoped policy snapshots - mixed setup", - placementKey: types.NamespacedName{ - Name: placementName, - }, + name: "multiple latest snapshots returns ErrMultipleActivePolicySnapshots", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-2", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, - // Add namespaced snapshots to test mixed scenario - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, + clusterLatestSnapshot("test-placement-0"), + clusterLatestSnapshot("test-placement-1"), }, - expectedSnapshots: 3, // Should only return cluster-scoped ones for cluster-scoped placement + wantErrSentinel: ErrMultipleActivePolicySnapshots, }, { - name: "list namespaced policy snapshots", - placementKey: types.NamespacedName{ - Name: placementName, - Namespace: policySnapshotNamespace, - }, + name: "ignores snapshots from other placements", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ + clusterLatestSnapshot("test-placement-0"), + &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Namespace: "other-namespace", + Name: "other-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: "other-placement", + fleetv1beta1.IsLatestSnapshotLabel: "true", }, }, }, }, - expectedSnapshots: 2, + wantName: "test-placement-0", }, { - name: "list namespaced policy snapshots - mixed setup", - placementKey: types.NamespacedName{ - Name: placementName, - Namespace: policySnapshotNamespace, - }, + name: "ignores non-latest snapshots", + placementKey: clusterKey, existingSnapshots: []client.Object{ - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-0", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-placement-1", - Namespace: policySnapshotNamespace, - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: placementName, - }, - }, - }, - &fleetv1beta1.SchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Namespace: "other-namespace", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, - // Add cluster-scoped snapshots to test mixed scenario + clusterLatestSnapshot("test-placement-1"), &fleetv1beta1.ClusterSchedulingPolicySnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-placement-0", Labels: map[string]string{ fleetv1beta1.PlacementTrackingLabel: placementName, + fleetv1beta1.IsLatestSnapshotLabel: "false", }, }, }, }, - expectedSnapshots: 2, // Should only return namespaced ones for namespaced placement + wantName: "test-placement-1", }, { - name: "no policy snapshots found", - placementKey: types.NamespacedName{ - Name: placementName, + name: "cluster-scoped lookup ignores a same-named latest namespaced snapshot", + placementKey: clusterKey, + existingSnapshots: []client.Object{ + clusterLatestSnapshot("test-placement-0"), + namespacedLatestSnapshot("test-placement-0"), }, + wantName: "test-placement-0", + }, + { + name: "namespaced lookup ignores a same-named latest cluster snapshot", + placementKey: namespacedKey, existingSnapshots: []client.Object{ - &fleetv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: "other-placement-0", - Labels: map[string]string{ - fleetv1beta1.PlacementTrackingLabel: "other-placement", - }, - }, - }, + namespacedLatestSnapshot("test-placement-0"), + clusterLatestSnapshot("test-placement-0"), }, - expectedSnapshots: 0, + wantName: "test-placement-0", }, } @@ -891,14 +928,51 @@ func TestListPolicySnapshots(t *testing.T) { WithObjects(tc.existingSnapshots...). Build() - snapshots, err := ListPolicySnapshots(ctx, fakeClient, tc.placementKey) - if err != nil { - t.Fatalf("ListPolicySnapshots() = %v, want nil", err) + got, err := LookupLatestPolicySnapshot(ctx, fakeClient, tc.placementKey) + if (err != nil) != (tc.wantErrSentinel != nil) { + t.Fatalf("LookupLatestPolicySnapshot() error = %v, wantErrSentinel %v", err, tc.wantErrSentinel) } - - if len(snapshots.GetPolicySnapshotObjs()) != tc.expectedSnapshots { - t.Errorf("Expected %d snapshots, got %d", tc.expectedSnapshots, len(snapshots.GetPolicySnapshotObjs())) + if tc.wantErrSentinel != nil { + if !errors.Is(err, tc.wantErrSentinel) { + t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping %v", err, tc.wantErrSentinel) + } + return + } + if got == nil || got.GetName() != tc.wantName { + t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want name %q", got, tc.wantName) } }) } } + +// TestLookupLatestPolicySnapshot_ListError covers the API-error path: when the underlying List +// call fails with an error that isUnexpectedCacheError treats as unexpected (i.e. not a +// *apierrors.StatusError, context.Canceled, or context.DeadlineExceeded), NewAPIServerError +// promotes it to ErrUnexpectedBehavior. +func TestLookupLatestPolicySnapshot_ListError(t *testing.T) { + scheme := runtime.NewScheme() + if err := fleetv1beta1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add to scheme: %v", err) + } + + listErr := fmt.Errorf("simulated cache failure") + fakeClient := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme).Build(), + interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error { + return listErr + }, + }, + ) + + got, err := LookupLatestPolicySnapshot(context.Background(), fakeClient, types.NamespacedName{Name: placementName}) + if err == nil { + t.Fatalf("LookupLatestPolicySnapshot() = %v, nil; want non-nil error", got) + } + if got != nil { + t.Errorf("LookupLatestPolicySnapshot() returned snapshot %v, want nil", got) + } + if !errors.Is(err, ErrUnexpectedBehavior) { + t.Errorf("LookupLatestPolicySnapshot() error = %v, want wrapping ErrUnexpectedBehavior", err) + } +} diff --git a/pkg/utils/controller/resource_snapshot_resolver_test.go b/pkg/utils/controller/resource_snapshot_resolver_test.go index d2b652cf3..3bd8f52e3 100644 --- a/pkg/utils/controller/resource_snapshot_resolver_test.go +++ b/pkg/utils/controller/resource_snapshot_resolver_test.go @@ -1683,10 +1683,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { } tests := []struct { - name string - placementObj fleetv1beta1.PlacementObj - objects []client.Object - expectedError string + name string + placementObj fleetv1beta1.PlacementObj + objects []client.Object + // Snapshots that must still exist after the delete. Nil means every seeded object should be deleted. + wantSurvivors []types.NamespacedName + wantError string }{ { name: "delete resource snapshots - namespaced", @@ -1770,8 +1772,8 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, - // Namespaced snapshots with same placement name (should NOT be deleted) - // TODO: find a way to test this + // Namespaced snapshot sharing the same placement name; cross-scope isolation + // requires it to survive a cluster-scoped delete. &fleetv1beta1.ResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-namespaced-snapshot", @@ -1782,6 +1784,9 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, + wantSurvivors: []types.NamespacedName{ + {Namespace: "test-namespace", Name: "test-namespaced-snapshot"}, + }, }, { name: "mixed environment - delete namespaced snapshots only", @@ -1792,8 +1797,8 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, objects: []client.Object{ - // Cluster-scoped snapshots with same placement name (should NOT be deleted) - // TODO: find a way to test this + // Cluster-scoped snapshot sharing the same placement name; cross-scope isolation + // requires it to survive a namespaced delete. &fleetv1beta1.ClusterResourceSnapshot{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster-snapshot", @@ -1832,6 +1837,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { }, }, }, + wantSurvivors: []types.NamespacedName{ + // Cluster-scoped survivor (different scope, same placement name). + {Name: "test-cluster-snapshot"}, + // Different-namespace survivor (same scope, different namespace). + {Namespace: "other-namespace", Name: "other-namespaced-snapshot"}, + }, }, } @@ -1841,12 +1852,12 @@ func TestDeleteResourceSnapshots(t *testing.T) { err := DeleteResourceSnapshots(context.Background(), k8Client, tt.placementObj) - if tt.expectedError != "" { + if tt.wantError != "" { if err == nil { t.Fatalf("Expected error but got nil") } - if !strings.Contains(err.Error(), tt.expectedError) { - t.Errorf("Expected error to contain: %s, but got: %v", tt.expectedError, err) + if !strings.Contains(err.Error(), tt.wantError) { + t.Errorf("Expected error to contain: %s, but got: %v", tt.wantError, err) } return } @@ -1855,18 +1866,30 @@ func TestDeleteResourceSnapshots(t *testing.T) { t.Fatalf("Expected no error but got: %v", err) } - // Verify snapshots were deleted by checking they no longer exist - placementKey := types.NamespacedName{ - Name: tt.placementObj.GetName(), - Namespace: tt.placementObj.GetNamespace(), + // Unfiltered list of both scopes; exact match guards the cross-scope regression. + var clusterList fleetv1beta1.ClusterResourceSnapshotList + if err := k8Client.List(context.Background(), &clusterList); err != nil { + t.Fatalf("Failed to list ClusterResourceSnapshots after deletion: %v", err) } - result, err := ListAllResourceSnapshots(context.Background(), k8Client, placementKey) - if err != nil { - t.Fatalf("Expected no error when listing snapshots after deletion, but got: %v", err) + var nsList fleetv1beta1.ResourceSnapshotList + if err := k8Client.List(context.Background(), &nsList); err != nil { + t.Fatalf("Failed to list ResourceSnapshots after deletion: %v", err) } - - if len(result.GetResourceSnapshotObjs()) != 0 { - t.Errorf("Expected 0 resource snapshots after deletion, got %d", len(result.GetResourceSnapshotObjs())) + got := make([]types.NamespacedName, 0, len(clusterList.Items)+len(nsList.Items)) + for _, s := range clusterList.Items { + got = append(got, types.NamespacedName{Name: s.Name}) + } + for _, s := range nsList.Items { + got = append(got, types.NamespacedName{Namespace: s.Namespace, Name: s.Name}) + } + less := func(a, b types.NamespacedName) bool { + if a.Namespace != b.Namespace { + return a.Namespace < b.Namespace + } + return a.Name < b.Name + } + if diff := cmp.Diff(tt.wantSurvivors, got, cmpopts.SortSlices(less), cmpopts.EquateEmpty()); diff != "" { + t.Errorf("surviving snapshots mismatch (-want +got):\n%s", diff) } }) } diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go new file mode 100644 index 000000000..3f04059e5 --- /dev/null +++ b/pkg/utils/errors/errors.go @@ -0,0 +1,240 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// TO-DO: apply the new error pattern incrementally when it is ready to do so. + +// errors features custom error categorizing, wrapping and string formatting logic +// used in the KubeFleet project. +package errors + +import ( + "errors" + "fmt" + "runtime" + + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +type ErrCategory string + +const ( + // An error is considered of the unexpected error category when a KubeFleet controller + // encounters a situation that it does not know how to handle and cannot be resolved + // by itself or by user fixing their inputs. + ErrCategoryUnexpected ErrCategory = "unexpected" + + // An error is considered of the transient error category when a KubeFleet controller + // encounters a situation that would self-resolve soon, e.g., cache not sync'd yet. + ErrCategoryTransient ErrCategory = "transient" + + // An error is considered of the API server error category when it arises during a call + // to the Kubernetes API server. + ErrCategoryAPIServer ErrCategory = "APIserver" + + // An error is considered of the user error category when it arises due to some + // incorrect input or action from the user side and can resolve when the + // user fixes their inputs. + ErrCategoryUser ErrCategory = "user" + + // An error is considered uncategorized when it does not fit into any of the other + // predefined error categories or no category has been explicitly assigned. + ErrCategoryUncategorized ErrCategory = "uncategorized" +) + +var _ error = &Error{} + +type Error struct { + // category is the category of the error. + category ErrCategory + + // wrapped is an error that this error wraps. + wrapped error + + // desc is the additional description about the wrapped error. + desc string + + // attrs are the additional attributes associated with the wrapped error. + attrs []interface{} +} + +func (e *Error) categoryWithDefault() ErrCategory { + if len(e.category) == 0 { + return ErrCategoryUncategorized + } + return e.category +} + +// Error implements the error interface. +// +// Note the output intentionally does not include the additional attributes, so as to keep the cardinality +// low; to print them out, pass them explicitly to the klog functions. +func (e *Error) Error() string { + switch { + case e.wrapped != nil && len(e.desc) == 0: + // With wrapped error but no description. + return e.wrapped.Error() + case e.wrapped != nil: + // With wrapped error, with description. + return fmt.Sprintf("%s: %s", e.desc, e.wrapped.Error()) + case e.wrapped == nil && len(e.desc) > 0: + // No wrapped error but with description. + return e.desc + default: + // No wrapped error and no description. + return "an unknown error occurred" + } +} + +// Unwrap returns the wrapped error, so that errors.Is and errors.As can traverse the error chain as +// expected. +func (e *Error) Unwrap() error { + return e.wrapped +} + +// Wraps wraps an existing error with a higher-level description and optional key-value attributes, +// and returns a new *Error. If the existing error (or its children) is already an *Error, +// Wraps will surface the category and the attributes to the top-level. +func Wraps(err error, desc string, kvs ...interface{}) *Error { + // Find if any of error down in the chain is already an *Error; if so, use its category and surface its + // attributes to the current level. + var category ErrCategory + var mergedKVs []interface{} + var childError *Error + if errors.As(err, &childError) { + category = childError.category + mergedKVs = append(mergedKVs, childError.attrs...) + } + mergedKVs = append(mergedKVs, kvs...) + if len(category) == 0 { + category = ErrCategoryUncategorized + } + + return &Error{ + category: category, + wrapped: err, + desc: desc, + attrs: mergedKVs, + } +} + +// Args returns the k-v attributes associated with an *Error (and its children). +// +// The function is designed to work with klog calls; to use it, call klog.ErrorS with: +// +// klog.ErrorS(err, "additional top-level error description", Args(err)...). +// +// If you need to add additional key-value attributes at the top level in addition to those +// already captured from the error chain, append them to Args as follows: +// +// klog.ErrorS(err, "additional top-level error description", append(Args(err), "k", "v")...). +func Args(err error, kvs ...interface{}) []interface{} { + var compositeKVs []interface{} + var childError *Error + if errors.As(err, &childError) { + // Make sure that the error category is always included as the first kv pair. + compositeKVs = append(compositeKVs, "errCategory", childError.categoryWithDefault()) + compositeKVs = append(compositeKVs, childError.attrs...) + compositeKVs = append(compositeKVs, kvs...) + } + return compositeKVs +} + +// CallerFrameOverview is a simple struct that summarizes a single call frame. +type CallerFrameOverview struct { + Function string + File string + Line int +} + +// captureCallers captures the current call stack and returns a slice of CallerFrameOverview +// summarizing each frame in the stack, skipping the top 3 frames for simplicity. +func captureCallers() []CallerFrameOverview { + pcs := make([]uintptr, 32) + // Skip 3 frames (the runtime.Callers function itself, this utility function, and the caller of this utility) + // when capturing the call stack for simplicity reasons. + n := runtime.Callers(3, pcs) + frames := runtime.CallersFrames(pcs[:n]) + var callerFrames []CallerFrameOverview + for { + frame, more := frames.Next() + callerFrames = append(callerFrames, CallerFrameOverview{ + Function: frame.Function, + File: frame.File, + Line: frame.Line, + }) + if !more { + break + } + } + return callerFrames +} + +// NewUnexpectedError creates a new Error categorized as ErrCategoryUnexpected. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewUnexpectedError(err error, desc string, kvs ...interface{}) *Error { + // For unexpected errors, collect the caller frames. + // + // Note (chenyu1): previously for unexpected errors, we use the debug.Stack() to print out a full stack + // trace in the logs. Effective as it is, the operation is expensive and it always captures the full stack, + // where the top of the stack is always the error handling utility function/method itself rather than + // the actual caller site where the error originates. Furthermore, as debug.Stack() returns a multi-line string, + // often the logging backend cannot render it properly, making it difficult to read/summarize. To address this, + // in this new implementation we switch to runtime.Callers and runtime.CallersFrames to capture the call stack + // in a structured way with the unrelated stacks skipped. + frames := captureCallers() + kvs = append(kvs, "callers", frames) + + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryUnexpected + return newErr +} + +// NewTransientError creates a new Error categorized as ErrCategoryTransient. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewTransientError(err error, desc string, kvs ...interface{}) *Error { + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryTransient + return newErr +} + +// NewUserError creates a new Error categorized as ErrCategoryUser. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewUserError(err error, desc string, kvs ...interface{}) *Error { + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryUser + return newErr +} + +// NewAPIServerError creates a new Error categorized as ErrCategoryAPIServer. It is supposed to be called +// at the lowest level possible, where the error category can be determined. The error can then be returned +// and wrapped (w/ additional attributes if applicable). +func NewAPIServerError(err error, desc string, cached bool, kvs ...interface{}) *Error { + // For API server errors, collect caching and HTTP response related information. + kvs = append(kvs, "cached", cached) + kvs = append(kvs, "APIServerErrReason", apierrors.ReasonForError(err)) + + apiErr, ok := err.(apierrors.APIStatus) + if ok { + kvs = append(kvs, "APIServerResHTTPCode", apiErr.Status().Code) + } + + newErr := Wraps(err, desc, kvs...) + newErr.category = ErrCategoryAPIServer + return newErr +} diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go new file mode 100644 index 000000000..5b62fbee6 --- /dev/null +++ b/pkg/utils/errors/errors_test.go @@ -0,0 +1,579 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package errors + +import ( + "bytes" + "fmt" + "io" + "log/slog" + "strings" + "testing" + + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" +) + +var ( + // wrappedErrComparer is a cmp option that compares the .wrapped field by pointer equality, + // avoiding the need to recurse into unexported fields of standard-library error types. + wrappedErrComparer = cmp.FilterPath( + func(p cmp.Path) bool { return p.Last().String() == ".wrapped" }, + cmp.Comparer(func(x, y error) bool { return x == y }), //nolint:errorlint + ) +) + +// TestCommonUsePatterns demonstrates how the common error handling patterns are enabled by the errors package. +func TestCommonUsePatterns(t *testing.T) { + var bytesBuf bytes.Buffer + slogHandler := slog.NewTextHandler(&bytesBuf, &slog.HandlerOptions{Level: slog.LevelDebug}) + logger := logr.FromSlogHandler(slogHandler) + klog.SetLogger(logger) + + // Example 1: handling API server errors with rich context and structured logging. + rawAPIServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "work") + categorizedAPIServerErr := NewAPIServerError(rawAPIServerErr, "failed to create namespace", false, "k1", "v1") + wrappedErr := Wraps(categorizedAPIServerErr, "additional high-level error description", "k2", "v2", "k3", "v3") + klog.ErrorS(wrappedErr, "additional top/controller-level error description", Args(wrappedErr)...) + // If there needs to be kv attributes at the top level as well: + // klog.ErrorS(wrappedErr, "additional top/controller-level error description", append(Args(wrappedErr), "k4", "v4", "k5", "v5")...) + klog.Flush() + + outputStr := readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:53: time=2026-04-29T02:39:44.853+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: failed to create namespace: namespaces \"work\" already exists" errCategory=APIserver k1=v1 cached=false APIServerErrReason=AlreadyExists APIServerResHTTPCode=409 k2=v2 k3=v3 + wantSubStrings := []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: failed to create namespace: namespaces \\\"work\\\" already exists\"", + "errCategory=APIserver", + "k1=v1", + "k2=v2", + "k3=v3", + "cached=false", + "APIServerErrReason=AlreadyExists", + "APIServerResHTTPCode=409", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } + + // Example 2: handling unexpected errors with rich context and structured logging. + rawUtilityCallErr := fmt.Errorf("cannot calculate resource hash") + categorizedUnexpectedErr := NewUnexpectedError(rawUtilityCallErr, "additional low-level error description", "k1", "v1") + wrappedUnexpectedErr := Wraps(categorizedUnexpectedErr, "additional high-level error description", "k2", "v2") + klog.ErrorS(wrappedUnexpectedErr, "additional top/controller-level error description", Args(wrappedUnexpectedErr)...) + klog.Flush() + + outputStr = readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:go.goms.io/fleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2 + wantSubStrings = []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: additional low-level error description: cannot calculate resource hash\"", + "errCategory=unexpected", + "k1=v1", + "callers=", + "Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns", + "k2=v2", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } + + // Example 3: handling uncategorized errors with rich context and structured logging. + rawErr := fmt.Errorf("something went wrong") + uncategorizedErr := Wraps(rawErr, "additional low-level error description", "k1", "v1") + wrappedUncategorizedErr := Wraps(uncategorizedErr, "additional high-level error description", "k2", "v2") + klog.ErrorS(wrappedUncategorizedErr, "additional top/controller-level error description", Args(wrappedUncategorizedErr)...) + klog.Flush() + + outputStr = readFromBuffer(t, &bytesBuf) + + // The full error output looks like the follows: + // errors_test.go:119: time=2026-04-29T02:59:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: something went wrong" errCategory=uncategorized k1=v1 k2=v2 + wantSubStrings = []string{ + "msg=\"additional top/controller-level error description\"", + "err=\"additional high-level error description: additional low-level error description: something went wrong\"", + "errCategory=uncategorized", + "k1=v1", + "k2=v2", + } + for _, subStr := range wantSubStrings { + if strings.Contains(outputStr, subStr) == false { + t.Errorf("got %s\nwant to contain %s\n", outputStr, subStr) + } + } +} + +func TestErrorStringer(t *testing.T) { + wrappedErr := fmt.Errorf("something went wrong") + + testCases := []struct { + name string + err *Error + wantMsg string + }{ + { + name: "no wrapped error, no description", + err: &Error{}, + wantMsg: "an unknown error occurred", + }, + { + name: "no wrapped error, with description", + err: &Error{desc: "operation failed"}, + wantMsg: "operation failed", + }, + { + name: "with wrapped error, no description", + err: &Error{wrapped: wrappedErr}, + wantMsg: "something went wrong", + }, + { + name: "with wrapped error, with description", + err: &Error{wrapped: wrappedErr, desc: "operation failed"}, + wantMsg: "operation failed: something went wrong", + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := tc.err.Error() + if got != tc.wantMsg { + t.Errorf("Error() = %q, want %q", got, tc.wantMsg) + } + }) + } +} + +func TestWraps(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryUser, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "wrapping a plain error, no kvs", + err: plainErr, + desc: "high level desc", + wantErr: &Error{ + category: ErrCategoryUncategorized, + wrapped: plainErr, + desc: "high level desc", + }, + }, + { + name: "wrapping a plain error, with kvs", + err: plainErr, + desc: "high level desc", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryUncategorized, + wrapped: plainErr, + desc: "high level desc", + attrs: []interface{}{"k1", "v1"}, + }, + }, + { + name: "wrapping an *Error, no extra kvs", + err: categorizedChildErr, + desc: "high level desc", + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "high level desc", + attrs: []interface{}{"ck1", "cv1"}, + }, + }, + { + name: "wrapping an *Error, with extra kvs", + err: categorizedChildErr, + desc: "high level desc", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "high level desc", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := Wraps(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("Wraps() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestArgs(t *testing.T) { + testCases := []struct { + name string + err error + kvs []interface{} + wantKVs []interface{} + }{ + { + name: "plain error (not an *Error)", + err: fmt.Errorf("plain error"), + wantKVs: nil, + }, + { + name: "*Error with no category set, no attrs, no extra kvs", + err: &Error{}, + wantKVs: []interface{}{"errCategory", ErrCategoryUncategorized}, + }, + { + name: "*Error with explicit category, no attrs, no extra kvs", + err: &Error{category: ErrCategoryUser}, + wantKVs: []interface{}{"errCategory", ErrCategoryUser}, + }, + { + name: "*Error with explicit category and attrs, no extra kvs", + err: &Error{category: ErrCategoryAPIServer, attrs: []interface{}{"k1", "v1", "k2", "v2"}}, + wantKVs: []interface{}{"errCategory", ErrCategoryAPIServer, "k1", "v1", "k2", "v2"}, + }, + { + name: "*Error with explicit category and attrs, with extra kvs", + err: &Error{category: ErrCategoryUnexpected, attrs: []interface{}{"k1", "v1"}}, + kvs: []interface{}{"k2", "v2"}, + wantKVs: []interface{}{"errCategory", ErrCategoryUnexpected, "k1", "v1", "k2", "v2"}, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotKVs := Args(tc.err, tc.kvs...) + if diff := cmp.Diff(gotKVs, tc.wantKVs); diff != "" { + t.Errorf("Args() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestUnwrap(t *testing.T) { + innerErr := fmt.Errorf("inner error") + testCases := []struct { + name string + err *Error + wantErr error + }{ + { + name: "nil wrapped error", + err: &Error{}, + wantErr: nil, + }, + { + name: "non-nil wrapped error", + err: &Error{wrapped: innerErr}, + wantErr: innerErr, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := tc.err.Unwrap() + // Note: here we actually want to just compare the error values by pointer equality. + if gotErr != tc.wantErr { //nolint:errorlint + t.Errorf("Unwrap() = %v, want %v", gotErr, tc.wantErr) + } + }) + } +} + +func TestNewTransientError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryUser, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "plain error, no kvs", + err: plainErr, + desc: "transient condition", + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: plainErr, + desc: "transient condition", + }, + }, + { + name: "plain error, with kvs", + err: plainErr, + desc: "transient condition", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: plainErr, + desc: "transient condition", + attrs: []interface{}{"k1", "v1"}, + }, + }, + // This is technically speaking possible, but in practice users should not need to override categories. + { + name: "wrapping an *Error, category overridden to transient", + err: categorizedChildErr, + desc: "transient condition", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryTransient, + wrapped: categorizedChildErr, + desc: "transient condition", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewTransientError(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewTransientError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewUserError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + categorizedChildErr := &Error{ + category: ErrCategoryTransient, + desc: "child desc", + attrs: []interface{}{"ck1", "cv1"}, + } + + testCases := []struct { + name string + err error + desc string + kvs []interface{} + wantErr *Error + }{ + { + name: "plain error, no kvs", + err: plainErr, + desc: "invalid input", + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: plainErr, + desc: "invalid input", + }, + }, + { + name: "plain error, with kvs", + err: plainErr, + desc: "invalid input", + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: plainErr, + desc: "invalid input", + attrs: []interface{}{"k1", "v1"}, + }, + }, + { + // This is technically speaking possible, but in practice users should not need to override categories. + name: "wrapping an *Error, category overridden to user", + err: categorizedChildErr, + desc: "invalid input", + kvs: []interface{}{"k2", "v2"}, + wantErr: &Error{ + category: ErrCategoryUser, + wrapped: categorizedChildErr, + desc: "invalid input", + attrs: []interface{}{"ck1", "cv1", "k2", "v2"}, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewUserError(tc.err, tc.desc, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewUserError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewAPIServerError(t *testing.T) { + apiServerErr := apierrors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "fleet") + plainErr := fmt.Errorf("plain error") + + testCases := []struct { + name string + err error + desc string + cached bool + kvs []interface{} + wantErr *Error + }{ + { + name: "API server error, not cached, no extra kvs", + err: apiServerErr, + desc: "failed to create namespace", + cached: false, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: apiServerErr, + desc: "failed to create namespace", + attrs: []interface{}{ + "cached", false, + "APIServerErrReason", apierrors.ReasonForError(apiServerErr), + "APIServerResHTTPCode", apiServerErr.Status().Code, + }, + }, + }, + { + name: "API server error, cached, with extra kvs", + err: apiServerErr, + desc: "failed to create namespace", + cached: true, + kvs: []interface{}{"k1", "v1"}, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: apiServerErr, + desc: "failed to create namespace", + attrs: []interface{}{ + "k1", "v1", + "cached", true, + "APIServerErrReason", apierrors.ReasonForError(apiServerErr), + "APIServerResHTTPCode", apiServerErr.Status().Code, + }, + }, + }, + { + name: "non API server error, no HTTPCode in attrs", + err: plainErr, + desc: "unexpected failure", + cached: false, + wantErr: &Error{ + category: ErrCategoryAPIServer, + wrapped: plainErr, + desc: "unexpected failure", + attrs: []interface{}{ + "cached", false, + "APIServerErrReason", apierrors.ReasonForError(plainErr), + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotErr := NewAPIServerError(tc.err, tc.desc, tc.cached, tc.kvs...) + if diff := cmp.Diff(gotErr, tc.wantErr, + cmp.AllowUnexported(Error{}), + wrappedErrComparer, + ); diff != "" { + t.Errorf("NewAPIServerError() mismatch (-got, +want):\n%s", diff) + } + }) + } +} + +func TestNewUnexpectedError(t *testing.T) { + plainErr := fmt.Errorf("plain error") + gotErr := NewUnexpectedError(plainErr, "low-level desc", "k1", "v1") + + // Verify the category. + if gotErr.category != ErrCategoryUnexpected { + t.Errorf("NewUnexpectedError() category = %v, want %v", gotErr.category, ErrCategoryUnexpected) + } + // Verify wrapped error and description via Error(). + wantMsg := "low-level desc: plain error" + if got := gotErr.Error(); got != wantMsg { + t.Errorf("NewUnexpectedError() Error() = %q, want %q", got, wantMsg) + } + // Verify that the user-provided kvs appear at the start of attrs. + if len(gotErr.attrs) < 2 || gotErr.attrs[0] != "k1" || gotErr.attrs[1] != "v1" { + t.Errorf("NewUnexpectedError() attrs = %v, want k1=v1 at start", gotErr.attrs) + } + // Verify that the callers key is present in attrs and its frame count is not zero. + found := false + canCast := false + var frames []CallerFrameOverview + for i := 0; i < len(gotErr.attrs)-1; i++ { + if gotErr.attrs[i] == "callers" { + found = true + frames, canCast = gotErr.attrs[i+1].([]CallerFrameOverview) + break + } + } + if !found { + t.Errorf("NewUnexpectedError() attrs = %v, want callers key present", gotErr.attrs) + } + if !canCast { + t.Errorf("NewUnexpectedError() callers value has type %T, want []CallerFrameOverview", gotErr.attrs) + } + if len(frames) == 0 { + t.Errorf("NewUnexpectedError() callers = %v, want non-zero frame count", frames) + } + + // Verify the first frame (the current test code). + callerFunc := frames[0].Function + if callerFunc != "go.goms.io/fleet/pkg/utils/errors.TestNewUnexpectedError" { + t.Errorf("NewUnexpectedError() first caller function = %s, want TestNewUnexpectedError", callerFunc) + } + callerFile := frames[0].File + if !strings.HasSuffix(callerFile, "pkg/utils/errors/errors_test.go") { + t.Errorf("NewUnexpectedError() first caller file = %s, want to end with pkg/utils/errors/errors_test.go", callerFile) + } + // Note: skip verifying the exact line number as it may change with code edits. +} + +func readFromBuffer(t *testing.T, buf *bytes.Buffer) string { + output := make([]byte, 1000) + n, err := buf.Read(output) + if err != nil && err != io.EOF { + t.Fatalf("Failed to read from bytes buffer: %v", err) + } + output = output[:n] + outputStr := string(output) + return outputStr +} diff --git a/pkg/utils/writefile/writefile.go b/pkg/utils/writefile/writefile.go new file mode 100644 index 000000000..f161bfeca --- /dev/null +++ b/pkg/utils/writefile/writefile.go @@ -0,0 +1,34 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package writefile + +import ( + "os" + "path/filepath" +) + +// CreateSecureFile creates or truncates a file with owner-only permissions (0600). +// The path is sanitized with filepath.Clean to resolve any relative or redundant elements. +// +// - O_WRONLY: write-only, since callers only need to write sensitive data. +// - O_CREATE: creates the file if it does not exist. +// - O_TRUNC: empties the file before writing so that no leftover bytes from a +// previous (possibly longer) write remain in the file. +// - 0600: owner read/write only, preventing other users from accessing the file. +func CreateSecureFile(path string) (*os.File, error) { + return os.OpenFile(filepath.Clean(path), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) +} diff --git a/pkg/utils/writefile/writefile_test.go b/pkg/utils/writefile/writefile_test.go new file mode 100644 index 000000000..7d389ac66 --- /dev/null +++ b/pkg/utils/writefile/writefile_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package writefile + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestCreateSecureFile(t *testing.T) { + tests := []struct { + name string + oldContent string + writeContent string + }{ + { + name: "creates new file with restricted permissions", + writeContent: "sensitive-data", + }, + { + name: "truncates longer existing content", + oldContent: "this-is-a-very-long-existing-content-that-should-be-truncated", + writeContent: "short", + }, + { + name: "overwrites shorter existing content", + oldContent: "short", + writeContent: "a-much-longer-replacement-content", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + dir := t.TempDir() + filePath := filepath.Join(dir, "secure-file") + + writeAndClose := func(content string) { + f, err := CreateSecureFile(filePath) + if err != nil { + t.Fatalf("CreateSecureFile(%q) returned error: %v", filePath, err) + } + if _, err := io.WriteString(f, content); err != nil { + t.Fatalf("io.WriteString() returned error: %v", err) + } + f.Close() + } + + if tc.oldContent != "" { + writeAndClose(tc.oldContent) + } + + writeAndClose(tc.writeContent) + + got, err := os.ReadFile(filePath) + if err != nil { + t.Fatalf("os.ReadFile(%q) returned error: %v", filePath, err) + } + if diff := cmp.Diff(tc.writeContent, string(got)); diff != "" { + t.Errorf("CreateSecureFile() file content mismatch (-want +got):\n%s", diff) + } + + info, err := os.Stat(filePath) + if err != nil { + t.Fatalf("os.Stat(%q) returned error: %v", filePath, err) + } + if gotPerm := info.Mode().Perm(); gotPerm != 0600 { + t.Errorf("CreateSecureFile() file permission = %o, want %o", gotPerm, 0600) + } + }) + } +} diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 507664b61..b7febe5af 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -55,6 +55,7 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" "go.goms.io/fleet/cmd/hubagent/options" + "go.goms.io/fleet/pkg/utils/writefile" "go.goms.io/fleet/pkg/webhook/clusterresourceoverride" "go.goms.io/fleet/pkg/webhook/clusterresourceplacement" "go.goms.io/fleet/pkg/webhook/clusterresourceplacementdisruptionbudget" @@ -671,10 +672,20 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho } // Build core v1 resources list, conditionally including pods if workload is enabled - coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName, - limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, podResourceName, podResourceName + "/status", - replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName, - serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"} + coreV1Resources := []string{ + bindingResourceName, + configMapResourceName, + endPointResourceName, + limitRangeResourceName, + persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", + podTemplateResourceName, + podResourceName, podResourceName + "/status", + replicationControllerResourceName, replicationControllerResourceName + "/status", + resourceQuotaResourceName, resourceQuotaResourceName + "/status", + secretResourceName, + serviceAccountResourceName, serviceAccountResourceName + "/token", + servicesResourceName, servicesResourceName + "/status", + } namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ Operations: cuOperations, @@ -967,7 +978,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error { return fmt.Errorf("could not create directory %q to store certificates: %w", certDir, err) } certPath := filepath.Join(certDir, fleetWebhookCertFileName) - f, err := os.OpenFile(filepath.Clean(certPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600) + f, err := writefile.CreateSecureFile(certPath) if err != nil { return fmt.Errorf("could not open %q: %w", certPath, err) } @@ -981,7 +992,7 @@ func genCertAndKeyFile(certData, keyData []byte, certDir string) error { } keyPath := filepath.Join(certDir, fleetWebhookKeyFileName) - kf, err := os.OpenFile(filepath.Clean(keyPath), os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0600) + kf, err := writefile.CreateSecureFile(keyPath) if err != nil { return fmt.Errorf("could not open %q: %w", keyPath, err) } diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go index c9d0c4506..7759cb5a1 100644 --- a/test/apis/placement/v1beta1/api_validation_integration_test.go +++ b/test/apis/placement/v1beta1/api_validation_integration_test.go @@ -27,6 +27,7 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" k8sErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/intstr" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" @@ -1685,6 +1686,60 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) }) + + It("Should allow creation of ClusterStagedUpdateStrategy with waitTime set to '0s'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "0s", + }, + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, obj)).Should(Succeed()) + Expect(hubClient.Delete(ctx, obj)).Should(Succeed()) + }) + + It("Should allow creation of ClusterStagedUpdateStrategy with waitTime as a compound duration '1h30m'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1h30m", + }, + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, obj)).Should(Succeed()) + Expect(hubClient.Delete(ctx, obj)).Should(Succeed()) + }) }) Context("Test ClusterStagedUpdateStrategy API validation - invalid cases", func() { @@ -2096,6 +2151,123 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '0abc'", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "0abc", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1' (number without unit)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '1d' (unsupported unit)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "1d", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with invalid waitTime '01h' (leading zero)", func() { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "placement.kubernetes-fleet.io/v1beta1", + "kind": "ClusterStagedUpdateStrategy", + "metadata": map[string]interface{}{ + "name": fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + "spec": map[string]interface{}{ + "stages": []interface{}{ + map[string]interface{}{ + "name": fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + "afterStageTasks": []interface{}{ + map[string]interface{}{ + "type": "TimedWait", + "waitTime": "01h", + }, + }, + }, + }, + }, + }, + } + err := hubClient.Create(ctx, obj) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].afterStageTasks\\[0\\].waitTime in body should match")) + }) + }) Context("Test ClusterApprovalRequest API validation - valid cases", func() { diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go index 6f108dbca..ed778d4ce 100644 --- a/test/e2e/cluster_staged_updaterun_test.go +++ b/test/e2e/cluster_staged_updaterun_test.go @@ -2203,7 +2203,7 @@ func approveClusterApprovalRequest(stageName, updateRunName, stageTask string) { // Use kubectl-fleet approve plugin to approve the request cmd := exec.Command(fleetBinaryPath, "approve", "clusterapprovalrequest", - "--hubClusterContext", "kind-hub", + "--hub-cluster-context", "kind-hub", "--name", approvalRequestName) output, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "kubectl-fleet approve failed: %s", string(output)) diff --git a/test/e2e/drain_tool_test.go b/test/e2e/drain_tool_test.go index 199a48ba7..5939a9299 100644 --- a/test/e2e/drain_tool_test.go +++ b/test/e2e/drain_tool_test.go @@ -360,8 +360,8 @@ var _ = Describe("Drain is allowed on one cluster, blocked on others - ClusterRe func runDrainClusterBinary(hubClusterName, memberClusterName string) { By(fmt.Sprintf("draining cluster %s", memberClusterName)) cmd := exec.Command(fleetBinaryPath, "draincluster", - "--hubClusterContext", hubClusterName, - "--clusterName", memberClusterName) + "--hub-cluster-context", hubClusterName, + "--cluster-name", memberClusterName) _, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "Drain command failed with error: %v", err) } @@ -369,8 +369,8 @@ func runDrainClusterBinary(hubClusterName, memberClusterName string) { func runUncordonClusterBinary(hubClusterName, memberClusterName string) { By(fmt.Sprintf("uncordoning cluster %s", memberClusterName)) cmd := exec.Command(fleetBinaryPath, "uncordoncluster", - "--hubClusterContext", hubClusterName, - "--clusterName", memberClusterName) + "--hub-cluster-context", hubClusterName, + "--cluster-name", memberClusterName) _, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), "Uncordon command failed with error: %v", err) } diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go index 450331eb0..b073ccd11 100644 --- a/test/e2e/fleet_guard_rail_test.go +++ b/test/e2e/fleet_guard_rail_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/gomega" admissionv1 "k8s.io/api/admission/v1" appsv1 "k8s.io/api/apps/v1" + authenticationv1 "k8s.io/api/authentication/v1" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -1607,3 +1608,88 @@ var _ = Describe("fleet deployment webhook tests for validating and mutating dep }) }) }) + +var _ = Describe("fleet guard rail webhook tests for service accounts in restricted namespaces", Ordered, func() { + svcAccountGVK := metav1.GroupVersionKind{Group: corev1.SchemeGroupVersion.Group, Version: corev1.SchemeGroupVersion.Version, Kind: "ServiceAccount"} + tokenReqGVK := metav1.GroupVersionKind{Group: "authentication.k8s.io", Version: "v1", Kind: "TokenRequest"} + + svcAccountName := fmt.Sprintf(svcAccountNameTemplate, GinkgoParallelProcess()) + svcAccountToAddName := "added-sa" + kubeSystemNamespaceName := "kube-system" + + var svcAccount *corev1.ServiceAccount + BeforeAll(func() { + // Create a service account in the kube-system namespace. + svcAccount = &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + Expect(hubClient.Create(ctx, svcAccount)).Should(Succeed(), "Failed to create service account") + }) + + AfterAll(func() { + // Ensure the removal of the created service account. + Eventually(func() error { + svcAccount := corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + if err := hubClient.Delete(ctx, &svcAccount); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete service account: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, &corev1.ServiceAccount{}); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to retrieve service account: %w", err) + } + return fmt.Errorf("service account still exists") + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete service account") + }) + + It("should deny creation of service accounts in kube-system namespace for non-whitelisted users", func() { + newSvcAccount := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountToAddName, + Namespace: kubeSystemNamespaceName, + }, + } + wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat, + testUser, + utils.GenerateGroupString(testGroups), + admissionv1.Create, + svcAccountGVK, "", + types.NamespacedName{Name: newSvcAccount.Name, Namespace: newSvcAccount.Namespace}, + ) + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, newSvcAccount), wantErrMsg)).Should(Succeed(), "Failed to deny creation of service account in kube-system namespace") + }) + + It("should deny access to service account token subresource in restricted namespace for non-whitelisted users", func() { + tokenRequest := &authenticationv1.TokenRequest{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: kubeSystemNamespaceName, + Name: svcAccountName, + }, + Spec: authenticationv1.TokenRequestSpec{ + Audiences: []string{"experimental"}, + ExpirationSeconds: ptr.To(int64(3600)), + }, + } + wantErrMsg := fmt.Sprintf(validation.ResourceDeniedFormat, + testUser, + utils.GenerateGroupString(testGroups), + admissionv1.Create, + tokenReqGVK, "token", + types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, + ) + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.SubResource("token").Create(ctx, svcAccount, tokenRequest), wantErrMsg)).Should(Succeed(), "Failed to deny access to service account token subresource in restricted namespace") + }) +}) diff --git a/test/e2e/placement_selecting_resources_test.go b/test/e2e/placement_selecting_resources_test.go index 10d6a56c3..b12ffa399 100644 --- a/test/e2e/placement_selecting_resources_test.go +++ b/test/e2e/placement_selecting_resources_test.go @@ -1392,11 +1392,12 @@ var _ = Describe("creating CRP and checking selected resources order", Ordered, It("should update CRP status with the correct order of the selected resources", func() { // Define the expected resources in order - // Note: PVCs are not propagated, so they should not appear in selected resources + // Note: PVCs are propagated when enableWorkload=false (the default for E2E tests) expectedResources := []placementv1beta1.ResourceIdentifier{ {Kind: "Namespace", Name: nsName, Version: "v1"}, {Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"}, {Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"}, + {Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"}, {Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"}, } diff --git a/test/e2e/resource_placement_hub_workload_test.go b/test/e2e/resource_placement_hub_workload_test.go index b088109ee..0e644d145 100644 --- a/test/e2e/resource_placement_hub_workload_test.go +++ b/test/e2e/resource_placement_hub_workload_test.go @@ -33,6 +33,9 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" ) +// This test requires enableWorkload=true because it creates workloads on the hub cluster +// and expects them to become ready. With enableWorkload=false, the pod webhook blocks +// pod creation on the hub cluster. var _ = Describe("placing workloads using a CRP with PickAll policy", Label("resourceplacement"), Ordered, func() { crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) var testDeployment appsv1.Deployment @@ -41,6 +44,8 @@ var _ = Describe("placing workloads using a CRP with PickAll policy", Label("res var testStatefulSet appsv1.StatefulSet BeforeAll(func() { + Skip("This test requires enableWorkload=true to run workloads on the hub cluster") + // Read the test manifests readDeploymentTestManifest(&testDeployment) readDaemonSetTestManifest(&testDaemonSet) diff --git a/test/e2e/resource_placement_selecting_resources_test.go b/test/e2e/resource_placement_selecting_resources_test.go index 2e6966274..9737dd787 100644 --- a/test/e2e/resource_placement_selecting_resources_test.go +++ b/test/e2e/resource_placement_selecting_resources_test.go @@ -1057,10 +1057,11 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f It("should update RP status with the correct order of the selected resources", func() { // Define the expected resources in order. - // Note: PVCs are not propagated, so they should not appear in selected resources + // Note: PVCs are propagated when enableWorkload=false (the default for E2E tests) expectedResources := []placementv1beta1.ResourceIdentifier{ {Kind: "Secret", Name: secret.Name, Namespace: nsName, Version: "v1"}, {Kind: "ConfigMap", Name: configMap.Name, Namespace: nsName, Version: "v1"}, + {Kind: "PersistentVolumeClaim", Name: pvc.Name, Namespace: nsName, Version: "v1"}, {Group: "rbac.authorization.k8s.io", Kind: "Role", Name: role.Name, Namespace: nsName, Version: "v1"}, } diff --git a/test/e2e/resources_test.go b/test/e2e/resources_test.go index bf1d15318..e631a8c3a 100644 --- a/test/e2e/resources_test.go +++ b/test/e2e/resources_test.go @@ -54,6 +54,7 @@ const ( clusterStagedUpdateRunNameWithSubIndexTemplate = "cur-%d-%d" stagedUpdateRunStrategyNameTemplate = "sus-%d" stagedUpdateRunNameWithSubIndexTemplate = "sur-%d-%d" + svcAccountNameTemplate = "sa-%d" customDeletionBlockerFinalizer = "kubernetes-fleet.io/custom-deletion-blocker-finalizer" workNamespaceLabelName = "process" diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index 4f0432bcf..a0116847c 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -1749,7 +1749,7 @@ func checkIfStatusErrorWithMessage(err error, errorMsg string) error { return nil } } - return fmt.Errorf("error message %s not found in error %w", errorMsg, err) + return fmt.Errorf("error message (%s) not found in given error (%w)", errorMsg, err) } // buildOwnerReference builds an owner reference given a cluster and a placement name. diff --git a/tools/fleet/README.md b/tools/fleet/README.md index 897978db3..c7405896d 100644 --- a/tools/fleet/README.md +++ b/tools/fleet/README.md @@ -72,25 +72,25 @@ Use the `approve` subcommand to approve approval request resources for staged up **Cluster-scoped (ClusterApprovalRequest):** ```bash -kubectl fleet approve clusterapprovalrequest --hubClusterContext --name +kubectl fleet approve clusterapprovalrequest --hub-cluster-context --name # Or using alias: -kubectl fleet approve careq --hubClusterContext --name +kubectl fleet approve careq --hub-cluster-context --name ``` **Namespace-scoped (ApprovalRequest):** ```bash -kubectl fleet approve approvalrequest --hubClusterContext --name --namespace +kubectl fleet approve approvalrequest --hub-cluster-context --name --namespace # Or using alias: -kubectl fleet approve areq --hubClusterContext --name -n +kubectl fleet approve areq --hub-cluster-context --name -n ``` Example: ```bash # Approve a ClusterApprovalRequest -kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name my-approval-request +kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name my-approval-request # Approve an ApprovalRequest in a specific namespace -kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval-request --namespace my-namespace +kubectl fleet approve approvalrequest --hub-cluster-context hub --name my-approval-request --namespace my-namespace ``` ### Drain a Member Cluster @@ -98,12 +98,12 @@ kubectl fleet approve approvalrequest --hubClusterContext hub --name my-approval Use the `draincluster` subcommand to remove all resources propagated to a member cluster from the hub cluster by any `Placement` resource. This is useful when you want to temporarily move all workloads off a member cluster in preparation for an event like upgrade or reconfiguration. ```bash -kubectl fleet draincluster --hubClusterContext --clusterName +kubectl fleet draincluster --hub-cluster-context --cluster-name ``` Example: ```bash -kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster-1 +kubectl fleet draincluster --hub-cluster-context hub --cluster-name member-cluster-1 ``` ### Uncordon a Member Cluster @@ -111,12 +111,12 @@ kubectl fleet draincluster --hubClusterContext hub --clusterName member-cluster- Use the `uncordoncluster` subcommand to uncordon a member cluster that has been previously drained, allowing resources to be propagated to the cluster again. ```bash -kubectl fleet uncordoncluster --hubClusterContext --clusterName +kubectl fleet uncordoncluster --hub-cluster-context --cluster-name ``` Example: ```bash -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName member-cluster-1 +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name member-cluster-1 ``` ## Subcommands @@ -156,13 +156,13 @@ If the `cordon` taint is not present on the member cluster, the command will hav ## Flags The `approve` subcommand uses the following flags: -- `--hubClusterContext`: kubectl context for the hub cluster (required) +- `--hub-cluster-context`: kubectl context for the hub cluster (required) - `--name`: name of the resource to approve (required) - `--namespace`, `-n`: namespace of the resource to approve (required for `approvalrequest`, not allowed for `clusterapprovalrequest`) Both `draincluster` and `uncordoncluster` subcommands use the following flags: -- `--hubClusterContext`: kubectl context for the hub cluster (required) -- `--clusterName`: name of the member cluster to operate on (required) +- `--hub-cluster-context`: kubectl context for the hub cluster (required) +- `--cluster-name`: name of the member cluster to operate on (required) ## Examples @@ -173,31 +173,31 @@ Both `draincluster` and `uncordoncluster` subcommands use the following flags: kubectl config get-contexts # 2. Drain a cluster for maintenance -kubectl fleet draincluster --hubClusterContext production-hub --clusterName worker-node-1 +kubectl fleet draincluster --hub-cluster-context production-hub --cluster-name worker-node-1 # 3. Perform maintenance on the worker-node-1 cluster # ... maintenance operations ... # 4. After maintenance, uncordon the cluster to allow workloads back -kubectl fleet uncordoncluster --hubClusterContext production-hub --clusterName worker-node-1 +kubectl fleet uncordoncluster --hub-cluster-context production-hub --cluster-name worker-node-1 ``` ### Additional Examples ```bash # Approve a ClusterApprovalRequest for staged updates -kubectl fleet approve clusterapprovalrequest --hubClusterContext hub --name update-approval-stage-1 +kubectl fleet approve clusterapprovalrequest --hub-cluster-context hub --name update-approval-stage-1 # Approve a ApprovalRequest for staged updates -kubectl fleet approve approvalrequest --hubClusterContext hub --name update-approval-stage-1 --namespace test-namespace +kubectl fleet approve approvalrequest --hub-cluster-context hub --name update-approval-stage-1 --namespace test-namespace # Drain multiple clusters (run separately for each cluster) -kubectl fleet draincluster --hubClusterContext hub --clusterName east-cluster -kubectl fleet draincluster --hubClusterContext hub --clusterName west-cluster +kubectl fleet draincluster --hub-cluster-context hub --cluster-name east-cluster +kubectl fleet draincluster --hub-cluster-context hub --cluster-name west-cluster # Uncordon clusters after maintenance -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName east-cluster -kubectl fleet uncordoncluster --hubClusterContext hub --clusterName west-cluster +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name east-cluster +kubectl fleet uncordoncluster --hub-cluster-context hub --cluster-name west-cluster ``` ## Troubleshooting diff --git a/tools/fleet/cmd/approve/approve.go b/tools/fleet/cmd/approve/approve.go index 7d216e5db..9406cf0c3 100644 --- a/tools/fleet/cmd/approve/approve.go +++ b/tools/fleet/cmd/approve/approve.go @@ -20,11 +20,11 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" @@ -87,6 +87,7 @@ type approveOptions struct { hubClusterContext string name string namespace string + timeout time.Duration hubClient client.Client } @@ -119,16 +120,19 @@ For namespace-scoped resources (approvalrequest), you must also specify the --na if err := o.setupClient(); err != nil { return err } - return o.run(cmd.Context(), cfg) + ctx, cancel := context.WithTimeout(cmd.Context(), o.timeout) + defer cancel() + return o.run(ctx, cfg) }, } - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "The name of the kubeconfig context to use for the hub cluster") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "The name of the kubeconfig context to use for the hub cluster") cmd.Flags().StringVar(&o.name, "name", "", "The name of the resource to approve") cmd.Flags().StringVarP(&o.namespace, "namespace", "n", "", "The namespace of the resource to approve (required for namespace-scoped resources)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags. - _ = cmd.MarkFlagRequired("hubClusterContext") + _ = cmd.MarkFlagRequired("hub-cluster-context") _ = cmd.MarkFlagRequired("name") return cmd @@ -208,10 +212,9 @@ func (o *approveOptions) approveApprovalRequest(ctx context.Context) error { // setupClient creates and configures the Kubernetes client func (o *approveOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/cmd/draincluster/draincluster.go b/tools/fleet/cmd/draincluster/draincluster.go index 5efd974dc..31b472165 100644 --- a/tools/fleet/cmd/draincluster/draincluster.go +++ b/tools/fleet/cmd/draincluster/draincluster.go @@ -20,11 +20,11 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" k8errors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/validation" @@ -49,6 +49,7 @@ const ( type drainOptions struct { hubClusterContext string clusterName string + timeout time.Duration hubClient client.Client } @@ -70,18 +71,20 @@ func NewCmdDrainCluster() *cobra.Command { } // Add flags specific to drain command - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)") - cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)") + cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags - _ = cmd.MarkFlagRequired("hubClusterContext") - _ = cmd.MarkFlagRequired("clusterName") + _ = cmd.MarkFlagRequired("hub-cluster-context") + _ = cmd.MarkFlagRequired("cluster-name") return cmd } func (o *drainOptions) runDrain() error { - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), o.timeout) + defer cancel() isDrainSuccessful, err := o.drain(ctx) if err != nil { @@ -111,13 +114,9 @@ func (o *drainOptions) runDrain() error { // setupClient creates and configures the Kubernetes client func (o *drainOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := clusterv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err) - } - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go index a16a41d36..ad4a15ef9 100644 --- a/tools/fleet/cmd/uncordoncluster/uncordoncluster.go +++ b/tools/fleet/cmd/uncordoncluster/uncordoncluster.go @@ -20,15 +20,14 @@ import ( "context" "fmt" "log" + "time" "github.com/spf13/cobra" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" - placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" toolsutils "go.goms.io/fleet/tools/utils" ) @@ -36,6 +35,7 @@ import ( type uncordonOptions struct { hubClusterContext string clusterName string + timeout time.Duration hubClient client.Client } @@ -57,18 +57,21 @@ func NewCmdUncordonCluster() *cobra.Command { } // Add flags specific to uncordon command - cmd.Flags().StringVar(&o.hubClusterContext, "hubClusterContext", "", "kubectl context for the hub cluster (required)") - cmd.Flags().StringVar(&o.clusterName, "clusterName", "", "name of the member cluster (required)") + cmd.Flags().StringVar(&o.hubClusterContext, "hub-cluster-context", "", "kubectl context for the hub cluster (required)") + cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "name of the member cluster (required)") + cmd.Flags().DurationVar(&o.timeout, "timeout", 5*time.Minute, "Maximum time to wait for the operation to complete") // Mark required flags - _ = cmd.MarkFlagRequired("hubClusterContext") - _ = cmd.MarkFlagRequired("clusterName") + _ = cmd.MarkFlagRequired("hub-cluster-context") + _ = cmd.MarkFlagRequired("cluster-name") return cmd } func (o *uncordonOptions) runUncordon() error { - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), o.timeout) + defer cancel() + if err := o.uncordon(ctx); err != nil { return fmt.Errorf("failed to uncordon cluster %s: %w", o.clusterName, err) } @@ -79,13 +82,9 @@ func (o *uncordonOptions) runUncordon() error { // setupClient creates and configures the Kubernetes client func (o *uncordonOptions) setupClient() error { - scheme := runtime.NewScheme() - - if err := clusterv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (cluster) to the runtime scheme: %w", err) - } - if err := placementv1beta1.AddToScheme(scheme); err != nil { - return fmt.Errorf("failed to add custom APIs (placement) to the runtime scheme: %w", err) + scheme, err := toolsutils.NewFleetScheme() + if err != nil { + return fmt.Errorf("failed to create runtime scheme: %w", err) } hubClient, err := toolsutils.GetClusterClientFromClusterContext(o.hubClusterContext, scheme) diff --git a/tools/fleet/main.go b/tools/fleet/main.go index 989eee78c..6fdcadf7e 100644 --- a/tools/fleet/main.go +++ b/tools/fleet/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "fmt" "log" "github.com/spf13/cobra" @@ -26,6 +27,12 @@ import ( "go.goms.io/fleet/tools/fleet/cmd/uncordoncluster" ) +// These variables are set via ldflags at build time. +var ( + version = "dev" + commit = "unknown" +) + func main() { rootCmd := &cobra.Command{ Use: "kubectl-fleet", @@ -33,6 +40,15 @@ func main() { Long: "kubectl-fleet is a kubectl plugin for managing KubeFleet member clusters", } + // Add version subcommand + rootCmd.AddCommand(&cobra.Command{ + Use: "version", + Short: "Print the version information", + Run: func(cmd *cobra.Command, args []string) { + fmt.Printf("kubectl-fleet %s (%s)\n", version, commit) + }, + }) + // Add subcommands rootCmd.AddCommand(approve.NewCmdApprove()) rootCmd.AddCommand(draincluster.NewCmdDrainCluster()) diff --git a/tools/utils/common.go b/tools/utils/common.go index 23f68c852..98c14ef5a 100644 --- a/tools/utils/common.go +++ b/tools/utils/common.go @@ -13,6 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" ) var ( @@ -24,6 +25,18 @@ var ( } ) +// NewFleetScheme creates a runtime.Scheme with all Fleet API types registered. +func NewFleetScheme() (*runtime.Scheme, error) { + scheme := runtime.NewScheme() + if err := clusterv1beta1.AddToScheme(scheme); err != nil { + return nil, err + } + if err := placementv1beta1.AddToScheme(scheme); err != nil { + return nil, err + } + return scheme, nil +} + // GetClusterClientFromClusterContext creates a new client.Client for the given cluster context and scheme. func GetClusterClientFromClusterContext(clusterContext string, scheme *runtime.Scheme) (client.Client, error) { clusterConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(