Skip to content

Commit

Permalink
fix(controller): always try to update status (#1154) (#1163)
Browse files Browse the repository at this point in the history
* fix(controller): always try to update status

* improve check-operator resiliency

* improve how the status conditions are updated

* Apply suggestions from code review



* Update upgrade.go

* nit

---------

Co-authored-by: Cedric Lamoriniere <cedric.lamoriniere@datadoghq.com>
  • Loading branch information
celenechang and clamoriniere committed May 1, 2024
1 parent 6f06176 commit 70db137
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 23 deletions.
8 changes: 8 additions & 0 deletions apis/datadoghq/v2alpha1/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ func SetDatadogAgentStatusCondition(status *DatadogAgentStatus, condition *metav
}
}

// DeleteDatadogAgentStatusCondition is used to delete a condition
func DeleteDatadogAgentStatusCondition(status *DatadogAgentStatus, conditionType string) {
idConditionComplete := getIndexForConditionType(status, conditionType)
if idConditionComplete >= 0 {
status.Conditions = append(status.Conditions[:idConditionComplete], status.Conditions[idConditionComplete+1:]...)
}
}

// NewDatadogAgentStatusCondition returns new metav1.Condition instance
func NewDatadogAgentStatusCondition(conditionType string, conditionStatus metav1.ConditionStatus, now metav1.Time, reason, message string) metav1.Condition {
return metav1.Condition{
Expand Down
103 changes: 103 additions & 0 deletions apis/datadoghq/v2alpha1/condition_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package v2alpha1

import (
"testing"

apiutils "github.com/DataDog/datadog-operator/apis/utils"
"github.com/google/go-cmp/cmp"
assert "github.com/stretchr/testify/require"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestDeleteDatadogAgentStatusCondition(t *testing.T) {
type args struct {
status *DatadogAgentStatus
condition string
}
tests := []struct {
name string
args args
expectedStatus *DatadogAgentStatus
}{
{
name: "empty status",
args: args{
status: &DatadogAgentStatus{},
condition: "fooType",
},
expectedStatus: &DatadogAgentStatus{},
},
{
name: "not present status",
args: args{
status: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "barType",
},
},
},
condition: "fooType",
},
expectedStatus: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "barType",
},
},
},
},
{
name: "status present at the end",
args: args{
status: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "barType",
},
{
Type: "fooType",
},
},
},
condition: "fooType",
},
expectedStatus: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "barType",
},
},
},
},
{
name: "status present at the begining",
args: args{
status: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "fooType",
},
{
Type: "barType",
},
},
},
condition: "fooType",
},
expectedStatus: &DatadogAgentStatus{
Conditions: []v1.Condition{
{
Type: "barType",
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
DeleteDatadogAgentStatusCondition(tt.args.status, tt.args.condition)
assert.True(t, apiutils.IsEqualStruct(tt.args.status, tt.expectedStatus), "status \ndiff = %s", cmp.Diff(tt.args.status, tt.expectedStatus))
})
}
}
31 changes: 18 additions & 13 deletions cmd/check-operator/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,23 +158,27 @@ func (o *Options) getV2Status() (common.StatusWrapper, error) {
return common.NewV2StatusWrapper(datadogAgent), nil
}

func isReconcileError(conditions []metav1.Condition) bool {
func isReconcileError(conditions []metav1.Condition) error {
for _, condition := range conditions {
if (condition.Type == "DatadogAgentReconcileError" && condition.Status == metav1.ConditionTrue) ||
(condition.Type == "AgentReconcile" && condition.Status == metav1.ConditionFalse) ||
(condition.Type == "ClusterAgentReconcile" && condition.Status == metav1.ConditionFalse) ||
(condition.Type == "ClusterChecksRunnerReconcile" && condition.Status == metav1.ConditionFalse) {
return true
switch {
case condition.Type == "DatadogAgentReconcileError" && condition.Status == metav1.ConditionTrue:
return fmt.Errorf("datadogAgent reconciliation error message: %s", condition.Message)
case condition.Type == "AgentReconcile" && condition.Status == metav1.ConditionFalse:
return fmt.Errorf("agent reconciliation error message: %s", condition.Message)
case condition.Type == "ClusterAgentReconcile" && condition.Status == metav1.ConditionFalse:
return fmt.Errorf("cluster Agent reconciliation error message: %s", condition.Message)
case condition.Type == "ClusterChecksRunnerReconcile" && condition.Status == metav1.ConditionFalse:
return fmt.Errorf("cluster Check Runner reconciliation error message: %s", condition.Message)
}
}
return false
return nil
}

// Run use to run the command.
func (o *Options) Run() error {
o.printOutf("Start checking rolling-update status")
agentDone, dcaDone, clcDone := false, false, false
checkFunc := func() (bool, error) {
var agentDone, dcaDone, ccrDone, reconcileError bool
v2Available, err := common.IsV2Available(o.Clientset)
if err != nil {
return false, fmt.Errorf("unable to detect if CRD v2 is available, err:%w", err)
Expand All @@ -195,8 +199,9 @@ func (o *Options) Run() error {
return false, fmt.Errorf("unable to get the DatadogAgent.status, err:%w", err)
}

if isReconcileError(status.GetStatusCondition()) {
return false, fmt.Errorf("got reconcile error")
if err = isReconcileError(status.GetStatusCondition()); err != nil {
o.printOutf("received a reconcile error: %v", err)
reconcileError = true
}

if !agentDone {
Expand All @@ -207,11 +212,11 @@ func (o *Options) Run() error {
dcaDone = o.isDeploymentDone(status.GetClusterAgentStatus(), o.dcaMinUpToDate, "Cluster Agent")
}

if !clcDone {
clcDone = o.isDeploymentDone(status.GetClusterChecksRunnerStatus(), o.clcMinUpToDate, "Cluster Check Runner")
if !ccrDone {
ccrDone = o.isDeploymentDone(status.GetClusterChecksRunnerStatus(), o.clcMinUpToDate, "Cluster Check Runner")
}

if agentDone && dcaDone && clcDone {
if agentDone && dcaDone && ccrDone && !reconcileError {
return true, nil
}

Expand Down
6 changes: 6 additions & 0 deletions controllers/datadogagent/controller_reconcile_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ func (r *Reconciler) reconcileV2Agent(logger logr.Logger, requiredComponents fea
if err := r.deleteV2DaemonSet(daemonsetLogger, dda, daemonset, newStatus); err != nil {
return reconcile.Result{}, err
}
deleteStatusWithAgent(newStatus)
return reconcile.Result{}, nil
}

Expand Down Expand Up @@ -241,6 +242,11 @@ func (r *Reconciler) deleteV2ExtendedDaemonSet(logger logr.Logger, dda *datadogh
return nil
}

func deleteStatusWithAgent(newStatus *datadoghqv2alpha1.DatadogAgentStatus) {
newStatus.Agent = nil
datadoghqv2alpha1.DeleteDatadogAgentStatusCondition(newStatus, datadoghqv2alpha1.AgentReconcileConditionType)
}

// removeStaleStatus removes a DaemonSet's status from a DatadogAgent's
// status based on the DaemonSet's name
func removeStaleStatus(ddaStatus *datadoghqv2alpha1.DatadogAgentStatus, name string) {
Expand Down
8 changes: 7 additions & 1 deletion controllers/datadogagent/controller_reconcile_ccr.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ func (r *Reconciler) cleanupV2ClusterChecksRunner(logger logr.Logger, dda *datad
}
}

newStatus.ClusterChecksRunner = nil
deleteStatusWithClusterChecksRunner(newStatus)

return reconcile.Result{}, nil
}

func deleteStatusWithClusterChecksRunner(newStatus *datadoghqv2alpha1.DatadogAgentStatus) {
newStatus.ClusterChecksRunner = nil
datadoghqv2alpha1.DeleteDatadogAgentStatusCondition(newStatus, datadoghqv2alpha1.ClusterChecksRunnerReconcileConditionType)
}
18 changes: 17 additions & 1 deletion controllers/datadogagent/controller_reconcile_dca.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

func (r *Reconciler) reconcileV2ClusterAgent(logger logr.Logger, requiredComponents feature.RequiredComponents, features []feature.Feature, dda *datadoghqv2alpha1.DatadogAgent, resourcesManager feature.ResourceManagers, newStatus *datadoghqv2alpha1.DatadogAgentStatus) (reconcile.Result, error) {
var result reconcile.Result
now := metav1.NewTime(time.Now())

// Start by creating the Default Cluster-Agent deployment
deployment := componentdca.NewDefaultClusterAgentDeployment(dda)
Expand All @@ -35,11 +37,17 @@ func (r *Reconciler) reconcileV2ClusterAgent(logger logr.Logger, requiredCompone
deployment.Spec.Template = *override.ApplyGlobalSettingsClusterAgent(logger, podManagers, dda, resourcesManager)

// Apply features changes on the Deployment.Spec.Template
var featErrors []error
for _, feat := range features {
if errFeat := feat.ManageClusterAgent(podManagers); errFeat != nil {
return result, errFeat
featErrors = append(featErrors, errFeat)
}
}
if len(featErrors) > 0 {
err := utilerrors.NewAggregate(featErrors)
updateStatusV2WithClusterAgent(deployment, newStatus, now, metav1.ConditionFalse, "ClusterAgent feature error", err.Error())
return result, err
}

deploymentLogger := logger.WithValues("component", datadoghqv2alpha1.ClusterAgentComponentName)

Expand All @@ -61,12 +69,14 @@ func (r *Reconciler) reconcileV2ClusterAgent(logger logr.Logger, requiredCompone
true,
)
}
deleteStatusV2WithClusterAgent(newStatus)
return r.cleanupV2ClusterAgent(deploymentLogger, dda, deployment, resourcesManager, newStatus)
}
override.PodTemplateSpec(logger, podManagers, componentOverride, datadoghqv2alpha1.ClusterAgentComponentName, dda.Name)
override.Deployment(deployment, componentOverride)
} else if !dcaEnabled {
// If the override is not defined, then disable based on dcaEnabled value
deleteStatusV2WithClusterAgent(newStatus)
return r.cleanupV2ClusterAgent(deploymentLogger, dda, deployment, resourcesManager, newStatus)
}

Expand All @@ -78,6 +88,11 @@ func updateStatusV2WithClusterAgent(dca *appsv1.Deployment, newStatus *datadoghq
datadoghqv2alpha1.UpdateDatadogAgentStatusConditions(newStatus, updateTime, datadoghqv2alpha1.ClusterAgentReconcileConditionType, status, reason, message, true)
}

func deleteStatusV2WithClusterAgent(newStatus *datadoghqv2alpha1.DatadogAgentStatus) {
newStatus.ClusterAgent = nil
datadoghqv2alpha1.DeleteDatadogAgentStatusCondition(newStatus, datadoghqv2alpha1.ClusterAgentReconcileConditionType)
}

func (r *Reconciler) cleanupV2ClusterAgent(logger logr.Logger, dda *datadoghqv2alpha1.DatadogAgent, deployment *appsv1.Deployment, resourcesManager feature.ResourceManagers, newStatus *datadoghqv2alpha1.DatadogAgentStatus) (reconcile.Result, error) {
nsName := types.NamespacedName{
Name: deployment.GetName(),
Expand Down Expand Up @@ -113,5 +128,6 @@ func (r *Reconciler) cleanupV2ClusterAgent(logger logr.Logger, dda *datadoghqv2a
}

newStatus.ClusterAgent = nil

return reconcile.Result{}, nil
}
37 changes: 29 additions & 8 deletions controllers/datadogagent/controller_reconcile_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func (r *Reconciler) internalReconcileV2(ctx context.Context, request reconcile.
func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger, instance *datadoghqv2alpha1.DatadogAgent) (reconcile.Result, error) {
var result reconcile.Result
newStatus := instance.Status.DeepCopy()
now := metav1.NewTime(time.Now())

features, requiredComponents := feature.BuildFeatures(instance, reconcilerOptionsToFeatureOptions(&r.options, logger))
// update list of enabled features for metrics forwarder
Expand Down Expand Up @@ -123,9 +124,15 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
errs = append(errs, featErr)
}
}
if len(errs) > 0 {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, errors.NewAggregate(errs))
}

// Examine user configuration to override any external dependencies (e.g. RBACs)
errs = append(errs, override.Dependencies(logger, resourceManagers, instance)...)
errs = override.Dependencies(logger, resourceManagers, instance)
if len(errs) > 0 {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, errors.NewAggregate(errs))
}

userSpecifiedClusterAgentToken := instance.Spec.Global.ClusterAgentToken != nil || instance.Spec.Global.ClusterAgentTokenSecret != nil
if !userSpecifiedClusterAgentToken {
Expand All @@ -141,6 +148,9 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
result, err = r.reconcileV2ClusterAgent(logger, requiredComponents, features, instance, resourceManagers, newStatus)
if utils.ShouldReturn(result, err) {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err)
} else {
// Update the status to make it the ClusterAgentReconcileConditionType successful
datadoghqv2alpha1.UpdateDatadogAgentStatusConditions(newStatus, now, datadoghqv2alpha1.ClusterAgentReconcileConditionType, metav1.ConditionTrue, "reconcile_succeed", "reconcile succeed", false)
}

// Start with an "empty" profile and provider
Expand All @@ -153,7 +163,7 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
// Get a node list for profiles and introspection
nodeList, e := r.getNodeList(ctx)
if e != nil {
return reconcile.Result{}, e
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, e)
}

if r.options.IntrospectionEnabled {
Expand All @@ -162,13 +172,13 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger

if r.options.DatadogAgentProfileEnabled {
profileList, profilesByNode, e := r.profilesToApply(ctx, logger, nodeList)
if err != nil {
return reconcile.Result{}, e
if e != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, e)
}
profiles = profileList

if err = r.handleProfiles(ctx, profilesByNode, instance.Namespace); err != nil {
return reconcile.Result{}, err
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err)
}
}
}
Expand All @@ -177,18 +187,29 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
for provider := range providerList {
result, err = r.reconcileV2Agent(logger, requiredComponents, features, instance, resourceManagers, newStatus, provider, providerList, &profile)
if utils.ShouldReturn(result, err) {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err)
// If the agent reconcile failed, we should not continue with the other profiles
errs = append(errs, err)
}
}
}

if err = r.cleanupExtraneousDaemonSets(ctx, logger, instance, newStatus, providerList, profiles); err != nil {
errs = append(errs, err)
logger.Error(err, "Error cleaning up old DaemonSets")
}
if utils.ShouldReturn(result, errors.NewAggregate(errs)) {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, errors.NewAggregate(errs))
} else {
// Update the status to set AgentReconcileConditionType to successful
datadoghqv2alpha1.UpdateDatadogAgentStatusConditions(newStatus, now, datadoghqv2alpha1.AgentReconcileConditionType, metav1.ConditionTrue, "reconcile_succeed", "reconcile succeed", false)
}

result, err = r.reconcileV2ClusterChecksRunner(logger, requiredComponents, features, instance, resourceManagers, newStatus)
if utils.ShouldReturn(result, err) {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err)
} else {
// Update the status to set ClusterChecksRunnerReconcileConditionType to successful
datadoghqv2alpha1.UpdateDatadogAgentStatusConditions(newStatus, now, datadoghqv2alpha1.ClusterChecksRunnerReconcileConditionType, metav1.ConditionTrue, "reconcile_succeed", "reconcile succeed", false)
}

// ------------------------------
Expand All @@ -197,15 +218,15 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
errs = append(errs, depsStore.Apply(ctx, r.client)...)
if len(errs) > 0 {
logger.V(2).Info("Dependencies apply error", "errs", errs)
return result, errors.NewAggregate(errs)
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, errors.NewAggregate(errs))
}

// -----------------------------
// Cleanup unused dependencies
// -----------------------------
// Run it after the deployments reconcile
if errs = depsStore.Cleanup(ctx, r.client); len(errs) > 0 {
return result, errors.NewAggregate(errs)
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, errors.NewAggregate(errs))
}

// Always requeue
Expand Down

0 comments on commit 70db137

Please sign in to comment.