Skip to content

Commit

Permalink
[eds] Introduce PodCannotStart condition and canary auto-pause handli…
Browse files Browse the repository at this point in the history
…ng (#68)

* [eds] Introduce PodCannotStart condition and auto-pause handling

- Introduces logic to handle certain failure waiting states as cause for auto-pausing canary.
- Simplifies logic for reported container states in terminated reason (will basically take anything).
- e2e tests for PodCannotStart scenarios (container config error and image pull errors).

* Address review comment and (hopefully) fix e2e flaky test

* Implement MaxSlowStartDuration for AutoPause

* Move failure cases to a different path

* Fix a bug in e2e test waiting for changes in ERS but watching EDS

- Improve logging
- Colorize e2e test milestone logs
- Add error logs around reconciler scenarios on update, still require more investigation.

* Refetch pod before adding/deleting label + example of a slow start config
  • Loading branch information
xornivore committed Dec 8, 2020
1 parent c125046 commit efa4090
Show file tree
Hide file tree
Showing 22 changed files with 719 additions and 107 deletions.
5 changes: 5 additions & 0 deletions api/v1alpha1/extendeddaemonset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ type ExtendedDaemonSetSpecStrategyCanaryAutoPause struct {
Enabled *bool `json:"enabled,omitempty"`
// MaxRestarts defines the number of tolerable (per pod) Canary pod restarts after which the Canary deployment is autopaused
MaxRestarts *int32 `json:"maxRestarts,omitempty"`
// MaxSlowStartDuration defines the maximum slow start duration for a pod (stuck in Creating state) after which the
// Canary deployment is autopaused
MaxSlowStartDuration *metav1.Duration `json:"maxSlowStartDuration,omitempty"`
}

// ExtendedDaemonSetSpecStrategyCanaryAutoFail defines the canary deployment AutoFail parameters of the ExtendedDaemonSet
Expand Down Expand Up @@ -126,6 +129,8 @@ const (
ExtendedDaemonSetStatusReasonOOM ExtendedDaemonSetStatusReason = "OOMKilled"
// ExtendedDaemonSetStatusRestartsTimeoutExceeded represents timeout on restarts as the reason for the ExtendedDaemonSet status
ExtendedDaemonSetStatusRestartsTimeoutExceeded ExtendedDaemonSetStatusReason = "RestartsTimeoutExceeded"
// ExtendedDaemonSetStatusSlowStartTimeoutExceeded represents timeout on slow starts as the reason for the ExtendedDaemonSet status
ExtendedDaemonSetStatusSlowStartTimeoutExceeded ExtendedDaemonSetStatusReason = "SlowStartTimeoutExceeded"
// ExtendedDaemonSetStatusReasonUnknown represents an Unknown reason for the status state
ExtendedDaemonSetStatusReasonUnknown ExtendedDaemonSetStatusReason = "Unknown"
)
Expand Down
2 changes: 2 additions & 0 deletions api/v1alpha1/extendeddaemonsetreplicaset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ const (
ConditionTypePodDeletion ExtendedDaemonSetReplicaSetConditionType = "PodDeletion"
// ConditionTypePodRestarting Pod(s) restarting condition
ConditionTypePodRestarting ExtendedDaemonSetReplicaSetConditionType = "PodRestarting"
// ConditionTypePodCannotStart Pod(s) cannot start condition
ConditionTypePodCannotStart ExtendedDaemonSetReplicaSetConditionType = "PodCannotStart"
// ConditionTypeLastFullSync last time the ExtendedDaemonSetReplicaSet sync when to the end of the reconcile function
ConditionTypeLastFullSync ExtendedDaemonSetReplicaSetConditionType = "LastFullSync"
)
Expand Down
5 changes: 5 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions api/v1alpha1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions config/crd/bases/v1/datadoghq.com_extendeddaemonsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ spec:
deployment is autopaused
format: int32
type: integer
maxSlowStartDuration:
description: MaxSlowStartDuration defines the maximum
slow start duration for a pod (stuck in Creating state)
after which the Canary deployment is autopaused
type: string
type: object
duration:
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ spec:
is autopaused
format: int32
type: integer
maxSlowStartDuration:
description: MaxSlowStartDuration defines the maximum slow
start duration for a pod (stuck in Creating state) after
which the Canary deployment is autopaused
type: string
type: object
duration:
type: string
Expand Down
15 changes: 12 additions & 3 deletions controllers/extendeddaemonset/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package extendeddaemonset
import (
"context"
"fmt"
"math/rand"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -64,9 +65,11 @@ func NewReconciler(options ReconcilerOptions, client client.Client, scheme *runt
// The Controller will requeue the Request to be processed again if the returned error is non-nil or
// Result.Requeue is true, otherwise upon completion it will remove the work from the queue.
func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) {
reqLogger := r.log.WithValues("Request.Namespace", request.Namespace, "Request.Name", request.Name)
reqLogger.Info("Reconciling ExtendedDaemonSet")
now := time.Now()
rand := rand.Uint32()
reqLogger := r.log.WithValues("Req.NS", request.Namespace, "Req.Name", request.Name, "Req.TS", now.Unix(), "Req.Rand", rand)
reqLogger.Info("Reconciling ExtendedDaemonSet")

// Fetch the ExtendedDaemonSet instance
instance := &datadoghqv1alpha1.ExtendedDaemonSet{}
err := r.client.Get(context.TODO(), request.NamespacedName, instance)
Expand All @@ -86,7 +89,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
defaultedInstance := datadoghqv1alpha1.DefaultExtendedDaemonSet(instance)
err = r.client.Update(context.TODO(), defaultedInstance)
if err != nil {
reqLogger.Error(err, "failed to update ExtendedDaemonSet")
reqLogger.Error(err, "Failed to update ExtendedDaemonSet")
return reconcile.Result{}, err
}
// ExtendedDaemonSet is now defaulted return and requeue
Expand Down Expand Up @@ -269,19 +272,25 @@ func (r *Reconciler) updateInstanceWithCurrentRS(logger logr.Logger, daemonset *
if updateDaemonsetSpec {
// Make and use a copy because undesired behaviors occur when making two update calls
newDaemonsetCopy := newDaemonset.DeepCopy()
logger.Info("Updating ExtendedDaemonSet")
if err := r.client.Update(context.TODO(), newDaemonsetCopy); err != nil {
logger.Error(err, "Failed to update ExtendedDaemonSet")
return newDaemonsetCopy, reconcile.Result{}, err
}

// This ensures that the first client update respects the desired new status
newDaemonsetCopy.Status = newDaemonset.Status
logger.Info("Updating ExtendedDaemonSet status")
if err := r.client.Status().Update(context.TODO(), newDaemonsetCopy); err != nil {
logger.Error(err, "Failed to update ExtendedDaemonSet status")
return newDaemonsetCopy, reconcile.Result{}, err
}
return newDaemonsetCopy, reconcile.Result{}, nil
}

logger.Info("Updating ExtendedDaemonSet status")
if err := r.client.Status().Update(context.TODO(), newDaemonset); err != nil {
logger.Error(err, "Failed to update ExtendedDaemonSet status")
return newDaemonset, reconcile.Result{}, err
}
return newDaemonset, reconcile.Result{}, nil
Expand Down
7 changes: 1 addition & 6 deletions controllers/extendeddaemonset/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,7 @@ func IsCanaryDeploymentPaused(dsAnnotations map[string]string) (bool, datadoghqv
isPaused, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey]
if found && isPaused == "true" { //nolint:goconst
if reason, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey]; found {
switch reason {
case
string(datadoghqv1alpha1.ExtendedDaemonSetStatusReasonCLB),
string(datadoghqv1alpha1.ExtendedDaemonSetStatusReasonOOM):
return true, datadoghqv1alpha1.ExtendedDaemonSetStatusReason(reason)
}
return true, datadoghqv1alpha1.ExtendedDaemonSetStatusReason(reason)
}
return true, datadoghqv1alpha1.ExtendedDaemonSetStatusReasonUnknown
}
Expand Down
6 changes: 3 additions & 3 deletions controllers/extendeddaemonset/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ func TestIsCanaryDeploymentPaused(t *testing.T) {
wantReason: datadoghqv1alpha1.ExtendedDaemonSetStatusReasonUnknown,
},
{
name: "pause annotation is `true` and has an unsupported reason, expect true and `unknown` reason",
name: "pause annotation is `true` and has a not very well known reason reason, expect true and actual reason",
args: args{
dsAnnotations: map[string]string{
"extendeddaemonset.datadoghq.com/canary-paused": "true",
"extendeddaemonset.datadoghq.com/canary-paused-reason": "SomeUnsupportedReason",
"extendeddaemonset.datadoghq.com/canary-paused-reason": "SomeOddReason",
},
},
want: true,
wantReason: datadoghqv1alpha1.ExtendedDaemonSetStatusReasonUnknown,
wantReason: datadoghqv1alpha1.ExtendedDaemonSetStatusReason("SomeOddReason"),
},
}
for _, tt := range tests {
Expand Down
Loading

0 comments on commit efa4090

Please sign in to comment.