Skip to content

Commit

Permalink
persist manual unpause until end of a canary deployment, and then res…
Browse files Browse the repository at this point in the history
…et (#77)

* persist manual unpause until end of a canary deployment, and then reset

* fix clearCanaryAnnotations, update from feedback
  • Loading branch information
celenechang authored and clamoriniere committed Jan 21, 2021
1 parent 99ef906 commit a38c7ac
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 31 deletions.
7 changes: 7 additions & 0 deletions api/v1alpha1/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ const (
ExtendedDaemonSetCanaryPausedAnnotationKey = "extendeddaemonset.datadoghq.com/canary-paused"
// ExtendedDaemonSetCanaryPausedReasonAnnotationKey annotation key used on ExtendedDaemonset to provide a reason that the a canary deployment is paused.
ExtendedDaemonSetCanaryPausedReasonAnnotationKey = "extendeddaemonset.datadoghq.com/canary-paused-reason"
// ExtendedDaemonSetCanaryUnpausedAnnotationKey annotation key used on ExtendedDaemonset in order to detect if a canary deployment is manually unpaused.
ExtendedDaemonSetCanaryUnpausedAnnotationKey = "extendeddaemonset.datadoghq.com/canary-unpaused"
// ExtendedDaemonSetCanaryFailedAnnotationKey annotation key used on ExtendedDaemonset in order to detect if a canary deployment has failed.
ExtendedDaemonSetCanaryFailedAnnotationKey = "extendeddaemonset.datadoghq.com/canary-failed"
// ExtendedDaemonSetCanaryFailedReasonAnnotationKey annotation key used on ExtendedDaemonset in order to provide a reason that the a canary deployment is failed.
Expand All @@ -34,4 +36,9 @@ const (
ExtendedDaemonSetRessourceNodeAnnotationKey = "resources.extendeddaemonset.datadoghq.com/%s.%s.%s"
// MD5NodeExtendedDaemonSetAnnotationKey annotation key use on Pods in order to identify which Node Resources Overwride have been used to generate it.
MD5NodeExtendedDaemonSetAnnotationKey = "extendeddaemonset.datadoghq.com/nodehash"

// ValueStringTrue is the string value of bool `true`
ValueStringTrue = "true"
// ValueStringFalse is the string value of bool `false`
ValueStringFalse = "false"
)
27 changes: 20 additions & 7 deletions controllers/extendeddaemonset/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ func (r *Reconciler) updateInstanceWithCurrentRS(logger logr.Logger, now time.Ti
metaNow := metav1.NewTime(now)

var updateDaemonsetSpec bool
var updateDaemonsetAnnotations bool
// If the deployment is in Canary phase, then update status (and spec as needed)
if daemonset.Spec.Strategy.Canary != nil {

Expand Down Expand Up @@ -252,7 +253,7 @@ func (r *Reconciler) updateInstanceWithCurrentRS(logger logr.Logger, now time.Ti
}
} else {
// if the Canary Deployment is not active anymore remove the canary annotations
clearCanaryAnnotations(newDaemonset)
updateDaemonsetAnnotations = clearCanaryAnnotations(newDaemonset)
}
}

Expand All @@ -268,10 +269,11 @@ func (r *Reconciler) updateInstanceWithCurrentRS(logger logr.Logger, now time.Ti
return extendedDaemonsetCopy, reconcile.Result{}, fmt.Errorf("failed to update ExtendedDaemonSet status, %w", err)
}

if updateDaemonsetSpec {
if updateDaemonsetSpec || updateDaemonsetAnnotations {
// we use the `extendedDaemonsetCopy` instance to have last version. that contains the latest metadata info (resource version)
// Copy the spec part into the extendedDaemonsetCopy.
extendedDaemonsetCopy.Spec = *newDaemonset.Spec.DeepCopy()
extendedDaemonsetCopy.Annotations = newDaemonset.Annotations
// In case of canaryFailed, we also update the ExtendedDaemonset.Spec
logger.Info("Updating ExtendedDaemonSet.Spec")
if err := r.client.Update(context.TODO(), extendedDaemonsetCopy); err != nil {
Expand Down Expand Up @@ -566,11 +568,22 @@ func (r *Reconciler) cleanupReplicaSet(logger logr.Logger, rsList *datadoghqv1al
return utilserrors.NewAggregate(errs)
}

func clearCanaryAnnotations(eds *datadoghqv1alpha1.ExtendedDaemonSet) {
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedReasonAnnotationKey)
func clearCanaryAnnotations(eds *datadoghqv1alpha1.ExtendedDaemonSet) bool {
keysToDelete := []string{
datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryUnpausedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedReasonAnnotationKey,
}
var updated bool
for _, key := range keysToDelete {
if _, ok := eds.Annotations[key]; ok {
delete(eds.Annotations, key)
updated = true
}
}
return updated
}

type podsCounterType struct {
Expand Down
4 changes: 2 additions & 2 deletions controllers/extendeddaemonset/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -915,8 +915,8 @@ func TestReconcileExtendedDaemonSet_updateInstanceWithCurrentRS(t *testing.T) {
t.Errorf("ReconcileExtendedDaemonSet.updateInstanceWithCurrentRS() error = %v, wantErr %v", err, tt.wantErr)
return
}
assert.Equal(t, tt.want, got, "econcileExtendedDaemonSet.updateInstanceWithCurrentRS()")
assert.Equal(t, tt.wantResult, got1, "econcileExtendedDaemonSet.updateInstanceWithCurrentRS().result")
assert.Equal(t, tt.want, got, "ReconcileExtendedDaemonSet.updateInstanceWithCurrentRS()")
assert.Equal(t, tt.wantResult, got1, "ReconcileExtendedDaemonSet.updateInstanceWithCurrentRS().result")
})
}
}
Expand Down
14 changes: 11 additions & 3 deletions controllers/extendeddaemonset/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ func IsCanaryDeploymentEnded(specCanary *datadoghqv1alpha1.ExtendedDaemonSetSpec

// IsCanaryDeploymentPaused checks if the Canary deployment has been paused
func IsCanaryDeploymentPaused(dsAnnotations map[string]string, ers *datadoghqv1alpha1.ExtendedDaemonSetReplicaSet) (bool, datadoghqv1alpha1.ExtendedDaemonSetStatusReason) {

// check ERS status to detect if a Canary paused
if ers != nil && conditions.IsConditionTrue(&ers.Status, datadoghqv1alpha1.ConditionTypeCanaryPaused) {
cond := conditions.GetExtendedDaemonSetReplicaSetStatusCondition(&ers.Status, datadoghqv1alpha1.ConditionTypeCanaryPaused)
Expand All @@ -70,7 +69,7 @@ func IsCanaryDeploymentPaused(dsAnnotations map[string]string, ers *datadoghqv1a

// Check annotations is a user have added the pause annotation.
isPaused, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey]
if found && isPaused == "true" { //nolint:goconst
if found && isPaused == v1alpha1.ValueStringTrue {
if reason, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey]; found {
return true, datadoghqv1alpha1.ExtendedDaemonSetStatusReason(reason)
}
Expand All @@ -79,6 +78,15 @@ func IsCanaryDeploymentPaused(dsAnnotations map[string]string, ers *datadoghqv1a
return false, ""
}

// IsCanaryDeploymentUnpaused checks if the Canary deployment has been manually unpaused
func IsCanaryDeploymentUnpaused(dsAnnotations map[string]string) bool {
isUnpaused, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryUnpausedAnnotationKey]
if found {
return isUnpaused == v1alpha1.ValueStringTrue
}
return false
}

// IsCanaryDeploymentValid used to know if the Canary deployment has been declared
// valid even if its duration has not finished yet.
// If the ExtendedDaemonSet has the corresponding annotation: return true
Expand All @@ -98,7 +106,7 @@ func IsCanaryDeploymentFailed(dsAnnotations map[string]string, ers *datadoghqv1a

// Check also failed annotations if a user wanted to force a canary failure
if value, found := dsAnnotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey]; found {
return value == "true"
return value == v1alpha1.ValueStringTrue
}
return false
}
Expand Down
34 changes: 25 additions & 9 deletions controllers/extendeddaemonset_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func ginkgoLog(format string, a ...interface{}) {
fmt.Fprintf(GinkgoWriter, format, a...)
}

// These tests may take several minutes to run, check you go test timeout
// These tests may take several minutes to run, check your go test timeout
var _ = Describe("ExtendedDaemonSet e2e updates and recovery", func() {
Context("Initial deployment", func() {
name := "eds-fail"
Expand Down Expand Up @@ -200,7 +200,7 @@ var _ = Describe("ExtendedDaemonSet e2e updates and recovery", func() {
Expect(k8sClient.Get(ctx, key, eds)).Should(Succeed())
info("EDS status:\n%s\n", spew.Sdump(eds.Status))

clearCanaryAnnotations(eds)
_ = clearCanaryAnnotations(eds)

eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("gcr.io/google-containers/alpine-with-bash:1.0")
eds.Spec.Template.Spec.Containers[0].Command = []string{
Expand All @@ -224,6 +224,11 @@ var _ = Describe("ExtendedDaemonSet e2e updates and recovery", func() {
Eventually(withEDS(key, eds, func() bool {
return eds.Status.State == datadoghqv1alpha1.ExtendedDaemonSetStatusStateRunning
}), timeout, interval).Should(BeTrue())

Eventually(withEDS(key, eds, func() bool {
_, found := eds.Annotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey]
return found
}), timeout, interval).Should(BeFalse())
})

It("Should delete EDS", func() {
Expand Down Expand Up @@ -448,7 +453,7 @@ var _ = Describe("ExtendedDaemonSet e2e PodCannotStart condition", func() {
})
})

// These tests may take several minutes to run, check you go test timeout
// These tests may take several minutes to run, check your go test timeout
var _ = Describe("ExtendedDaemonSet e2e successful canary deployment update", func() {
Context("Initial deployment", func() {
name := fmt.Sprintf("eds-foo-%d", time.Now().Unix())
Expand Down Expand Up @@ -581,7 +586,7 @@ var _ = Describe("ExtendedDaemonSet e2e successful canary deployment update", fu

})

// This test may take ~30s to run, check you go test timeout
// This test may take ~30s to run, check your go test timeout
var _ = Describe("ExtendedDaemonSet Controller", func() {
const timeout = time.Second * 30
const longTimeout = time.Second * 120
Expand Down Expand Up @@ -725,11 +730,22 @@ func withList(listOptions []client.ListOption, obj runtime.Object, desc string,
}
}

func clearCanaryAnnotations(eds *datadoghqv1alpha1.ExtendedDaemonSet) {
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey)
delete(eds.Annotations, datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedReasonAnnotationKey)
func clearCanaryAnnotations(eds *datadoghqv1alpha1.ExtendedDaemonSet) bool {
keysToDelete := []string{
datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryPausedReasonAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryUnpausedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey,
datadoghqv1alpha1.ExtendedDaemonSetCanaryFailedReasonAnnotationKey,
}
var updated bool
for _, key := range keysToDelete {
if _, ok := eds.Annotations[key]; ok {
delete(eds.Annotations, key)
updated = true
}
}
return updated
}

func deleteEDS(k8sclient client.Client, key types.NamespacedName) condFn {
Expand Down
10 changes: 8 additions & 2 deletions controllers/extendeddaemonsetreplicaset/strategy/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func manageCanaryStatus(annotations map[string]string, params *Parameters, now t

result.IsFailed = eds.IsCanaryDeploymentFailed(annotations, params.Replicaset)
result.IsPaused, _ = eds.IsCanaryDeploymentPaused(annotations, params.Replicaset)
result.IsUnpaused = eds.IsCanaryDeploymentUnpaused(annotations)

var (
metaNow = metav1.NewTime(now)
Expand Down Expand Up @@ -185,7 +186,8 @@ func manageCanaryPodFailures(pods []*v1.Pod, params *Parameters, result *Result,
continue
}

if autoFailEnabled && restartCount > autoFailMaxRestarts {
switch {
case autoFailEnabled && restartCount > autoFailMaxRestarts:
result.IsFailed = true
result.FailedReason = highRestartReason
params.Logger.Info(
Expand All @@ -194,7 +196,11 @@ func manageCanaryPodFailures(pods []*v1.Pod, params *Parameters, result *Result,
"MaxRestarts", autoFailMaxRestarts,
"Reason", highRestartReason,
)
} else if !result.IsPaused && autoPauseEnabled {
case result.IsUnpaused:
// Unpausing is a manual action and takes precedence
result.IsPaused = false
result.PausedReason = ""
case !result.IsPaused && autoPauseEnabled:
// Handle cases related to failure to start states
if cannotStart {
result.IsPaused = true
Expand Down
2 changes: 2 additions & 0 deletions controllers/extendeddaemonsetreplicaset/strategy/type.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ type Result struct {
IsPaused bool
// PausedReason provides the reason for the paused deployment
PausedReason datadoghqv1alpha1.ExtendedDaemonSetStatusReason
// IsUnpaused represents if the deployment was manually unpaused
IsUnpaused bool

// IsFailed represents failed state of the deployment
IsFailed bool
Expand Down
2 changes: 2 additions & 0 deletions examples/foo-eds_v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ spec:
canary:
replicas: 1
duration: 5m
autoFail:
enabled: false
rollingUpdate:
maxParallelPodCreation: 1
slowStartIntervalDuration: 1m
Expand Down
9 changes: 6 additions & 3 deletions pkg/plugin/canary/fail.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,16 +133,19 @@ func (o *failOptions) run() error {
}

if eds.Spec.Strategy.Canary == nil {
return fmt.Errorf("the ExtendedDaemonset does not have a canary")
return fmt.Errorf("the ExtendedDaemonset does not have a canary strategy")
}
if eds.Status.Canary == nil {
return fmt.Errorf("the ExtendedDaemonset does not have an active canary deployment")
}

newEds := eds.DeepCopy()
if newEds.Annotations == nil {
newEds.Annotations = make(map[string]string)
} else if isFailed, ok := newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryFailedAnnotationKey]; ok {
if o.failStatus && isFailed == "true" {
if o.failStatus && isFailed == v1alpha1.ValueStringTrue {
return fmt.Errorf("canary deployment already failed")
} else if !o.failStatus && isFailed == "false" {
} else if !o.failStatus && isFailed == v1alpha1.ValueStringFalse {
return fmt.Errorf("canary deployment already reset")
}
}
Expand Down
21 changes: 16 additions & 5 deletions pkg/plugin/canary/pause.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,21 +161,32 @@ func (o *pauseOptions) run() error {
}

if eds.Spec.Strategy.Canary == nil {
return fmt.Errorf("the ExtendedDaemonset does not have a canary")
return fmt.Errorf("the ExtendedDaemonset does not have a canary strategy")
}
if eds.Status.Canary == nil {
return fmt.Errorf("the ExtendedDaemonset does not have an active canary deployment")
}

newEds := eds.DeepCopy()

if newEds.Annotations == nil {
newEds.Annotations = make(map[string]string)
} else if isPaused, ok := newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey]; ok {
if o.pauseStatus && isPaused == "true" {
if o.pauseStatus && isPaused == v1alpha1.ValueStringTrue {
return fmt.Errorf("canary deployment already paused")
} else if !o.pauseStatus && isPaused == "false" {
return fmt.Errorf("canary deployment already unpaused")
} else if !o.pauseStatus && isPaused == v1alpha1.ValueStringFalse {
return fmt.Errorf("canary deployment is not paused; cannot unpause")
}
}
newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey] = fmt.Sprintf("%v", o.pauseStatus)
// Set appropriate annotation depending on whether cmd is to pause or unpause
if o.pauseStatus {
newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey] = v1alpha1.ValueStringTrue
// Set to false in case it was previously true
newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryUnpausedAnnotationKey] = v1alpha1.ValueStringFalse
} else {
newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryPausedAnnotationKey] = v1alpha1.ValueStringFalse
newEds.Annotations[v1alpha1.ExtendedDaemonSetCanaryUnpausedAnnotationKey] = v1alpha1.ValueStringTrue
}

patch := client.MergeFrom(eds)
if err = o.client.Patch(context.TODO(), newEds, patch); err != nil {
Expand Down

0 comments on commit a38c7ac

Please sign in to comment.