/
upgrade.go
270 lines (220 loc) · 8.14 KB
/
upgrade.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.
package upgrade
import (
"context"
"fmt"
"os"
"strconv"
"time"
"github.com/spf13/cobra"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/cli-runtime/pkg/genericclioptions"
"sigs.k8s.io/controller-runtime/pkg/client"
commonv1 "github.com/DataDog/datadog-operator/apis/datadoghq/common/v1"
"github.com/DataDog/datadog-operator/apis/datadoghq/v1alpha1"
"github.com/DataDog/datadog-operator/apis/datadoghq/v2alpha1"
"github.com/DataDog/datadog-operator/pkg/plugin/common"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// Options provides information required to manage canary.
type Options struct {
genericclioptions.IOStreams
common.Options
args []string
datadogAgentName string
checkPeriod time.Duration
checkTimeout time.Duration
agentCompletionPct float64
agentCompletionMin int32
dcaMinUpToDate int32
clcMinUpToDate int32
}
// NewOptions provides an instance of Options with default values.
func NewOptions(streams genericclioptions.IOStreams) *Options {
opts := &Options{
IOStreams: streams,
checkPeriod: 30 * time.Second,
checkTimeout: 2 * time.Hour,
agentCompletionPct: 0.95,
agentCompletionMin: 10,
dcaMinUpToDate: 1,
clcMinUpToDate: 2,
}
opts.SetConfigFlags()
if val, found := os.LookupEnv("AGENT_COMPLETION_PCT"); found {
if iVal, err := strconv.ParseFloat(val, 64); err == nil {
opts.agentCompletionPct = iVal / 100
}
}
if val, found := os.LookupEnv("AGENT_COMPLETION_MIN"); found {
if iVal, err := strconv.ParseInt(val, 10, 32); err == nil {
opts.agentCompletionMin = int32(iVal)
}
}
if val, found := os.LookupEnv("DCA_MIN_UP_TO_DATE"); found {
if iVal, err := strconv.ParseInt(val, 10, 32); err == nil {
opts.dcaMinUpToDate = int32(iVal)
}
}
if val, found := os.LookupEnv("CLC_MIN_UP_TO_DATE"); found {
if iVal, err := strconv.ParseInt(val, 10, 32); err == nil {
opts.clcMinUpToDate = int32(iVal)
}
}
if val, found := os.LookupEnv("CHECK_TIMEOUT_MINUTES"); found {
if iVal, err := strconv.ParseInt(val, 10, 32); err == nil {
opts.checkTimeout = time.Duration(iVal) * time.Minute
}
}
return opts
}
// NewCmdUpgrade provides a cobra command wrapping Options.
func NewCmdUpgrade(streams genericclioptions.IOStreams) *cobra.Command {
o := NewOptions(streams)
cmd := &cobra.Command{
Use: "upgrade [DatadogAgent name]",
Short: "Wait until the rolling-update of all agent components is finished",
Example: "./check-operator upgrade datadog-agent",
SilenceUsage: true,
RunE: func(c *cobra.Command, args []string) error {
if err := o.Complete(c, args); err != nil {
return err
}
if err := o.Validate(); err != nil {
return err
}
return o.Run()
},
}
o.ConfigFlags.AddFlags(cmd.Flags())
return cmd
}
// Complete sets all information required for processing the command.
func (o *Options) Complete(cmd *cobra.Command, args []string) error {
o.args = args
if len(args) > 0 {
o.datadogAgentName = args[0]
}
return o.Init(cmd)
}
// Validate ensures that all required arguments and flag values are provided.
func (o *Options) Validate() error {
if o.datadogAgentName == "" {
return fmt.Errorf("the DatadogAgent name is required")
}
return nil
}
func (o *Options) getV1Status() (common.StatusWrapper, error) {
datadogAgent := &v1alpha1.DatadogAgent{}
err := o.Client.Get(context.TODO(), client.ObjectKey{Namespace: o.UserNamespace, Name: o.datadogAgentName}, datadogAgent)
if err != nil {
if errors.IsNotFound(err) {
return nil, err
}
return nil, fmt.Errorf("unable to get DatadogAgent, err: %w", err)
}
return common.NewV1StatusWrapper(datadogAgent), nil
}
func (o *Options) getV2Status() (common.StatusWrapper, error) {
datadogAgent := &v2alpha1.DatadogAgent{}
err := o.Client.Get(context.TODO(), client.ObjectKey{Namespace: o.UserNamespace, Name: o.datadogAgentName}, datadogAgent)
if err != nil {
if errors.IsNotFound(err) {
return nil, err
}
return nil, fmt.Errorf("unable to get DatadogAgent, err: %w", err)
}
return common.NewV2StatusWrapper(datadogAgent), nil
}
func isReconcileError(conditions []metav1.Condition) bool {
for _, condition := range conditions {
if (condition.Type == "DatadogAgentReconcileError" && condition.Status == metav1.ConditionTrue) ||
(condition.Type == "AgentReconcile" && condition.Status == metav1.ConditionFalse) ||
(condition.Type == "ClusterAgentReconcile" && condition.Status == metav1.ConditionFalse) ||
(condition.Type == "ClusterChecksRunnerReconcile" && condition.Status == metav1.ConditionFalse) {
return true
}
}
return false
}
// Run use to run the command.
func (o *Options) Run() error {
o.printOutf("Start checking rolling-update status")
agentDone, dcaDone, clcDone := false, false, false
checkFunc := func() (bool, error) {
v2Available, err := common.IsV2Available(o.Clientset)
if err != nil {
return false, fmt.Errorf("unable to detect if CRD v2 is available, err:%w", err)
}
var status common.StatusWrapper
if v2Available {
o.printOutf("v2alpha1 is available")
status, err = o.getV2Status()
} else {
o.printOutf("Only v1alpha1 is available")
status, err = o.getV1Status()
}
if errors.IsNotFound(err) {
o.printOutf("Got a not found error while getting %s/%s. Assuming this DatadogAgent CR has never been deployed in this environment", o.UserNamespace, o.datadogAgentName)
return true, nil
} else if err != nil {
return false, fmt.Errorf("unable to get the DatadogAgent.status, err:%w", err)
}
if isReconcileError(status.GetStatusCondition()) {
return false, fmt.Errorf("got reconcile error")
}
if !agentDone {
agentDone = o.isAgentDone(status.GetAgentStatus())
}
if !dcaDone {
dcaDone = o.isDeploymentDone(status.GetClusterAgentStatus(), o.dcaMinUpToDate, "Cluster Agent")
}
if !clcDone {
clcDone = o.isDeploymentDone(status.GetClusterChecksRunnerStatus(), o.clcMinUpToDate, "Cluster Check Runner")
}
if agentDone && dcaDone && clcDone {
return true, nil
}
o.printOutf("One or multiple components are still upgrading...")
if status.GetAgentStatus() != nil {
o.printOutf("[Agent] nb pods: %d, nb updated pods: %d", status.GetAgentStatus().Current, status.GetAgentStatus().UpToDate)
}
if status.GetClusterAgentStatus() != nil {
o.printOutf("[Cluster Agent] nb pods: %d, nb updated pods: %d", status.GetClusterAgentStatus().Replicas, status.GetClusterAgentStatus().UpdatedReplicas)
}
if status.GetClusterChecksRunnerStatus() != nil {
o.printOutf("[Cluster Check Runner] nb pods: %d, nb updated pods: %d", status.GetClusterChecksRunnerStatus().Replicas, status.GetClusterChecksRunnerStatus().UpdatedReplicas)
}
return false, nil
}
return wait.Poll(o.checkPeriod, o.checkTimeout, checkFunc)
}
func (o *Options) isAgentDone(status *commonv1.DaemonSetStatus) bool {
if status == nil {
return true
}
if float64(status.UpToDate) > float64(status.Current)*o.agentCompletionPct || status.Current-status.UpToDate <= o.agentCompletionMin {
o.printOutf("[Agent] upgrade is now finished (reached threshold): %d, nb updated pods: %d, threshold pct: %f, min threshold: %d", status.Current, status.UpToDate, o.agentCompletionPct, o.agentCompletionMin)
return true
}
return false
}
func (o *Options) isDeploymentDone(status *commonv1.DeploymentStatus, minUpToDate int32, component string) bool {
if status == nil {
return true
}
if status.UpdatedReplicas >= minUpToDate {
o.printOutf("[%s] upgrade is now finished (reached threshold): %d, nb updated pods: %d, min up-to-date threshold: %d", component, status.Replicas, status.UpdatedReplicas, o.dcaMinUpToDate)
return true
}
return false
}
func (o *Options) printOutf(format string, a ...interface{}) {
args := []interface{}{time.Now().UTC().Format("2006-01-02T15:04:05.999Z"), o.UserNamespace, o.datadogAgentName}
args = append(args, a...)
_, _ = fmt.Fprintf(o.Out, "[%s] DatadogAgent '%s/%s': "+format+"\n", args...)
}