-
Notifications
You must be signed in to change notification settings - Fork 80
/
setup.go
453 lines (406 loc) · 20.8 KB
/
setup.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/*
* setup.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package setup
import (
"flag"
"fmt"
"io"
"io/fs"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"log"
"os"
"path"
"sigs.k8s.io/controller-runtime/pkg/cache"
"strconv"
"strings"
"time"
"github.com/go-logr/logr"
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/FoundationDB/fdb-kubernetes-operator/controllers"
"github.com/FoundationDB/fdb-kubernetes-operator/fdbclient"
"github.com/FoundationDB/fdb-kubernetes-operator/internal"
"gopkg.in/natefinch/lumberjack.v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
)
var operatorVersion = "latest"
// Options provides all configuration Options for the operator
type Options struct {
EnableLeaderElection bool
CleanUpOldLogFile bool
CompressOldFiles bool
PrintVersion bool
EnableRestartIncompatibleProcesses bool
ServerSideApply bool
EnableRecoveryState bool
CacheDatabaseStatus bool
EnableNodeIndex bool
MetricsAddr string
LeaderElectionID string
LogFile string
LogFilePermission string
LabelSelector string
ClusterLabelKeyForNodeTrigger string
WatchNamespace string
CliTimeout int
MaxCliTimeout int
MaxConcurrentReconciles int
LogFileMaxSize int
LogFileMaxAge int
MaxNumberOfOldLogFiles int
MinimumRecoveryTimeForExclusion float64
MinimumRecoveryTimeForInclusion float64
LogFileMinAge time.Duration
GetTimeout time.Duration
PostTimeout time.Duration
MaintenanceListStaleDuration time.Duration
MaintenanceListWaitDuration time.Duration
// LeaseDuration is the duration that non-leader candidates will
// wait to force acquire leadership. This is measured against time of
// last observed ack. Default is 15 seconds.
LeaseDuration time.Duration
// RenewDeadline is the duration that the acting controlplane will retry
// refreshing leadership before giving up. Default is 10 seconds.
RenewDeadline time.Duration
// RetryPeriod is the duration the LeaderElector clients should wait
// between tries of actions. Default is 2 seconds.
RetryPeriod time.Duration
DeprecationOptions internal.DeprecationOptions
MinimumRequiredUptimeCCBounce time.Duration
}
// BindFlags will parse the given flagset for the operator option flags
func (o *Options) BindFlags(fs *flag.FlagSet) {
fs.StringVar(&o.MetricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
fs.BoolVar(&o.EnableLeaderElection, "enable-leader-election", true,
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
fs.StringVar(&o.LeaderElectionID, "leader-election-id", "fdb-kubernetes-operator",
"LeaderElectionID determines the name of the resource that leader election will use for holding the leader lock.")
fs.BoolVar(&o.DeprecationOptions.UseFutureDefaults, "use-future-defaults", false,
"Apply defaults from the next major version of the operator. This is only intended for use in development.",
)
fs.StringVar(&o.LogFile, "log-file", "", "The path to a file to write logs to.")
fs.IntVar(&o.CliTimeout, "cli-timeout", 10, "The timeout to use for CLI commands in seconds.")
fs.IntVar(&o.MaxCliTimeout, "max-cli-timeout", 40, "The maximum timeout to use for CLI commands in seconds. This timeout is used for CLI requests that are known to be potentially slow like get status or exclude.")
fs.IntVar(&o.MaxConcurrentReconciles, "max-concurrent-reconciles", 1, "Defines the maximum number of concurrent reconciles for all controllers.")
fs.BoolVar(&o.CleanUpOldLogFile, "cleanup-old-cli-logs", true, "Defines if the operator should delete old fdbcli log files.")
fs.DurationVar(&o.LogFileMinAge, "log-file-min-age", 5*time.Minute, "Defines the minimum age of fdbcli log files before removing when \"--cleanup-old-cli-logs\" is set.")
fs.IntVar(&o.LogFileMaxAge, "log-file-max-age", 28, "Defines the maximum age to retain old operator log file in number of days.")
fs.IntVar(&o.LogFileMaxSize, "log-file-max-size", 250, "Defines the maximum size in megabytes of the operator log file before it gets rotated.")
fs.StringVar(&o.LogFilePermission, "log-file-permission", "0644",
"The file permission for the log file. Only used if log-file is set. Only the octal representation is supported.")
fs.StringVar(&o.ClusterLabelKeyForNodeTrigger, "cluster-label-key-for-node-trigger", "",
"The label key to use to trigger a reconciliation if a node resources changes.")
fs.IntVar(&o.MaxNumberOfOldLogFiles, "max-old-log-files", 3, "Defines the maximum number of old operator log files to retain.")
fs.BoolVar(&o.CompressOldFiles, "compress", false, "Defines whether the rotated log files should be compressed using gzip or not.")
fs.BoolVar(&o.PrintVersion, "version", false, "Prints the version of the operator and exits.")
fs.StringVar(&o.LabelSelector, "label-selector", "", "Defines a label-selector that will be used to select resources.")
fs.StringVar(&o.WatchNamespace, "watch-namespace", os.Getenv("WATCH_NAMESPACE"), "Defines which namespace the operator should watch.")
fs.DurationVar(&o.GetTimeout, "get-timeout", 5*time.Second, "http timeout for get requests to the FDB sidecar.")
fs.DurationVar(&o.PostTimeout, "post-timeout", 10*time.Second, "http timeout for post requests to the FDB sidecar.")
fs.DurationVar(&o.LeaseDuration, "leader-election-lease-duration", 15*time.Second, "the duration that non-leader candidates will wait to force acquire leadership.")
fs.DurationVar(&o.RenewDeadline, "leader-election-renew-deadline", 10*time.Second, "the duration that the acting controlplane will retry refreshing leadership before giving up.")
fs.DurationVar(&o.RetryPeriod, "leader-election-retry-period", 2*time.Second, "the duration the LeaderElector clients should wait between tries of action.")
fs.DurationVar(&o.MaintenanceListStaleDuration, "maintenance-list-stale-duration", 4*time.Hour, "the duration after stale entries will be deleted form the maintenance list. Only has an affect if the operator is allowed to reset the maintenance zone.")
fs.DurationVar(&o.MaintenanceListWaitDuration, "maintenance-list-wait-duration", 5*time.Minute, "the duration where a process in the maintenance list in a different zone will be assumed to block the maintenance zone reset. Only has an affect if the operator is allowed to reset the maintenance zone.")
fs.DurationVar(&o.MinimumRequiredUptimeCCBounce, "minimum-required-uptime-for-cc-bounce", 1*time.Hour, "the minimum required uptime of the cluster before allowing the operator to restart the CC if there is a failed tester process.")
fs.BoolVar(&o.EnableRestartIncompatibleProcesses, "enable-restart-incompatible-processes", true, "This flag enables/disables in the operator to restart incompatible fdbserver processes.")
fs.BoolVar(&o.ServerSideApply, "server-side-apply", false, "This flag enables server side apply.")
fs.BoolVar(&o.EnableRecoveryState, "enable-recovery-state", true, "This flag enables the use of the recovery state for the minimum uptime between bounced if the FDB version supports it.")
fs.BoolVar(&o.CacheDatabaseStatus, "cache-database-status", true, "Defines the default value for caching the database status.")
fs.BoolVar(&o.EnableNodeIndex, "enable-node-index", false, "Deprecated, not used anymore. Defines if the operator should add an index for accessing node objects. This requires a ClusterRoleBinding with node access. If the taint feature should be used, this setting should be set to true.")
fs.Float64Var(&o.MinimumRecoveryTimeForInclusion, "minimum-recovery-time-for-inclusion", 600.0, "Defines the minimum uptime of the cluster before inclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of inclusions.")
fs.Float64Var(&o.MinimumRecoveryTimeForExclusion, "minimum-recovery-time-for-exclusion", 120.0, "Defines the minimum uptime of the cluster before exclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of exclusions.")
}
// StartManager will start the FoundationDB operator manager.
// Each reconciler that is not nil will be added to the list of reconcilers
// For all reconcilers the Client, Recorder and if appropriate the namespace will be set.
func StartManager(
scheme *runtime.Scheme,
operatorOpts Options,
logOpts zap.Options,
clusterReconciler *controllers.FoundationDBClusterReconciler,
backupReconciler *controllers.FoundationDBBackupReconciler,
restoreReconciler *controllers.FoundationDBRestoreReconciler,
logr logr.Logger,
watchedObjects ...client.Object) (manager.Manager, *os.File) {
if operatorOpts.PrintVersion {
fmt.Printf("version: %s\n", operatorVersion)
os.Exit(0)
}
logWriter, err := setupLogger(operatorOpts)
if err != nil {
log.Fatalf("unable to setup logger: %s, got error: %s\n", operatorOpts.LogFile, err.Error())
}
logger := zap.New(
zap.UseFlagOptions(&logOpts),
zap.WriteTo(logWriter))
ctrl.SetLogger(logger)
// Might be called by controller-runtime in the future: https://github.com/kubernetes-sigs/controller-runtime/issues/1420
klog.SetLogger(logger)
setupLog := logger.WithName("setup")
fdbclient.DefaultCLITimeout = time.Duration(operatorOpts.CliTimeout) * time.Second
fdbclient.MaxCliTimeout = time.Duration(operatorOpts.MaxCliTimeout) * time.Second
// Define the cache options for the client cache used by the operator. If no label selector is defined, the
// default cache configuration will be used.
cacheOptions := cache.Options{}
// Only if a label selector is defined we have to update the cache options.
if operatorOpts.LabelSelector != "" {
// Parse the label selector, if the label selector is not parsable panic.
labelSelector, parseErr := labels.Parse(operatorOpts.LabelSelector)
if parseErr != nil {
log.Fatalf("could not parse label selector: %s, got error: %s", operatorOpts.LabelSelector, parseErr)
}
selector := cache.ObjectSelector{
Label: labelSelector,
}
// Set the label selector for all resources that the operator manages, this should reduce the resources that
// are cached by the operator if a label selector is provided.s
cacheOptions.SelectorsByObject = map[client.Object]cache.ObjectSelector{
&fdbv1beta2.FoundationDBCluster{}: selector,
&corev1.Pod{}: selector,
&corev1.PersistentVolumeClaim{}: selector,
&corev1.ConfigMap{}: selector,
&corev1.Service{}: selector,
&appsv1.Deployment{}: selector,
}
// Make sure we set the label selector for any additional watched objects.
for _, object := range watchedObjects {
cacheOptions.SelectorsByObject[object] = selector
}
}
options := ctrl.Options{
Scheme: scheme,
MetricsBindAddress: operatorOpts.MetricsAddr,
LeaderElection: operatorOpts.EnableLeaderElection,
LeaderElectionID: operatorOpts.LeaderElectionID,
LeaseDuration: &operatorOpts.LeaseDuration,
RenewDeadline: &operatorOpts.RenewDeadline,
RetryPeriod: &operatorOpts.RetryPeriod,
Port: 9443,
NewCache: cache.BuilderWithOptions(cacheOptions),
}
if operatorOpts.WatchNamespace != "" {
options.Namespace = operatorOpts.WatchNamespace
setupLog.Info("Operator starting in single namespace mode", "namespace", options.Namespace)
cacheOptions.Namespace = operatorOpts.WatchNamespace
} else {
setupLog.Info("Operator starting in Global mode")
}
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), options)
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}
if err := moveFDBBinaries(setupLog); err != nil {
setupLog.Error(err, "unable to move FDB binaries")
os.Exit(1)
}
labelSelector, err := metav1.ParseToLabelSelector(strings.Trim(operatorOpts.LabelSelector, "\""))
if err != nil {
setupLog.Error(err, "unable to parse provided label selector")
os.Exit(1)
}
if clusterReconciler != nil {
clusterReconciler.Client = mgr.GetClient()
clusterReconciler.Recorder = mgr.GetEventRecorderFor("foundationdbcluster-controller")
clusterReconciler.DeprecationOptions = operatorOpts.DeprecationOptions
clusterReconciler.DatabaseClientProvider = fdbclient.NewDatabaseClientProvider(logger)
clusterReconciler.GetTimeout = operatorOpts.GetTimeout
clusterReconciler.PostTimeout = operatorOpts.PostTimeout
clusterReconciler.Log = logr.WithName("controllers").WithName("FoundationDBCluster")
clusterReconciler.EnableRestartIncompatibleProcesses = operatorOpts.EnableRestartIncompatibleProcesses
clusterReconciler.ServerSideApply = operatorOpts.ServerSideApply
clusterReconciler.EnableRecoveryState = operatorOpts.EnableRecoveryState
clusterReconciler.CacheDatabaseStatusForReconciliationDefault = operatorOpts.CacheDatabaseStatus
clusterReconciler.MinimumRequiredUptimeCCBounce = operatorOpts.MinimumRequiredUptimeCCBounce
clusterReconciler.MaintenanceListStaleDuration = operatorOpts.MaintenanceListStaleDuration
clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration
clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion
clusterReconciler.MaintenanceListStaleDuration = operatorOpts.MaintenanceListStaleDuration
clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration
clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion
clusterReconciler.ClusterLabelKeyForNodeTrigger = strings.Trim(operatorOpts.ClusterLabelKeyForNodeTrigger, "\"")
clusterReconciler.Namespace = operatorOpts.WatchNamespace
if err := clusterReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, *labelSelector, watchedObjects...); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FoundationDBCluster")
os.Exit(1)
}
if operatorOpts.MetricsAddr != "0" {
controllers.InitCustomMetrics(clusterReconciler)
}
}
if backupReconciler != nil {
backupReconciler.Client = mgr.GetClient()
backupReconciler.Recorder = mgr.GetEventRecorderFor("foundationdbbackup-controller")
backupReconciler.DatabaseClientProvider = fdbclient.NewDatabaseClientProvider(logger)
backupReconciler.Log = logr.WithName("controllers").WithName("FoundationDBBackup")
backupReconciler.ServerSideApply = operatorOpts.ServerSideApply
if err := backupReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, *labelSelector); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FoundationDBBackup")
os.Exit(1)
}
}
if restoreReconciler != nil {
restoreReconciler.Client = mgr.GetClient()
restoreReconciler.Recorder = mgr.GetEventRecorderFor("foundationdbrestore-controller")
restoreReconciler.DatabaseClientProvider = fdbclient.NewDatabaseClientProvider(logger)
restoreReconciler.Log = logr.WithName("controllers").WithName("FoundationDBRestore")
restoreReconciler.ServerSideApply = operatorOpts.ServerSideApply
if err := restoreReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, *labelSelector); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FoundationDBRestore")
os.Exit(1)
}
}
if operatorOpts.CleanUpOldLogFile {
setupLog.V(1).Info("setup log file cleaner", "LogFileMinAge", operatorOpts.LogFileMinAge.String())
cleaner := internal.NewCliLogFileCleaner(logger, operatorOpts.LogFileMinAge)
ticker := time.NewTicker(operatorOpts.LogFileMinAge)
go func() {
for {
<-ticker.C
cleaner.CleanupOldCliLogs()
}
}()
}
// +kubebuilder:scaffold:builder
setupLog.Info("setup manager")
return mgr, nil
}
// MoveFDBBinaries moves FDB binaries that are pulled from setup containers into
// the correct locations.
func moveFDBBinaries(log logr.Logger) error {
binFile, err := os.Open(os.Getenv("FDB_BINARY_DIR"))
if err != nil {
return err
}
defer binFile.Close()
libDir, err := os.Open(os.Getenv("FDB_NETWORK_OPTION_EXTERNAL_CLIENT_DIRECTORY"))
if err != nil {
if os.IsNotExist(err) {
err = os.MkdirAll(os.Getenv("FDB_NETWORK_OPTION_EXTERNAL_CLIENT_DIRECTORY"), os.ModeDir|os.ModePerm)
if err != nil {
return err
}
} else {
return err
}
}
binDir, err := binFile.Readdir(0)
if err != nil {
return err
}
for _, binEntry := range binDir {
if binEntry.IsDir() && fdbv1beta2.VersionRegex.Match([]byte(binEntry.Name())) {
version, err := fdbv1beta2.ParseFdbVersion(binEntry.Name())
if err != nil {
return err
}
versionBinFile, err := os.Open(path.Join(binFile.Name(), binEntry.Name(), "bin", binEntry.Name()))
if err != nil && !os.IsNotExist(err) {
return err
}
if err == nil {
minorVersionPath := path.Join(binFile.Name(), version.GetBinaryVersion())
err = os.MkdirAll(minorVersionPath, os.ModeDir|os.ModePerm)
if err != nil {
return err
}
versionBinDir, err := versionBinFile.Readdir(0)
if err != nil {
return err
}
for _, versionBinEntry := range versionBinDir {
currentPath := path.Join(versionBinFile.Name(), versionBinEntry.Name())
newPath := path.Join(minorVersionPath, versionBinEntry.Name())
log.Info("Moving FDB binary file", "currentPath", currentPath, "newPath", newPath)
err = os.Rename(currentPath, newPath)
if err != nil {
return err
}
}
}
_ = versionBinFile.Close()
versionLibFile, err := os.Open(path.Join(binFile.Name(), binEntry.Name(), "lib", "libfdb_c.so"))
if err != nil && !os.IsNotExist(err) {
return err
}
if err == nil {
currentPath := path.Join(versionLibFile.Name())
newPath := path.Join(libDir.Name(), fmt.Sprintf("libfdb_c_%s.so", version))
log.Info("Moving FDB library file", "currentPath", currentPath, "newPath", newPath)
err = os.Rename(currentPath, newPath)
if err != nil {
return err
}
}
_ = versionLibFile.Close()
}
}
return nil
}
// setupLogger will return a MultiWriter if the operator should log to a file and stdout otherwise only the stdout
// io.Writer is returned. If the operator should log to a file the operator will make sure to create the file with
// the expected permissions.
func setupLogger(operatorOpts Options) (io.Writer, error) {
if operatorOpts.LogFile != "" {
expectedPermission := fs.FileMode(0644)
if operatorOpts.LogFilePermission != "" {
expectedPermissionUnit, err := strconv.ParseUint(operatorOpts.LogFilePermission, 8, 32)
if err != nil {
return nil, err
}
expectedPermission = fs.FileMode(expectedPermissionUnit)
}
// We have to create the original file by ourself since lumberjack doesn't support to pass down the expected permissions
// see: https://github.com/natefinch/lumberjack/issues/82#issuecomment-482143273.
stat, err := os.Stat(operatorOpts.LogFile)
// File doesn't exist and must be created with the expected permission.
if os.IsNotExist(err) {
err := os.WriteFile(operatorOpts.LogFile, nil, expectedPermission)
if err != nil {
return nil, err
}
}
if err == nil && stat.Mode() != expectedPermission {
err = os.Chmod(operatorOpts.LogFile, expectedPermission)
if err != nil {
return nil, err
}
}
lumberjackLogger := &lumberjack.Logger{
Filename: operatorOpts.LogFile,
MaxSize: operatorOpts.LogFileMaxSize,
MaxAge: operatorOpts.LogFileMaxAge,
MaxBackups: operatorOpts.MaxNumberOfOldLogFiles,
Compress: operatorOpts.CompressOldFiles,
}
return io.MultiWriter(os.Stdout, lumberjackLogger), nil
}
return os.Stdout, nil
}