-
Notifications
You must be signed in to change notification settings - Fork 81
/
replace_failed_process_groups.go
81 lines (69 loc) · 3.13 KB
/
replace_failed_process_groups.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/*
* replace_failed_process_groups.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2021 Apple Inc. and the FoundationDB project authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controllers
import (
"context"
"time"
"github.com/FoundationDB/fdb-kubernetes-operator/pkg/fdbstatus"
"github.com/go-logr/logr"
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/FoundationDB/fdb-kubernetes-operator/internal/replacements"
)
// replaceFailedProcessGroups identifies processes groups that have failed and need to be
// replaced.
type replaceFailedProcessGroups struct{}
// return non-nil requeue if a process has been replaced
func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, logger logr.Logger) *requeue {
// If the EmptyMonitorConf setting is set we expect that all fdb processes in this part of the cluster are missing. In order
// to prevent the operator from replacing any process groups we skip this reconciliation here.
if cluster.Spec.Buggify.EmptyMonitorConf {
logger.V(1).Info("Skipping because EmptyMonitorConf is set to true")
return nil
}
// If the status is not cached, we have to fetch it.
if status == nil {
adminClient, err := r.DatabaseClientProvider.GetAdminClient(cluster, r)
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
defer adminClient.Close()
status, err = adminClient.GetStatus()
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
}
// Only replace process groups without an address, if the cluster has the desired fault tolerance and is available.
hasDesiredFaultTolerance := fdbstatus.HasDesiredFaultToleranceFromStatus(logger, status, cluster)
hasReplacement, hasMoreFailedProcesses := replacements.ReplaceFailedProcessGroups(logger, cluster, status, hasDesiredFaultTolerance)
// If the reconciler replaced at least one process group we want to update the status and requeue.
if hasReplacement {
err := r.updateOrApply(ctx, cluster)
if err != nil {
return &requeue{curError: err}
}
return &requeue{message: "Removals have been updated in the cluster status"}
}
// If there are more failed processes that are not yet automatically replaced, we want the controller to requeue this
// request to make sure it takes another attempt to replace the faulty process group(s).
if hasMoreFailedProcesses {
return &requeue{message: "More failed process groups are detected", delayedRequeue: true, delay: 5 * time.Minute}
}
return nil
}