diff --git a/e2e/test_operator_plugin/operator_plugin_test.go b/e2e/test_operator_plugin/operator_plugin_test.go index 3363065b9..71f37c6eb 100644 --- a/e2e/test_operator_plugin/operator_plugin_test.go +++ b/e2e/test_operator_plugin/operator_plugin_test.go @@ -28,6 +28,7 @@ import ( "context" "fmt" "log" + "strings" "time" fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2" @@ -249,6 +250,11 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() { false, ) log.Println("stdout:", stdout, "stderr:", stderr) + if strings.Contains(stderr, "Error determining public address") { + Skip( + "plugin was not able to determine public address, this means that all coordinators are probably gone", + ) + } Expect(err).NotTo(HaveOccurred()) // Ensure the cluster is available again. diff --git a/kubectl-fdb/cmd/recover_multi_region_cluster.go b/kubectl-fdb/cmd/recover_multi_region_cluster.go index 9c4a60c7e..a15380c8a 100644 --- a/kubectl-fdb/cmd/recover_multi_region_cluster.go +++ b/kubectl-fdb/cmd/recover_multi_region_cluster.go @@ -732,6 +732,30 @@ func checkIfClusterIsUnavailableAndMajorityOfCoordinatorsAreUnreachable( } log.Println("Getting the status from:", clientPod.Name) + for retry := 0; retry < 5; retry++ { + err = getStatusAndCheckIfClusterShouldBeRecovered(ctx, kubeClient, config, clientPod) + if err == nil { + break + } + + time.Sleep(5 * time.Second) + } + + // If DNS is used for the cluster file, we could hit cases where no DNS entry can be resolved, in this case we could + // assume that the cluster is also down. The error from the client side is the following: + // Error: error getting status: Error determining public address. + // ERROR: Unable to bind to network (1512) + if err != nil && strings.Contains(err.Error(), "Error determining public address") { + return err + } + + return err +} + +func getStatusAndCheckIfClusterShouldBeRecovered(ctx context.Context, + kubeClient client.Client, + config *rest.Config, + clientPod *corev1.Pod) error { status, err := getStatus(ctx, kubeClient, config, clientPod) if err != nil { return err