From bfb4c67002928843ab05b14ed49a3482224be53c Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Tue, 4 Nov 2025 10:39:48 +0100 Subject: [PATCH 1/3] If the cluster has no entries in the cluster file that can be resolved, assume in the multi-region recover command that the cluster should be recovered --- .../cmd/recover_multi_region_cluster.go | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kubectl-fdb/cmd/recover_multi_region_cluster.go b/kubectl-fdb/cmd/recover_multi_region_cluster.go index 9c4a60c7e..5963aa187 100644 --- a/kubectl-fdb/cmd/recover_multi_region_cluster.go +++ b/kubectl-fdb/cmd/recover_multi_region_cluster.go @@ -732,6 +732,30 @@ func checkIfClusterIsUnavailableAndMajorityOfCoordinatorsAreUnreachable( } log.Println("Getting the status from:", clientPod.Name) + for retry := 0; retry < 5; retry++ { + err = getStatusAndCheckIfClusterShouldBeRecovered(ctx, kubeClient, config, clientPod) + if err == nil { + break + } + + time.Sleep(5 * time.Second) + } + + // If DNS is used for the cluster file, we could hit cases where no DNS entry can be resolved, in this case we could + // assume that the cluster is also down. The error from the client side is the following: + // Error: error getting status: Error determining public address. + // ERROR: Unable to bind to network (1512) + if err != nil && strings.Contains(err.Error(), "Error determining public address") { + return nil + } + + return err +} + +func getStatusAndCheckIfClusterShouldBeRecovered(ctx context.Context, + kubeClient client.Client, + config *rest.Config, + clientPod *corev1.Pod) error { status, err := getStatus(ctx, kubeClient, config, clientPod) if err != nil { return err From 42480b39018cda0ab8a5005a9435ea2179dda0cc Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Tue, 4 Nov 2025 10:44:17 +0100 Subject: [PATCH 2/3] Skip test case if no entries can be resolved --- e2e/test_operator_plugin/operator_plugin_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/e2e/test_operator_plugin/operator_plugin_test.go b/e2e/test_operator_plugin/operator_plugin_test.go index 3363065b9..30a57d88f 100644 --- a/e2e/test_operator_plugin/operator_plugin_test.go +++ b/e2e/test_operator_plugin/operator_plugin_test.go @@ -28,6 +28,7 @@ import ( "context" "fmt" "log" + "strings" "time" fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2" @@ -198,6 +199,11 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() { false, ) log.Println("stdout:", stdout, "stderr:", stderr) + if strings.Contains(stderr, "Error determining public address") { + Skip( + "plugin was not able to determine public address, this means that all coordinators are probably gone", + ) + } Expect(err).NotTo(HaveOccurred()) // Ensure the cluster is available again. From f030788aed55e35cd6fd2ca3001cb05282bebd51 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Thu, 6 Nov 2025 13:22:12 +0100 Subject: [PATCH 3/3] Skip correct e2e test case --- e2e/test_operator_plugin/operator_plugin_test.go | 10 +++++----- kubectl-fdb/cmd/recover_multi_region_cluster.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/e2e/test_operator_plugin/operator_plugin_test.go b/e2e/test_operator_plugin/operator_plugin_test.go index 30a57d88f..71f37c6eb 100644 --- a/e2e/test_operator_plugin/operator_plugin_test.go +++ b/e2e/test_operator_plugin/operator_plugin_test.go @@ -199,11 +199,6 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() { false, ) log.Println("stdout:", stdout, "stderr:", stderr) - if strings.Contains(stderr, "Error determining public address") { - Skip( - "plugin was not able to determine public address, this means that all coordinators are probably gone", - ) - } Expect(err).NotTo(HaveOccurred()) // Ensure the cluster is available again. @@ -255,6 +250,11 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() { false, ) log.Println("stdout:", stdout, "stderr:", stderr) + if strings.Contains(stderr, "Error determining public address") { + Skip( + "plugin was not able to determine public address, this means that all coordinators are probably gone", + ) + } Expect(err).NotTo(HaveOccurred()) // Ensure the cluster is available again. diff --git a/kubectl-fdb/cmd/recover_multi_region_cluster.go b/kubectl-fdb/cmd/recover_multi_region_cluster.go index 5963aa187..a15380c8a 100644 --- a/kubectl-fdb/cmd/recover_multi_region_cluster.go +++ b/kubectl-fdb/cmd/recover_multi_region_cluster.go @@ -746,7 +746,7 @@ func checkIfClusterIsUnavailableAndMajorityOfCoordinatorsAreUnreachable( // Error: error getting status: Error determining public address. // ERROR: Unable to bind to network (1512) if err != nil && strings.Contains(err.Error(), "Error determining public address") { - return nil + return err } return err