Skip to content

Commit

Permalink
Merge pull request #123 from Aiven-Open/rdunklau/use_that_new_state
Browse files Browse the repository at this point in the history
Wait for a recheck after config reload and detection of primary not being available
  • Loading branch information
alexole committed Apr 11, 2024
2 parents 8210a4d + 9ffc8c6 commit a0252a8
Showing 1 changed file with 27 additions and 17 deletions.
44 changes: 27 additions & 17 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,11 +494,17 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.log.warning("Not considering failover, because it's not enabled by configuration")
elif self.current_master:
self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
# Refresh the standby nodes list, and check that we still don't have a master node
self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout)
cluster_state = copy.deepcopy(self.cluster_state)
observer_state = copy.deepcopy(self.observer_state)
_, master_node, standby_nodes = self.create_node_map(cluster_state, observer_state)
# We seem to have a master node after all
if master_node and master_node.get("connection"):
return
master_known_to_be_gone = self.current_master in self.known_gone_nodes
now = time.monotonic()
config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout

if master_known_to_be_gone or config_timeout_exceeded:
# we've seen a master at some point in time, but now it's
# not reachable or removed from configuration, perform an
Expand Down Expand Up @@ -860,29 +866,33 @@ def _get_check_interval(self) -> float:

def main_loop(self):
while self.running:
new_config = False
if self.config_reload_pending:
self.config_reload_pending = False
try:
self.load_config()
new_config = True
except:
self.config_reload_pending = True
raise
try:
self._apply_latest_config_version()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to update configuration")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.check_cluster_state()
self._check_cluster_monitor_thread_health(now=time.monotonic())
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to check cluster state")
self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state")
try:
self.write_cluster_state_to_json_file()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to write cluster state")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
# If we have a new config, wait for the requested check to be completed before we try anything else.
if not new_config:
try:
self._apply_latest_config_version()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to update configuration")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.check_cluster_state()
self._check_cluster_monitor_thread_health(now=time.monotonic())
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to check cluster state")
self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state")
try:
self.write_cluster_state_to_json_file()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to write cluster state")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.failover_decision_queue.get(timeout=self._get_check_interval())
q = self.failover_decision_queue
Expand Down

0 comments on commit a0252a8

Please sign in to comment.