Skip to content

Commit

Permalink
Merge pull request #63 from sjamgade/warning-should-be-lower-than-tim…
Browse files Browse the repository at this point in the history
…eout

fix replication lag warning to be always lower that failover timeout

#63
  • Loading branch information
rikonen authored Feb 24, 2021
2 parents bd37c79 + 3422690 commit 181b0d3
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
10 changes: 10 additions & 0 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ def load_config(self, _signal=None, _frame=None):
self.replication_lag_failover_timeout = self.config.get("max_failover_replication_time_lag", 120.0)
self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", 300.0)
self.missing_master_from_config_timeout = self.config.get("missing_master_from_config_timeout", 15.0)

if self.replication_lag_warning_boundary >= self.replication_lag_failover_timeout:
msg = "Replication lag warning boundary (%s) is not lower than its failover timeout (%s)"
self.log.warning(msg, self.replication_lag_warning_boundary, self.replication_lag_failover_timeout)
if self.replication_lag_warning_boundary > self.replication_lag_failover_timeout:
msg = "Replication lag warning boundary set to %s"
self.log.warning(msg, self.replication_lag_warning_boundary)
self.replication_lag_warning_boundary = self.replication_lag_failover_timeout
self.log.debug("Loaded config: %r from: %r", self.config, self.config_path)
self.cluster_monitor_check_queue.put("new config came, recheck")

Expand Down Expand Up @@ -376,6 +384,8 @@ def check_replication_lag(self, own_state, standby_nodes):
self.over_warning_limit_command, return_code)
else:
self.log.warning("No over_warning_limit_command set")
# force looping one more time since we just passed the warning limit
return
elif self.replication_lag_over_warning_limit:
self.replication_lag_over_warning_limit = False
self.delete_alert_file("replication_delay_warning")
Expand Down
11 changes: 10 additions & 1 deletion test/test_lookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def test_check_cluster_do_failover_one_slave(pgl):
pgl.execute_external_command.return_value = 0
pgl.replication_lag_over_warning_limit = False
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1
assert pgl.replication_lag_over_warning_limit is False

Expand Down Expand Up @@ -177,6 +180,9 @@ def test_check_cluster_do_failover_one_slave_one_observer(pgl):
pgl.execute_external_command.return_value = 0
pgl.replication_lag_over_warning_limit = False
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1
assert pgl.replication_lag_over_warning_limit is False

Expand Down Expand Up @@ -348,9 +354,12 @@ def test_failover_master_two_slaves_one_observer_no_connection_between_slaves(pg
pg_is_in_recovery=True, connection=True, replication_time_lag=130.0)
pgl.execute_external_command.return_value = 0
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1

assert pgl.replication_lag_over_warning_limit is False # we keep the warning on
assert pgl.replication_lag_over_warning_limit is False


def test_failover_master_one_slave_one_observer_no_connections(pgl):
Expand Down

0 comments on commit 181b0d3

Please sign in to comment.