Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix replication lag warning to be always lower that failover timeout #63

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ def load_config(self, _signal=None, _frame=None):
self.replication_lag_failover_timeout = self.config.get("max_failover_replication_time_lag", 120.0)
self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", 300.0)
self.missing_master_from_config_timeout = self.config.get("missing_master_from_config_timeout", 15.0)

if self.replication_lag_warning_boundary >= self.replication_lag_failover_timeout:
msg = "Replication lag warning boundary (%s) is not lower than its failover timeout (%s)"
self.log.warning(msg, self.replication_lag_warning_boundary, self.replication_lag_failover_timeout)
if self.replication_lag_warning_boundary > self.replication_lag_failover_timeout:
msg = "Replication lag warning boundary set to %s"
self.log.warning(msg, self.replication_lag_warning_boundary)
self.replication_lag_warning_boundary = self.replication_lag_failover_timeout
self.log.debug("Loaded config: %r from: %r", self.config, self.config_path)
self.cluster_monitor_check_queue.put("new config came, recheck")

Expand Down Expand Up @@ -376,6 +384,8 @@ def check_replication_lag(self, own_state, standby_nodes):
self.over_warning_limit_command, return_code)
else:
self.log.warning("No over_warning_limit_command set")
# force looping one more time since we just passed the warning limit
return
elif self.replication_lag_over_warning_limit:
self.replication_lag_over_warning_limit = False
self.delete_alert_file("replication_delay_warning")
Expand Down
11 changes: 10 additions & 1 deletion test/test_lookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def test_check_cluster_do_failover_one_slave(pgl):
pgl.execute_external_command.return_value = 0
pgl.replication_lag_over_warning_limit = False
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1
assert pgl.replication_lag_over_warning_limit is False

Expand Down Expand Up @@ -177,6 +180,9 @@ def test_check_cluster_do_failover_one_slave_one_observer(pgl):
pgl.execute_external_command.return_value = 0
pgl.replication_lag_over_warning_limit = False
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1
assert pgl.replication_lag_over_warning_limit is False

Expand Down Expand Up @@ -348,9 +354,12 @@ def test_failover_master_two_slaves_one_observer_no_connection_between_slaves(pg
pg_is_in_recovery=True, connection=True, replication_time_lag=130.0)
pgl.execute_external_command.return_value = 0
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
assert pgl.replication_lag_over_warning_limit is True
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1

assert pgl.replication_lag_over_warning_limit is False # we keep the warning on
assert pgl.replication_lag_over_warning_limit is False


def test_failover_master_one_slave_one_observer_no_connections(pgl):
Expand Down