Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Add tunables for connection retries to master and interval between

connection retries, these parameters along with master_response_timeout
determines the amount of time since failure to failover
  • Loading branch information...
commit aaf35947ed2242932b63067f72cfcd43f8c22736 1 parent 08ed0aa
Jaime2ndQuadrant Jaime2ndQuadrant authored
34 config.c
@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
45 45 /* if nothing has been provided defaults to 60 */
46 46 options->master_response_timeout = 60;
47 47
  48 + /* it defaults to 6 retries with a time between retries of 10s */
  49 + options->reconnect_attempts = 6;
  50 + options->reconnect_intvl = 10;
  51 +
48 52 /*
49 53 * Since some commands don't require a config file at all, not
50 54 * having one isn't necessarily a problem.
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
103 107 strncpy(options->follow_command, value, MAXLEN);
104 108 else if (strcmp(name, "master_response_timeout") == 0)
105 109 options->master_response_timeout = atoi(value);
  110 + else if (strcmp(name, "reconnect_attempts") == 0)
  111 + options->reconnect_attempts = atoi(value);
  112 + else if (strcmp(name, "reconnect_interval") == 0)
  113 + options->reconnect_intvl = atoi(value);
106 114 else
107 115 log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
108 116 }
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
128 136 log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
129 137 exit(ERR_BAD_CONFIG);
130 138 }
  139 +
  140 + if (options->reconnect_attempts < 0)
  141 + {
  142 + log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
  143 + exit(ERR_BAD_CONFIG);
  144 + }
  145 +
  146 + if (options->reconnect_intvl <= 0)
  147 + {
  148 + log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
  149 + exit(ERR_BAD_CONFIG);
  150 + }
131 151 }
132 152
133 153
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
232 252 return false;
233 253 }
234 254
  255 + if (new_options.reconnect_attempts < 0)
  256 + {
  257 + log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
  258 + return false;
  259 + }
  260 +
  261 + if (new_options.reconnect_intvl < 0)
  262 + {
  263 + log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
  264 + return false;
  265 + }
  266 +
235 267 /* Test conninfo string */
236 268 conn = establishDBConnection(new_options.conninfo, false);
237 269 if (!conn || (PQstatus(conn) != CONNECTION_OK))
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
252 284 strcpy(orig_options->follow_command, new_options.follow_command);
253 285 strcpy(orig_options->rsync_options, new_options.rsync_options);
254 286 orig_options->master_response_timeout = new_options.master_response_timeout;
  287 + orig_options->reconnect_attempts = new_options.reconnect_attempts;
  288 + orig_options->reconnect_intvl = new_options.reconnect_intvl;
255 289 /*
256 290 * XXX These ones can change with a simple SIGHUP?
257 291
2  config.h
@@ -37,6 +37,8 @@ typedef struct
37 37 char logfacility[MAXLEN];
38 38 char rsync_options[QUERY_STR_LEN];
39 39 int master_response_timeout;
  40 + int reconnect_attempts;
  41 + int reconnect_intvl;
40 42 } t_configuration_options;
41 43
42 44 void parse_config(const char *config_file, t_configuration_options *options);
4 repmgr.conf.sample
@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
16 16 # How many seconds we wait for master response before declaring master failure
17 17 master_response_timeout=60
18 18
  19 +# How many time we try to reconnect to master before starting failover procedure
  20 +reconnect_attempts=6
  21 +reconnect_interval=10
  22 +
19 23 # Autofailover options
20 24 failover=automatic
21 25 priority=-1
4 repmgr.h
@@ -69,9 +69,5 @@ typedef struct
69 69 } t_runtime_options;
70 70
71 71 #define SLEEP_MONITOR 2
72   -#define SLEEP_RETRY 3
73   -#define NUM_RETRY 40
74   -
75   -
76 72
77 73 #endif
16 repmgrd.c
@@ -345,7 +345,7 @@ WitnessMonitor(void)
345 345 * Check if the master is still available, if after 5 minutes of retries
346 346 * we cannot reconnect, return false.
347 347 */
348   - CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
  348 + CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
349 349
350 350 if (PQstatus(primaryConn) != CONNECTION_OK)
351 351 {
@@ -429,7 +429,7 @@ StandbyMonitor(void)
429 429 * Check if the master is still available, if after 5 minutes of retries
430 430 * we cannot reconnect, try to get a new master.
431 431 */
432   - CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
  432 + CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
433 433
434 434 if (PQstatus(primaryConn) != CONNECTION_OK)
435 435 {
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
762 762
763 763 /*
764 764 * Check if the master is still available
765   - * if after NUM_RETRY * SLEEP_RETRY seconds of retries
  765 + * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
766 766 * we cannot reconnect
767 767 * return false
768 768 */
769   - for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++)
  769 + for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
770 770 {
771 771 if (!is_pgup(primaryConn, local_options.master_response_timeout))
772 772 {
773   - log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries)));
774   - /* wait SLEEP_RETRY seconds between retries */
775   - sleep(SLEEP_RETRY);
  773 + log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
  774 + progname,
  775 + (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
  776 + /* wait local_options.reconnect_intvl seconds between retries */
  777 + sleep(local_options.reconnect_intvl);
776 778 }
777 779 else
778 780 {

0 comments on commit aaf3594

Please sign in to comment.
Something went wrong with that request. Please try again.