Browse files

Add tunables for connection retries to master and interval between

connection retries, these parameters along with master_response_timeout
determines the amount of time since failure to failover
  • Loading branch information...
1 parent 08ed0aa commit aaf35947ed2242932b63067f72cfcd43f8c22736 @Jaime2ndQuadrant Jaime2ndQuadrant committed Jul 21, 2012
Showing with 49 additions and 11 deletions.
  1. +34 −0 config.c
  2. +2 −0 config.h
  3. +4 −0 repmgr.conf.sample
  4. +0 −4 repmgr.h
  5. +9 −7 repmgrd.c
View
34 config.c
@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
/* if nothing has been provided defaults to 60 */
options->master_response_timeout = 60;
+ /* it defaults to 6 retries with a time between retries of 10s */
+ options->reconnect_attempts = 6;
+ options->reconnect_intvl = 10;
+
/*
* Since some commands don't require a config file at all, not
* having one isn't necessarily a problem.
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
strncpy(options->follow_command, value, MAXLEN);
else if (strcmp(name, "master_response_timeout") == 0)
options->master_response_timeout = atoi(value);
+ else if (strcmp(name, "reconnect_attempts") == 0)
+ options->reconnect_attempts = atoi(value);
+ else if (strcmp(name, "reconnect_interval") == 0)
+ options->reconnect_intvl = atoi(value);
else
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
}
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
+
+ if (options->reconnect_attempts < 0)
+ {
+ log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
+ exit(ERR_BAD_CONFIG);
+ }
+
+ if (options->reconnect_intvl <= 0)
+ {
+ log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
+ exit(ERR_BAD_CONFIG);
+ }
}
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
return false;
}
+ if (new_options.reconnect_attempts < 0)
+ {
+ log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
+ return false;
+ }
+
+ if (new_options.reconnect_intvl < 0)
+ {
+ log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
+ return false;
+ }
+
/* Test conninfo string */
conn = establishDBConnection(new_options.conninfo, false);
if (!conn || (PQstatus(conn) != CONNECTION_OK))
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
strcpy(orig_options->follow_command, new_options.follow_command);
strcpy(orig_options->rsync_options, new_options.rsync_options);
orig_options->master_response_timeout = new_options.master_response_timeout;
+ orig_options->reconnect_attempts = new_options.reconnect_attempts;
+ orig_options->reconnect_intvl = new_options.reconnect_intvl;
/*
* XXX These ones can change with a simple SIGHUP?
View
2 config.h
@@ -37,6 +37,8 @@ typedef struct
char logfacility[MAXLEN];
char rsync_options[QUERY_STR_LEN];
int master_response_timeout;
+ int reconnect_attempts;
+ int reconnect_intvl;
} t_configuration_options;
void parse_config(const char *config_file, t_configuration_options *options);
View
4 repmgr.conf.sample
@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
# How many seconds we wait for master response before declaring master failure
master_response_timeout=60
+# How many time we try to reconnect to master before starting failover procedure
+reconnect_attempts=6
+reconnect_interval=10
+
# Autofailover options
failover=automatic
priority=-1
View
4 repmgr.h
@@ -69,9 +69,5 @@ typedef struct
} t_runtime_options;
#define SLEEP_MONITOR 2
-#define SLEEP_RETRY 3
-#define NUM_RETRY 40
-
-
#endif
View
16 repmgrd.c
@@ -345,7 +345,7 @@ WitnessMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, return false.
*/
- CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
+ CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -429,7 +429,7 @@ StandbyMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, try to get a new master.
*/
- CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
+ CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
/*
* Check if the master is still available
- * if after NUM_RETRY * SLEEP_RETRY seconds of retries
+ * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
* we cannot reconnect
* return false
*/
- for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++)
+ for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
{
if (!is_pgup(primaryConn, local_options.master_response_timeout))
{
- log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries)));
- /* wait SLEEP_RETRY seconds between retries */
- sleep(SLEEP_RETRY);
+ log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
+ progname,
+ (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
+ /* wait local_options.reconnect_intvl seconds between retries */
+ sleep(local_options.reconnect_intvl);
}
else
{

0 comments on commit aaf3594

Please sign in to comment.