Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Add tunables for connection retries to master and interval between

connection retries, these parameters along with master_response_timeout
determines the amount of time since failure to failover
  • Loading branch information...
commit aaf35947ed2242932b63067f72cfcd43f8c22736 1 parent 08ed0aa
Jaime2ndQuadrant Jaime2ndQuadrant authored
34 config.c
View
@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
/* if nothing has been provided defaults to 60 */
options->master_response_timeout = 60;
+ /* it defaults to 6 retries with a time between retries of 10s */
+ options->reconnect_attempts = 6;
+ options->reconnect_intvl = 10;
+
/*
* Since some commands don't require a config file at all, not
* having one isn't necessarily a problem.
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
strncpy(options->follow_command, value, MAXLEN);
else if (strcmp(name, "master_response_timeout") == 0)
options->master_response_timeout = atoi(value);
+ else if (strcmp(name, "reconnect_attempts") == 0)
+ options->reconnect_attempts = atoi(value);
+ else if (strcmp(name, "reconnect_interval") == 0)
+ options->reconnect_intvl = atoi(value);
else
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
}
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
+
+ if (options->reconnect_attempts < 0)
+ {
+ log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
+ exit(ERR_BAD_CONFIG);
+ }
+
+ if (options->reconnect_intvl <= 0)
+ {
+ log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
+ exit(ERR_BAD_CONFIG);
+ }
}
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
return false;
}
+ if (new_options.reconnect_attempts < 0)
+ {
+ log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
+ return false;
+ }
+
+ if (new_options.reconnect_intvl < 0)
+ {
+ log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
+ return false;
+ }
+
/* Test conninfo string */
conn = establishDBConnection(new_options.conninfo, false);
if (!conn || (PQstatus(conn) != CONNECTION_OK))
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
strcpy(orig_options->follow_command, new_options.follow_command);
strcpy(orig_options->rsync_options, new_options.rsync_options);
orig_options->master_response_timeout = new_options.master_response_timeout;
+ orig_options->reconnect_attempts = new_options.reconnect_attempts;
+ orig_options->reconnect_intvl = new_options.reconnect_intvl;
/*
* XXX These ones can change with a simple SIGHUP?
2  config.h
View
@@ -37,6 +37,8 @@ typedef struct
char logfacility[MAXLEN];
char rsync_options[QUERY_STR_LEN];
int master_response_timeout;
+ int reconnect_attempts;
+ int reconnect_intvl;
} t_configuration_options;
void parse_config(const char *config_file, t_configuration_options *options);
4 repmgr.conf.sample
View
@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
# How many seconds we wait for master response before declaring master failure
master_response_timeout=60
+# How many time we try to reconnect to master before starting failover procedure
+reconnect_attempts=6
+reconnect_interval=10
+
# Autofailover options
failover=automatic
priority=-1
4 repmgr.h
View
@@ -69,9 +69,5 @@ typedef struct
} t_runtime_options;
#define SLEEP_MONITOR 2
-#define SLEEP_RETRY 3
-#define NUM_RETRY 40
-
-
#endif
16 repmgrd.c
View
@@ -345,7 +345,7 @@ WitnessMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, return false.
*/
- CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
+ CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -429,7 +429,7 @@ StandbyMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, try to get a new master.
*/
- CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
+ CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
/*
* Check if the master is still available
- * if after NUM_RETRY * SLEEP_RETRY seconds of retries
+ * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
* we cannot reconnect
* return false
*/
- for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++)
+ for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
{
if (!is_pgup(primaryConn, local_options.master_response_timeout))
{
- log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries)));
- /* wait SLEEP_RETRY seconds between retries */
- sleep(SLEEP_RETRY);
+ log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
+ progname,
+ (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
+ /* wait local_options.reconnect_intvl seconds between retries */
+ sleep(local_options.reconnect_intvl);
}
else
{
Please sign in to comment.
Something went wrong with that request. Please try again.