Skip to content

commits following field experience during the last few months #166

Open
wants to merge 26 commits into from
Select commit
+119 −41
View
160 heartbeat/mysql
@@ -45,6 +45,7 @@
# OCF_RESKEY_max_slave_lag
# OCF_RESKEY_evict_outdated_slaves
# OCF_RESKEY_reader_attribute
+# OCF_RESKEY_reader_failcount
#######################################################################
# Initialization:
@@ -85,6 +86,8 @@ OCF_RESKEY_replication_port_default="3306"
OCF_RESKEY_max_slave_lag_default="3600"
OCF_RESKEY_evict_outdated_slaves_default="false"
OCF_RESKEY_reader_attribute_default="readable"
+OCF_RESKEY_reader_failcount_default="1"
+
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
MYSQL_BINDIR=`dirname ${OCF_RESKEY_binary}`
@@ -116,12 +119,15 @@ MYSQL_BINDIR=`dirname ${OCF_RESKEY_binary}`
: ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}}
: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}}
+: ${OCF_RESKEY_reader_failcount=${OCF_RESKEY_reader_failcount_default}}
+
#######################################################################
# Convenience variables
MYSQL=$OCF_RESKEY_client_binary
-MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket --connect_timeout=10"
+
+MYSQL_OPTIONS_LOCAL="-A -S $OCF_RESKEY_socket --connect_timeout=10"
MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd"
MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd"
MYSQL_TOO_MANY_CONN_ERR=1040
@@ -131,6 +137,7 @@ HOSTNAME=`uname -n`
CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME "
INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'`
CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication"
+CRM="${HA_SBIN_DIR}/crm"
#######################################################################
@@ -358,6 +365,15 @@ This parameter is only meaningful in master/slave set configurations.
whether a node is usable for clients to read from.</shortdesc>
<content type="string" default="${OCF_RESKEY_reader_attribute_default}" />
</parameter>
+<parameter name="reader_failcount" unique="1" required="0">
+<longdesc lang="en">
+The number of times a monitor operation can find the slave
+to be unsuitable for reader VIP before failing. Useful if
+there are short intermittent issues like clock adjustments in VMs.
+</longdesc>
+<shortdesc lang="en">Allowed failcount for reader</shortdesc>
+<content type="integer" default="${OCF_RESKEY_reader_failcount_default}" />
+</parameter>
</parameters>
<actions>
@@ -448,8 +464,8 @@ parse_slave_info() {
}
get_slave_info() {
- # Warning: this sets $tmpfile and LEAVE this file! You must delete it after use!
- local mysql_options
+
+ local mysql_options tmpfile
if [ "$master_log_file" -a "$master_host" ]; then
# variables are already defined, get_slave_info has been run before
@@ -470,14 +486,15 @@ get_slave_info() {
slave_io=`parse_slave_info Slave_IO_Running $tmpfile`
last_errno=`parse_slave_info Last_Errno $tmpfile`
secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile`
- ocf_log debug "MySQL instance running as a replication slave"
+ ocf_log debug "MySQL instance has a non empty slave status"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
ocf_log err "check_slave invoked on an instance that is not a replication slave."
+ rm -f $tmpfile
return $OCF_ERR_GENERIC
fi
-
+ rm -f $tmpfile
return $OCF_SUCCESS
fi
}
@@ -496,17 +513,19 @@ check_slave() {
# diverged from its master. Make sure this resource
# doesn't restart in place.
ocf_log err "MySQL instance configured for replication, but replication has failed."
- ocf_log err "See $tmpfile for details"
# Just pull the reader VIP away, killing MySQL here would be pretty evil
# on a loaded server
-
set_reader_attr 0
+
+ #Since replication is broken, not suitable to be a master
+ $CRM_MASTER -v 0
+
exit $OCF_SUCCESS
fi
- # If we got max_connections, let's remove the vip
+ # If we got max_connections, let's only remove the vip
if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then
set_reader_attr 0
exit $OCF_SUCCESS
@@ -524,7 +543,6 @@ check_slave() {
if [ "$master_host" != "$new_master" ]; then
# Not pointing to the right master, not good, removing the VIPs
set_reader_attr 0
-
exit $OCF_SUCCESS
fi
@@ -535,10 +553,10 @@ check_slave() {
# good thing. Try to recoved by restarting the SQL thread
# and remove reader vip. Prevent MySQL restart.
ocf_log err "MySQL Slave SQL threads currently not running."
- ocf_log err "See $tmpfile for details"
# Remove reader vip
set_reader_attr 0
+ $CRM_MASTER -v 0
# try to restart slave
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
@@ -553,7 +571,6 @@ check_slave() {
# behind. Let's check our lag.
if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then
ocf_log err "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)."
- ocf_log err "See $tmpfile for details"
# Remove reader vip
set_reader_attr 0
@@ -574,19 +591,22 @@ check_slave() {
fi
# is the slave ok to have a VIP on it
- if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then
+ test $secs_behind -eq 0 2>/dev/null
+ if [ $? -eq 2 ]; then
set_reader_attr 0
else
- set_reader_attr 1
+ if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then
+ set_reader_attr 0
+ else
+ set_reader_attr 1
+ fi
fi
ocf_log debug "MySQL instance running as a replication slave"
- rm -f $tmpfile
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
# TODO: Needs to handle when get_slave_info will return too many connections error
- rm -f $tmpfile
ocf_log err "check_slave invoked on an instance that is not a replication slave."
exit $OCF_ERR_GENERIC
fi
@@ -605,7 +625,6 @@ set_master() {
# master_params=", MASTER_LOG_FILE='$master_log_file', \
# MASTER_LOG_POS=$master_log_pos"
ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos"
- rm -f $tmpfile
return
else
master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2`
@@ -624,10 +643,9 @@ set_master() {
# reset with RESET MASTER.
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
- -e "CHANGE MASTER TO MASTER_HOST='$new_master', \
+ -e "STOP SLAVE;CHANGE MASTER TO MASTER_HOST='$new_master', \
MASTER_USER='$OCF_RESKEY_replication_user', \
MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params"
- rm -f $tmpfile
}
unset_master(){
@@ -638,7 +656,10 @@ unset_master(){
# host, then there's nothing to do. But we do log a warning as
# no-one but the CRM should be touching the MySQL master/slave
# configuration.
- if ! is_slave; then
+
+ is_slave
+ rc=$?
+ if [ $rc -ne 0 ]; then
ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave"
return $OCF_SUCCESS
fi
@@ -677,6 +698,7 @@ unset_master(){
-e "STOP SLAVE IO_THREAD"
if [ $? -gt 0 ]; then
ocf_log err "Error stopping slave IO thread"
+ rm -f $tmpfile
exit $OCF_ERR_GENERIC
fi
@@ -705,7 +727,7 @@ unset_master(){
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
- -e "RESET SLAVE;"
+ -e "RESET SLAVE /*!50516 ALL */;"
if [ $? -gt 0 ]; then
ocf_log err "Failed to reset slave"
exit $OCF_ERR_GENERIC
@@ -714,7 +736,6 @@ unset_master(){
# Start replication as slave
start_slave() {
-
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
}
@@ -724,9 +745,18 @@ set_reader_attr() {
local curr_attr_value
curr_attr_value=$(get_reader_attr)
-
- if [ "$curr_attr_value" -ne "$1" ]; then
- $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1
+
+ if [ "$1" -eq "0" ]; then
+ if [ "$curr_attr_value" -gt "0" ]; then
+ curr_attr_value=$((${curr_attr_value}-1))
+ $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $curr_attr_value
+ else
+ $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v 0
+ fi
+ else
+ if [ "$curr_attr_value" -ne "$OCF_RESKEY_reader_failcount" ]; then
+ $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $OCF_RESKEY_reader_failcount
+ fi
fi
}
@@ -838,6 +868,8 @@ mysql_status() {
mysql_monitor() {
local rc
local status_loglevel="err"
+ local master_resource
+ local master_exists
# Set loglevel to info during probe
if ocf_is_probe; then
@@ -859,18 +891,30 @@ mysql_monitor() {
if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then
# Check if this instance is configured as a slave, and if so
# check slave status
- if is_slave; then
- check_slave
+
+ master_resource=`$CRM resource list | grep p_mysql | awk '{print $3}' | head -n 1`
+ master_exists=`$CRM resource list $master_resource | egrep -c 'Master$'`
+ # Are we currently having a master?
+ if [ "$master_exists" -ne "0" ]; then
+ is_slave
+ rc=$?
+ if [ $rc -eq 0 -o "$OCF_RESKEY_CRM_meta_role" = "Slave" ]; then
+ check_slave
+ fi
fi
# Check for test table
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table"
rc=$?
- if [ $rc -ne 0 ]; then
- ocf_log err "Failed to select from $test_table";
- return $OCF_ERR_GENERIC;
+ if [ $rc -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then
+ if [ $rc -ne 0 ]; then
+ ocf_log err "Failed to select from $test_table";
+ return $OCF_ERR_GENERIC;
+ fi
+ else
+ ocf_log info "Master hit max_connections"
fi
fi
@@ -887,6 +931,10 @@ mysql_start() {
if ocf_is_ms; then
# Initialize the ReaderVIP attribute, monitor will enable it
set_reader_attr 0
+
+ # set master_score to 0 in case mysql crashes on startup
+ $CRM_MASTER -v 0
+
fi
mysql_status info
@@ -943,7 +991,7 @@ mysql_start() {
#chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir
mysql_extra_params=
if ocf_is_ms; then
- mysql_extra_params="--skip-slave-start"
+ mysql_extra_params="$mysql_extra_params --skip-slave-start --read-only"
fi
${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \
@@ -979,7 +1027,7 @@ mysql_start() {
# We're configured as a stateful resource. We must start as
# slave by default. At this point we don't know if the CRM has
# already promoted a master. So, we simply start in read only
- # mode.
+ # mode. Should already be from command line.
set_read_only on
# Now, let's see whether there is a master. We might be a new
@@ -1080,8 +1128,9 @@ mysql_promote() {
if ( ! mysql_status err ); then
return $OCF_NOT_RUNNING
fi
- ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
- -e "STOP SLAVE"
+
+ unset_master
+
# Set Master Info in CIB, cluster level attribute
update_data_master_status
@@ -1094,7 +1143,10 @@ mysql_promote() {
# Existing master gets a higher-than-default master preference, so
# the cluster manager does not shuffle the master role around
# unnecessarily
- $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1))
+ $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1000))
+
+ # A master can accept reads
+ set_reader_attr 1
# A master can accept reads
set_reader_attr 1
@@ -1104,12 +1156,12 @@ mysql_promote() {
mysql_demote() {
if ! mysql_status err; then
- return $OCF_NOT_RUNNING
+ $CRM_MASTER -v 0
+ else
+ # Return master preference to default, so the cluster manager gets
+ # a chance to select a new master
+ $CRM_MASTER -v 1
fi
-
- # Return master preference to default, so the cluster manager gets
- # a chance to select a new master
- $CRM_MASTER -v 1
}
mysql_notify() {
@@ -1134,6 +1186,15 @@ mysql_notify() {
# The master has completed its promotion. Now is a good
# time to check whether our replication slave is working
# correctly.
+
+ # Is the notification for our set
+ notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_promote_resource|cut -d: -f1`
+ my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1`
+ if [ $notify_resource != ${my_resource} ]; then
+ ocf_log debug "Notification is not for us"
+ return $OCF_SUCCESS
+ fi
+
master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "`
if [ "$master_host" = ${HOSTNAME} ]; then
ocf_log info "This will be the new master, ignoring post-promote notification."
@@ -1159,6 +1220,14 @@ mysql_notify() {
return $OCF_SUCCESS
;;
'pre-demote')
+ # Is the notification for our set
+ notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1`
+ my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1`
+ if [ $notify_resource != ${my_resource} ]; then
+ ocf_log debug "Notification is not for us"
+ return $OCF_SUCCESS
+ fi
+
demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "`
if [ $demote_host = ${HOSTNAME} ]; then
ocf_log info "post-demote notification for $demote_host"
@@ -1180,16 +1249,25 @@ mysql_notify() {
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "KILL ${thread}"
done
+ rm -f $tmpfile
else
ocf_log info "Ignoring post-demote notification execpt for my own demotion."
fi
return $OCF_SUCCESS
;;
'post-demote')
+ # Is the notification for our set
+ notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1`
+ my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1`
+ if [ $notify_resource != ${my_resource} ]; then
+ ocf_log debug "Notification is not for us"
+ return $OCF_SUCCESS
+ fi
+
demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "`
if [ $demote_host = ${HOSTNAME} ]; then
- ocf_log info "Ignoring post-demote notification for my own demotion."
- return $OCF_SUCCESS
+ ocf_log info "Ignoring post-demote notification for my own demotion."
+ return $OCF_SUCCESS
fi
ocf_log info "post-demote notification for $demote_host."
# The former master has just been gracefully demoted.
Something went wrong with that request. Please try again.