Browse files

Merge pull request #148 from jbrassow/master

LVM agents:  Deactivate remotely active LVs before attempting EX activation
  • Loading branch information...
2 parents beb344b + f8a488e commit 8ee9185ce6463ec6ae23d9f6c63481828a71f39f @feist feist committed Oct 10, 2012
Showing with 114 additions and 11 deletions.
  1. +73 −10 rgmanager/src/resources/lvm_by_lv.sh
  2. +41 −1 rgmanager/src/resources/lvm_by_vg.sh
View
83 rgmanager/src/resources/lvm_by_lv.sh
@@ -29,6 +29,27 @@ lv_verify()
return $OCF_SUCCESS
}
+restore_transient_failed_pvs()
+{
+ local a=0
+ local -a results
+
+ results=(`pvs -o name,vg_name,attr --noheadings | grep $OCF_RESKEY_vg_name | grep -v 'unknown device'`)
+ while [ ! -z "${results[$a]}" ] ; do
+ if [[ ${results[$(($a + 2))]} =~ ..m ]] &&
+ [ $OCF_RESKEY_vg_name == ${results[$(($a + 1))]} ]; then
+ ocf_log notice "Attempting to restore missing PV, ${results[$a]} in $OCF_RESKEY_vg_name"
+ vgextend --restoremissing $OCF_RESKEY_vg_name ${results[$a]}
+ if [ $? -ne 0 ]; then
+ ocf_log notice "Failed to restore ${results[$a]}"
+ else
+ ocf_log notice " ${results[$a]} restored"
+ fi
+ fi
+ a=$(($a + 3))
+ done
+}
+
# lv_exec_resilient
#
# Sometimes, devices can come back. Their metadata will conflict
@@ -91,6 +112,11 @@ lv_activate_resilient()
if [ $action != "start" ]; then
op="-an"
+ elif [[ "$(lvs -o attr --noheadings $lv_path)" =~ r.......p ]] ||
+ [[ "$(lvs -o attr --noheadings $lv_path)" =~ R.......p ]]; then
+ # We can activate partial RAID LVs and run just fine.
+ ocf_log notice "Attempting activation of partial RAID LV, $lv_path"
+ op="-ay --partial"
fi
if ! lv_exec_resilient "lvchange $op $lv_path" ; then
@@ -317,6 +343,13 @@ lv_activate()
fi
fi
+ # If this is a partial VG, attempt to
+ # restore any transiently failed PVs
+ if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ ...p ]]; then
+ ocf_log err "Volume group \"$OCF_RESKEY_vg_name\" has PVs marked as missing"
+ restore_transient_failed_pvs
+ fi
+
if ! lv_activate_and_tag $1 $my_name $lv_path; then
ocf_log err "Failed to $1 $lv_path"
@@ -365,23 +398,53 @@ lv_activate()
function lv_start_clustered
{
- if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
- ocf_log err "Failed to activate logical volume, $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
- ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+ if lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
+ return $OCF_SUCCESS
+ fi
- if ! lvconvert --repair --use-policies $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
- ocf_log err "Failed to cleanup $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+ # FAILED exclusive activation:
+ # This can be caused by an LV being active remotely.
+ # Before attempting a repair effort, we should attempt
+ # to deactivate the LV cluster-wide; but only if the LV
+ # is not open. Otherwise, it is senseless to attempt.
+ if ! [[ "$(lvs -o attr --noheadings $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name)" =~ ....ao ]]; then
+ # We'll wait a small amount of time for some settling before
+ # attempting to deactivate. Then the deactivate will be
+ # immediately followed by another exclusive activation attempt.
+ sleep 5
+ if ! lvchange -an $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
+ # Someone could have the device open.
+ # We can't do anything about that.
+ ocf_log err "Unable to perform required deactivation of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name before starting"
return $OCF_ERR_GENERIC
fi
- if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
- ocf_log err "Failed second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
- return $OCF_ERR_GENERIC
+ if lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
+ # Second attempt after deactivation was successful, we now
+ # have the lock exclusively
+ return $OCF_SUCCESS
fi
+ fi
- ocf_log notice "Second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name successful"
- return $OCF_SUCCESS
+ # Failed to activate:
+ # This could be due to a device failure (or another machine could
+ # have snuck in between the deactivation/activation). We don't yet
+ # have a mechanism to check for remote activation, so we will proceed
+ # with repair action.
+ ocf_log err "Failed to activate logical volume, $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+ ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+
+ if ! lvconvert --repair --use-policies $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
+ ocf_log err "Failed to cleanup $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+ return $OCF_ERR_GENERIC
fi
+
+ if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then
+ ocf_log err "Failed second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log notice "Second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name successful"
return $OCF_SUCCESS
}
View
42 rgmanager/src/resources/lvm_by_vg.sh
@@ -194,10 +194,37 @@ function vg_start_clustered
local results
local all_pvs
local resilience
+ local try_again=false
ocf_log info "Starting volume group, $OCF_RESKEY_vg_name"
if ! vgchange -aey $OCF_RESKEY_vg_name; then
+ try_again=true
+
+ # Failure to activate:
+ # This could be caused by a remotely active LV. Before
+ # attempting any repair of the VG, we will first attempt
+ # to deactivate the VG cluster-wide.
+ # We must check for open LVs though, since these cannot
+ # be deactivated. We have no choice but to go one-by-one.
+
+ # Allow for some settling
+ sleep 5
+
+ results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`)
+ a=0
+ while [ ! -z "${results[$a]}" ]; do
+ if [[ ! ${results[$(($a + 1))]} =~ ....ao ]]; then
+ if ! lvchange -an $OCF_RESKEY_vg_name/${results[$a]}; then
+ ocf_log err "Unable to perform required deactivation of $OCF_RESKEY_vg_name before starting"
+ return $OCF_ERR_GENERIC
+ fi
+ fi
+ a=$(($a + 2))
+ done
+ fi
+
+ if try_again && ! vgchange -aey $OCF_RESKEY_vg_name; then
ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name"
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
@@ -218,7 +245,7 @@ function vg_start_clustered
# Make sure all the logical volumes are active
results=(`lvs -o name,attr --noheadings 2> /dev/null $OCF_RESKEY_vg_name`)
a=0
- while [ ! -z ${results[$a]} ]; do
+ while [ ! -z "${results[$a]}" ]; do
if [[ ! ${results[$(($a + 1))]} =~ ....a. ]]; then
all_pvs=(`pvs --noheadings -o name 2> /dev/null`)
resilience=" --config devices{filter=["
@@ -340,6 +367,19 @@ function vg_start_single
##
function vg_start
{
+ local a=0
+ local results
+
+ results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`)
+ while [ ! -z ${results[$a]} ]; do
+ if [[ ! ${results[$(($a + 1))]} =~ ^r ]] ||
+ [[ ! ${results[$(($a + 1))]} =~ ^R ]]; then
+ ocf_log err "RAID LVs are not supported without an 'lv_name' specification"
+ return $OCF_ERR_GENERIC
+ fi
+ a=$(($a + 2))
+ done
+
if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then
vg_start_clustered
else

0 comments on commit 8ee9185

Please sign in to comment.