Merge pull request #1263 from kgaillot/fixes

Better stonith failure handling in crmd, plus a few unrelated fixes
ClusterLabs · Apr 18, 2017 · 0ea7a77 · 0ea7a77
2 parents 2124e04 + 0c15dfd
commit 0ea7a77
Show file tree

Hide file tree

Showing 21 changed files with 528 additions and 100 deletions.
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
@@ -19,6 +19,7 @@
 #  define CRMD_UTILS__H
 
 #  include <crm/crm.h>
+#  include <crm/transition.h>
 #  include <crm/common/xml.h>
 #  include <crm/cib/internal.h> /* For CIB_OP_MODIFY */
 #  include "crmd_alerts.h"
@@ -100,8 +101,10 @@ int crmd_join_phase_count(enum crm_join_phase phase);
 void crmd_join_phase_log(int level);
 
 const char *get_timer_desc(fsa_timer_t * timer);
-gboolean too_many_st_failures(void);
 void st_fail_count_reset(const char * target);
+void st_fail_count_increment(const char *target);
+void abort_for_stonith_failure(enum transition_action abort_action,
+                               const char *target, xmlNode *reason);
 void crmd_peer_down(crm_node_t *peer, bool full);
 
 /* Convenience macro for registering a CIB callback

diff --git a/crmd/join_client.c b/crmd/join_client.c
@@ -30,6 +30,26 @@ void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, v
 
 extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
 
+/*!
+ * \internal
+ * \brief Remember if DC is shutting down as we join
+ *
+ * If we're joining while the current DC is shutting down, update its expected
+ * state, so we don't fence it if we become the new DC. (We weren't a peer
+ * when it broadcast its shutdown request.)
+ *
+ * \param[in] msg  A join message from the DC
+ */
+static void
+update_dc_expected(xmlNode *msg)
+{
+    if (fsa_our_dc && crm_is_true(crm_element_value(msg, F_CRM_DC_LEAVING))) {
+        crm_node_t *dc_node = crm_get_peer(0, fsa_our_dc);
+
+        crm_update_peer_expected(__FUNCTION__, dc_node, CRMD_JOINSTATE_DOWN);
+    }
+}
+
 /*	A_CL_JOIN_QUERY		*/
 /* is there a DC out there? */
 void
@@ -128,6 +148,8 @@ do_cl_join_offer_respond(long long action,
         return;
     }
 
+    update_dc_expected(input->msg);
+
     CRM_LOG_ASSERT(input != NULL);
     query_call_id =
         fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local | cib_no_children);
@@ -250,6 +272,8 @@ do_cl_join_finalize_respond(long long action,
         return;
     }
 
+    update_dc_expected(input->msg);
+
     /* send our status section to the DC */
     tmp1 = do_lrm_query(TRUE, fsa_our_uname);
     if (tmp1 != NULL) {

diff --git a/crmd/join_dc.c b/crmd/join_dc.c
@@ -106,6 +106,30 @@ initialize_join(gboolean before)
     }
 }
 
+/*!
+ * \internal
+ * \brief Create a join message from the DC
+ *
+ * \param[in] join_op  Join operation name
+ * \param[in] host_to  Recipient of message
+ */
+static xmlNode *
+create_dc_message(const char *join_op, const char *host_to)
+{
+    xmlNode *msg = create_request(join_op, NULL, host_to, CRM_SYSTEM_CRMD,
+                                  CRM_SYSTEM_DC, NULL);
+
+    /* Identify which election this is a part of */
+    crm_xml_add_int(msg, F_CRM_JOIN_ID, current_join_id);
+
+    /* Add a field specifying whether the DC is shutting down. This keeps the
+     * joining node from fencing the old DC if it becomes the new DC.
+     */
+    crm_xml_add_boolean(msg, F_CRM_DC_LEAVING,
+                        is_set(fsa_input_register, R_SHUTDOWN));
+    return msg;
+}
+
 static void
 join_make_offer(gpointer key, gpointer value, gpointer user_data)
 {
@@ -147,10 +171,8 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
 
     crm_update_peer_join(__FUNCTION__, (crm_node_t*)member, crm_join_none);
 
-    offer = create_request(CRM_OP_JOIN_OFFER, NULL, member->uname,
-                           CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
+    offer = create_dc_message(CRM_OP_JOIN_OFFER, member->uname);
 
-    crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id);
     /* send the welcome */
     crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);
 
@@ -242,8 +264,10 @@ do_dc_join_offer_one(long long action,
     /* always offer to the DC (ourselves)
      * this ensures the correct value for max_generation_from
      */
-    member = crm_get_peer(0, fsa_our_uname);
-    join_make_offer(NULL, member, NULL);
+    if (strcmp(join_to, fsa_our_uname) != 0) {
+        member = crm_get_peer(0, fsa_our_uname);
+        join_make_offer(NULL, member, NULL);
+    }
 
     /* this was a genuine join request, cancel any existing
      * transition and invoke the PE
@@ -586,9 +610,7 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
     }
 
     /* send the ack/nack to the node */
-    acknak = create_request(CRM_OP_JOIN_ACKNAK, NULL, join_to,
-                            CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
-    crm_xml_add_int(acknak, F_CRM_JOIN_ID, current_join_id);
+    acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
 
     crm_debug("join-%d: ACK'ing join request from %s",
               current_join_id, join_to);

diff --git a/crmd/messages.c b/crmd/messages.c
@@ -870,6 +870,12 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause)
 
         } else {
             reap_crm_member(id, name);
+
+            /* If we're forgetting this node, also forget any failures to fence
+             * it, so we don't carry that over to any node added later with the
+             * same name.
+             */
+            st_fail_count_reset(name);
         }
 
     } else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) {

diff --git a/crmd/te_actions.c b/crmd/te_actions.c
@@ -726,15 +726,11 @@ notify_crmd(crm_graph_t * graph)
         case tg_restart:
             type = "restart";
             if (fsa_state == S_TRANSITION_ENGINE) {
-                if (too_many_st_failures() == FALSE) {
-                    if (transition_timer->period_ms > 0) {
-                        crm_timer_stop(transition_timer);
-                        crm_timer_start(transition_timer);
-                    } else {
-                        event = I_PE_CALC;
-                    }
+                if (transition_timer->period_ms > 0) {
+                    crm_timer_stop(transition_timer);
+                    crm_timer_start(transition_timer);
                 } else {
-                    event = I_TE_SUCCESS;
+                    event = I_PE_CALC;
                 }
 
             } else if (fsa_state == S_POLICY_ENGINE) {

diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
@@ -635,8 +635,8 @@ struct st_fail_rec {
     int count;
 };
 
-gboolean
-too_many_st_failures(void)
+static gboolean
+too_many_st_failures(const char *target)
 {
     GHashTableIter iter;
     const char *key = NULL;
@@ -646,32 +646,63 @@ too_many_st_failures(void)
         return FALSE;
     }
 
-    g_hash_table_iter_init(&iter, stonith_failures);
-    while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
-        if (value->count > stonith_max_attempts ) {
-            crm_warn("Too many failures to fence %s (%d), giving up", key, value->count);
-            return TRUE;
+    if (target == NULL) {
+        g_hash_table_iter_init(&iter, stonith_failures);
+        while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
+            if (value->count >= stonith_max_attempts) {
+                target = (const char*)key;
+                goto too_many;
+            }
+        }
+    } else {
+        value = g_hash_table_lookup(stonith_failures, target);
+        if ((value != NULL) && (value->count >= stonith_max_attempts)) {
+            goto too_many;
         }
     }
     return FALSE;
+
+too_many:
+    crm_warn("Too many failures (%d) to fence %s, giving up",
+             value->count, target);
+    return TRUE;
 }
 
+/*!
+ * \internal
+ * \brief Reset a stonith fail count
+ *
+ * \param[in] target  Name of node to reset, or NULL for all
+ */
 void
 st_fail_count_reset(const char *target)
 {
-    struct st_fail_rec *rec = NULL;
-
-    if (stonith_failures) {
-        rec = g_hash_table_lookup(stonith_failures, target);
+    if (stonith_failures == NULL) {
+        return;
     }
 
-    if (rec) {
-        rec->count = 0;
+    if (target) {
+        struct st_fail_rec *rec = NULL;
+
+        rec = g_hash_table_lookup(stonith_failures, target);
+        if (rec) {
+            rec->count = 0;
+        }
+    } else {
+        GHashTableIter iter;
+        const char *key = NULL;
+        struct st_fail_rec *rec = NULL;
+
+        g_hash_table_iter_init(&iter, stonith_failures);
+        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
+                                      (gpointer *) &rec)) {
+            rec->count = 0;
+        }
     }
 }
 
-static void
-st_fail_count_increment(const char *target, int rc)
+void
+st_fail_count_increment(const char *target)
 {
     struct st_fail_rec *rec = NULL;
 
@@ -694,6 +725,27 @@ st_fail_count_increment(const char *target, int rc)
     }
 }
 
+/*!
+ * \internal
+ * \brief Abort transition due to stonith failure
+ *
+ * \param[in] abort_action  Whether to restart or stop transition
+ * \param[in] target  Don't restart if this (NULL for any) has too many failures
+ * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
+ */
+void
+abort_for_stonith_failure(enum transition_action abort_action,
+                          const char *target, xmlNode *reason)
+{
+    /* If stonith repeatedly fails, we eventually give up on starting a new
+     * transition for that reason.
+     */
+    if ((abort_action != tg_stop) && too_many_st_failures(target)) {
+        abort_action = tg_stop;
+    }
+    abort_transition(INFINITY, abort_action, "Stonith failed", reason);
+}
+
 void
 tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
 {
@@ -755,12 +807,22 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
 
     } else {
         const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
+        enum transition_action abort_action = tg_restart;
 
         action->failed = TRUE;
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
                    call_id, target, pcmk_strerror(rc));
-        abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
-        st_fail_count_increment(target, rc);
+
+        /* If no fence devices were available, there's no use in immediately
+         * checking again, so don't start a new transition in that case.
+         */
+        if (rc == -ENODEV) {
+            crm_warn("No devices found in cluster to fence %s, giving up",
+                     target);
+            abort_action = tg_stop;
+        }
+
+        abort_for_stonith_failure(abort_action, target, NULL);
     }
 
     update_graph(transition_graph, action);

diff --git a/crmd/te_utils.c b/crmd/te_utils.c
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
 
     if (last_action != NULL) {
         crm_warn("STONITHd failure resulted in un-runnable actions");
-        abort_transition(INFINITY, tg_restart, "Stonith failure", last_action);
+        abort_for_stonith_failure(tg_restart, NULL, last_action);
         return TRUE;
     }
 
@@ -259,9 +259,12 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
         return;
     }
 
-    if (st_event->result == pcmk_ok &&
-        safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
-        st_fail_count_reset(st_event->target);
+    if (safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
+        if (st_event->result == pcmk_ok) {
+            st_fail_count_reset(st_event->target);
+        } else {
+            st_fail_count_increment(st_event->target);
+        }
     }
 
     crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",

diff --git a/doc/Pacemaker_Explained/en-US/Ch-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Options.txt
@@ -230,8 +230,8 @@ How long to wait for STONITH actions (reboot, on, off) to complete
 | stonith-max-attempts | 10 |
 indexterm:[stonith-max-attempts,Cluster Option]
 indexterm:[Cluster,Option,stonith-max-attempts]
-How many times stonith can fail before it will no longer be attempted on a target.
-Positive non-zero values are allowed. '(since 1.1.17)'
+How many times fencing can fail for a target before the cluster will no longer
+immediately re-attempt it. '(since 1.1.17)'
 
 | concurrent-fencing | FALSE |
 indexterm:[concurrent-fencing,Cluster Option]