From f643935fa7544e602f862d61f01b73b261aa2276 Mon Sep 17 00:00:00 2001 From: Andrew Beekhof Date: Tue, 16 Jan 2018 19:05:31 +1100 Subject: [PATCH 1/5] Bug rhbz#1519812 - Prevent notify actions from causing --wait to hang --- tools/crm_resource_runtime.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c index 22bdebf8e32..189d1b3e070 100644 --- a/tools/crm_resource_runtime.c +++ b/tools/crm_resource_runtime.c @@ -1343,10 +1343,19 @@ cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t * return rc; } -#define action_is_pending(action) \ - ((is_set((action)->flags, pe_action_optional) == FALSE) \ - && (is_set((action)->flags, pe_action_runnable) == TRUE) \ - && (is_set((action)->flags, pe_action_pseudo) == FALSE)) +static inline int action_is_pending(action_t *action) +{ + if(is_set(action->flags, pe_action_optional)) { + return FALSE; + } else if(is_set(action->flags, pe_action_runnable) == FALSE) { + return FALSE; + } else if(is_set(action->flags, pe_action_pseudo)) { + return FALSE; + } else if(safe_str_eq("notify", action->task)) { + return FALSE; + } + return TRUE; +} /*! * \internal @@ -1362,7 +1371,9 @@ actions_are_pending(GListPtr actions) GListPtr action; for (action = actions; action != NULL; action = action->next) { - if (action_is_pending((action_t *) action->data)) { + action_t *a = (action_t *)action->data; + if (action_is_pending(a)) { + crm_notice("Waiting for %s (flags=0x%.8x)", a->uuid, a->flags); return TRUE; } } From cd8f984c7a15cdde46d36dbcf0636a04c341b96d Mon Sep 17 00:00:00 2001 From: Andrew Beekhof Date: Mon, 22 Jan 2018 21:18:46 +1100 Subject: [PATCH 2/5] Fix: rhbz#1527072 - Correctly observe colocation constraints with bundles in the Master role --- pengine/container.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pengine/container.c b/pengine/container.c index f5d916c805c..15d094db604 100644 --- a/pengine/container.c +++ b/pengine/container.c @@ -486,10 +486,18 @@ container_rsc_colocation_rh(resource_t * rsc_lh, resource_t * rsc, rsc_colocatio } else { node_t *chosen = tuple->docker->fns->location(tuple->docker, NULL, FALSE); - if (chosen != NULL && is_set_recursive(tuple->docker, pe_rsc_block, TRUE) == FALSE) { - pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight); - allocated_rhs = g_list_prepend(allocated_rhs, chosen); + if (chosen == NULL || is_set_recursive(tuple->docker, pe_rsc_block, TRUE)) { + continue; + } + if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child == NULL) { + continue; } + if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child->next_role < RSC_ROLE_MASTER) { + continue; + } + + pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight); + allocated_rhs = g_list_prepend(allocated_rhs, chosen); } } From 93b77f40c6d24fc44ff4858584ad0ee74e57e35e Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Mon, 22 Jan 2018 11:38:22 -0600 Subject: [PATCH 3/5] Low: crmd: quorum gain should always cause new transition 0b689055 aborted the transition on quorum loss, but quorum can also be acquired without triggering a new transition, if corosync gives quorum without a node joining (e.g. forced via corosync-cmapctl, or perhaps via heuristics). This aborts the transition when quorum is gained, but only after a 5-second delay, if the transition has not been aborted in that time. This avoids an unnecessary abort in the vast majority of cases where an abort is already done, and it allows some time for all nodes to connect when quorum is gained, rather than immediately fencing remaining unseen nodes. --- crmd/membership.c | 22 +++++++++++++++++----- crmd/te_utils.c | 48 +++++++++++++++++++++++++++++++++++++++++++++-- crmd/tengine.h | 2 ++ 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/crmd/membership.c b/crmd/membership.c index c36dbedcf14..4f2fa8a81fc 100644 --- a/crmd/membership.c +++ b/crmd/membership.c @@ -438,12 +438,24 @@ crm_update_quorum(gboolean quorum, gboolean force_update) fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); - /* If a node not running any resources is cleanly shut down and drops us - * below quorum, we won't necessarily abort the transition, so abort it - * here to be safe. + /* Quorum changes usually cause a new transition via other activity: + * quorum gained via a node joining will abort via the node join, + * and quorum lost via a node leaving will usually abort via resource + * activity and/or fencing. + * + * However, it is possible that nothing else causes a transition (e.g. + * someone forces quorum via corosync-cmaptcl, or quorum is lost due to + * a node in standby shutting down cleanly), so here ensure a new + * transition is triggered. */ - if (quorum == FALSE) { - abort_transition(INFINITY, tg_restart, "Quorum loss", NULL); + if (quorum) { + /* If quorum was gained, abort after a short delay, in case multiple + * nodes are joining around the same time, so the one that brings us + * to quorum doesn't cause all the remaining ones to be fenced. + */ + abort_after_delay(INFINITY, tg_restart, "Quorum gained", 5000); + } else { + abort_transition(INFINITY, tg_restart, "Quorum lost", NULL); } } fsa_has_quorum = quorum; diff --git a/crmd/te_utils.c b/crmd/te_utils.c index dab02d36088..8d105dc31b5 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -530,6 +530,46 @@ trigger_graph_processing(const char *fn, int line) mainloop_set_trigger(transition_trigger); } +static struct abort_timer_s { + bool aborted; + guint id; + int priority; + enum transition_action action; + const char *text; +} abort_timer = { 0, }; + +static gboolean +abort_timer_popped(gpointer data) +{ + if (abort_timer.aborted == FALSE) { + abort_transition(abort_timer.priority, abort_timer.action, + abort_timer.text, NULL); + } + abort_timer.id = 0; + return FALSE; // do not immediately reschedule timer +} + +/*! + * \internal + * \brief Abort transition after delay, if not already aborted in that time + * + * \param[in] abort_text Must be literal string + */ +void +abort_after_delay(int abort_priority, enum transition_action abort_action, + const char *abort_text, guint delay_ms) +{ + if (abort_timer.id) { + // Timer already in progress, stop and reschedule + g_source_remove(abort_timer.id); + } + abort_timer.aborted = FALSE; + abort_timer.priority = abort_priority; + abort_timer.action = abort_action; + abort_timer.text = abort_text; + abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, NULL); +} + void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line) @@ -557,6 +597,8 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action, break; } + abort_timer.aborted = TRUE; + /* Make sure any queued calculations are discarded ASAP */ free(fsa_pe_ref); fsa_pe_ref = NULL; @@ -660,10 +702,12 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action, (transition_graph->complete? "true" : "false")); } else { + const char *id = ID(reason); + do_crm_log(level, "Transition aborted by %s.%s '%s': %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", - TYPE(reason), ID(reason), (op? op : "change"), abort_text, - add[0], add[1], add[2], fn, line, path, + TYPE(reason), (id? id : ""), (op? op : "change"), + abort_text, add[0], add[1], add[2], fn, line, path, (transition_graph->complete? "true" : "false")); } } diff --git a/crmd/tengine.h b/crmd/tengine.h index 7205c16cc44..6a75a08c5cf 100644 --- a/crmd/tengine.h +++ b/crmd/tengine.h @@ -59,6 +59,8 @@ extern void notify_crmd(crm_graph_t * graph); # include extern void trigger_graph_processing(const char *fn, int line); +void abort_after_delay(int abort_priority, enum transition_action abort_action, + const char *abort_text, guint delay_ms); extern void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line); From 30eb9a980db152f6c803a35d3b261a563ad4ee75 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 24 Jan 2018 10:51:34 -0600 Subject: [PATCH 4/5] Low: tools: crm_resource --refresh should ignore --operation and --interval It already did when a resource was not specified. Also update help text to clarify cleanup vs refresh. --- tools/crm_resource.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/crm_resource.c b/tools/crm_resource.c index f3eb190c3ff..f44defbfa5b 100644 --- a/tools/crm_resource.c +++ b/tools/crm_resource.c @@ -212,14 +212,16 @@ static struct crm_option long_options[] = { }, { "cleanup", no_argument, NULL, 'C', - "\t\tDelete failed operations from a resource's history allowing its current state to be rechecked.\n" + "\t\tIf resource has any past failures, clear its history and fail count.\n" "\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n" + "\t\t\t\t--operation and --interval apply to fail counts, but entire history is always cleared,\n" + "\t\t\t\tto allow current state to be rechecked.\n" }, { "refresh", no_argument, NULL, 'R', "\t\tDelete resource's history (including failures) so its current state is rechecked.\n" - "\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n" - "\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be cleaned" + "\t\t\t\tOptionally filtered by --resource and --node (otherwise all).\n" + "\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be refreshed." }, { "set-parameter", required_argument, NULL, 'p', @@ -438,7 +440,6 @@ main(int argc, char **argv) bool require_resource = TRUE; /* whether command requires that resource be specified */ bool require_dataset = TRUE; /* whether command requires populated dataset instance */ bool require_crmd = FALSE; /* whether command requires connection to CRMd */ - bool just_errors = TRUE; /* whether cleanup command deletes all history or just errors */ int rc = pcmk_ok; int is_ocf_rc = 0; @@ -630,8 +631,7 @@ main(int argc, char **argv) if (cib_file == NULL) { require_crmd = TRUE; } - just_errors = FALSE; - rsc_cmd = 'C'; + rsc_cmd = 'R'; find_flags = pe_find_renamed|pe_find_anon; break; @@ -641,7 +641,6 @@ main(int argc, char **argv) if (cib_file == NULL) { require_crmd = TRUE; } - just_errors = TRUE; rsc_cmd = 'C'; find_flags = pe_find_renamed|pe_find_anon; break; @@ -1092,7 +1091,7 @@ main(int argc, char **argv) rc = cli_resource_delete_attribute(rsc, rsc_id, prop_set, prop_id, prop_name, cib_conn, &data_set); - } else if (rsc_cmd == 'C' && just_errors) { + } else if (rsc_cmd == 'C') { crmd_replies_needed = 0; for (xmlNode *xml_op = __xml_first_child(data_set.failed); xml_op != NULL; xml_op = __xml_next(xml_op)) { @@ -1129,7 +1128,7 @@ main(int argc, char **argv) start_mainloop(); } - } else if ((rsc_cmd == 'C') && rsc) { + } else if ((rsc_cmd == 'R') && rsc) { if(do_force == FALSE) { rsc = uber_parent(rsc); } @@ -1137,8 +1136,8 @@ main(int argc, char **argv) crm_debug("Re-checking the state of %s (%s requested) on %s", rsc->id, rsc_id, host_uname); crmd_replies_needed = 0; - rc = cli_resource_delete(crmd_channel, host_uname, rsc, operation, - interval, &data_set); + rc = cli_resource_delete(crmd_channel, host_uname, rsc, NULL, 0, + &data_set); if(rc == pcmk_ok && BE_QUIET == FALSE) { /* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */ @@ -1149,7 +1148,7 @@ main(int argc, char **argv) start_mainloop(); } - } else if (rsc_cmd == 'C') { + } else if (rsc_cmd == 'R') { #if HAVE_ATOMIC_ATTRD const char *router_node = host_uname; xmlNode *msg_data = NULL; From aecab580390f4facdc2968b4fa41f448125099f5 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 24 Jan 2018 11:46:51 -0600 Subject: [PATCH 5/5] Fix: tools: crm_resource --cleanup couldn't match clone instances regression introduced by 047a6611 + e3b825a7 --- tools/crm_resource.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/crm_resource.c b/tools/crm_resource.c index f44defbfa5b..c1b8892d504 100644 --- a/tools/crm_resource.c +++ b/tools/crm_resource.c @@ -1101,18 +1101,26 @@ main(int argc, char **argv) const char *task_interval = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL); const char *resource_name = crm_element_value(xml_op, XML_LRM_ATTR_RSCID); - if(resource_name == NULL) { + if (resource_name == NULL) { continue; } else if(host_uname && safe_str_neq(host_uname, node)) { continue; - } else if(rsc_id && safe_str_neq(rsc_id, resource_name)) { - continue; } else if(operation && safe_str_neq(operation, task)) { continue; } else if(interval && safe_str_neq(interval, task_interval)) { continue; } + if (rsc_id) { + resource_t *fail_rsc = pe_find_resource_with_flags(data_set.resources, + resource_name, + find_flags); + + if (!fail_rsc || safe_str_neq(rsc->id, fail_rsc->id)) { + continue; + } + } + crm_debug("Erasing %s failure for %s (%s detected) on %s", task, rsc->id, resource_name, node); rc = cli_resource_delete(crmd_channel, node, rsc, task,