Skip to content

Commit

Permalink
Merge pull request #1409 from kgaillot/fixes
Browse files Browse the repository at this point in the history
Bug fixes
  • Loading branch information
kgaillot committed Jan 24, 2018
2 parents 0d8c767 + aecab58 commit e9741ca
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 30 deletions.
22 changes: 17 additions & 5 deletions crmd/membership.c
Expand Up @@ -438,12 +438,24 @@ crm_update_quorum(gboolean quorum, gboolean force_update)
fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete);
free_xml(update);

/* If a node not running any resources is cleanly shut down and drops us
* below quorum, we won't necessarily abort the transition, so abort it
* here to be safe.
/* Quorum changes usually cause a new transition via other activity:
* quorum gained via a node joining will abort via the node join,
* and quorum lost via a node leaving will usually abort via resource
* activity and/or fencing.
*
* However, it is possible that nothing else causes a transition (e.g.
* someone forces quorum via corosync-cmaptcl, or quorum is lost due to
* a node in standby shutting down cleanly), so here ensure a new
* transition is triggered.
*/
if (quorum == FALSE) {
abort_transition(INFINITY, tg_restart, "Quorum loss", NULL);
if (quorum) {
/* If quorum was gained, abort after a short delay, in case multiple
* nodes are joining around the same time, so the one that brings us
* to quorum doesn't cause all the remaining ones to be fenced.
*/
abort_after_delay(INFINITY, tg_restart, "Quorum gained", 5000);
} else {
abort_transition(INFINITY, tg_restart, "Quorum lost", NULL);
}
}
fsa_has_quorum = quorum;
Expand Down
48 changes: 46 additions & 2 deletions crmd/te_utils.c
Expand Up @@ -530,6 +530,46 @@ trigger_graph_processing(const char *fn, int line)
mainloop_set_trigger(transition_trigger);
}

static struct abort_timer_s {
bool aborted;
guint id;
int priority;
enum transition_action action;
const char *text;
} abort_timer = { 0, };

static gboolean
abort_timer_popped(gpointer data)
{
if (abort_timer.aborted == FALSE) {
abort_transition(abort_timer.priority, abort_timer.action,
abort_timer.text, NULL);
}
abort_timer.id = 0;
return FALSE; // do not immediately reschedule timer
}

/*!
* \internal
* \brief Abort transition after delay, if not already aborted in that time
*
* \param[in] abort_text Must be literal string
*/
void
abort_after_delay(int abort_priority, enum transition_action abort_action,
const char *abort_text, guint delay_ms)
{
if (abort_timer.id) {
// Timer already in progress, stop and reschedule
g_source_remove(abort_timer.id);
}
abort_timer.aborted = FALSE;
abort_timer.priority = abort_priority;
abort_timer.action = abort_action;
abort_timer.text = abort_text;
abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, NULL);
}

void
abort_transition_graph(int abort_priority, enum transition_action abort_action,
const char *abort_text, xmlNode * reason, const char *fn, int line)
Expand Down Expand Up @@ -557,6 +597,8 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action,
break;
}

abort_timer.aborted = TRUE;

/* Make sure any queued calculations are discarded ASAP */
free(fsa_pe_ref);
fsa_pe_ref = NULL;
Expand Down Expand Up @@ -660,10 +702,12 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action,
(transition_graph->complete? "true" : "false"));

} else {
const char *id = ID(reason);

do_crm_log(level, "Transition aborted by %s.%s '%s': %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
TYPE(reason), ID(reason), (op? op : "change"), abort_text,
add[0], add[1], add[2], fn, line, path,
TYPE(reason), (id? id : ""), (op? op : "change"),
abort_text, add[0], add[1], add[2], fn, line, path,
(transition_graph->complete? "true" : "false"));
}
}
Expand Down
2 changes: 2 additions & 0 deletions crmd/tengine.h
Expand Up @@ -59,6 +59,8 @@ extern void notify_crmd(crm_graph_t * graph);
# include <te_callbacks.h>

extern void trigger_graph_processing(const char *fn, int line);
void abort_after_delay(int abort_priority, enum transition_action abort_action,
const char *abort_text, guint delay_ms);
extern void abort_transition_graph(int abort_priority, enum transition_action abort_action,
const char *abort_text, xmlNode * reason, const char *fn,
int line);
Expand Down
14 changes: 11 additions & 3 deletions pengine/container.c
Expand Up @@ -486,10 +486,18 @@ container_rsc_colocation_rh(resource_t * rsc_lh, resource_t * rsc, rsc_colocatio
} else {
node_t *chosen = tuple->docker->fns->location(tuple->docker, NULL, FALSE);

if (chosen != NULL && is_set_recursive(tuple->docker, pe_rsc_block, TRUE) == FALSE) {
pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight);
allocated_rhs = g_list_prepend(allocated_rhs, chosen);
if (chosen == NULL || is_set_recursive(tuple->docker, pe_rsc_block, TRUE)) {
continue;
}
if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child == NULL) {
continue;
}
if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child->next_role < RSC_ROLE_MASTER) {
continue;
}

pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight);
allocated_rhs = g_list_prepend(allocated_rhs, chosen);
}
}

Expand Down
37 changes: 22 additions & 15 deletions tools/crm_resource.c
Expand Up @@ -212,14 +212,16 @@ static struct crm_option long_options[] = {
},
{
"cleanup", no_argument, NULL, 'C',
"\t\tDelete failed operations from a resource's history allowing its current state to be rechecked.\n"
"\t\tIf resource has any past failures, clear its history and fail count.\n"
"\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n"
"\t\t\t\t--operation and --interval apply to fail counts, but entire history is always cleared,\n"
"\t\t\t\tto allow current state to be rechecked.\n"
},
{
"refresh", no_argument, NULL, 'R',
"\t\tDelete resource's history (including failures) so its current state is rechecked.\n"
"\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n"
"\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be cleaned"
"\t\t\t\tOptionally filtered by --resource and --node (otherwise all).\n"
"\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be refreshed."
},
{
"set-parameter", required_argument, NULL, 'p',
Expand Down Expand Up @@ -438,7 +440,6 @@ main(int argc, char **argv)
bool require_resource = TRUE; /* whether command requires that resource be specified */
bool require_dataset = TRUE; /* whether command requires populated dataset instance */
bool require_crmd = FALSE; /* whether command requires connection to CRMd */
bool just_errors = TRUE; /* whether cleanup command deletes all history or just errors */

int rc = pcmk_ok;
int is_ocf_rc = 0;
Expand Down Expand Up @@ -630,8 +631,7 @@ main(int argc, char **argv)
if (cib_file == NULL) {
require_crmd = TRUE;
}
just_errors = FALSE;
rsc_cmd = 'C';
rsc_cmd = 'R';
find_flags = pe_find_renamed|pe_find_anon;
break;

Expand All @@ -641,7 +641,6 @@ main(int argc, char **argv)
if (cib_file == NULL) {
require_crmd = TRUE;
}
just_errors = TRUE;
rsc_cmd = 'C';
find_flags = pe_find_renamed|pe_find_anon;
break;
Expand Down Expand Up @@ -1092,7 +1091,7 @@ main(int argc, char **argv)
rc = cli_resource_delete_attribute(rsc, rsc_id, prop_set, prop_id,
prop_name, cib_conn, &data_set);

} else if (rsc_cmd == 'C' && just_errors) {
} else if (rsc_cmd == 'C') {
crmd_replies_needed = 0;
for (xmlNode *xml_op = __xml_first_child(data_set.failed); xml_op != NULL;
xml_op = __xml_next(xml_op)) {
Expand All @@ -1102,18 +1101,26 @@ main(int argc, char **argv)
const char *task_interval = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL);
const char *resource_name = crm_element_value(xml_op, XML_LRM_ATTR_RSCID);

if(resource_name == NULL) {
if (resource_name == NULL) {
continue;
} else if(host_uname && safe_str_neq(host_uname, node)) {
continue;
} else if(rsc_id && safe_str_neq(rsc_id, resource_name)) {
continue;
} else if(operation && safe_str_neq(operation, task)) {
continue;
} else if(interval && safe_str_neq(interval, task_interval)) {
continue;
}

if (rsc_id) {
resource_t *fail_rsc = pe_find_resource_with_flags(data_set.resources,
resource_name,
find_flags);

if (!fail_rsc || safe_str_neq(rsc->id, fail_rsc->id)) {
continue;
}
}

crm_debug("Erasing %s failure for %s (%s detected) on %s",
task, rsc->id, resource_name, node);
rc = cli_resource_delete(crmd_channel, node, rsc, task,
Expand All @@ -1129,16 +1136,16 @@ main(int argc, char **argv)
start_mainloop();
}

} else if ((rsc_cmd == 'C') && rsc) {
} else if ((rsc_cmd == 'R') && rsc) {
if(do_force == FALSE) {
rsc = uber_parent(rsc);
}

crm_debug("Re-checking the state of %s (%s requested) on %s",
rsc->id, rsc_id, host_uname);
crmd_replies_needed = 0;
rc = cli_resource_delete(crmd_channel, host_uname, rsc, operation,
interval, &data_set);
rc = cli_resource_delete(crmd_channel, host_uname, rsc, NULL, 0,
&data_set);

if(rc == pcmk_ok && BE_QUIET == FALSE) {
/* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */
Expand All @@ -1149,7 +1156,7 @@ main(int argc, char **argv)
start_mainloop();
}

} else if (rsc_cmd == 'C') {
} else if (rsc_cmd == 'R') {
#if HAVE_ATOMIC_ATTRD
const char *router_node = host_uname;
xmlNode *msg_data = NULL;
Expand Down
21 changes: 16 additions & 5 deletions tools/crm_resource_runtime.c
Expand Up @@ -1343,10 +1343,19 @@ cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t *
return rc;
}

#define action_is_pending(action) \
((is_set((action)->flags, pe_action_optional) == FALSE) \
&& (is_set((action)->flags, pe_action_runnable) == TRUE) \
&& (is_set((action)->flags, pe_action_pseudo) == FALSE))
static inline int action_is_pending(action_t *action)
{
if(is_set(action->flags, pe_action_optional)) {
return FALSE;
} else if(is_set(action->flags, pe_action_runnable) == FALSE) {
return FALSE;
} else if(is_set(action->flags, pe_action_pseudo)) {
return FALSE;
} else if(safe_str_eq("notify", action->task)) {
return FALSE;
}
return TRUE;
}

/*!
* \internal
Expand All @@ -1362,7 +1371,9 @@ actions_are_pending(GListPtr actions)
GListPtr action;

for (action = actions; action != NULL; action = action->next) {
if (action_is_pending((action_t *) action->data)) {
action_t *a = (action_t *)action->data;
if (action_is_pending(a)) {
crm_notice("Waiting for %s (flags=0x%.8x)", a->uuid, a->flags);
return TRUE;
}
}
Expand Down

0 comments on commit e9741ca

Please sign in to comment.