Skip to content

Commit e9741ca

Browse files
authored
Merge pull request #1409 from kgaillot/fixes
Bug fixes
2 parents 0d8c767 + aecab58 commit e9741ca

File tree

6 files changed

+114
-30
lines changed

6 files changed

+114
-30
lines changed

crmd/membership.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -438,12 +438,24 @@ crm_update_quorum(gboolean quorum, gboolean force_update)
438438
fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete);
439439
free_xml(update);
440440

441-
/* If a node not running any resources is cleanly shut down and drops us
442-
* below quorum, we won't necessarily abort the transition, so abort it
443-
* here to be safe.
441+
/* Quorum changes usually cause a new transition via other activity:
442+
* quorum gained via a node joining will abort via the node join,
443+
* and quorum lost via a node leaving will usually abort via resource
444+
* activity and/or fencing.
445+
*
446+
* However, it is possible that nothing else causes a transition (e.g.
447+
* someone forces quorum via corosync-cmaptcl, or quorum is lost due to
448+
* a node in standby shutting down cleanly), so here ensure a new
449+
* transition is triggered.
444450
*/
445-
if (quorum == FALSE) {
446-
abort_transition(INFINITY, tg_restart, "Quorum loss", NULL);
451+
if (quorum) {
452+
/* If quorum was gained, abort after a short delay, in case multiple
453+
* nodes are joining around the same time, so the one that brings us
454+
* to quorum doesn't cause all the remaining ones to be fenced.
455+
*/
456+
abort_after_delay(INFINITY, tg_restart, "Quorum gained", 5000);
457+
} else {
458+
abort_transition(INFINITY, tg_restart, "Quorum lost", NULL);
447459
}
448460
}
449461
fsa_has_quorum = quorum;

crmd/te_utils.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,46 @@ trigger_graph_processing(const char *fn, int line)
530530
mainloop_set_trigger(transition_trigger);
531531
}
532532

533+
static struct abort_timer_s {
534+
bool aborted;
535+
guint id;
536+
int priority;
537+
enum transition_action action;
538+
const char *text;
539+
} abort_timer = { 0, };
540+
541+
static gboolean
542+
abort_timer_popped(gpointer data)
543+
{
544+
if (abort_timer.aborted == FALSE) {
545+
abort_transition(abort_timer.priority, abort_timer.action,
546+
abort_timer.text, NULL);
547+
}
548+
abort_timer.id = 0;
549+
return FALSE; // do not immediately reschedule timer
550+
}
551+
552+
/*!
553+
* \internal
554+
* \brief Abort transition after delay, if not already aborted in that time
555+
*
556+
* \param[in] abort_text Must be literal string
557+
*/
558+
void
559+
abort_after_delay(int abort_priority, enum transition_action abort_action,
560+
const char *abort_text, guint delay_ms)
561+
{
562+
if (abort_timer.id) {
563+
// Timer already in progress, stop and reschedule
564+
g_source_remove(abort_timer.id);
565+
}
566+
abort_timer.aborted = FALSE;
567+
abort_timer.priority = abort_priority;
568+
abort_timer.action = abort_action;
569+
abort_timer.text = abort_text;
570+
abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, NULL);
571+
}
572+
533573
void
534574
abort_transition_graph(int abort_priority, enum transition_action abort_action,
535575
const char *abort_text, xmlNode * reason, const char *fn, int line)
@@ -557,6 +597,8 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action,
557597
break;
558598
}
559599

600+
abort_timer.aborted = TRUE;
601+
560602
/* Make sure any queued calculations are discarded ASAP */
561603
free(fsa_pe_ref);
562604
fsa_pe_ref = NULL;
@@ -660,10 +702,12 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action,
660702
(transition_graph->complete? "true" : "false"));
661703

662704
} else {
705+
const char *id = ID(reason);
706+
663707
do_crm_log(level, "Transition aborted by %s.%s '%s': %s "
664708
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
665-
TYPE(reason), ID(reason), (op? op : "change"), abort_text,
666-
add[0], add[1], add[2], fn, line, path,
709+
TYPE(reason), (id? id : ""), (op? op : "change"),
710+
abort_text, add[0], add[1], add[2], fn, line, path,
667711
(transition_graph->complete? "true" : "false"));
668712
}
669713
}

crmd/tengine.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ extern void notify_crmd(crm_graph_t * graph);
5959
# include <te_callbacks.h>
6060

6161
extern void trigger_graph_processing(const char *fn, int line);
62+
void abort_after_delay(int abort_priority, enum transition_action abort_action,
63+
const char *abort_text, guint delay_ms);
6264
extern void abort_transition_graph(int abort_priority, enum transition_action abort_action,
6365
const char *abort_text, xmlNode * reason, const char *fn,
6466
int line);

pengine/container.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,10 +486,18 @@ container_rsc_colocation_rh(resource_t * rsc_lh, resource_t * rsc, rsc_colocatio
486486
} else {
487487
node_t *chosen = tuple->docker->fns->location(tuple->docker, NULL, FALSE);
488488

489-
if (chosen != NULL && is_set_recursive(tuple->docker, pe_rsc_block, TRUE) == FALSE) {
490-
pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight);
491-
allocated_rhs = g_list_prepend(allocated_rhs, chosen);
489+
if (chosen == NULL || is_set_recursive(tuple->docker, pe_rsc_block, TRUE)) {
490+
continue;
491+
}
492+
if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child == NULL) {
493+
continue;
492494
}
495+
if(constraint->role_rh >= RSC_ROLE_MASTER && tuple->child->next_role < RSC_ROLE_MASTER) {
496+
continue;
497+
}
498+
499+
pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight);
500+
allocated_rhs = g_list_prepend(allocated_rhs, chosen);
493501
}
494502
}
495503

tools/crm_resource.c

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -212,14 +212,16 @@ static struct crm_option long_options[] = {
212212
},
213213
{
214214
"cleanup", no_argument, NULL, 'C',
215-
"\t\tDelete failed operations from a resource's history allowing its current state to be rechecked.\n"
215+
"\t\tIf resource has any past failures, clear its history and fail count.\n"
216216
"\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n"
217+
"\t\t\t\t--operation and --interval apply to fail counts, but entire history is always cleared,\n"
218+
"\t\t\t\tto allow current state to be rechecked.\n"
217219
},
218220
{
219221
"refresh", no_argument, NULL, 'R',
220222
"\t\tDelete resource's history (including failures) so its current state is rechecked.\n"
221-
"\t\t\t\tOptionally filtered by --resource, --node, --operation, and --interval (otherwise all).\n"
222-
"\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be cleaned"
223+
"\t\t\t\tOptionally filtered by --resource and --node (otherwise all).\n"
224+
"\t\t\t\tUnless --force is specified, resource's group or clone (if any) will also be refreshed."
223225
},
224226
{
225227
"set-parameter", required_argument, NULL, 'p',
@@ -438,7 +440,6 @@ main(int argc, char **argv)
438440
bool require_resource = TRUE; /* whether command requires that resource be specified */
439441
bool require_dataset = TRUE; /* whether command requires populated dataset instance */
440442
bool require_crmd = FALSE; /* whether command requires connection to CRMd */
441-
bool just_errors = TRUE; /* whether cleanup command deletes all history or just errors */
442443

443444
int rc = pcmk_ok;
444445
int is_ocf_rc = 0;
@@ -630,8 +631,7 @@ main(int argc, char **argv)
630631
if (cib_file == NULL) {
631632
require_crmd = TRUE;
632633
}
633-
just_errors = FALSE;
634-
rsc_cmd = 'C';
634+
rsc_cmd = 'R';
635635
find_flags = pe_find_renamed|pe_find_anon;
636636
break;
637637

@@ -641,7 +641,6 @@ main(int argc, char **argv)
641641
if (cib_file == NULL) {
642642
require_crmd = TRUE;
643643
}
644-
just_errors = TRUE;
645644
rsc_cmd = 'C';
646645
find_flags = pe_find_renamed|pe_find_anon;
647646
break;
@@ -1092,7 +1091,7 @@ main(int argc, char **argv)
10921091
rc = cli_resource_delete_attribute(rsc, rsc_id, prop_set, prop_id,
10931092
prop_name, cib_conn, &data_set);
10941093

1095-
} else if (rsc_cmd == 'C' && just_errors) {
1094+
} else if (rsc_cmd == 'C') {
10961095
crmd_replies_needed = 0;
10971096
for (xmlNode *xml_op = __xml_first_child(data_set.failed); xml_op != NULL;
10981097
xml_op = __xml_next(xml_op)) {
@@ -1102,18 +1101,26 @@ main(int argc, char **argv)
11021101
const char *task_interval = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL);
11031102
const char *resource_name = crm_element_value(xml_op, XML_LRM_ATTR_RSCID);
11041103

1105-
if(resource_name == NULL) {
1104+
if (resource_name == NULL) {
11061105
continue;
11071106
} else if(host_uname && safe_str_neq(host_uname, node)) {
11081107
continue;
1109-
} else if(rsc_id && safe_str_neq(rsc_id, resource_name)) {
1110-
continue;
11111108
} else if(operation && safe_str_neq(operation, task)) {
11121109
continue;
11131110
} else if(interval && safe_str_neq(interval, task_interval)) {
11141111
continue;
11151112
}
11161113

1114+
if (rsc_id) {
1115+
resource_t *fail_rsc = pe_find_resource_with_flags(data_set.resources,
1116+
resource_name,
1117+
find_flags);
1118+
1119+
if (!fail_rsc || safe_str_neq(rsc->id, fail_rsc->id)) {
1120+
continue;
1121+
}
1122+
}
1123+
11171124
crm_debug("Erasing %s failure for %s (%s detected) on %s",
11181125
task, rsc->id, resource_name, node);
11191126
rc = cli_resource_delete(crmd_channel, node, rsc, task,
@@ -1129,16 +1136,16 @@ main(int argc, char **argv)
11291136
start_mainloop();
11301137
}
11311138

1132-
} else if ((rsc_cmd == 'C') && rsc) {
1139+
} else if ((rsc_cmd == 'R') && rsc) {
11331140
if(do_force == FALSE) {
11341141
rsc = uber_parent(rsc);
11351142
}
11361143

11371144
crm_debug("Re-checking the state of %s (%s requested) on %s",
11381145
rsc->id, rsc_id, host_uname);
11391146
crmd_replies_needed = 0;
1140-
rc = cli_resource_delete(crmd_channel, host_uname, rsc, operation,
1141-
interval, &data_set);
1147+
rc = cli_resource_delete(crmd_channel, host_uname, rsc, NULL, 0,
1148+
&data_set);
11421149

11431150
if(rc == pcmk_ok && BE_QUIET == FALSE) {
11441151
/* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */
@@ -1149,7 +1156,7 @@ main(int argc, char **argv)
11491156
start_mainloop();
11501157
}
11511158

1152-
} else if (rsc_cmd == 'C') {
1159+
} else if (rsc_cmd == 'R') {
11531160
#if HAVE_ATOMIC_ATTRD
11541161
const char *router_node = host_uname;
11551162
xmlNode *msg_data = NULL;

tools/crm_resource_runtime.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,19 @@ cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t *
13431343
return rc;
13441344
}
13451345

1346-
#define action_is_pending(action) \
1347-
((is_set((action)->flags, pe_action_optional) == FALSE) \
1348-
&& (is_set((action)->flags, pe_action_runnable) == TRUE) \
1349-
&& (is_set((action)->flags, pe_action_pseudo) == FALSE))
1346+
static inline int action_is_pending(action_t *action)
1347+
{
1348+
if(is_set(action->flags, pe_action_optional)) {
1349+
return FALSE;
1350+
} else if(is_set(action->flags, pe_action_runnable) == FALSE) {
1351+
return FALSE;
1352+
} else if(is_set(action->flags, pe_action_pseudo)) {
1353+
return FALSE;
1354+
} else if(safe_str_eq("notify", action->task)) {
1355+
return FALSE;
1356+
}
1357+
return TRUE;
1358+
}
13501359

13511360
/*!
13521361
* \internal
@@ -1362,7 +1371,9 @@ actions_are_pending(GListPtr actions)
13621371
GListPtr action;
13631372

13641373
for (action = actions; action != NULL; action = action->next) {
1365-
if (action_is_pending((action_t *) action->data)) {
1374+
action_t *a = (action_t *)action->data;
1375+
if (action_is_pending(a)) {
1376+
crm_notice("Waiting for %s (flags=0x%.8x)", a->uuid, a->flags);
13661377
return TRUE;
13671378
}
13681379
}

0 commit comments

Comments
 (0)