Skip to content

Commit e29d2f9

Browse files
committed
High: crmd: Block after 10 failed fencing attempts for a node
1 parent 1ae5fec commit e29d2f9

File tree

5 files changed

+58
-11
lines changed

5 files changed

+58
-11
lines changed

TODO.markdown

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Semi-random collection of tasks we'd like to get done
22

33
## Targeted for 1.2
4-
- Block after N stonith failures (per node)
54
- Promote any necessary items from the 1.1 schema
65
- Get fencing/test.c working again (with and without stonithd -s)
76
- Avoid the use of xmlNode in fencing register_callback() call types

crmd/crmd_utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ extern void update_attrd(const char *host, const char *name, const char *value,
7474
const char *user_name);
7575

7676
extern const char *get_timer_desc(fsa_timer_t * timer);
77+
gboolean too_many_st_failures(void);
7778

7879
# define start_transition(state) do { \
7980
switch(state) { \

crmd/te_actions.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,8 @@ te_fence_node(crm_graph_t * graph, crm_action_t * action)
124124
return FALSE;
125125
}
126126

127-
crm_notice(
128-
"Executing %s fencing operation (%s) on %s (timeout=%d)",
129-
type, id, target, transition_graph->stonith_timeout);
127+
crm_notice("Executing %s fencing operation (%s) on %s (timeout=%d)",
128+
type, id, target, transition_graph->stonith_timeout);
130129

131130
/* Passing NULL means block until we can connect... */
132131
te_connect_stonith(NULL);
@@ -138,10 +137,10 @@ te_fence_node(crm_graph_t * graph, crm_action_t * action)
138137
rc = stonith_api->cmds->fence(stonith_api, options, target, type,
139138
transition_graph->stonith_timeout / 1000);
140139

141-
stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000,
142-
FALSE, generate_transition_key(transition_graph->id,
143-
action->id, 0, te_uuid),
144-
"tengine_stonith_callback", tengine_stonith_callback);
140+
stonith_api->cmds->register_callback(
141+
stonith_api, rc, transition_graph->stonith_timeout / 1000,
142+
FALSE, generate_transition_key(transition_graph->id, action->id, 0, te_uuid),
143+
"tengine_stonith_callback", tengine_stonith_callback);
145144

146145
return TRUE;
147146
}
@@ -481,7 +480,7 @@ notify_crmd(crm_graph_t * graph)
481480
if (transition_timer->period_ms > 0) {
482481
crm_timer_stop(transition_timer);
483482
crm_timer_start(transition_timer);
484-
} else {
483+
} else if(too_many_st_failures() == FALSE) {
485484
event = I_PE_CALC;
486485
}
487486

crmd/te_callbacks.c

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,34 @@ process_te_message(xmlNode * msg, xmlNode * xml_data)
393393
return TRUE;
394394
}
395395

396+
GHashTable *stonith_failures = NULL;
397+
struct st_fail_rec
398+
{
399+
int count;
400+
};
401+
402+
gboolean too_many_st_failures(void)
403+
{
404+
GHashTableIter iter;
405+
const char *key = NULL;
406+
struct st_fail_rec *value = NULL;
407+
408+
if(stonith_failures == NULL) {
409+
return FALSE;
410+
}
411+
412+
g_hash_table_iter_init(&iter, stonith_failures);
413+
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
414+
if(value->count > 10) {
415+
crm_notice("Too many failures to fence %s (%d), giving up",
416+
key, value->count);
417+
return TRUE;
418+
}
419+
}
420+
return FALSE;
421+
}
422+
423+
396424
void
397425
tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id, int rc,
398426
xmlNode * output, void *userdata)
@@ -402,6 +430,7 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
402430
int stonith_id = -1;
403431
int transition_id = -1;
404432
crm_action_t *action = NULL;
433+
struct st_fail_rec *rec = NULL;
405434

406435
CRM_CHECK(userdata != NULL, return);
407436
crm_log_xml_trace(output, "StonithOp");
@@ -411,6 +440,7 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
411440
if (AM_I_DC == FALSE) {
412441
return;
413442
}
443+
414444
/* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
415445
/* op->call_id, op->optype, op->node_name, op->op_result, */
416446
/* (char *)op->node_list, op->private_data); */
@@ -435,6 +465,11 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
435465
}
436466

437467
stop_te_timer(action->timer);
468+
if(stonith_failures == NULL) {
469+
stonith_failures = g_hash_table_new_full(
470+
crm_str_hash, g_str_equal, g_hash_destroy_str, free);
471+
}
472+
438473
if (rc == pcmk_ok) {
439474
const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
440475
const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
@@ -444,7 +479,11 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
444479
action->confirmed = TRUE;
445480
send_stonith_update(action, target, uuid);
446481
}
447-
482+
rec = g_hash_table_lookup(stonith_failures, target);
483+
if(rec) {
484+
rec->count = 0;
485+
}
486+
448487
} else {
449488
const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
450489
const char *allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL);
@@ -454,6 +493,15 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
454493
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", call_id, target, pcmk_strerror(rc));
455494
abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
456495
}
496+
497+
rec = g_hash_table_lookup(stonith_failures, target);
498+
if(rec) {
499+
rec->count++;
500+
} else {
501+
rec = malloc(sizeof(struct st_fail_rec));
502+
rec->count = 1;
503+
g_hash_table_insert(stonith_failures, strdup(target), rec);
504+
}
457505
}
458506

459507
update_graph(transition_graph, action);

crmd/te_utils.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action,
386386
if (transition_timer->period_ms > 0) {
387387
crm_timer_stop(transition_timer);
388388
crm_timer_start(transition_timer);
389-
} else {
389+
} else if(too_many_st_failures() == FALSE) {
390390
register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL);
391391
}
392392
return;

0 commit comments

Comments
 (0)