@@ -393,6 +393,34 @@ process_te_message(xmlNode * msg, xmlNode * xml_data)
393
393
return TRUE;
394
394
}
395
395
396
+ GHashTable * stonith_failures = NULL ;
397
+ struct st_fail_rec
398
+ {
399
+ int count ;
400
+ };
401
+
402
+ gboolean too_many_st_failures (void )
403
+ {
404
+ GHashTableIter iter ;
405
+ const char * key = NULL ;
406
+ struct st_fail_rec * value = NULL ;
407
+
408
+ if (stonith_failures == NULL ) {
409
+ return FALSE;
410
+ }
411
+
412
+ g_hash_table_iter_init (& iter , stonith_failures );
413
+ while (g_hash_table_iter_next (& iter , (gpointer * ) & key , (gpointer * ) & value )) {
414
+ if (value -> count > 10 ) {
415
+ crm_notice ("Too many failures to fence %s (%d), giving up" ,
416
+ key , value -> count );
417
+ return TRUE;
418
+ }
419
+ }
420
+ return FALSE;
421
+ }
422
+
423
+
396
424
void
397
425
tengine_stonith_callback (stonith_t * stonith , const xmlNode * msg , int call_id , int rc ,
398
426
xmlNode * output , void * userdata )
@@ -402,6 +430,7 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
402
430
int stonith_id = -1 ;
403
431
int transition_id = -1 ;
404
432
crm_action_t * action = NULL ;
433
+ struct st_fail_rec * rec = NULL ;
405
434
406
435
CRM_CHECK (userdata != NULL , return );
407
436
crm_log_xml_trace (output , "StonithOp" );
@@ -411,6 +440,7 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
411
440
if (AM_I_DC == FALSE) {
412
441
return ;
413
442
}
443
+
414
444
/* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
415
445
/* op->call_id, op->optype, op->node_name, op->op_result, */
416
446
/* (char *)op->node_list, op->private_data); */
@@ -435,6 +465,11 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
435
465
}
436
466
437
467
stop_te_timer (action -> timer );
468
+ if (stonith_failures == NULL ) {
469
+ stonith_failures = g_hash_table_new_full (
470
+ crm_str_hash , g_str_equal , g_hash_destroy_str , free );
471
+ }
472
+
438
473
if (rc == pcmk_ok ) {
439
474
const char * target = crm_element_value (action -> xml , XML_LRM_ATTR_TARGET );
440
475
const char * uuid = crm_element_value (action -> xml , XML_LRM_ATTR_TARGET_UUID );
@@ -444,7 +479,11 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
444
479
action -> confirmed = TRUE;
445
480
send_stonith_update (action , target , uuid );
446
481
}
447
-
482
+ rec = g_hash_table_lookup (stonith_failures , target );
483
+ if (rec ) {
484
+ rec -> count = 0 ;
485
+ }
486
+
448
487
} else {
449
488
const char * target = crm_element_value_const (action -> xml , XML_LRM_ATTR_TARGET );
450
489
const char * allow_fail = crm_meta_value (action -> params , XML_ATTR_TE_ALLOWFAIL );
@@ -454,6 +493,15 @@ tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id,
454
493
crm_notice ("Stonith operation %d for %s failed (%s): aborting transition." , call_id , target , pcmk_strerror (rc ));
455
494
abort_transition (INFINITY , tg_restart , "Stonith failed" , NULL );
456
495
}
496
+
497
+ rec = g_hash_table_lookup (stonith_failures , target );
498
+ if (rec ) {
499
+ rec -> count ++ ;
500
+ } else {
501
+ rec = malloc (sizeof (struct st_fail_rec ));
502
+ rec -> count = 1 ;
503
+ g_hash_table_insert (stonith_failures , strdup (target ), rec );
504
+ }
457
505
}
458
506
459
507
update_graph (transition_graph , action );
0 commit comments