1212from crmsh import sh
1313from crmsh import xmlutil
1414from crmsh import constants
15+ from crmsh import cibquery
1516from crmsh .service_manager import ServiceManager
1617
1718
@@ -88,30 +89,37 @@ class SBD(command.UI):
8889 - sbd purge
8990 '''
9091 name = "sbd"
91- TIMEOUT_TYPES = ("watchdog" , "allocate" , "loop" , "msgwait" )
92- DISKLESS_TIMEOUT_TYPES = ("watchdog" ,)
92+ TIMEOUT_TYPES = ("watchdog" , "allocate" , "loop" , "msgwait" , "crashdump-watchdog" )
93+ DISKLESS_TIMEOUT_TYPES = ("watchdog" , "crashdump-watchdog" )
9394 SHOW_TYPES = ("disk_metadata" , "sysconfig" , "property" )
9495 DISKLESS_SHOW_TYPES = ("sysconfig" , "property" )
9596 PCMK_ATTRS = (
9697 "have-watchdog" ,
9798 "stonith-timeout" ,
98- "stonith-enabled" ,
99- "priority-fencing-delay" ,
100- "pcmk_delay_max"
99+ "stonith-enabled"
101100 )
102101 PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout' ,)
103102 PARSE_RE = re .compile (
104- # Match keys with non-empty values, capturing possible suffix
105- r'(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)'
103+ # To extract key, suffix and value from these possible arguments:
104+ # watchdog-timeout=30
105+ # crashdump-watchdog-timeout=120
106+ # watchdog-device=/dev/watchdog
107+ r'([\w-]+)-([\w]+)=([\w/]+)'
106108 )
109+ # re pattern to match "-C <number>" or "-C <number> -Z"
110+ SBD_OPTS_RE = r'-C\s+\d+(\s+-Z)?'
107111
108112 class SyntaxError (Exception ):
109113 pass
110114
115+ class MissingRequiredException (Exception ):
116+ pass
117+
111118 def __init__ (self ):
112119 self .device_list_from_config : list [str ] = None
113120 self .device_meta_dict_runtime : dict [str , int ] = None
114121 self .watchdog_timeout_from_config : int = None
122+ self .crashdump_watchdog_timeout_from_config : int = None
115123 self .watchdog_device_from_config : str = None
116124 self .service_manager : ServiceManager = None
117125 self .cluster_shell : sh .cluster_shell = None
@@ -130,6 +138,7 @@ def _load_attributes(self):
130138 except Exception :
131139 self .watchdog_timeout_from_config = None
132140 self .watchdog_device_from_config = watchdog .Watchdog .get_watchdog_device_from_sbd_config ()
141+ self .crashdump_watchdog_timeout_from_config = sbd .SBDUtils .get_crashdump_watchdog_timeout ()
133142
134143 self .service_manager = ServiceManager ()
135144 self .cluster_shell = sh .cluster_shell ()
@@ -217,6 +226,13 @@ def _show_property(self) -> None:
217226 for match in matches :
218227 print (f"{ match [0 ]} ={ match [1 ]} " )
219228
229+ cmd = "crm configure show related:fence_sbd"
230+ out = self .cluster_shell .get_stdout_or_raise_error (cmd )
231+ if out :
232+ print ()
233+ logger .info ('%s' , cmd )
234+ print (out )
235+
220236 print ()
221237 logger .info ('%s' , sbd .SBDTimeout .SHOW_SBD_START_TIMEOUT_CMD )
222238 systemd_start_timeout = sbd .SBDTimeout .get_sbd_systemd_start_timeout ()
@@ -287,6 +303,93 @@ def _adjust_timeout_dict(timeout_dict: dict) -> dict:
287303 timeout_dict ["watchdog" ] = watchdog_timeout
288304 logger .info ("No watchdog timeout specified, use msgwait timeout/2: %s" , watchdog_timeout )
289305 return timeout_dict
306+ return timeout_dict
307+
308+ def _set_crashdump_option (self , delete = False ):
309+ '''
310+ Set crashdump option for fence_sbd resource
311+ '''
312+ cib = xmlutil .text2elem (self .cluster_shell .get_stdout_or_raise_error ('crm configure show xml' ))
313+ ra = cibquery .ResourceAgent ("stonith" , "" , "fence_sbd" )
314+ res_id_list = cibquery .get_primitives_with_ra (cib , ra )
315+ if not res_id_list :
316+ if delete :
317+ return
318+ logger .error ("No fence_sbd resource found" )
319+ raise self .MissingRequiredException
320+
321+ crashdump_value = cibquery .get_parameter_value (cib , res_id_list [0 ], "crashdump" )
322+ cmd = ""
323+ if utils .is_boolean_false (crashdump_value ):
324+ if delete :
325+ return
326+ cmd = f"crm resource param { res_id_list [0 ]} set crashdump 1"
327+ logger .info ("Set crashdump option for fence_sbd resource" )
328+ elif delete :
329+ cmd = f"crm resource param { res_id_list [0 ]} delete crashdump"
330+ logger .info ("Delete crashdump option for fence_sbd resource" )
331+ if cmd :
332+ self .cluster_shell .get_stdout_or_raise_error (cmd )
333+
334+ def _set_crashdump_in_sysconfig (self , crashdump_watchdog_timeout = None , restore = False , diskless = False ) -> dict :
335+ update_dict = {}
336+ sbd_timeout_action_for_crashdump = "flush,crashdump"
337+ comment_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/s/^/#__sbd_crashdump_backup__ /' { sbd .SBDManager .SYSCONFIG_SBD } "
338+ add_action_line = f"sed -i '/^#__sbd_crashdump_backup__/a SBD_TIMEOUT_ACTION={ sbd_timeout_action_for_crashdump } ' { sbd .SBDManager .SYSCONFIG_SBD } "
339+ comment_out_action_line = f"sed -i 's/^#__sbd_crashdump_backup__ SBD_TIMEOUT_ACTION/SBD_TIMEOUT_ACTION/' { sbd .SBDManager .SYSCONFIG_SBD } "
340+ delete_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/d' { sbd .SBDManager .SYSCONFIG_SBD } "
341+
342+ sbd_timeout_action_configured = sbd .SBDUtils .get_sbd_value_from_config ("SBD_TIMEOUT_ACTION" )
343+ if restore :
344+ if sbd_timeout_action_configured and sbd_timeout_action_configured == sbd_timeout_action_for_crashdump :
345+ cmd_delete_and_comment_out = f"{ delete_action_line } && { comment_out_action_line } "
346+ logger .info ("Delete SBD_TIMEOUT_ACTION: %s and restore original value" , sbd_timeout_action_for_crashdump )
347+ self .cluster_shell .get_stdout_or_raise_error (cmd_delete_and_comment_out )
348+
349+ sbd_opts = sbd .SBDUtils .get_sbd_value_from_config ("SBD_OPTS" )
350+ if sbd_opts and re .search (self .SBD_OPTS_RE , sbd_opts ):
351+ sbd_opts = re .sub (self .SBD_OPTS_RE , '' , sbd_opts )
352+ update_dict ["SBD_OPTS" ] = ' ' .join (sbd_opts .split ())
353+
354+ elif crashdump_watchdog_timeout :
355+ if not sbd_timeout_action_configured :
356+ update_dict ["SBD_TIMEOUT_ACTION" ] = sbd_timeout_action_for_crashdump
357+ elif sbd_timeout_action_configured != sbd_timeout_action_for_crashdump :
358+ cmd_comment_and_add = f"{ comment_action_line } && { add_action_line } "
359+ self .cluster_shell .get_stdout_or_raise_error (cmd_comment_and_add )
360+ logger .info ("Update SBD_TIMEOUT_ACTION in %s: %s" , sbd .SBDManager .SYSCONFIG_SBD , sbd_timeout_action_for_crashdump )
361+
362+ value_for_diskless = " -Z" if diskless else ""
363+ value_for_sbd_opts = f"-C { crashdump_watchdog_timeout } { value_for_diskless } "
364+ sbd_opts = sbd .SBDUtils .get_sbd_value_from_config ("SBD_OPTS" )
365+ sbd_opts = re .sub (self .SBD_OPTS_RE , '' , sbd_opts )
366+ update_dict ["SBD_OPTS" ] = f"{ ' ' .join (sbd_opts .split ())} { value_for_sbd_opts } " if sbd_opts else value_for_sbd_opts
367+
368+ return update_dict
369+
370+ def _check_kdump_service (self ):
371+ no_kdump = False
372+ for node in self .cluster_nodes :
373+ if not self .service_manager .service_is_active ("kdump.service" , node ):
374+ logger .warning ("Kdump service is not active on %s" , node )
375+ no_kdump = True
376+ if no_kdump :
377+ logger .warning ("Kdump service is required for crashdump" )
378+
379+ def _should_configure_crashdump (
380+ self ,
381+ crashdump_watchdog_timeout ,
382+ watchdog_timeout ,
383+ diskless = False
384+ ) -> bool :
385+ if not crashdump_watchdog_timeout and not self .crashdump_watchdog_timeout_from_config :
386+ return False
387+ ct_updated = crashdump_watchdog_timeout and \
388+ crashdump_watchdog_timeout != self .crashdump_watchdog_timeout_from_config
389+ watchdog_timeout_configured = self .watchdog_timeout_from_config if diskless \
390+ else self .device_meta_dict_runtime .get ("watchdog" )
391+ wt_updated = watchdog_timeout and watchdog_timeout != watchdog_timeout_configured
392+ return ct_updated or wt_updated
290393
291394 def _configure_diskbase (self , parameter_dict : dict ):
292395 '''
@@ -296,18 +399,28 @@ def _configure_diskbase(self, parameter_dict: dict):
296399 watchdog_device = parameter_dict .get ("watchdog-device" )
297400 if watchdog_device != self .watchdog_device_from_config :
298401 update_dict ["SBD_WATCHDOG_DEV" ] = watchdog_device
299- timeout_dict = {k : v for k , v in parameter_dict .items () if k in self .TIMEOUT_TYPES }
300- is_subdict_timeout = utils .is_subdict (timeout_dict , self .device_meta_dict_runtime )
301402
302- if is_subdict_timeout and not update_dict :
403+ timeout_dict = {
404+ k : v for k , v in parameter_dict .items ()
405+ if k in self .TIMEOUT_TYPES and k != "crashdump-watchdog"
406+ }
407+ timeout_dict = self ._adjust_timeout_dict (timeout_dict )
408+ # merge runtime timeout dict into parameter timeout dict without overwriting
409+ timeout_dict = {** self .device_meta_dict_runtime , ** timeout_dict }
410+
411+ crashdump_watchdog_timeout = parameter_dict .get ("crashdump-watchdog" , self .crashdump_watchdog_timeout_from_config )
412+ if self ._should_configure_crashdump (crashdump_watchdog_timeout , timeout_dict .get ("watchdog" )):
413+ self ._check_kdump_service ()
414+ self ._set_crashdump_option ()
415+ timeout_dict ["msgwait" ] = 2 * timeout_dict ["watchdog" ] + crashdump_watchdog_timeout
416+ logger .info ("Set msgwait-timeout to 2*watchdog-timeout + crashdump-watchdog-timeout: %s" , timeout_dict ["msgwait" ])
417+ result_dict = self ._set_crashdump_in_sysconfig (crashdump_watchdog_timeout )
418+ update_dict = {** update_dict , ** result_dict }
419+
420+ if timeout_dict == self .device_meta_dict_runtime and not update_dict :
303421 logger .info ("No change in SBD configuration" )
304422 return
305423
306- if not is_subdict_timeout :
307- timeout_dict = self ._adjust_timeout_dict (timeout_dict )
308- # merge runtime timeout dict into parameter timeout dict without overwriting
309- timeout_dict = {** self .device_meta_dict_runtime , ** timeout_dict }
310-
311424 sbd_manager = sbd .SBDManager (
312425 device_list_to_init = self .device_list_from_config ,
313426 timeout_dict = timeout_dict ,
@@ -320,17 +433,30 @@ def _configure_diskless(self, parameter_dict: dict):
320433 Configure diskless SBD based on input parameters and runtime config
321434 '''
322435 update_dict = {}
436+ timeout_dict = {}
437+
323438 watchdog_timeout = parameter_dict .get ("watchdog" )
324439 if watchdog_timeout and watchdog_timeout != self .watchdog_timeout_from_config :
325440 update_dict ["SBD_WATCHDOG_TIMEOUT" ] = str (watchdog_timeout )
326441 watchdog_device = parameter_dict .get ("watchdog-device" )
327442 if watchdog_device != self .watchdog_device_from_config :
328443 update_dict ["SBD_WATCHDOG_DEV" ] = watchdog_device
444+
445+ crashdump_watchdog_timeout = parameter_dict .get ("crashdump-watchdog" , self .crashdump_watchdog_timeout_from_config )
446+ if self ._should_configure_crashdump (crashdump_watchdog_timeout , watchdog_timeout , diskless = True ):
447+ self ._check_kdump_service ()
448+ result_dict = self ._set_crashdump_in_sysconfig (crashdump_watchdog_timeout , diskless = True )
449+ update_dict = {** update_dict , ** result_dict }
450+ sbd_watchdog_timeout = watchdog_timeout or self .watchdog_timeout_from_config
451+ stonith_watchdog_timeout = sbd_watchdog_timeout + crashdump_watchdog_timeout
452+ logger .info ("Set stonith-watchdog-timeout to SBD_WATCHDOG_TIMEOUT + crashdump-watchdog-timeout: %s" , stonith_watchdog_timeout )
453+ timeout_dict ["stonith-watchdog" ] = stonith_watchdog_timeout
329454 if not update_dict :
330455 logger .info ("No change in SBD configuration" )
331456 return
332457
333458 sbd_manager = sbd .SBDManager (
459+ timeout_dict = timeout_dict ,
334460 update_dict = update_dict ,
335461 diskless_sbd = True
336462 )
@@ -379,6 +505,7 @@ def do_device(self, context, *args) -> bool:
379505 '''
380506 Implement sbd device command
381507 '''
508+ self ._load_attributes ()
382509 if not self .service_is_active (constants .PCMK_SERVICE ):
383510 return False
384511 if not sbd .SBDUtils .is_using_disk_based_sbd ():
@@ -417,15 +544,16 @@ def do_configure(self, context, *args) -> bool:
417544 Implement sbd configure command
418545 '''
419546 try :
420- for service in (constants .PCMK_SERVICE , constants .SBD_SERVICE ):
421- if not self .service_is_active (service ):
422- return False
547+ self ._load_attributes ()
423548 if not args :
424549 raise self .SyntaxError ("No argument" )
425-
426550 if args [0 ] == "show" :
427551 self ._configure_show (args )
428552 return True
553+ for service in (constants .PCMK_SERVICE , constants .SBD_SERVICE ):
554+ if not self .service_is_active (service ):
555+ return False
556+
429557 parameter_dict = self ._parse_args (args )
430558 if sbd .SBDUtils .is_using_disk_based_sbd ():
431559 self ._configure_diskbase (parameter_dict )
@@ -439,13 +567,26 @@ def do_configure(self, context, *args) -> bool:
439567 if usage :
440568 print (usage )
441569 return False
570+ except self .MissingRequiredException :
571+ return False
442572
443- def do_purge (self , context ) -> bool :
573+ @command .completers (completers .choice (['crashdump' ]))
574+ def do_purge (self , context , * args ) -> bool :
444575 '''
445576 Implement sbd purge command
446577 '''
578+ self ._load_attributes ()
447579 if not self .service_is_active (constants .SBD_SERVICE ):
448580 return False
581+
582+ if args and args [0 ] == "crashdump" :
583+ self ._set_crashdump_option (delete = True )
584+ update_dict = self ._set_crashdump_in_sysconfig (restore = True )
585+ if update_dict :
586+ sbd .SBDManager .update_sbd_configuration (update_dict )
587+ sbd .SBDManager .restart_cluster_if_possible ()
588+ return True
589+
449590 sbd .purge_sbd_from_cluster ()
450591 sbd .SBDManager .restart_cluster_if_possible ()
451592 return True
@@ -544,6 +685,7 @@ def do_status(self, context) -> bool:
544685 '''
545686 Implement sbd status command
546687 '''
688+ self ._load_attributes ()
547689 self ._print_sbd_type ()
548690 self ._print_sbd_status ()
549691 self ._print_sbd_cgroup_status ()
0 commit comments