Skip to content

Commit 367bbfa

Browse files
authored
Dev: ui_sbd: Configure crashdump watchdog timeout (#1732)
This PR supports configuring the crashdump watchdog timeout by using `crm sbd configure crashdump-watchdog-timeout=<timeout>`, for both disk-based and diskless SBD. - disk-based SBD case ``` # crm sbd configure crashdump-watchdog-timeout=60 WARNING: Kdump service is not active on alp-1 WARNING: Kdump service is not active on alp-2 INFO: Set crashdump option for fence_sbd resource INFO: Set msgwait-timeout to 2*watchdog-timeout + crashdump-watchdog-timeout: 90 INFO: Configuring disk-based SBD INFO: Initializing SBD device /dev/sda8 INFO: Update SBD_TIMEOUT_ACTION in /etc/sysconfig/sbd: flush,crashdump INFO: Update SBD_OPTS in /etc/sysconfig/sbd: -C 60 INFO: Already synced /etc/sysconfig/sbd to all nodes WARNING: Resource is running, need to restart cluster service manually on each node INFO: Update SBD_DELAY_START in /etc/sysconfig/sbd: 131 INFO: Already synced /etc/sysconfig/sbd to all nodes WARNING: "stonith-timeout" in crm_config is set to 155, it was 83 ``` - diskless SBD case ``` # crm sbd configure crashdump-watchdog-timeout=60 WARNING: Kdump service is not active on alp-1 WARNING: Kdump service is not active on alp-2 INFO: Set stonith-watchdog-timeout to SBD_WATCHDOG_TIMEOUT + crashdump-watchdog-timeout: 75 INFO: Configuring diskless SBD WARNING: Diskless SBD requires cluster with three or more nodes. If you want to use diskless SBD for 2-node cluster, should be combined with QDevice. INFO: Update SBD_TIMEOUT_ACTION in /etc/sysconfig/sbd: flush,crashdump INFO: Update SBD_OPTS in /etc/sysconfig/sbd: -C 60 -Z INFO: Already synced /etc/sysconfig/sbd to all nodes INFO: Restarting cluster service INFO: BEGIN Waiting for cluster ........... INFO: END Waiting for cluster WARNING: "stonith-watchdog-timeout" in crm_config is set to 75, it was -1 WARNING: "stonith-timeout" in crm_config is set to 101, it was 71 ``` - To cleanup crashdump related option and configurations ``` # crm sbd purge crashdump INFO: Delete crashdump option for fence_sbd resource INFO: Delete SBD_TIMEOUT_ACTION: flush,crashdump and restore original value INFO: Update SBD_OPTS in /etc/sysconfig/sbd: INFO: Already synced /etc/sysconfig/sbd to all nodes WARNING: Resource is running, need to restart cluster service manually on each node ```
2 parents 1442a04 + 964c262 commit 367bbfa

File tree

10 files changed

+372
-74
lines changed

10 files changed

+372
-74
lines changed

crmsh/bootstrap.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2725,12 +2725,12 @@ def adjust_pcmk_delay_max(is_2node_wo_qdevice):
27252725
for res in cib_factory.fence_id_list_without_pcmk_delay():
27262726
cmd = "crm resource param {} set pcmk_delay_max {}s".format(res, PCMK_DELAY_MAX)
27272727
shell.get_stdout_or_raise_error(cmd)
2728-
logger.debug("Add parameter 'pcmk_delay_max={}s' for resource '{}'".format(PCMK_DELAY_MAX, res))
2728+
logger.info("Add parameter 'pcmk_delay_max={}s' for resource '{}'".format(PCMK_DELAY_MAX, res))
27292729
else:
27302730
for res in cib_factory.fence_id_list_with_pcmk_delay():
27312731
cmd = "crm resource param {} delete pcmk_delay_max".format(res)
27322732
shell.get_stdout_or_raise_error(cmd)
2733-
logger.debug("Delete parameter 'pcmk_delay_max' for resource '{}'".format(res))
2733+
logger.info("Delete parameter 'pcmk_delay_max' for resource '{}'".format(res))
27342734

27352735

27362736
def adjust_stonith_timeout():

crmsh/cibquery.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,26 @@ def has_primitive_filesystem_with_fstype(cib: lxml.etree.Element, fstype: str) -
3333
f'/instance_attributes/nvpair[@name="fstype" and @value="{fstype}"]'
3434
))
3535

36+
37+
def get_primitives_with_ra(cib: lxml.etree.Element, ra: ResourceAgent) -> list[str]:
38+
"""
39+
Given cib and ResourceAgent instance, return id list of primitives that matched
40+
consider provider as optional
41+
"""
42+
provider_condition = f' and @provider="{ra.m_provider}"' if ra.m_provider else ""
43+
return cib.xpath(
44+
f'/cib/configuration/resources//primitive[@class="{ra.m_class}"{provider_condition} and @type="{ra.m_type}"]/@id'
45+
)
46+
47+
48+
def get_parameter_value(cib: lxml.etree.Element, res_id: str, param_name: str) -> typing.Optional[str]:
49+
result = cib.xpath(
50+
f'/cib/configuration/resources//primitive[@id="{res_id}"]'
51+
f'/instance_attributes/nvpair[@name="{param_name}"]/@value'
52+
)
53+
return result[0] if result else None
54+
55+
3656
def get_cluster_nodes(cib: lxml.etree.Element) -> list[ClusterNode]:
3757
"""Return a list of cluster nodes, excluding pacemaker-remote nodes"""
3858
result = list()

crmsh/sbd.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ def get_sbd_value_from_config(key):
8080
'''
8181
return utils.parse_sysconfig(SBDManager.SYSCONFIG_SBD).get(key)
8282

83+
@staticmethod
84+
def get_crashdump_watchdog_timeout() -> typing.Optional[int]:
85+
res = SBDUtils.get_sbd_value_from_config("SBD_OPTS")
86+
if not res:
87+
return None
88+
matched = re.search(r"-C\s+(\d+)", res)
89+
return int(matched.group(1)) if matched else None
90+
8391
@staticmethod
8492
def get_sbd_device_from_config():
8593
'''
@@ -559,7 +567,8 @@ def configure_sbd(self):
559567
Configure fence_sbd resource and related properties
560568
'''
561569
if self.diskless_sbd:
562-
utils.set_property("stonith-watchdog-timeout", SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT)
570+
swt_value = self.timeout_dict.get("stonith-watchdog", SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT)
571+
utils.set_property("stonith-watchdog-timeout", swt_value)
563572
else:
564573
if utils.get_property("stonith-watchdog-timeout", get_default=False):
565574
utils.delete_property("stonith-watchdog-timeout")

crmsh/ui_sbd.py

Lines changed: 162 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from crmsh import sh
1313
from crmsh import xmlutil
1414
from crmsh import constants
15+
from crmsh import cibquery
1516
from crmsh.service_manager import ServiceManager
1617

1718

@@ -88,30 +89,37 @@ class SBD(command.UI):
8889
- sbd purge
8990
'''
9091
name = "sbd"
91-
TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait")
92-
DISKLESS_TIMEOUT_TYPES = ("watchdog",)
92+
TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait", "crashdump-watchdog")
93+
DISKLESS_TIMEOUT_TYPES = ("watchdog", "crashdump-watchdog")
9394
SHOW_TYPES = ("disk_metadata", "sysconfig", "property")
9495
DISKLESS_SHOW_TYPES = ("sysconfig", "property")
9596
PCMK_ATTRS = (
9697
"have-watchdog",
9798
"stonith-timeout",
98-
"stonith-enabled",
99-
"priority-fencing-delay",
100-
"pcmk_delay_max"
99+
"stonith-enabled"
101100
)
102101
PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout',)
103102
PARSE_RE = re.compile(
104-
# Match keys with non-empty values, capturing possible suffix
105-
r'(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)'
103+
# To extract key, suffix and value from these possible arguments:
104+
# watchdog-timeout=30
105+
# crashdump-watchdog-timeout=120
106+
# watchdog-device=/dev/watchdog
107+
r'([\w-]+)-([\w]+)=([\w/]+)'
106108
)
109+
# re pattern to match "-C <number>" or "-C <number> -Z"
110+
SBD_OPTS_RE = r'-C\s+\d+(\s+-Z)?'
107111

108112
class SyntaxError(Exception):
109113
pass
110114

115+
class MissingRequiredException(Exception):
116+
pass
117+
111118
def __init__(self):
112119
self.device_list_from_config: list[str] = None
113120
self.device_meta_dict_runtime: dict[str, int] = None
114121
self.watchdog_timeout_from_config: int = None
122+
self.crashdump_watchdog_timeout_from_config: int = None
115123
self.watchdog_device_from_config: str = None
116124
self.service_manager: ServiceManager = None
117125
self.cluster_shell: sh.cluster_shell = None
@@ -130,6 +138,7 @@ def _load_attributes(self):
130138
except Exception:
131139
self.watchdog_timeout_from_config = None
132140
self.watchdog_device_from_config = watchdog.Watchdog.get_watchdog_device_from_sbd_config()
141+
self.crashdump_watchdog_timeout_from_config = sbd.SBDUtils.get_crashdump_watchdog_timeout()
133142

134143
self.service_manager = ServiceManager()
135144
self.cluster_shell = sh.cluster_shell()
@@ -217,6 +226,13 @@ def _show_property(self) -> None:
217226
for match in matches:
218227
print(f"{match[0]}={match[1]}")
219228

229+
cmd = "crm configure show related:fence_sbd"
230+
out = self.cluster_shell.get_stdout_or_raise_error(cmd)
231+
if out:
232+
print()
233+
logger.info('%s', cmd)
234+
print(out)
235+
220236
print()
221237
logger.info('%s', sbd.SBDTimeout.SHOW_SBD_START_TIMEOUT_CMD)
222238
systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout()
@@ -287,6 +303,93 @@ def _adjust_timeout_dict(timeout_dict: dict) -> dict:
287303
timeout_dict["watchdog"] = watchdog_timeout
288304
logger.info("No watchdog timeout specified, use msgwait timeout/2: %s", watchdog_timeout)
289305
return timeout_dict
306+
return timeout_dict
307+
308+
def _set_crashdump_option(self, delete=False):
309+
'''
310+
Set crashdump option for fence_sbd resource
311+
'''
312+
cib = xmlutil.text2elem(self.cluster_shell.get_stdout_or_raise_error('crm configure show xml'))
313+
ra = cibquery.ResourceAgent("stonith", "", "fence_sbd")
314+
res_id_list = cibquery.get_primitives_with_ra(cib, ra)
315+
if not res_id_list:
316+
if delete:
317+
return
318+
logger.error("No fence_sbd resource found")
319+
raise self.MissingRequiredException
320+
321+
crashdump_value = cibquery.get_parameter_value(cib, res_id_list[0], "crashdump")
322+
cmd = ""
323+
if utils.is_boolean_false(crashdump_value):
324+
if delete:
325+
return
326+
cmd = f"crm resource param {res_id_list[0]} set crashdump 1"
327+
logger.info("Set crashdump option for fence_sbd resource")
328+
elif delete:
329+
cmd = f"crm resource param {res_id_list[0]} delete crashdump"
330+
logger.info("Delete crashdump option for fence_sbd resource")
331+
if cmd:
332+
self.cluster_shell.get_stdout_or_raise_error(cmd)
333+
334+
def _set_crashdump_in_sysconfig(self, crashdump_watchdog_timeout=None, restore=False, diskless=False) -> dict:
335+
update_dict = {}
336+
sbd_timeout_action_for_crashdump = "flush,crashdump"
337+
comment_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/s/^/#__sbd_crashdump_backup__ /' {sbd.SBDManager.SYSCONFIG_SBD}"
338+
add_action_line = f"sed -i '/^#__sbd_crashdump_backup__/a SBD_TIMEOUT_ACTION={sbd_timeout_action_for_crashdump}' {sbd.SBDManager.SYSCONFIG_SBD}"
339+
comment_out_action_line = f"sed -i 's/^#__sbd_crashdump_backup__ SBD_TIMEOUT_ACTION/SBD_TIMEOUT_ACTION/' {sbd.SBDManager.SYSCONFIG_SBD}"
340+
delete_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/d' {sbd.SBDManager.SYSCONFIG_SBD}"
341+
342+
sbd_timeout_action_configured = sbd.SBDUtils.get_sbd_value_from_config("SBD_TIMEOUT_ACTION")
343+
if restore:
344+
if sbd_timeout_action_configured and sbd_timeout_action_configured == sbd_timeout_action_for_crashdump:
345+
cmd_delete_and_comment_out = f"{delete_action_line} && {comment_out_action_line}"
346+
logger.info("Delete SBD_TIMEOUT_ACTION: %s and restore original value", sbd_timeout_action_for_crashdump)
347+
self.cluster_shell.get_stdout_or_raise_error(cmd_delete_and_comment_out)
348+
349+
sbd_opts = sbd.SBDUtils.get_sbd_value_from_config("SBD_OPTS")
350+
if sbd_opts and re.search(self.SBD_OPTS_RE, sbd_opts):
351+
sbd_opts = re.sub(self.SBD_OPTS_RE, '', sbd_opts)
352+
update_dict["SBD_OPTS"] = ' '.join(sbd_opts.split())
353+
354+
elif crashdump_watchdog_timeout:
355+
if not sbd_timeout_action_configured:
356+
update_dict["SBD_TIMEOUT_ACTION"] = sbd_timeout_action_for_crashdump
357+
elif sbd_timeout_action_configured != sbd_timeout_action_for_crashdump:
358+
cmd_comment_and_add = f"{comment_action_line} && {add_action_line}"
359+
self.cluster_shell.get_stdout_or_raise_error(cmd_comment_and_add)
360+
logger.info("Update SBD_TIMEOUT_ACTION in %s: %s", sbd.SBDManager.SYSCONFIG_SBD, sbd_timeout_action_for_crashdump)
361+
362+
value_for_diskless = " -Z" if diskless else ""
363+
value_for_sbd_opts = f"-C {crashdump_watchdog_timeout}{value_for_diskless}"
364+
sbd_opts = sbd.SBDUtils.get_sbd_value_from_config("SBD_OPTS")
365+
sbd_opts = re.sub(self.SBD_OPTS_RE, '', sbd_opts)
366+
update_dict["SBD_OPTS"] = f"{' '.join(sbd_opts.split())} {value_for_sbd_opts}" if sbd_opts else value_for_sbd_opts
367+
368+
return update_dict
369+
370+
def _check_kdump_service(self):
371+
no_kdump = False
372+
for node in self.cluster_nodes:
373+
if not self.service_manager.service_is_active("kdump.service", node):
374+
logger.warning("Kdump service is not active on %s", node)
375+
no_kdump = True
376+
if no_kdump:
377+
logger.warning("Kdump service is required for crashdump")
378+
379+
def _should_configure_crashdump(
380+
self,
381+
crashdump_watchdog_timeout,
382+
watchdog_timeout,
383+
diskless=False
384+
) -> bool:
385+
if not crashdump_watchdog_timeout and not self.crashdump_watchdog_timeout_from_config:
386+
return False
387+
ct_updated = crashdump_watchdog_timeout and \
388+
crashdump_watchdog_timeout != self.crashdump_watchdog_timeout_from_config
389+
watchdog_timeout_configured = self.watchdog_timeout_from_config if diskless \
390+
else self.device_meta_dict_runtime.get("watchdog")
391+
wt_updated = watchdog_timeout and watchdog_timeout != watchdog_timeout_configured
392+
return ct_updated or wt_updated
290393

291394
def _configure_diskbase(self, parameter_dict: dict):
292395
'''
@@ -296,18 +399,28 @@ def _configure_diskbase(self, parameter_dict: dict):
296399
watchdog_device = parameter_dict.get("watchdog-device")
297400
if watchdog_device != self.watchdog_device_from_config:
298401
update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
299-
timeout_dict = {k: v for k, v in parameter_dict.items() if k in self.TIMEOUT_TYPES}
300-
is_subdict_timeout = utils.is_subdict(timeout_dict, self.device_meta_dict_runtime)
301402

302-
if is_subdict_timeout and not update_dict:
403+
timeout_dict = {
404+
k: v for k, v in parameter_dict.items()
405+
if k in self.TIMEOUT_TYPES and k != "crashdump-watchdog"
406+
}
407+
timeout_dict = self._adjust_timeout_dict(timeout_dict)
408+
# merge runtime timeout dict into parameter timeout dict without overwriting
409+
timeout_dict = {**self.device_meta_dict_runtime, **timeout_dict}
410+
411+
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog", self.crashdump_watchdog_timeout_from_config)
412+
if self._should_configure_crashdump(crashdump_watchdog_timeout, timeout_dict.get("watchdog")):
413+
self._check_kdump_service()
414+
self._set_crashdump_option()
415+
timeout_dict["msgwait"] = 2*timeout_dict["watchdog"] + crashdump_watchdog_timeout
416+
logger.info("Set msgwait-timeout to 2*watchdog-timeout + crashdump-watchdog-timeout: %s", timeout_dict["msgwait"])
417+
result_dict = self._set_crashdump_in_sysconfig(crashdump_watchdog_timeout)
418+
update_dict = {**update_dict, **result_dict}
419+
420+
if timeout_dict == self.device_meta_dict_runtime and not update_dict:
303421
logger.info("No change in SBD configuration")
304422
return
305423

306-
if not is_subdict_timeout:
307-
timeout_dict = self._adjust_timeout_dict(timeout_dict)
308-
# merge runtime timeout dict into parameter timeout dict without overwriting
309-
timeout_dict = {**self.device_meta_dict_runtime, **timeout_dict}
310-
311424
sbd_manager = sbd.SBDManager(
312425
device_list_to_init=self.device_list_from_config,
313426
timeout_dict=timeout_dict,
@@ -320,17 +433,30 @@ def _configure_diskless(self, parameter_dict: dict):
320433
Configure diskless SBD based on input parameters and runtime config
321434
'''
322435
update_dict = {}
436+
timeout_dict = {}
437+
323438
watchdog_timeout = parameter_dict.get("watchdog")
324439
if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config:
325440
update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout)
326441
watchdog_device = parameter_dict.get("watchdog-device")
327442
if watchdog_device != self.watchdog_device_from_config:
328443
update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
444+
445+
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog", self.crashdump_watchdog_timeout_from_config)
446+
if self._should_configure_crashdump(crashdump_watchdog_timeout, watchdog_timeout, diskless=True):
447+
self._check_kdump_service()
448+
result_dict = self._set_crashdump_in_sysconfig(crashdump_watchdog_timeout, diskless=True)
449+
update_dict = {**update_dict, **result_dict}
450+
sbd_watchdog_timeout = watchdog_timeout or self.watchdog_timeout_from_config
451+
stonith_watchdog_timeout = sbd_watchdog_timeout + crashdump_watchdog_timeout
452+
logger.info("Set stonith-watchdog-timeout to SBD_WATCHDOG_TIMEOUT + crashdump-watchdog-timeout: %s", stonith_watchdog_timeout)
453+
timeout_dict["stonith-watchdog"] = stonith_watchdog_timeout
329454
if not update_dict:
330455
logger.info("No change in SBD configuration")
331456
return
332457

333458
sbd_manager = sbd.SBDManager(
459+
timeout_dict=timeout_dict,
334460
update_dict=update_dict,
335461
diskless_sbd=True
336462
)
@@ -379,6 +505,7 @@ def do_device(self, context, *args) -> bool:
379505
'''
380506
Implement sbd device command
381507
'''
508+
self._load_attributes()
382509
if not self.service_is_active(constants.PCMK_SERVICE):
383510
return False
384511
if not sbd.SBDUtils.is_using_disk_based_sbd():
@@ -417,15 +544,16 @@ def do_configure(self, context, *args) -> bool:
417544
Implement sbd configure command
418545
'''
419546
try:
420-
for service in (constants.PCMK_SERVICE, constants.SBD_SERVICE):
421-
if not self.service_is_active(service):
422-
return False
547+
self._load_attributes()
423548
if not args:
424549
raise self.SyntaxError("No argument")
425-
426550
if args[0] == "show":
427551
self._configure_show(args)
428552
return True
553+
for service in (constants.PCMK_SERVICE, constants.SBD_SERVICE):
554+
if not self.service_is_active(service):
555+
return False
556+
429557
parameter_dict = self._parse_args(args)
430558
if sbd.SBDUtils.is_using_disk_based_sbd():
431559
self._configure_diskbase(parameter_dict)
@@ -439,13 +567,26 @@ def do_configure(self, context, *args) -> bool:
439567
if usage:
440568
print(usage)
441569
return False
570+
except self.MissingRequiredException:
571+
return False
442572

443-
def do_purge(self, context) -> bool:
573+
@command.completers(completers.choice(['crashdump']))
574+
def do_purge(self, context, *args) -> bool:
444575
'''
445576
Implement sbd purge command
446577
'''
578+
self._load_attributes()
447579
if not self.service_is_active(constants.SBD_SERVICE):
448580
return False
581+
582+
if args and args[0] == "crashdump":
583+
self._set_crashdump_option(delete=True)
584+
update_dict = self._set_crashdump_in_sysconfig(restore=True)
585+
if update_dict:
586+
sbd.SBDManager.update_sbd_configuration(update_dict)
587+
sbd.SBDManager.restart_cluster_if_possible()
588+
return True
589+
449590
sbd.purge_sbd_from_cluster()
450591
sbd.SBDManager.restart_cluster_if_possible()
451592
return True
@@ -544,6 +685,7 @@ def do_status(self, context) -> bool:
544685
'''
545686
Implement sbd status command
546687
'''
688+
self._load_attributes()
547689
self._print_sbd_type()
548690
self._print_sbd_status()
549691
self._print_sbd_cgroup_status()

crmsh/utils.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3193,11 +3193,4 @@ def strip_ansi_escape_sequences(text):
31933193
"""
31943194
ansi_escape_pattern = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]')
31953195
return ansi_escape_pattern.sub('', text)
3196-
3197-
3198-
def is_subdict(sub_dict, main_dict):
3199-
"""
3200-
Check if sub_dict is a sub-dictionary of main_dict
3201-
"""
3202-
return all(main_dict.get(k) == v for k, v in sub_dict.items())
32033196
# vim:ts=4:sw=4:et:

0 commit comments

Comments
 (0)