Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion crmsh/sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import corosync
from . import xmlutil
from . import watchdog
from . import cibquery
from .service_manager import ServiceManager
from .sh import ShellUtils

Expand Down Expand Up @@ -489,6 +490,7 @@ class SBDManager:
SBD_RA = "stonith:fence_sbd"
SBD_RA_ID = "stonith-sbd"
SBD_DEVICE_MAX = 3
SBD_CRASHDUMP_ACTION = "flush,crashdump"

class NotConfigSBD(Exception):
pass
Expand All @@ -499,7 +501,8 @@ def __init__(
timeout_dict: typing.Dict[str, int] | None = None,
update_dict: typing.Dict[str, str] | None = None,
diskless_sbd: bool = False,
bootstrap_context: 'bootstrap.Context | None' = None
bootstrap_context: 'bootstrap.Context | None' = None,
crashdump: str | None = None
):
'''
Init function which can be called from crm sbd subcommand or bootstrap
Expand All @@ -511,6 +514,7 @@ def __init__(
self.cluster_is_running = ServiceManager().service_is_active(constants.PCMK_SERVICE)
self.bootstrap_context = bootstrap_context
self.overwrite_sysconfig = False
self.crashdump = crashdump

# From bootstrap init or join process, override the values
if self.bootstrap_context:
Expand Down Expand Up @@ -603,6 +607,7 @@ def configure_sbd(self):
if not xmlutil.CrmMonXmlParser().is_resource_configured(self.SBD_RA):
cmd = f"crm configure primitive {self.SBD_RA_ID} {self.SBD_RA}"
sh.cluster_shell().get_stdout_or_raise_error(cmd)
self.set_crashdump_option_in_fence_sbd()
else:
swt_value = self.timeout_dict.get("stonith-watchdog", 2*SBDTimeout.get_sbd_watchdog_timeout())
utils.set_property("stonith-watchdog-timeout", swt_value)
Expand Down Expand Up @@ -734,6 +739,7 @@ def init_and_deploy_sbd(self, restart_first=False):
return

self.initialize_sbd()
self.set_crashdump_action()
self.update_configuration()
self.enable_sbd_service()

Expand Down Expand Up @@ -787,6 +793,59 @@ def join_sbd(self, remote_user, peer_host):
logger.info("Got {}SBD configuration".format("" if dev_list else "diskless "))
self.enable_sbd_service()

def set_crashdump_action(self):
'''
Set crashdump timeout action in /etc/sysconfig/sbd
'''
if not self.crashdump or self.crashdump not in ("set", "restore"):
return

comment_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/s/^/#__sbd_crashdump_backup__ /' {self.SYSCONFIG_SBD}"
add_action_line = f"sed -i '/^#__sbd_crashdump_backup__/a SBD_TIMEOUT_ACTION={self.SBD_CRASHDUMP_ACTION}' {self.SYSCONFIG_SBD}"
comment_out_action_line = f"sed -i 's/^#__sbd_crashdump_backup__ SBD_TIMEOUT_ACTION/SBD_TIMEOUT_ACTION/' {self.SYSCONFIG_SBD}"
delete_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/d' {self.SYSCONFIG_SBD}"
sbd_timeout_action_configured = SBDUtils.get_sbd_value_from_config("SBD_TIMEOUT_ACTION")
shell = sh.cluster_shell()

if self.crashdump == "set":
if not sbd_timeout_action_configured:
logger.info("Set SBD_TIMEOUT_ACTION in %s: %s", self.SYSCONFIG_SBD, self.SBD_CRASHDUMP_ACTION)
self.update_dict["SBD_TIMEOUT_ACTION"] = self.SBD_CRASHDUMP_ACTION
elif sbd_timeout_action_configured != self.SBD_CRASHDUMP_ACTION:
logger.info("Update SBD_TIMEOUT_ACTION in %s: %s", self.SYSCONFIG_SBD, self.SBD_CRASHDUMP_ACTION)
shell.get_stdout_or_raise_error(f"{comment_action_line} && {add_action_line}")
elif self.crashdump == "restore":
if sbd_timeout_action_configured and sbd_timeout_action_configured == self.SBD_CRASHDUMP_ACTION:
logger.info("Restore SBD_TIMEOUT_ACTION in %s", self.SYSCONFIG_SBD)
shell.get_stdout_or_raise_error(f"{delete_action_line} && {comment_out_action_line}")

def set_crashdump_option_in_fence_sbd(self):
'''
Set crashdump option in fence_sbd resource
'''
if not self.crashdump or self.crashdump not in ("set", "restore"):
return

shell = sh.cluster_shell()
configure_show_in_xml = xmlutil.text2elem(shell.get_stdout_or_raise_error('crm configure show xml'))
ra = cibquery.ResourceAgent("stonith", "", "fence_sbd")
res_id_list = cibquery.get_primitives_with_ra(configure_show_in_xml, ra)
if not res_id_list:
return

for res in res_id_list:
crashdump_value = cibquery.get_parameter_value(configure_show_in_xml, res, "crashdump")
cmd = ""
if utils.is_boolean_false(crashdump_value):
if self.crashdump == "set":
logger.info("Set crashdump option for fence_sbd resource '%s'", res)
cmd = f"crm resource param {res} set crashdump 1"
elif self.crashdump == "restore":
logger.info("Delete crashdump option for fence_sbd resource '%s'", res)
cmd = f"crm resource param {res} delete crashdump"
if cmd:
shell.get_stdout_or_raise_error(cmd)


def cleanup_existing_sbd_resource():
if xmlutil.CrmMonXmlParser().is_resource_configured(SBDManager.SBD_RA):
Expand Down
103 changes: 41 additions & 62 deletions crmsh/ui_sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from crmsh import sh
from crmsh import xmlutil
from crmsh import constants
from crmsh import cibquery
from crmsh.service_manager import ServiceManager


Expand Down Expand Up @@ -313,60 +312,16 @@ def _adjust_timeout_dict(timeout_dict: dict) -> dict:
return timeout_dict
return timeout_dict

def _set_crashdump_option(self, delete=False):
'''
Set crashdump option for fence_sbd resource
'''
cib = xmlutil.text2elem(self.cluster_shell.get_stdout_or_raise_error('crm configure show xml'))
ra = cibquery.ResourceAgent("stonith", "", "fence_sbd")
res_id_list = cibquery.get_primitives_with_ra(cib, ra)
if not res_id_list:
if delete:
return
logger.error("No fence_sbd resource found")
raise utils.TerminateSubCommand

crashdump_value = cibquery.get_parameter_value(cib, res_id_list[0], "crashdump")
cmd = ""
if utils.is_boolean_false(crashdump_value):
if delete:
return
cmd = f"crm resource param {res_id_list[0]} set crashdump 1"
logger.info("Set crashdump option for fence_sbd resource")
elif delete:
cmd = f"crm resource param {res_id_list[0]} delete crashdump"
logger.info("Delete crashdump option for fence_sbd resource")
if cmd:
self.cluster_shell.get_stdout_or_raise_error(cmd)

def _set_crashdump_in_sysconfig(self, crashdump_watchdog_timeout=None, restore=False, diskless=False) -> dict:
def _set_sbd_opts(self, crashdump_watchdog_timeout=None, restore=False, diskless=False) -> dict:
update_dict = {}
sbd_timeout_action_for_crashdump = "flush,crashdump"
comment_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/s/^/#__sbd_crashdump_backup__ /' {sbd.SBDManager.SYSCONFIG_SBD}"
add_action_line = f"sed -i '/^#__sbd_crashdump_backup__/a SBD_TIMEOUT_ACTION={sbd_timeout_action_for_crashdump}' {sbd.SBDManager.SYSCONFIG_SBD}"
comment_out_action_line = f"sed -i 's/^#__sbd_crashdump_backup__ SBD_TIMEOUT_ACTION/SBD_TIMEOUT_ACTION/' {sbd.SBDManager.SYSCONFIG_SBD}"
delete_action_line = f"sed -i '/^SBD_TIMEOUT_ACTION/d' {sbd.SBDManager.SYSCONFIG_SBD}"

sbd_timeout_action_configured = sbd.SBDUtils.get_sbd_value_from_config("SBD_TIMEOUT_ACTION")
if restore:
if sbd_timeout_action_configured and sbd_timeout_action_configured == sbd_timeout_action_for_crashdump:
cmd_delete_and_comment_out = f"{delete_action_line} && {comment_out_action_line}"
logger.info("Delete SBD_TIMEOUT_ACTION: %s and restore original value", sbd_timeout_action_for_crashdump)
self.cluster_shell.get_stdout_or_raise_error(cmd_delete_and_comment_out)

sbd_opts = sbd.SBDUtils.get_sbd_value_from_config("SBD_OPTS")
if sbd_opts and re.search(self.SBD_OPTS_RE, sbd_opts):
sbd_opts = re.sub(self.SBD_OPTS_RE, '', sbd_opts)
update_dict["SBD_OPTS"] = ' '.join(sbd_opts.split())

elif crashdump_watchdog_timeout:
if not sbd_timeout_action_configured:
update_dict["SBD_TIMEOUT_ACTION"] = sbd_timeout_action_for_crashdump
elif sbd_timeout_action_configured != sbd_timeout_action_for_crashdump:
cmd_comment_and_add = f"{comment_action_line} && {add_action_line}"
self.cluster_shell.get_stdout_or_raise_error(cmd_comment_and_add)
logger.info("Update SBD_TIMEOUT_ACTION in %s: %s", sbd.SBDManager.SYSCONFIG_SBD, sbd_timeout_action_for_crashdump)

value_for_diskless = " -Z" if diskless else ""
value_for_sbd_opts = f"-C {crashdump_watchdog_timeout}{value_for_diskless}"
sbd_opts = sbd.SBDUtils.get_sbd_value_from_config("SBD_OPTS")
Expand Down Expand Up @@ -421,13 +376,14 @@ def _configure_diskbase(self, parameter_dict: dict):
# merge runtime timeout dict into parameter timeout dict without overwriting
timeout_dict = {**self.device_meta_dict_runtime, **timeout_dict}

configure_crashdump = False
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog", self.crashdump_watchdog_timeout_from_config)
if self._should_configure_crashdump(crashdump_watchdog_timeout, timeout_dict.get("watchdog")):
configure_crashdump = True
self._check_kdump_service()
self._set_crashdump_option()
timeout_dict["msgwait"] = 2*timeout_dict["watchdog"] + crashdump_watchdog_timeout
logger.info("Set msgwait-timeout to 2*watchdog-timeout + crashdump-watchdog-timeout: %s", timeout_dict["msgwait"])
result_dict = self._set_crashdump_in_sysconfig(crashdump_watchdog_timeout)
result_dict = self._set_sbd_opts(crashdump_watchdog_timeout)
update_dict = {**update_dict, **result_dict}

if timeout_dict == self.device_meta_dict_runtime and not update_dict:
Expand All @@ -437,7 +393,8 @@ def _configure_diskbase(self, parameter_dict: dict):
sbd_manager = sbd.SBDManager(
device_list_to_init=self.device_list_from_config,
timeout_dict=timeout_dict,
update_dict=update_dict
update_dict=update_dict,
crashdump="set" if configure_crashdump else None
)
sbd_manager.init_and_deploy_sbd()

Expand All @@ -455,10 +412,12 @@ def _configure_diskless(self, parameter_dict: dict):
if watchdog_device != self.watchdog_device_from_config:
update_dict["SBD_WATCHDOG_DEV"] = watchdog_device

configure_crashdump = False
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog", self.crashdump_watchdog_timeout_from_config)
if self._should_configure_crashdump(crashdump_watchdog_timeout, watchdog_timeout, diskless=True):
configure_crashdump = True
self._check_kdump_service()
result_dict = self._set_crashdump_in_sysconfig(crashdump_watchdog_timeout, diskless=True)
result_dict = self._set_sbd_opts(crashdump_watchdog_timeout, diskless=True)
update_dict = {**update_dict, **result_dict}
sbd_watchdog_timeout = watchdog_timeout or self.watchdog_timeout_from_config
stonith_watchdog_timeout = sbd_watchdog_timeout + crashdump_watchdog_timeout
Expand All @@ -478,7 +437,8 @@ def _configure_diskless(self, parameter_dict: dict):
sbd_manager = sbd.SBDManager(
timeout_dict=timeout_dict,
update_dict=update_dict,
diskless_sbd=True
diskless_sbd=True,
crashdump="set" if configure_crashdump else None
)
sbd_manager.init_and_deploy_sbd(restart_first)

Expand Down Expand Up @@ -595,6 +555,28 @@ def do_configure(self, context, *args) -> bool:
print(usage)
return False

def _purge_crashdump(self):
'''
Purge crashdump configuration from SBD
'''
timeout_dict, update_dict = {}, {}
if self.device_list_from_config:
timeout_dict["watchdog"] = self.device_meta_dict_runtime.get("watchdog")
timeout_dict["msgwait"] = 2 * timeout_dict["watchdog"]
logger.info("Set msgwait-timeout to 2*watchdog-timeout: %s", timeout_dict["msgwait"])
else:
timeout_dict["stonith-watchdog"] = 2 * self.watchdog_timeout_from_config
logger.info("Set stonith-watchdog-timeout to 2*SBD_WATCHDOG_TIMEOUT: %s", timeout_dict["stonith-watchdog"])
update_dict = self._set_sbd_opts(restore=True)

sbd_manager = sbd.SBDManager(
device_list_to_init=self.device_list_from_config if self.device_list_from_config else None,
timeout_dict=timeout_dict,
update_dict=update_dict,
crashdump="restore"
)
sbd_manager.init_and_deploy_sbd()

@command.completers(completers.choice(['crashdump']))
def do_purge(self, context, *args) -> bool:
'''
Expand All @@ -618,20 +600,17 @@ def do_purge(self, context, *args) -> bool:

utils.check_all_nodes_reachable("purging SBD")

with utils.leverage_maintenance_mode() as enabled:
if not utils.able_to_restart_cluster(enabled):
return False
if purge_crashdump:
self._purge_crashdump()
else: # purge sbd from cluster
with utils.leverage_maintenance_mode() as enabled:
if not utils.able_to_restart_cluster(enabled):
return False

if purge_crashdump:
self._set_crashdump_option(delete=True)
update_dict = self._set_crashdump_in_sysconfig(restore=True)
if update_dict:
sbd.SBDManager.update_sbd_configuration(update_dict)
else:
sbd.purge_sbd_from_cluster()
bootstrap.restart_cluster()

bootstrap.restart_cluster()
return True
return True

def _print_sbd_type(self):
if not self.service_manager.service_is_active(constants.SBD_SERVICE):
Expand Down
Loading