From 91bc180f1020984909a24ab8225f8c55e99b132b Mon Sep 17 00:00:00 2001 From: mgrimesix <126630154+mgrimesix@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:18:10 -0700 Subject: [PATCH] NAS-128399 / 24.10 / More nfs failover fixes (#13598) Changes in support of NFS failover in runtime code: * Changes to allow NFS clients using locks to survive failover * Add code to support moving the NFS status directory to the system dataset. Changes to runtime code to improve NFS resiliency: * Increase timeout on NFS service call from 5 to 10 seconds. NFS takes about 5 sec to start. Changes to CI tests: * Add check for nfs in the boot_pool. * Fix flake8 complaint. * Improve error reporting for service state check. * Added check for NFS dataset in first boot check in test_001_ssh.py. * Some simple refactoring in test_300_nfs.py. --------- Co-authored-by: Caleb St. John <30729806+yocalebo@users.noreply.github.com> --- .../system/nfsdcld.service.d/override.conf | 3 + .../etc_files/nfs.conf.d/local.conf.mako | 13 +++ src/middlewared/middlewared/plugins/nfs.py | 98 ++++++++++++++++++- .../middlewared/plugins/nfs_/status.py | 3 +- .../plugins/service_/services/nfs.py | 1 + .../plugins/system_dataset/hierarchy.py | 19 ++++ tests/api2/test_001_ssh.py | 7 +- tests/api2/test_300_nfs.py | 62 ++++++++++-- 8 files changed, 193 insertions(+), 13 deletions(-) create mode 100644 src/freenas/etc/systemd/system/nfsdcld.service.d/override.conf diff --git a/src/freenas/etc/systemd/system/nfsdcld.service.d/override.conf b/src/freenas/etc/systemd/system/nfsdcld.service.d/override.conf new file mode 100644 index 000000000000..d8174906d15c --- /dev/null +++ b/src/freenas/etc/systemd/system/nfsdcld.service.d/override.conf @@ -0,0 +1,3 @@ +[Unit] +BindTo=nfs-server.service + diff --git a/src/middlewared/middlewared/etc_files/nfs.conf.d/local.conf.mako b/src/middlewared/middlewared/etc_files/nfs.conf.d/local.conf.mako index cc75b9b78f67..d24f4f3e1f05 100644 --- a/src/middlewared/middlewared/etc_files/nfs.conf.d/local.conf.mako +++ b/src/middlewared/middlewared/etc_files/nfs.conf.d/local.conf.mako @@ -1,4 +1,8 @@ <% + from middlewared.plugins.nfs import NFSServicePathInfo + state_path = NFSServicePathInfo.STATEDIR.path() + cld_storedir = NFSServicePathInfo.CLDDIR.path() + cltrack_storedir = NFSServicePathInfo.CLDTRKDIR.path() config = render_ctx["nfs.config"] # Fail-safe setting is two nfsd @@ -28,7 +32,15 @@ vers4 = n host = ${','.join(config['bindip'])} % endif +[exportd] +state-directory-path = ${state_path} +[nfsdcld] +storagedir = ${cld_storedir} +[nfsdcltrack] +storagedir = ${cltrack_storedir} + [mountd] +state-directory-path = ${state_path} threads = ${num_mountd} % if config['mountd_port']: port = ${config['mountd_port']} @@ -36,6 +48,7 @@ port = ${config['mountd_port']} manage-gids = ${manage_gids} [statd] +state-directory-path = ${state_path} % if config['rpcstatd_port']: port = ${config['rpcstatd_port']} % endif diff --git a/src/middlewared/middlewared/plugins/nfs.py b/src/middlewared/middlewared/plugins/nfs.py index 43ede948c9fb..6893499884f7 100644 --- a/src/middlewared/middlewared/plugins/nfs.py +++ b/src/middlewared/middlewared/plugins/nfs.py @@ -2,8 +2,9 @@ import errno import ipaddress import itertools -import os.path +import os import pathlib +import shutil from middlewared.common.attachment import LockableFSAttachmentDelegate from middlewared.common.listen import SystemServiceListenMultipleDelegate @@ -15,6 +16,30 @@ import middlewared.sqlalchemy as sa from middlewared.utils.asyncio_ import asyncio_map from middlewared.plugins.nfs_.utils import get_domain, leftmost_has_wildcards, get_wildcard_domain +from middlewared.plugins.system_dataset.utils import SYSDATASET_PATH + + +class NFSServicePathInfo(enum.Enum): + # nfs conf sections that use STATEDIR: exportd, mountd, statd + STATEDIR = (os.path.join(SYSDATASET_PATH, 'nfs'), 0o755, True, {'uid': 0, 'gid': 0}) + CLDDIR = (os.path.join(SYSDATASET_PATH, 'nfs', 'nfsdcld'), 0o700, True, {'uid': 0, 'gid': 0}) + CLDTRKDIR = (os.path.join(SYSDATASET_PATH, 'nfs', 'nfsdcltrack'), 0o700, True, {'uid': 0, 'gid': 0}) + # Fix up the uid and gid in setup_directories + SMDIR = (os.path.join(SYSDATASET_PATH, 'nfs', 'sm'), 0o755, True, {'uid': 'statd', 'gid': 'nogroup'}) + SMBAKDIR = (os.path.join(SYSDATASET_PATH, 'nfs', 'sm.bak'), 0o755, True, {'uid': 'statd', 'gid': 'nogroup'}) + V4RECOVERYDIR = (os.path.join(SYSDATASET_PATH, 'nfs', 'v4recovery'), 0o755, True, {'uid': 0, 'gid': 0}) + + def path(self): + return self.value[0] + + def mode(self): + return self.value[1] + + def is_dir(self): + return self.value[2] + + def owner(self): + return self.value[3] class NFSProtocol(str, enum.Enum): @@ -77,6 +102,77 @@ class Config: Bool('managed_nfsd', default=True), ) + @private + def name_to_id_conversion(self, name, name_type='user'): + ''' Convert built-in user or group name to associated UID or GID ''' + if any((not isinstance(name, str), isinstance(name, int))): + # it's not a string (NoneType, float, w/e) or it's an int + # so there is nothing to do + return name + + if name_type == 'user': + method = 'user.get_builtin_user_id' + elif name_type == 'group': + method = 'group.get_builtin_group_id' + else: + self.logger.error('Unexpected name_type (%r)', name_type) + return name + try: + return self.middleware.call_sync(method, name) + except Exception as e: + if hasattr(e, 'errno') and e.errno == errno.ENOENT: + self.logger.error('Failed to resolve builtin %s %r', name_type, name) + else: + self.logger.error('Unexpected error resolving builtin %s %r', name_type, name, exc_info=True) + return name + + @private + def setup_directories(self): + ''' + We are moving the NFS state directory from /var/lib/nfs to + the system dataset: /var/db/system/nfs. + When setup_directories is called /var/db/system/nfs is expected to exist. + + If STATEDIR is empty, then this might be an initialization + and there might be current info in /var/lib/nfs. + + We always make sure the expected directories are present + ''' + + # Initialize the system dataset NFS state directory + state_dir = NFSServicePathInfo.STATEDIR.path() + try: + shutil.copytree('/var/lib/nfs', state_dir) + except FileExistsError: + # destination file/dir already exists so ignore error + pass + except Exception: + self.logger.error('Unexpected error initializing %r', state_dir, exc_info=True) + + # Make sure we have the necessary directories + for i in NFSServicePathInfo: + uid = self.name_to_id_conversion(i.owner()['uid'], name_type='user') + gid = self.name_to_id_conversion(i.owner()['gid'], name_type='group') + path = i.path() + if i.is_dir(): + os.makedirs(path, exist_ok=True) + + try: + os.chmod(path, i.mode()) + os.chown(path, uid, gid) + except Exception: + self.logger.error('Unexpected failure initializing %r', path, exc_info=True) + + procfs_path = '/proc/fs/nfsd/nfsv4recoverydir' + try: + with open(procfs_path, 'r+') as fp: + fp.write(f'{NFSServicePathInfo.V4RECOVERYDIR.path()}\n') + except FileNotFoundError: + # When this is removed from the kernel we will have a gentle reminder + self.logger.info("%r: Proc file has been removed", procfs_path) + except Exception: + self.logger.error("Unexpected failure updating %r", procfs_path, exc_info=True) + @private async def nfs_extend(self, nfs): keytab_has_nfs = await self.middleware.call("kerberos.keytab.has_nfs_principal") diff --git a/src/middlewared/middlewared/plugins/nfs_/status.py b/src/middlewared/middlewared/plugins/nfs_/status.py index a9f0859e0ead..cfe8dda47011 100644 --- a/src/middlewared/middlewared/plugins/nfs_/status.py +++ b/src/middlewared/middlewared/plugins/nfs_/status.py @@ -1,3 +1,4 @@ +from middlewared.plugins.nfs import NFSServicePathInfo from middlewared.schema import accepts, Int, returns, Str, Dict from middlewared.service import Service, private, filterable, filterable_returns from middlewared.utils import filter_list @@ -18,7 +19,7 @@ def get_rmtab(self): """ entries = [] with suppress(FileNotFoundError): - with open("/var/lib/nfs/rmtab", "r") as f: + with open(os.path.join(NFSServicePathInfo.STATEDIR.path(), "rmtab"), "r") as f: for line in f: ip, data = line.split(":", 1) export, refcnt = line.rsplit(":", 1) diff --git a/src/middlewared/middlewared/plugins/service_/services/nfs.py b/src/middlewared/middlewared/plugins/service_/services/nfs.py index 7dbed671fadc..9fa9b76a0529 100644 --- a/src/middlewared/middlewared/plugins/service_/services/nfs.py +++ b/src/middlewared/middlewared/plugins/service_/services/nfs.py @@ -6,6 +6,7 @@ class NFSService(SimpleService): name = "nfs" reloadable = True + systemd_unit_timeout = 10 etc = ["nfsd"] diff --git a/src/middlewared/middlewared/plugins/system_dataset/hierarchy.py b/src/middlewared/middlewared/plugins/system_dataset/hierarchy.py index e328ecf97d9a..715ad1325ecd 100644 --- a/src/middlewared/middlewared/plugins/system_dataset/hierarchy.py +++ b/src/middlewared/middlewared/plugins/system_dataset/hierarchy.py @@ -106,6 +106,25 @@ def get_system_dataset_spec(pool_name: str, uuid: str) -> list: 'mode': 0o775, }, }, + { + 'name': os.path.join(pool_name, '.system/nfs'), + 'props': { + 'mountpoint': 'legacy', + 'readonly': 'off', + 'snapdir': 'hidden', + }, + 'chown_config': { + 'uid': 0, + 'gid': 0, + 'mode': 0o755, + }, + 'post_mount_actions': [ + { + 'method': 'nfs.setup_directories', + 'args': [], + } + ] + }, { 'name': os.path.join(pool_name, '.system/samba4'), 'props': { diff --git a/tests/api2/test_001_ssh.py b/tests/api2/test_001_ssh.py index 6efce6b596b4..1357a48f49c9 100644 --- a/tests/api2/test_001_ssh.py +++ b/tests/api2/test_001_ssh.py @@ -8,7 +8,7 @@ from functions import if_key_listed, SSH_TEST from auto_config import ip, sshKey, user, password -from middlewared.test.integration.utils import call, fail +from middlewared.test.integration.utils import fail from middlewared.test.integration.utils.client import client @@ -34,6 +34,7 @@ def test_002_firstboot_checks(ws_client): expected_ds = [ 'boot-pool/.system', 'boot-pool/.system/cores', + 'boot-pool/.system/nfs', 'boot-pool/.system/samba4', 'boot-pool/grub' ] @@ -50,8 +51,8 @@ def test_002_firstboot_checks(ws_client): # always start in all circumstances (even if there is an invalid (or empty) config) ignore = ('smartd',) for srv in filter(lambda x: x['service'] not in ignore, ws_client.call('service.query')): - assert srv['enable'] is False - assert srv['state'] == 'STOPPED' + assert srv['enable'] is False, f"{srv['service']} service is unexpectedly enabled" + assert srv['state'] == 'STOPPED', f"{srv['service']} service expected STOPPED, but found {srv['state']}" # verify posix mode, uid and gid for standard users stat_info = { diff --git a/tests/api2/test_300_nfs.py b/tests/api2/test_300_nfs.py index db8ba9a22f0b..1cfcaa7e9703 100644 --- a/tests/api2/test_300_nfs.py +++ b/tests/api2/test_300_nfs.py @@ -38,7 +38,14 @@ conf_file = { "nfs": { "pname": "/etc/nfs.conf.d/local.conf", - "sections": {'nfsd': {}, 'mountd': {}, 'statd': {}, 'lockd': {}} + "sections": { + 'nfsd': {}, + 'exportd': {}, + 'nfsdcld': {}, + 'nfsdcltrack': {}, + 'mountd': {}, + 'statd': {}, + 'lockd': {}} }, "idmapd": { "pname": "/etc/idmapd.conf", @@ -134,6 +141,11 @@ def parse_rpcbind_config(): return rv +def get_nfs_service_state(): + nfs_service = call('service.query', [['service', '=', 'nfs']], {'get': True}) + return nfs_service['state'] + + def set_nfs_service_state(do_what=None, expect_to_pass=True, fail_check=None): ''' Start or Stop NFS service @@ -321,21 +333,29 @@ def nfs_config(options=None): # Enable NFS server -def test_01_creating_the_nfs_server(): +def test_01_init_the_nfs_config(): # initialize default_nfs_config for later restore save_nfs_config() + # Confirm NFS is not running + nfs_state = get_nfs_service_state() + assert nfs_state == 'STOPPED', f'Before update, expected NFS to be STOPPED, but found {nfs_state}' + payload = { - "servers": 10, "mountd_port": 618, "allow_nonroot": False, "rpcstatd_port": 871, "rpclockd_port": 32803, "protocols": ["NFSV3", "NFSV4"] } - results = PUT("/nfs/", payload) - assert results.status_code == 200, results.text - # The service is not yet enabled, so we cannot yet confirm the settings + nfs_conf = call("nfs.update", payload) + assert nfs_conf['mountd_port'] == 618 + assert nfs_conf['rpcstatd_port'] == 871 + assert nfs_conf['rpclockd_port'] == 32803 + + # Confirm NFS remains not running + nfs_state = get_nfs_service_state() + assert nfs_state == 'STOPPED', f'After update, xpected NFS to be STOPPED, but found {nfs_state}' @pytest.mark.dependency(name='NFS_DATASET_CREATED') @@ -396,8 +416,33 @@ def test_09_checking_to_see_if_nfs_service_is_running(request): assert results.json()[0]["state"] == "RUNNING", results.text +def test_10_confirm_state_directory(request): + """ + By default, the NFS state directory is at /var/lib/nfs. + To support HA systems, we moved this to the system dataset + at /var/db/system/nfs. In support of this we updated the + NFS conf file settings + """ + depends(request, ["NFS_SERVICE_STARTED"], scope="session") + + # Make sure the conf file has the expected settings + nfs_state_dir = '/var/db/system/nfs' + s = parse_server_config() + assert s['exportd']['state-directory-path'] == nfs_state_dir, str(s) + assert s['nfsdcld']['storagedir'] == os.path.join(nfs_state_dir, 'nfsdcld'), str(s) + assert s['nfsdcltrack']['storagedir'] == os.path.join(nfs_state_dir, 'nfsdcltrack'), str(s) + assert s['nfsdcld']['storagedir'] == os.path.join(nfs_state_dir, 'nfsdcld'), str(s) + assert s['mountd']['state-directory-path'] == nfs_state_dir, str(s) + assert s['statd']['state-directory-path'] == nfs_state_dir, str(s) + # Confirm we have the mount point in the system dataset + # ---------------------------------------------------------------------- + # NOTE: Update test_001_ssh.py: test_002_first_boot_checks. + # NOTE: Test fresh-install and upgrade. + # ---------------------------------------------------------------------- + + @pytest.mark.parametrize('vers', [3, 4]) -def test_10_perform_basic_nfs_ops(request, vers): +def test_11_perform_basic_nfs_ops(request, vers): with SSH_NFS(ip, NFS_PATH, vers=vers, user=user, password=password, ip=ip) as n: n.create('testfile') n.mkdir('testdir') @@ -412,7 +457,7 @@ def test_10_perform_basic_nfs_ops(request, vers): assert 'testfile' not in contents -def test_11_perform_server_side_copy(request): +def test_12_perform_server_side_copy(request): with SSH_NFS(ip, NFS_PATH, vers=4, user=user, password=password, ip=ip) as n: n.server_side_copy('ssc1', 'ssc2') @@ -1208,6 +1253,7 @@ def test_43_check_nfsv4_acl_support(request): 4) For NFSv4.1 or NFSv4.2, repeat same process for each of the supported acl_flags. """ + depends(request, ["NFS_SERVICE_STARTED"], scope="session") acl_nfs_path = f'/mnt/{pool_name}/test_nfs4_acl' test_perms = { "READ_DATA": True,