From 40b2b48fa5dc460f945ae4eded49e63993597d3a Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 18 Nov 2023 08:25:53 +1100 Subject: [PATCH 01/32] Consider `dnode_t` allocations in dbuf cache size accounting Entries in the dbuf cache contribute only the size of the dbuf data to the cache size. Attached "user" data is not counted. This can lead to the data currently "owned" by the cache consuming more memory accounting appears to show. In some cases (eg a metadnode data block with all child dnode_t slots allocated), the actual size can be as much as 3x as what the cache believes it to be. This is arguably correct behaviour, as the cache is only tracking the size of the dbuf data, not even the overhead of the dbuf_t. On the other hand, in the above case of dnodes, evicting cached metadnode dbufs is the only current way to reclaim the dnode objects, and can lead to the situation where the dbuf cache appears to be comfortably within its target memory window and yet is holding enormous amounts of slab memory that cannot be reclaimed. This commit adds a facility for a dbuf user to artificially inflate the apparent size of the dbuf for caching purposes. This at least allows for cache tuning to be adjusted to match something closer to the real memory overhead. metadnode dbufs carry a >1KiB allocation per dnode in their user data. This informs the dbuf cache machinery of that fact, allowing it to make better decisions when evicting dbufs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15511 (cherry picked from commit 92dc4ad83d95b88c87b03682f50bbbde20513244) --- cmd/dbufstat/dbufstat.in | 20 +++++++------ include/sys/dmu.h | 13 +++++++++ module/zfs/dbuf.c | 63 +++++++++++++++++++++++++++++++++++----- module/zfs/dbuf_stats.c | 13 +++++---- module/zfs/dnode.c | 19 ++++++++++-- 5 files changed, 102 insertions(+), 26 deletions(-) diff --git a/cmd/dbufstat/dbufstat.in b/cmd/dbufstat/dbufstat.in index 82250353f5eb..4b61a04798ae 100755 --- a/cmd/dbufstat/dbufstat.in +++ b/cmd/dbufstat/dbufstat.in @@ -37,7 +37,7 @@ import re bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", - "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"] dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", "indirect", "bonus", "spill"] -dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", - "dbc", "list", "atype", "flags", "count", "asize", "access", - "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", - "l2_comp", "aholds"] +dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state", + "dbholds", "dbc", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds"] thdr = ["pool", "objset", "dtype", "cached"] txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", "bonus", "spill"] -tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", - "dbc", "dbholds", "list", "atype", "flags", "count", "asize", - "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", +tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta", + "state", "dbc", "dbholds", "list", "atype", "flags", "count", + "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -70,6 +70,7 @@ cols = { "blkid": [8, -1, "block number of buffer"], "offset": [12, 1024, "offset in object of buffer"], "dbsize": [7, 1024, "size of buffer"], + "usize": [7, 1024, "size of attached user data"], "meta": [4, -1, "is this buffer metadata?"], "state": [5, -1, "state of buffer (read, cached, etc)"], "dbholds": [7, 1000, "number of holds on buffer"], @@ -399,6 +400,7 @@ def update_dict(d, k, line, labels): key = line[labels[k]] dbsize = int(line[labels['dbsize']]) + usize = int(line[labels['usize']]) blkid = int(line[labels['blkid']]) level = int(line[labels['level']]) @@ -416,7 +418,7 @@ def update_dict(d, k, line, labels): d[pool][objset][key]['indirect'] = 0 d[pool][objset][key]['spill'] = 0 - d[pool][objset][key]['cached'] += dbsize + d[pool][objset][key]['cached'] += dbsize + usize if blkid == -1: d[pool][objset][key]['bonus'] += dbsize diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0f80764a26a6..d8b44c4f3604 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -639,6 +639,9 @@ typedef struct dmu_buf_user { */ taskq_ent_t dbu_tqent; + /* Size of user data, for inclusion in dbuf_cache accounting. */ + uint64_t dbu_size; + /* * This instance's eviction function pointers. * @@ -721,6 +724,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db, */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); +/* + * User data size accounting. This can be used to artifically inflate the size + * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough + * to satisfy memory reclaim requests. It's not used for anything else, and + * defaults to 0. + */ +uint64_t dmu_buf_user_size(dmu_buf_t *db); +void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd); +void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); + /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index ba9e0aa7b3f9..10cbfce9f5c0 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -554,6 +554,21 @@ dbuf_evict_user(dmu_buf_impl_t *db) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif + if (db->db_caching_status != DB_NO_CACHE) { + /* + * This is a cached dbuf, so the size of the user data is + * included in its cached amount. We adjust it here because the + * user data has already been detached from the dbuf, and the + * sync functions are not supposed to touch it (the dbuf might + * not exist anymore by the time the sync functions run. + */ + uint64_t size = dbu->dbu_size; + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, size, db); + if (db->db_caching_status == DB_DBUF_CACHE) + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + } + /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for @@ -693,12 +708,12 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + &dbuf_caches[DB_DBUF_CACHE].size, size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -2808,6 +2823,8 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + ASSERT0(dmu_buf_user_size(&db->db)); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -3540,17 +3557,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); + &dbuf_caches[db->db_caching_status].size, size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); } db->db_caching_status = DB_NO_CACHE; } @@ -3782,7 +3799,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size; + uint64_t db_size = db->db.db_size + + dmu_buf_user_size(&db->db); size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); uint8_t db_level = db->db_level; @@ -3885,6 +3903,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user); } +uint64_t +dmu_buf_user_size(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + if (db->db_user == NULL) + return (0); + return (atomic_load_64(&db->db_user->dbu_size)); +} + +void +dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd); + atomic_add_64(&db->db_user->dbu_size, nadd); +} + +void +dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub); + atomic_sub_64(&db->db_user->dbu_size, nsub); +} + void dmu_buf_user_evict_wait(void) { diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 037190a81bb3..6586947b1708 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -46,14 +46,14 @@ static int dbuf_stats_hash_table_headers(char *buf, size_t size) { (void) snprintf(buf, size, - "%-96s | %-119s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | " + "%-105s | %-119s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | " "%-5s %-5s %-9s %-6s %-8s %-12s " "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | " "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n", "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc", - "list", "atype", "flags", "count", "asize", "access", + "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"); @@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) __dmu_object_info_from_dnode(dn, &doi); nwritten = snprintf(buf, size, - "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d " - "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " + "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu " + "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | " "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n", /* dmu_buf_impl_t */ @@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) (longlong_t)db->db_blkid, (u_longlong_t)db->db.db_offset, (u_longlong_t)db->db.db_size, + (u_longlong_t)dmu_buf_user_size(&db->db), !!dbuf_is_metadata(db), db->db_state, (ulong_t)zfs_refcount_count(&db->db_holds), diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index c0a2b36857be..54b8475293e9 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1120,9 +1120,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) return (B_TRUE); } -static void +static uint_t dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) { + uint_t reclaimed = 0; + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { @@ -1134,8 +1136,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); dnode_destroy(dnh->dnh_dnode); dnh->dnh_dnode = DN_SLOT_FREE; + reclaimed++; } } + + return (reclaimed); } void @@ -1448,6 +1453,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, + sizeof (dnode_t)); } } @@ -1505,8 +1512,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * to be freed. Single slot dnodes can be safely * re-purposed as a performance optimization. */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (slots > 1) { + uint_t reclaimed = + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (reclaimed > 0) + dmu_buf_sub_user_size(&db->db, + reclaimed * sizeof (dnode_t)); + } dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { @@ -1514,6 +1526,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } mutex_enter(&dn->dn_mtx); From 671c597be6b0a9740ec2046e8ed75117ea8b1057 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 13 Nov 2023 17:55:29 +1100 Subject: [PATCH 02/32] linux 5.4 compat: page_size() Before 5.4 we have to do a little math. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 93fb150fa4de5c1b25c261c762da2f95338005eb) --- config/kernel-mm-page-size.m4 | 17 +++++++++++ config/kernel.m4 | 2 ++ include/os/linux/kernel/linux/Makefile.am | 1 + include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++ 4 files changed, 56 insertions(+) create mode 100644 config/kernel-mm-page-size.m4 create mode 100644 include/os/linux/kernel/linux/mm_compat.h diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4 new file mode 100644 index 000000000000..d5ebd926986a --- /dev/null +++ b/config/kernel-mm-page-size.m4 @@ -0,0 +1,17 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ + ZFS_LINUX_TEST_SRC([page_size], [ + #include + ],[ + unsigned long s; + s = page_size(NULL); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ + AC_MSG_CHECKING([whether page_size() is available]) + ZFS_LINUX_TEST_RESULT([page_size], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 41492c19d8a5..9c27ffcf3c7c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -144,6 +144,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC + ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -261,6 +262,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC + ZFS_AC_KERNEL_MM_PAGE_SIZE ]) dnl # diff --git a/include/os/linux/kernel/linux/Makefile.am b/include/os/linux/kernel/linux/Makefile.am index 6ff0df506d9c..fc2bde013f22 100644 --- a/include/os/linux/kernel/linux/Makefile.am +++ b/include/os/linux/kernel/linux/Makefile.am @@ -10,6 +10,7 @@ KERNEL_H = \ simd_x86.h \ simd_aarch64.h \ simd_powerpc.h \ + mm_compat.h \ mod_compat.h \ page_compat.h \ compiler_compat.h diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h new file mode 100644 index 000000000000..40056c68d6dd --- /dev/null +++ b/include/os/linux/kernel/linux/mm_compat.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#ifndef _ZFS_MM_COMPAT_H +#define _ZFS_MM_COMPAT_H + +#include + +/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ +#ifndef HAVE_MM_PAGE_SIZE +#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) +#endif + +#endif /* _ZFS_MM_COMPAT_H */ From edf4cf0ce7b834f4e1f1c19be6ba9ab15deb6721 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 11 Dec 2023 16:05:54 +1100 Subject: [PATCH 03/32] abd: add page iterator The regular ABD iterators yield data buffers, so they have to map and unmap pages into kernel memory. If the caller only wants to count chunks, or can use page pointers directly, then the map/unmap is just unnecessary overhead. This adds adb_iterate_page_func, which yields unmapped struct page instead. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 930b785c61e9724f0a3a0e09571032ed397f368c) --- include/sys/abd.h | 7 +++ include/sys/abd_impl.h | 26 ++++++++- module/os/freebsd/zfs/abd_os.c | 4 +- module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++--- module/zfs/abd.c | 42 +++++++++++++ 5 files changed, 169 insertions(+), 14 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 6903e0c0e713..3d23581fce88 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -119,6 +122,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 113700cd72b1..189b743c076a 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #ifndef _ABD_IMPL_H @@ -38,12 +39,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -79,6 +98,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index ddd6d68b361c..2a7fa273deaa 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; } /* diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 254df32410f1..6297b559941b 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ #include #ifdef _KERNEL #include +#include #include #else #define MAX_ORDER 1 @@ -884,14 +886,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -904,6 +901,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -915,8 +913,15 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { + /* + * Ensure that last chunk is not in use. abd_iterate_*() must clear + * this state (directly or abd_iter_unmap()) before advancing. + */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); + ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0(aiter->iter_page_doff); + ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) @@ -998,6 +1003,88 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) +/* + * Yield the next page struct and data offset and size within it, without + * mapping it into the address space. + */ +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + + /* total data remaining in abd from this position */ + dsize = aiter->iter_abd->abd_size - aiter->iter_offset; + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = sg_page(aiter->iter_sg); + + /* position within page */ + doff = aiter->iter_offset; + + /* remaining data in scatterlist */ + dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + } + ASSERT(page); + + if (PageTail(page)) { + /* + * This page is part of a "compound page", which is a group of + * pages that can be referenced from a single struct page *. + * Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the + * __GFP_COMP flag, which we get from scatter ABDs and SPL + * vmalloc slabs (ie >16K allocations). So a great many of the + * IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGE_SIZE pages, and can be + * safely used as-is. However, the head page has length + * covering itself and all the tail pages. If this ABD chunk + * spans multiple pages, then we can use the head page and a + * >PAGE_SIZE length, which is far more efficient. + * + * To do this, we need to adjust the offset to be counted from + * the head page. struct page for compound pages are stored + * contiguously, so we can just adjust by a simple offset. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } + + /* final page and position within it */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + + /* amount of data in the chunk, up to the end of the page */ + aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); +} + /* * bio_nr_pages for ABD. * @off is the offset in @abd @@ -1220,4 +1307,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 42bf3e3036f9..79de6378f813 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -816,6 +816,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + aiter.iter_page = NULL; + aiter.iter_page_doff = 0; + aiter.iter_page_dsize = 0; + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; }; From d00ab549d432c0b6d99ced503c63c6622eae85eb Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 9 Jan 2024 12:12:56 +1100 Subject: [PATCH 04/32] vdev_disk: rename existing functions to vdev_classic_* This is just renaming the existing functions we're about to replace and grouping them together to make the next commits easier to follow. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 9bf6a7c8c3bdcc4e5975fa5baf6e9ff6f279a553) --- include/sys/abd.h | 2 + module/os/linux/zfs/abd_os.c | 5 + module/os/linux/zfs/vdev_disk.c | 247 +++++++++++++++++--------------- 3 files changed, 139 insertions(+), 115 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 3d23581fce88..86868e5a08cf 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -214,6 +214,8 @@ void abd_fini(void); /* * Linux ABD bio functions + * Note: these are only needed to support vdev_classic. See comment in + * vdev_disk.c. */ #if defined(__linux__) && defined(_KERNEL) unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 6297b559941b..6ea1e45cf8bb 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1085,6 +1085,11 @@ abd_iter_page(struct abd_iter *aiter) aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); } +/* + * Note: ABD BIO functions only needed to support vdev_classic. See comments in + * vdev_disk.c. + */ + /* * bio_nr_pages for ABD. * @off is the offset in @abd diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 935cdc039d5e..235a3e5e962a 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -72,20 +72,22 @@ static unsigned zfs_vdev_open_timeout_ms = 1000; */ #define EFI_MIN_RESV_SIZE (16 * 1024) -/* - * Virtual device vector for disks. - */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[0]; /* Attached bio's */ -} dio_request_t; - +#ifdef HAVE_BLK_MODE_T +static blk_mode_t +#else static fmode_t +#endif vdev_bdev_mode(spa_mode_t spa_mode) { +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= BLK_OPEN_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= BLK_OPEN_WRITE; +#else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) @@ -93,6 +95,7 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; +#endif return (mode); } @@ -365,88 +368,6 @@ vdev_disk_close(vdev_t *v) v->vdev_tsd = NULL; } -static dio_request_t * -vdev_disk_dio_alloc(int bio_count) -{ - dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (int i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - - return (dr); -} - -static void -vdev_disk_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_disk_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static int -vdev_disk_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_disk_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } - - return (rc); -} - -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) -{ - dio_request_t *dr = bio->bi_private; - int rc; - - if (dr->dr_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - dr->dr_error = BIO_END_IO_ERROR(bio); -#else - if (error) - dr->dr_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - dr->dr_error = EIO; -#endif - } - - /* Drop reference acquired by __vdev_disk_physio */ - rc = vdev_disk_dio_put(dr); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -598,23 +519,120 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, return (bio); } +/* ========== */ + +/* + * This is the classic, battle-tested BIO submission code. + * + * These functions have been renamed to vdev_classic_* to make it clear what + * they belong to, but their implementations are unchanged. + */ + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[]; /* Attached bio's */ +} dio_request_t; + +static dio_request_t * +vdev_classic_dio_alloc(int bio_count) +{ + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + + return (dr); +} + +static void +vdev_classic_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_classic_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static void +vdev_classic_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_classic_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } +} + +BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by vdev_classic_physio */ + vdev_classic_dio_put(dr); +} + static inline unsigned int -vdev_bio_max_segs(struct block_device *bdev) { - const unsigned long tune_max_segs = - vdev_disk_max_segs > 0 ? vdev_disk_max_segs : ULONG_MAX; - const unsigned long dev_max_segs = - queue_max_segments(bdev_get_queue(bdev)); - const unsigned long max_segs = MIN(tune_max_segs, dev_max_segs); +vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) +{ + unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, + bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS - return (bio_max_segs(max_segs)); + return (bio_max_segs(nr_segs)); #else - return (MIN(max_segs, BIO_MAX_PAGES)); + return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, +vdev_classic_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; @@ -637,7 +655,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } retry: - dr = vdev_disk_dio_alloc(bio_count); + dr = vdev_classic_dio_alloc(bio_count); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); @@ -669,23 +687,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); bio_count *= 2; goto retry; } - nr_vecs = vdev_bio_max_segs(bdev); + nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); return (SET_ERROR(ENOMEM)); } - /* Matching put called by vdev_disk_physio_completion */ - vdev_disk_dio_get(dr); + /* Matching put called by vdev_classic_physio_completion */ + vdev_classic_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); @@ -707,7 +725,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_disk_dio_get(dr); + vdev_classic_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); @@ -721,11 +739,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, if (dr->dr_bio_count > 1) blk_finish_plug(&plug); - (void) vdev_disk_dio_put(dr); + (void) vdev_classic_dio_put(dr); return (error); } +/* ========== */ + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -901,7 +921,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, + error = vdev_classic_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); @@ -1048,6 +1068,3 @@ param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) ZFS_MODULE_PARAM(zfs_zio, zio_, suppress_zero_writes, INT, ZMOD_RW, "Do not send zero byte writes to hardware"); - -ZFS_MODULE_PARAM(zfs_vdev_disk, vdev_disk_, max_segs, ULONG, ZMOD_RW, - "Maximum number of data segments to add to an IO request"); From 1feddf1ed6d3af9243dd57775c82f3b075a9d006 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 9 Jan 2024 12:23:30 +1100 Subject: [PATCH 05/32] vdev_disk: reorganise vdev_disk_io_start Light reshuffle to make it a bit more linear to read and get rid of a bunch of args that aren't needed in all cases. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit ad847ff6acb77fbba0f3ab2e864784225fd41007) --- module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 235a3e5e962a..0b8ff74920b8 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -632,9 +632,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) } static int -vdev_classic_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) +vdev_classic_physio(zio_t *zio) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + size_t io_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; + int flags = 0; + dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; @@ -820,7 +827,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - int rw, error; + int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. @@ -899,13 +906,6 @@ vdev_disk_io_start(zio_t *zio) rw_exit(&vd->vd_lock); zio_execute(zio); return; - case ZIO_TYPE_WRITE: - rw = WRITE; - break; - - case ZIO_TYPE_READ: - rw = READ; - break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); @@ -913,23 +913,34 @@ vdev_disk_io_start(zio_t *zio) zio_interrupt(zio); return; - default: + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = vdev_classic_physio(zio); rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENOTSUP); - zio_interrupt(zio); + if (error) { + zio->io_error = error; + zio_interrupt(zio); + } return; - } - zio->io_target_timestamp = zio_handle_io_delay(zio); - error = vdev_classic_physio(vd->vd_bdev, zio, - zio->io_size, zio->io_offset, rw, 0); - rw_exit(&vd->vd_lock); + default: + /* + * Getting here means our parent vdev has made a very strange + * request of us, and shouldn't happen. Assert here to force a + * crash in dev builds, but in production return the IO + * unhandled. The pool will likely suspend anyway but that's + * nicer than crashing the kernel. + */ + ASSERT3S(zio->io_type, ==, -1); - if (error) { - zio->io_error = error; + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } + + __builtin_unreachable(); } static void From 21f680878000dcf1311c6ff3396715af5b784f48 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 9 Jan 2024 12:29:19 +1100 Subject: [PATCH 06/32] vdev_disk: make read/write IO function configurable This is just setting up for the next couple of commits, which will add a new IO function and a parameter to select it. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 7ee83696cffac172eea89844ccc5e6b6899781ac) --- module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 0b8ff74920b8..9dfb2653d2a1 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -822,6 +822,8 @@ vdev_disk_io_trim(zio_t *zio) #endif } +int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; + static void vdev_disk_io_start(zio_t *zio) { @@ -916,7 +918,7 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); - error = vdev_classic_physio(zio); + error = vdev_disk_io_rw_fn(zio); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; @@ -989,8 +991,25 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +/* + * At first use vdev use, set the submission function from the default value if + * it hasn't been set already. + */ +static int +vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + (void) nv; + (void) tsd; + + if (vdev_disk_io_rw_fn == NULL) + vdev_disk_io_rw_fn = vdev_classic_physio; + + return (0); +} + vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, + .vdev_op_init = vdev_disk_init, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, From 1b16b90ae14bf096126e840765a666dcdb2d38a1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 18 Jul 2023 11:11:29 +1000 Subject: [PATCH 07/32] vdev_disk: rewrite BIO filling machinery to avoid split pages This commit tackles a number of issues in the way BIOs (`struct bio`) are constructed for submission to the Linux block layer. ### BIO segment limits are set incorrectly The kernel has a hard upper limit on the number of pages/segments that can be added to a BIO, as well as a separate limit for each device (related to its queue depth and other scheduling characteristics). ZFS counts the number of memory pages in the request ABD (`abd_nr_pages_off()`, and then uses that as the number of segments to put into the BIO, up to the hard upper limit. If it requires more than the limit, it will create multiple BIOs. Leaving aside the fact that page count method is wrong (see below), not limiting to the device segment max means that the device driver will need to split the BIO in half. This is alone is not necessarily a problem, but it interacts with another issue to cause a much larger problem. ### BIOs are filled inefficiently The kernel function to add a segment to a BIO (`bio_add_page()`) takes a `struct page` pointer, and offset+len within it. `struct page` can represent a run of contiguous memory pages (known as a "compound page"). In can be of arbitrary length. The ZFS functions that count ABD pages and load them into the BIO (`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never consider a page to be more than `PAGE_SIZE` (4K), even if the `struct page` is for multiple pages. In this case, it will load the same `struct page` into the BIO multiple times, with the offset adjusted each time. With a sufficiently large ABD, this can easily lead to the BIO being entirely filled much earlier than it could have been. This is also further contributes to the problem caused by the incorrect segment limit calculation, as its much easier to go past the device limit, and so require a split. Again, this is not a problem on its own. ### Incomplete pages are submitted to BIOs The logic for "never submit more than `PAGE_SIZE`" is actually a little more subtle. It will actually never submit a buffer that crosses a 4K page boundary. In practice, this is fine, as most ABDs are scattered, that is a list of complete 4K pages, and so are loaded in as such. Linear ABDs are typically allocated from slabs, and for small sizes they are frequently not aligned to page boundaries. For example, a 12K allocation can span four pages, eg: -- 4K -- -- 4K -- -- 4K -- -- 4K -- | | | | | :## ######## ######## ######: [1K, 4K, 4K, 3K] Such an allocation would be loaded into a BIO as you see: [1K, 4K, 4K, 3K] This tends not to be a problem in practice, because even if the BIO were filled and needed to be split, each half would still have either a start or end aligned to the logical block size of the device (assuming 4K at least). --- In ideal circumstances, these shortcomings don't cause any particular problems. Its when they start to interact with other ZFS features that things get interesting. ### Aggregation Aggregation will create a "gang" ABD, which is simply a list of other ABDs. Iterating over a gang ABD is just iterating over each ABD within it in turn. Because the segments are simply loaded in order, we can end up with uneven segments either side of the "gap" between the two ABDs. For example, two 12K ABDs might be aggregated and then loaded as: [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K] Should a split occur, each individual BIO can end up either having an start or end offset that is not aligned to the logical block size, which some drivers (eg SCSI) will reject. However, this tends not to happen because the default aggregation limit usually keeps the BIO small enough to not require more than one split, and most pages are actually full 4K pages, so hitting an uneven gap is very rare anyway. ### Gang blocks If the pool is under particular memory pressure, then an IO can be broken down into a "gang block", a 512-byte block composed of a header and up to three block pointers. Each points to a fragment of the original write, or in turn, another gang block, breaking the original data up over and over until space can be found in the pool for each of them. Each gang header is a separate 512-byte memory allocation from a slab, that needs to be written down to disk. When the gang header is added to the BIO, its a single 512-byte segment. ### Aggregation with gang blocks Pulling all this together, consider a large aggregated write of gang blocks. This results a BIO containing lots of 512-byte segments. Given our tendency to overfill the BIO, a split is likely, and most possible split points will yield a pair of BIOs that are misaligned. Drivers that care, like the SCSI driver, will reject them. --- This commit is a substantial refactor and rewrite of much of `vdev_disk` to sort all this out. ### Configure maximum segment size for device `vdev_bio_max_segs()` now returns the ideal maximum size for the device, if available. There's also a tuneable `zfs_vdev_disk_max_segs` to override this, to assist with testing. ### ABDs checked up front for page count and alignment We scan the ABD up front to count the number of pages within it, and to confirm that if we submitted all those pages to one or more BIOs, it could be split at any point with creating a misaligned BIO. Along the way, we determine how many BIO segments we'll need to handle the entire ABD, accounting for BIO fill limits (including segment and byte limits). If the pages in the BIO are not usable (as in any of the above situations), the ABD is linearised, and then checked again. This is the same technique used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size and allocator quirks. In the end, a count of segments is produced, which is then used to determine how many BIOs will be allocated. ### Virtual block IO object `vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The idea is simply that it can hold all the state needed to create, submit and return multiple BIOs, including all the refcounts, the ABD copy if it was needed, and so on. Apart from what I hope is a clearer interface, the major difference is that because we know how many BIOs we'll need up front, we don't need the old overflow logic that would grow the BIO array, throw away all the old work and restart. We can get it right from the start. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 588a6a2d38f20cd6e0d458042feda1831b302207) --- man/man4/zfs.4 | 10 +- module/os/linux/zfs/vdev_disk.c | 438 +++++++++++++++++++++++++++++++- 2 files changed, 442 insertions(+), 6 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ea48b54587cf..d840ae9a1625 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2,6 +2,7 @@ .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. +.\" Copyright (c) 2023, 2024 Klara, Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -15,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd January 10, 2023 +.Dd January 9, 2024 .Dt ZFS 4 .Os . @@ -1305,6 +1306,13 @@ as fuller devices will tend to be slower than empty devices. Also see .Sy zio_dva_throttle_enabled . . +.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint +Maximum number of segments to add to a BIO (min 4). +If this is higher than the maximum allowed by the device queue or the kernel +itself, it will be clamped. +Setting it to zero will cause the kernel's ideal size to be used. +This parameter only applies on Linux. +. .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 9dfb2653d2a1..5902904918fb 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -24,6 +24,7 @@ * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -49,11 +50,11 @@ typedef struct vdev_disk { int zio_suppress_zero_writes = B_TRUE; /* - * Maximum number of segments to add to a bio. If this is higher than the - * maximum allowed by the device queue or the kernel itself, it will be + * Maximum number of segments to add to a bio (min 4). If this is higher than + * the maximum allowed by the device queue or the kernel itself, it will be * clamped. Setting it to zero will cause the kernel's ideal size to be used. */ -unsigned long vdev_disk_max_segs = 0; +uint_t zfs_vdev_disk_max_segs = 0; /* * Unique identifier for the exclusive vdev holder. @@ -519,10 +520,430 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, return (bio); } +static inline uint_t +vdev_bio_max_segs(struct block_device *bdev) +{ + /* + * Smallest of the device max segs and the tuneable max segs. Minimum + * 4, so there's room to finish split pages if they come up. + */ + const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); + const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? + MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; + const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); + +#ifdef HAVE_BIO_MAX_SEGS + return (bio_max_segs(max_segs)); +#else + return (MIN(max_segs, BIO_MAX_PAGES)); +#endif +} + +static inline uint_t +vdev_bio_max_bytes(struct block_device *bdev) +{ + return (queue_max_sectors(bdev_get_queue(bdev)) << 9); +} + + +/* + * Virtual block IO object (VBIO) + * + * Linux block IO (BIO) objects have a limit on how many data segments (pages) + * they can hold. Depending on how they're allocated and structured, a large + * ZIO can require more than one BIO to be submitted to the kernel, which then + * all have to complete before we can return the completed ZIO back to ZFS. + * + * A VBIO is a wrapper around multiple BIOs, carrying everything needed to + * translate a ZIO down into the kernel block layer and back again. + * + * Note that these are only used for data ZIOs (read/write). Meta-operations + * (flush/trim) don't need multiple BIOs and so can just make the call + * directly. + */ +typedef struct { + zio_t *vbio_zio; /* parent zio */ + + struct block_device *vbio_bdev; /* blockdev to submit bios to */ + + abd_t *vbio_abd; /* abd carrying borrowed linear buf */ + + atomic_t vbio_ref; /* bio refcount */ + int vbio_error; /* error from failed bio */ + + uint_t vbio_max_segs; /* max segs per bio */ + + uint_t vbio_max_bytes; /* max bytes per bio */ + uint_t vbio_lbs_mask; /* logical block size mask */ + + uint64_t vbio_offset; /* start offset of next bio */ + + struct bio *vbio_bio; /* pointer to the current bio */ + struct bio *vbio_bios; /* list of all bios */ +} vbio_t; + +static vbio_t * +vbio_alloc(zio_t *zio, struct block_device *bdev) +{ + vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); + + vbio->vbio_zio = zio; + vbio->vbio_bdev = bdev; + atomic_set(&vbio->vbio_ref, 0); + vbio->vbio_max_segs = vdev_bio_max_segs(bdev); + vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); + vbio->vbio_lbs_mask = bdev_logical_block_size(bdev)-1; + vbio->vbio_offset = zio->io_offset; + + return (vbio); +} + +static int +vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) +{ + struct bio *bio; + uint_t ssize; + + while (size > 0) { + bio = vbio->vbio_bio; + if (bio == NULL) { + /* New BIO, allocate and set up */ + bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, + vbio->vbio_max_segs); + if (unlikely(bio == NULL)) + return (SET_ERROR(ENOMEM)); + BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; + + bio->bi_next = vbio->vbio_bios; + vbio->vbio_bios = vbio->vbio_bio = bio; + } + + /* + * Only load as much of the current page data as will fit in + * the space left in the BIO, respecting lbs alignment. Older + * kernels will error if we try to overfill the BIO, while + * newer ones will accept it and split the BIO. This ensures + * everything works on older kernels, and avoids an additional + * overhead on the new. + */ + ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & + ~(vbio->vbio_lbs_mask)); + if (ssize > 0 && + bio_add_page(bio, page, ssize, offset) == ssize) { + /* Accepted, adjust and load any remaining. */ + size -= ssize; + offset += ssize; + continue; + } + + /* No room, set up for a new BIO and loop */ + vbio->vbio_offset += BIO_BI_SIZE(bio); + + /* Signal new BIO allocation wanted */ + vbio->vbio_bio = NULL; + } + + return (0); +} + +BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error); +static void vbio_put(vbio_t *vbio); + +static void +vbio_submit(vbio_t *vbio, int flags) +{ + ASSERT(vbio->vbio_bios); + struct bio *bio = vbio->vbio_bios; + vbio->vbio_bio = vbio->vbio_bios = NULL; + + /* + * We take a reference for each BIO as we submit it, plus one to + * protect us from BIOs completing before we're done submitting them + * all, causing vbio_put() to free vbio out from under us and/or the + * zio to be returned before all its IO has completed. + */ + atomic_set(&vbio->vbio_ref, 1); + + /* + * If we're submitting more than one BIO, inform the block layer so + * it can batch them if it wants. + */ + struct blk_plug plug; + boolean_t do_plug = (bio->bi_next != NULL); + if (do_plug) + blk_start_plug(&plug); + + /* Submit all the BIOs */ + while (bio != NULL) { + atomic_inc(&vbio->vbio_ref); + + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + + bio->bi_end_io = vdev_disk_io_rw_completion; + bio->bi_private = vbio; + bio_set_op_attrs(bio, + vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? + WRITE : READ, flags); + + vdev_submit_bio(bio); + + bio = next; + } + + /* Finish the batch */ + if (do_plug) + blk_finish_plug(&plug); + + /* Release the extra reference */ + vbio_put(vbio); +} + +static void +vbio_return_abd(vbio_t *vbio) +{ + zio_t *zio = vbio->vbio_zio; + if (vbio->vbio_abd == NULL) + return; + + /* + * If we copied the ABD before issuing it, clean up and return the copy + * to the ADB, with changes if appropriate. + */ + void *buf = abd_to_buf(vbio->vbio_abd); + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); +} + +static void +vbio_free(vbio_t *vbio) +{ + VERIFY0(atomic_read(&vbio->vbio_ref)); + + vbio_return_abd(vbio); + + kmem_free(vbio, sizeof (vbio_t)); +} + +static void +vbio_put(vbio_t *vbio) +{ + if (atomic_dec_return(&vbio->vbio_ref) > 0) + return; + + /* + * This was the last reference, so the entire IO is completed. Clean + * up and submit it for processing. + */ + + /* + * Get any data buf back to the original ABD, if necessary. We do this + * now so we can get the ZIO into the pipeline as quickly as possible, + * and then do the remaining cleanup after. + */ + vbio_return_abd(vbio); + + zio_t *zio = vbio->vbio_zio; + + /* + * Set the overall error. If multiple BIOs returned an error, only the + * first will be taken; the others are dropped (see + * vdev_disk_io_rw_completion()). Its pretty much impossible for + * multiple IOs to the same device to fail with different errors, so + * there's no real risk. + */ + zio->io_error = vbio->vbio_error; + if (zio->io_error) + vdev_disk_error(zio); + + /* All done, submit for processing */ + zio_delay_interrupt(zio); + + /* Finish cleanup */ + vbio_free(vbio); +} + +BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error) +{ + vbio_t *vbio = bio->bi_private; + + if (vbio->vbio_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + vbio->vbio_error = BIO_END_IO_ERROR(bio); +#else + if (error) + vbio->vbio_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + vbio->vbio_error = EIO; +#endif + } + + /* + * Destroy the BIO. This is safe to do; the vbio owns its data and the + * kernel won't touch it again after the completion function runs. + */ + bio_put(bio); + + /* Drop this BIOs reference acquired by vbio_submit() */ + vbio_put(vbio); +} + +/* + * Iterator callback to count ABD pages and check their size & alignment. + * + * On Linux, each BIO segment can take a page pointer, and an offset+length of + * the data within that page. A page can be arbitrarily large ("compound" + * pages) but we still have to ensure the data portion is correctly sized and + * aligned to the logical block size, to ensure that if the kernel wants to + * split the BIO, the two halves will still be properly aligned. + */ +typedef struct { + uint_t bmask; + uint_t npages; + uint_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = len & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +/* + * Check if we can submit the pages in this ABD to the kernel as-is. Returns + * the number of pages, or 0 if it can't be submitted like this. + */ +static boolean_t +vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +{ + vdev_disk_check_pages_t s = { + .bmask = bdev_logical_block_size(bdev)-1, + .npages = 0, + .end = 0, + }; + + if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + return (B_FALSE); + + return (B_TRUE); +} + +/* Iterator callback to submit ABD pages to the vbio. */ +static int +vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vbio_t *vbio = priv; + return (vbio_add_page(vbio, page, len, off)); +} + +static int +vdev_disk_io_rw(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + int flags = 0; + + /* + * Accessing outside the block device is never allowed. + */ + if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + (u_longlong_t)zio->io_offset, + (u_longlong_t)zio->io_size, + (u_longlong_t)i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &flags); + + /* + * Check alignment of the incoming ABD. If any part of it would require + * submitting a page that is not aligned to the logical block size, + * then we take a copy into a linear buffer and submit that instead. + * This should be impossible on a 512b LBS, and fairly rare on 4K, + * usually requiring abnormally-small data blocks (eg gang blocks) + * mixed into the same ABD as larger ones (eg aggregated). + */ + abd_t *abd = zio->io_abd; + if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { + void *buf; + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + + /* + * Wrap the copy in an abd_t, so we can use the same iterators + * to count and fill the vbio later. + */ + abd = abd_get_from_buf(buf, zio->io_size); + + /* + * False here would mean the borrowed copy has an invalid + * alignment too, which would mean we've somehow been passed a + * linear ABD with an interior page that has a non-zero offset + * or a size not a multiple of PAGE_SIZE. This is not possible. + * It would mean either zio_buf_alloc() or its underlying + * allocators have done something extremely strange, or our + * math in vdev_disk_check_pages() is wrong. In either case, + * something in seriously wrong and its not safe to continue. + */ + VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + } + + /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ + int error = 0; + vbio_t *vbio = vbio_alloc(zio, bdev); + if (abd != zio->io_abd) + vbio->vbio_abd = abd; + + /* Fill it with pages */ + error = abd_iterate_page_func(abd, 0, zio->io_size, + vdev_disk_fill_vbio_cb, vbio); + if (error != 0) { + vbio_free(vbio); + return (error); + } + + vbio_submit(vbio, flags); + return (0); +} + /* ========== */ /* - * This is the classic, battle-tested BIO submission code. + * This is the classic, battle-tested BIO submission code. Until we're totally + * sure that the new code is safe and correct in all cases, this will remain + * available and can be enabled by setting zfs_vdev_disk_classic=1 at module + * load time. * * These functions have been renamed to vdev_classic_* to make it clear what * they belong to, but their implementations are unchanged. @@ -1003,7 +1424,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) (void) tsd; if (vdev_disk_io_rw_fn == NULL) - vdev_disk_io_rw_fn = vdev_classic_physio; + /* XXX make configurable */ + vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw; return (0); } @@ -1098,3 +1520,9 @@ param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) ZFS_MODULE_PARAM(zfs_zio, zio_, suppress_zero_writes, INT, ZMOD_RW, "Do not send zero byte writes to hardware"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, + "Timeout before determining that a device is missing"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, + "Maximum number of data segments to add to an IO request (min 4)"); From ba6d0fb06039a75cbadef4a08cd42a259c54b8ef Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 9 Jan 2024 13:28:57 +1100 Subject: [PATCH 08/32] vdev_disk: add module parameter to select BIO submission method This makes the submission method selectable at module load time via the `zfs_vdev_disk_classic` parameter, allowing this change to be backported to 2.2 safely, and disabled in favour of the "classic" submission method if new problems come up. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 2382fdb0a83a5a3c6cf3860695d3f29281773170) --- man/man4/zfs.4 | 16 ++++++++++++++++ module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d840ae9a1625..55580c937681 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1312,6 +1312,22 @@ If this is higher than the maximum allowed by the device queue or the kernel itself, it will be clamped. Setting it to zero will cause the kernel's ideal size to be used. This parameter only applies on Linux. +This parameter is ignored if +.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . +. +.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint +If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 +and earlier. +This "classic" method has known issues with highly fragmented IO requests and +is slower on many workloads, but it has been in use for many years and is known +to be very stable. +If you set this parameter, please also open a bug report why you did so, +including the workload involved and any error messages. +.Pp +This parameter and the classic submission method will be removed once we have +total confidence in the new method. +.Pp +This parameter only applies on Linux, and can only be set at module load time. . .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 5902904918fb..808b561b9999 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1412,6 +1412,29 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +/* + * BIO submission method. See comment above about vdev_classic. + * Set zfs_vdev_disk_classic=0 for new, =1 for classic + */ +static uint_t zfs_vdev_disk_classic = 0; /* default new */ + +/* Set submission function from module parameter */ +static int +vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) +{ + int err = param_set_uint(buf, kp); + if (err < 0) + return (SET_ERROR(err)); + + vdev_disk_io_rw_fn = + zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; + + printk(KERN_INFO "ZFS: forcing %s BIO submission\n", + zfs_vdev_disk_classic ? "classic" : "new"); + + return (0); +} + /* * At first use vdev use, set the submission function from the default value if * it hasn't been set already. @@ -1424,8 +1447,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) (void) tsd; if (vdev_disk_io_rw_fn == NULL) - /* XXX make configurable */ - vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw; + vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? + vdev_classic_physio : vdev_disk_io_rw; return (0); } @@ -1526,3 +1549,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, "Maximum number of data segments to add to an IO request (min 4)"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, + vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, + "Use classic BIO submission method"); From ce5719554f7fcb16f927d8f772d5104a2dea62bd Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 21 Feb 2024 11:07:21 +1100 Subject: [PATCH 09/32] vdev_disk: use bio_chain() to submit multiple BIOs Simplifies our code a lot, so we don't have to wait for each and reassemble them. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15533 Closes #15588 (cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3) --- module/os/linux/zfs/vdev_disk.c | 231 +++++++++++--------------------- 1 file changed, 80 insertions(+), 151 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 808b561b9999..ebf656a9b5ac 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -359,10 +359,9 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) { + if (vd->vd_bdev != NULL) blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); - } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); @@ -568,9 +567,6 @@ typedef struct { abd_t *vbio_abd; /* abd carrying borrowed linear buf */ - atomic_t vbio_ref; /* bio refcount */ - int vbio_error; /* error from failed bio */ - uint_t vbio_max_segs; /* max segs per bio */ uint_t vbio_max_bytes; /* max bytes per bio */ @@ -579,43 +575,52 @@ typedef struct { uint64_t vbio_offset; /* start offset of next bio */ struct bio *vbio_bio; /* pointer to the current bio */ - struct bio *vbio_bios; /* list of all bios */ + int vbio_flags; /* bio flags */ } vbio_t; static vbio_t * -vbio_alloc(zio_t *zio, struct block_device *bdev) +vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) { vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); vbio->vbio_zio = zio; vbio->vbio_bdev = bdev; - atomic_set(&vbio->vbio_ref, 0); + vbio->vbio_abd = NULL; vbio->vbio_max_segs = vdev_bio_max_segs(bdev); vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); vbio->vbio_lbs_mask = bdev_logical_block_size(bdev)-1; vbio->vbio_offset = zio->io_offset; + vbio->vbio_bio = NULL; + vbio->vbio_flags = flags; return (vbio); } +BIO_END_IO_PROTO(vbio_completion, bio, error); + static int vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) { - struct bio *bio; + struct bio *bio = vbio->vbio_bio; uint_t ssize; while (size > 0) { - bio = vbio->vbio_bio; if (bio == NULL) { /* New BIO, allocate and set up */ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, vbio->vbio_max_segs); - if (unlikely(bio == NULL)) - return (SET_ERROR(ENOMEM)); + VERIFY(bio); + BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; + bio_set_op_attrs(bio, + vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? + WRITE : READ, vbio->vbio_flags); - bio->bi_next = vbio->vbio_bios; - vbio->vbio_bios = vbio->vbio_bio = bio; + if (vbio->vbio_bio) { + bio_chain(vbio->vbio_bio, bio); + vdev_submit_bio(vbio->vbio_bio); + } + vbio->vbio_bio = bio; } /* @@ -640,157 +645,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) vbio->vbio_offset += BIO_BI_SIZE(bio); /* Signal new BIO allocation wanted */ - vbio->vbio_bio = NULL; + bio = NULL; } return (0); } -BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error); -static void vbio_put(vbio_t *vbio); +/* Iterator callback to submit ABD pages to the vbio. */ +static int +vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vbio_t *vbio = priv; + return (vbio_add_page(vbio, page, len, off)); +} +/* Create some BIOs, fill them with data and submit them */ static void -vbio_submit(vbio_t *vbio, int flags) +vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) { - ASSERT(vbio->vbio_bios); - struct bio *bio = vbio->vbio_bios; - vbio->vbio_bio = vbio->vbio_bios = NULL; - - /* - * We take a reference for each BIO as we submit it, plus one to - * protect us from BIOs completing before we're done submitting them - * all, causing vbio_put() to free vbio out from under us and/or the - * zio to be returned before all its IO has completed. - */ - atomic_set(&vbio->vbio_ref, 1); + ASSERT(vbio->vbio_bdev); /* - * If we're submitting more than one BIO, inform the block layer so - * it can batch them if it wants. + * We plug so we can submit the BIOs as we go and only unplug them when + * they are fully created and submitted. This is important; if we don't + * plug, then the kernel may start executing earlier BIOs while we're + * still creating and executing later ones, and if the device goes + * away while that's happening, older kernels can get confused and + * trample memory. */ struct blk_plug plug; - boolean_t do_plug = (bio->bi_next != NULL); - if (do_plug) - blk_start_plug(&plug); + blk_start_plug(&plug); - /* Submit all the BIOs */ - while (bio != NULL) { - atomic_inc(&vbio->vbio_ref); + (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); + ASSERT(vbio->vbio_bio); - struct bio *next = bio->bi_next; - bio->bi_next = NULL; + vbio->vbio_bio->bi_end_io = vbio_completion; + vbio->vbio_bio->bi_private = vbio; - bio->bi_end_io = vdev_disk_io_rw_completion; - bio->bi_private = vbio; - bio_set_op_attrs(bio, - vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? - WRITE : READ, flags); + vdev_submit_bio(vbio->vbio_bio); - vdev_submit_bio(bio); + blk_finish_plug(&plug); - bio = next; - } - - /* Finish the batch */ - if (do_plug) - blk_finish_plug(&plug); - - /* Release the extra reference */ - vbio_put(vbio); + vbio->vbio_bio = NULL; + vbio->vbio_bdev = NULL; } -static void -vbio_return_abd(vbio_t *vbio) +/* IO completion callback */ +BIO_END_IO_PROTO(vbio_completion, bio, error) { + vbio_t *vbio = bio->bi_private; zio_t *zio = vbio->vbio_zio; - if (vbio->vbio_abd == NULL) - return; - - /* - * If we copied the ABD before issuing it, clean up and return the copy - * to the ADB, with changes if appropriate. - */ - void *buf = abd_to_buf(vbio->vbio_abd); - abd_free(vbio->vbio_abd); - vbio->vbio_abd = NULL; - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, buf, zio->io_size); - else - abd_return_buf(zio->io_abd, buf, zio->io_size); -} - -static void -vbio_free(vbio_t *vbio) -{ - VERIFY0(atomic_read(&vbio->vbio_ref)); - vbio_return_abd(vbio); + ASSERT(zio); - kmem_free(vbio, sizeof (vbio_t)); -} + /* Capture and log any errors */ +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = 0; + if (error) + zio->io_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + zio->io_error = EIO; +#endif + ASSERT3U(zio->io_error, >=, 0); -static void -vbio_put(vbio_t *vbio) -{ - if (atomic_dec_return(&vbio->vbio_ref) > 0) - return; + if (zio->io_error) + vdev_disk_error(zio); - /* - * This was the last reference, so the entire IO is completed. Clean - * up and submit it for processing. - */ + /* Return the BIO to the kernel */ + bio_put(bio); /* - * Get any data buf back to the original ABD, if necessary. We do this - * now so we can get the ZIO into the pipeline as quickly as possible, - * and then do the remaining cleanup after. + * If we copied the ABD before issuing it, clean up and return the copy + * to the ADB, with changes if appropriate. */ - vbio_return_abd(vbio); + if (vbio->vbio_abd != NULL) { + void *buf = abd_to_buf(vbio->vbio_abd); + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; - zio_t *zio = vbio->vbio_zio; + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); + } - /* - * Set the overall error. If multiple BIOs returned an error, only the - * first will be taken; the others are dropped (see - * vdev_disk_io_rw_completion()). Its pretty much impossible for - * multiple IOs to the same device to fail with different errors, so - * there's no real risk. - */ - zio->io_error = vbio->vbio_error; - if (zio->io_error) - vdev_disk_error(zio); + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); /* All done, submit for processing */ zio_delay_interrupt(zio); - - /* Finish cleanup */ - vbio_free(vbio); -} - -BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error) -{ - vbio_t *vbio = bio->bi_private; - - if (vbio->vbio_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - vbio->vbio_error = BIO_END_IO_ERROR(bio); -#else - if (error) - vbio->vbio_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - vbio->vbio_error = EIO; -#endif - } - - /* - * Destroy the BIO. This is safe to do; the vbio owns its data and the - * kernel won't touch it again after the completion function runs. - */ - bio_put(bio); - - /* Drop this BIOs reference acquired by vbio_submit() */ - vbio_put(vbio); } /* @@ -853,14 +798,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) return (B_TRUE); } -/* Iterator callback to submit ABD pages to the vbio. */ -static int -vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv) -{ - vbio_t *vbio = priv; - return (vbio_add_page(vbio, page, len, off)); -} - static int vdev_disk_io_rw(zio_t *zio) { @@ -920,20 +857,12 @@ vdev_disk_io_rw(zio_t *zio) } /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ - int error = 0; - vbio_t *vbio = vbio_alloc(zio, bdev); + vbio_t *vbio = vbio_alloc(zio, bdev, flags); if (abd != zio->io_abd) vbio->vbio_abd = abd; - /* Fill it with pages */ - error = abd_iterate_page_func(abd, 0, zio->io_size, - vdev_disk_fill_vbio_cb, vbio); - if (error != 0) { - vbio_free(vbio); - return (error); - } - - vbio_submit(vbio, flags); + /* Fill it with data pages and submit it to the kernel */ + vbio_submit(vbio, abd, zio->io_size); return (0); } From 9e1afd0d91a5bf6460127a9ab8d90a2b781445f4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 14 Mar 2024 10:57:30 +1100 Subject: [PATCH 10/32] abd_iter_page: don't use compound heads on Linux <4.5 Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages in a compound page were refcounted separately. This means that using the head page without taking a reference to it could see it cleaned up later before we're finished with it. Specifically, bio_add_page() would take a reference, and drop its reference after the bio completion callback returns. If the zio is executed immediately from the completion callback, this is usually ok, as any data is referenced through the tail page referenced by the ABD, and so becomes "live" that way. If there's a delay in zio execution (high load, error injection), then the head page can be freed, along with any dirty flags or other indicators that the underlying memory is used. Later, when the zio completes and that memory is accessed, its either unmapped and an unhandled fault takes down the entire system, or it is mapped and we end up messing around in someone else's memory. Both of these are very bad. The solution on these older kernels is to take a reference to the head page when we use it, and release it when we're done. There's not really a sensible way under our current structure to do this; the "best" would be to keep a list of head page references in the ABD, and release them when the ABD is freed. Since this additional overhead is totally unnecessary on 4.5+, where head and tail pages share refcounts, I've opted to simply not use the compound head in ABD page iteration there. This is theoretically less efficient (though cleaning up head page references would add overhead), but its safe, and we still get the other benefits of not mapping pages before adding them to a bio and not mis-splitting pages. There doesn't appear to be an obvious symbol name or config option we can match on to discover this behaviour in configure (and the mm/page APIs have changed a lot since then anyway), so I've gone with a simple version check. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15533 Closes #15588 (cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542) --- module/os/linux/zfs/abd_os.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 6ea1e45cf8bb..f82c4ce6b82f 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -62,6 +62,7 @@ #include #include #include +#include #else #define MAX_ORDER 1 #endif @@ -1050,6 +1051,7 @@ abd_iter_page(struct abd_iter *aiter) } ASSERT(page); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) if (PageTail(page)) { /* * This page is part of a "compound page", which is a group of @@ -1071,11 +1073,23 @@ abd_iter_page(struct abd_iter *aiter) * To do this, we need to adjust the offset to be counted from * the head page. struct page for compound pages are stored * contiguously, so we can just adjust by a simple offset. + * + * Before kernel 4.5, compound page heads were refcounted + * separately, such that moving back to the head page would + * require us to take a reference to it and releasing it once + * we're completely finished with it. In practice, that means + * when our caller is done with the ABD, which we have no + * insight into from here. Rather than contort this API to + * track head page references on such ancient kernels, we just + * compile this block out and use the tail pages directly. This + * is slightly less efficient, but makes everything far + * simpler. */ struct page *head = compound_head(page); doff += ((page - head) * PAGESIZE); page = head; } +#endif /* final page and position within it */ aiter->iter_page = page; From 657c1f75ad29b02fe3d4eefcba7efe087eff62b3 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 2 Apr 2024 15:14:54 +1100 Subject: [PATCH 11/32] vdev_disk: don't touch vbio after its handed off to the kernel After IO is unplugged, it may complete immediately and vbio_completion be called on interrupt context. That may interrupt or deschedule our task. If its the last bio, the vbio will be freed. Then, we get rescheduled, and try to write to freed memory through vbio->. This patch just removes the the cleanup, and the corresponding assert. These were leftovers from a previous iteration of vbio_submit() and were always "belt and suspenders" ops anyway, never strictly required. Reported-by: Rich Ercolani Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f) --- module/os/linux/zfs/vdev_disk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index ebf656a9b5ac..90a4dc135d53 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -663,8 +663,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) static void vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) { - ASSERT(vbio->vbio_bdev); - /* * We plug so we can submit the BIOs as we go and only unplug them when * they are fully created and submitted. This is important; if we don't @@ -682,12 +680,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) vbio->vbio_bio->bi_end_io = vbio_completion; vbio->vbio_bio->bi_private = vbio; + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ vdev_submit_bio(vbio->vbio_bio); blk_finish_plug(&plug); - - vbio->vbio_bio = NULL; - vbio->vbio_bdev = NULL; } /* IO completion callback */ From 1339b7e0ac7c926ad1a5b9de203f2aeb4e461859 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 25 Oct 2023 15:11:37 +1100 Subject: [PATCH 12/32] spa: make read/write queues configurable We are finding that as customers get larger and faster machines (hundreds of cores, large NVMe-backed pools) they keep hitting relatively low performance ceilings. Our profiling work almost always finds that they're running into bottlenecks on the SPA IO taskqs. Unfortunately there's often little we can advise at that point, because there's very few ways to change behaviour without patching. This commit adds two load-time parameters `zio_taskq_read` and `zio_taskq_write` that can configure the READ and WRITE IO taskqs directly. This achieves two goals: it gives operators (and those that support them) a way to tune things without requiring a custom build of OpenZFS, which is often not possible, and it lets us easily try different config variations in a variety of environments to inform the development of better defaults for these kind of systems. Because tuning the IO taskqs really requires a fairly deep understanding of how IO in ZFS works, and generally isn't needed without a pretty serious workload and an ability to identify bottlenecks, only minimal documentation is provided. Its expected that anyone using this is going to have the source code there as well. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. (cherry picked from commit 12a031a3f50d6b20fcac60c392d117dc781e6f07) --- man/man4/zfs.4 | 10 ++ module/zfs/spa.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 306 insertions(+), 1 deletion(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 55580c937681..1a3d23b816d8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2191,6 +2191,16 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp +Set the queue and thread configuration for the IO read queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. +.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp +Set the queue and thread configuration for the IO write queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9a176781c942..922643b5ceb5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -150,7 +150,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -1114,6 +1114,292 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); +} +#else +#include + +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -10218,4 +10504,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ From c3b78e7b868bc1155ee389eccaa05e2c1dbebd3b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 11 Jan 2024 19:43:38 +1100 Subject: [PATCH 13/32] freebsd: fix compile for spa_taskq_read/spa_taskq_write params Missed in #15696, backporting #15675. Signed-off-by: Rob Norris (cherry picked from commit 437d598fa31cd77db30421c5d1bdaf7dafad8a71) --- include/os/freebsd/spl/sys/mod_os.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 5695abee7b85..d46f636b23dc 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -92,6 +92,12 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" +#define spa_taskq_read_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" + +#define spa_taskq_write_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" From 92063ad7d4e9dff8d335e6dae879398a7ac7ba82 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 10:22:58 -0500 Subject: [PATCH 14/32] spa: Fix FreeBSD sysctl handlers sbuf_cpy() resets the sbuf state, which is wrong for sbufs allocated by sbuf_new_for_sysctl(). In particular, this code triggers an assertion failure in sbuf_clear(). Simplify by just using sysctl_handle_string() for both reading and setting the tunable. Fixes: 6930ecbb7 ("spa: make read/write queues configurable") Reported-by: Peter Holm Signed-off-by: Mark Johnston (cherry picked from commit e5230fb9d2c3670feda07b8a0759ac446749c247) --- module/zfs/spa.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 922643b5ceb5..7d8253e383a4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1348,8 +1348,6 @@ spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); } #else -#include - /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. @@ -1360,19 +1358,11 @@ static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } @@ -1381,19 +1371,11 @@ static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } From 79d092987b8af92f564c78022e81daacbb548ddd Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 12:56:35 -0500 Subject: [PATCH 15/32] spa: Let spa_taskq_param_get()'s addition of a newline be optional For FreeBSD sysctls, we don't want the extra newline, since the sysctl(8) utility will format strings appropriately. Signed-off-by: Mark Johnston (cherry picked from commit 4cf943769d3decd73b379eb069cebe0c148e3f39) --- module/zfs/spa.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7d8253e383a4..d790d663de1d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1295,7 +1295,7 @@ spa_taskq_param_set(zio_type_t t, char *cfg) } static int -spa_taskq_param_get(zio_type_t t, char *buf) +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; @@ -1313,7 +1313,8 @@ spa_taskq_param_get(zio_type_t t, char *buf) sep = " "; } - buf[pos++] = '\n'; + if (add_newline) + buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); @@ -1331,7 +1332,7 @@ spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int @@ -1345,7 +1346,7 @@ spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* @@ -1360,7 +1361,7 @@ spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); @@ -1373,7 +1374,7 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); From 238392bc956f4917a36dd2f68c595710fc66a610 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 13 Jul 2023 18:06:57 +0200 Subject: [PATCH 16/32] FreeBSD: catch up to __FreeBSD_version 1400093 Reviewed-by: Alexander Motin Signed-off-by: Mateusz Guzik Closes #15036 (cherry picked from commit d78d49ce8aff4d40b1cf363f43f2eae088e36b69) --- include/os/freebsd/spl/sys/vnode.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index bcfc177a4751..a23b11d60a7b 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -36,7 +36,11 @@ struct xucred; typedef struct flock flock64_t; typedef struct vnode vnode_t; typedef struct vattr vattr_t; +#if __FreeBSD_version < 1400093 typedef enum vtype vtype_t; +#else +#define vtype_t __enum_uint8(vtype) +#endif #include #include From 2fec57bf0339142a832a94d289c470161f0b81d6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 23 Oct 2023 14:34:56 +1100 Subject: [PATCH 17/32] trace: spa_taskqs_ent trace class, dispatch and dispatched tracepoints Signed-off-by: Rob Norris --- include/os/linux/zfs/sys/Makefile.am | 1 + include/os/linux/zfs/sys/trace_spa_taskqs.h | 74 +++++++++++++++++++++ include/os/linux/zfs/sys/trace_zfs.h | 1 + include/sys/spa_impl.h | 2 + module/os/linux/zfs/trace.c | 1 + module/zfs/spa.c | 10 +++ 6 files changed, 89 insertions(+) create mode 100644 include/os/linux/zfs/sys/trace_spa_taskqs.h diff --git a/include/os/linux/zfs/sys/Makefile.am b/include/os/linux/zfs/sys/Makefile.am index a075db476e40..71c1dbda39c9 100644 --- a/include/os/linux/zfs/sys/Makefile.am +++ b/include/os/linux/zfs/sys/Makefile.am @@ -11,6 +11,7 @@ KERNEL_H = \ trace_dnode.h \ trace_multilist.h \ trace_rrwlock.h \ + trace_spa_taskqs.h \ trace_txg.h \ trace_vdev.h \ trace_zil.h \ diff --git a/include/os/linux/zfs/sys/trace_spa_taskqs.h b/include/os/linux/zfs/sys/trace_spa_taskqs.h new file mode 100644 index 000000000000..77e2b8ea1fd1 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_spa_taskqs.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_spa_taskqs + +#if !defined(_TRACE_SPA_TASKQS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SPA_TASKQS_H + +#include +#include + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * spa_taskqs_t *stqs, ..., + * taskq_ent_t *ent, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_spa_taskqs_ent_class, + TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent), + TP_ARGS(stqs, ent), +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_SPA_TASKQS_ENT_EVENT(name) \ +DEFINE_EVENT(zfs_spa_taskqs_ent_class, name, \ + TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent), \ + TP_ARGS(stqs, ent)) +/* END CSTYLED */ +DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatch); +DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatched); + +#endif /* _TRACE_SPA_TASKQS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_spa_taskqs +#include + +#else + +DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatch); +DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatched); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zfs.h b/include/os/linux/zfs/sys/trace_zfs.h index 0e19f8d186d0..0b0ac94e01f0 100644 --- a/include/os/linux/zfs/sys/trace_zfs.h +++ b/include/os/linux/zfs/sys/trace_zfs.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index b2a718af0f88..d9f1b5c4f327 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -183,6 +183,8 @@ typedef enum spa_proc_state { } spa_proc_state_t; typedef struct spa_taskqs { + zio_taskq_type_t stqs_type; + zio_type_t stqs_zio_type; uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; diff --git a/module/os/linux/zfs/trace.c b/module/os/linux/zfs/trace.c index a690822ae14c..9c09baee0378 100644 --- a/module/os/linux/zfs/trace.c +++ b/module/os/linux/zfs/trace.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d790d663de1d..8d93088518b2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -87,6 +87,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -982,6 +983,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; + tqs->stqs_type = q; + tqs->stqs_zio_type = t; + switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); @@ -1399,6 +1403,9 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + DTRACE_PROBE2(spa_taskqs_ent__dispatch, + spa_taskqs_t *, tqs, taskq_ent_t *, ent); + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { @@ -1406,6 +1413,9 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, } taskq_dispatch_ent(tq, func, arg, flags, ent); + + DTRACE_PROBE2(spa_taskqs_ent__dispatched, + spa_taskqs_t *, tqs, taskq_ent_t *, ent); } /* From c85d31a88b8880cac1701ce4d4f2f075583e82ea Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 23 Oct 2023 12:26:26 +1100 Subject: [PATCH 18/32] taskq: add taskq_try_dispatch_ent() Non-blocking form of taskq_dispatch_ent(), returns false if it can't acquire the taskq lock. Signed-off-by: Rob Norris --- include/os/freebsd/spl/sys/taskq.h | 2 ++ include/os/linux/spl/sys/taskq.h | 2 ++ include/sys/zfs_context.h | 2 ++ lib/libzpool/taskq.c | 24 ++++++++++++++-- module/os/freebsd/spl/spl_taskq.c | 9 ++++++ module/os/linux/spl/spl-taskq.c | 46 ++++++++++++++++++++++++------ 6 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 3040549e043d..74243ba903e7 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -91,6 +91,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index b50175a10873..cdcaf8cbff85 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -146,6 +146,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 2c7ae3003f46..2ec9636da278 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -503,6 +503,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 456080f7f247..28bead7930e3 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -156,8 +156,8 @@ taskq_init_ent(taskq_ent_t *t) t->tqent_flags = 0; } -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, +static void +taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { ASSERT(func != NULL); @@ -170,7 +170,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* * Enqueue the task to the underlying queue. */ - mutex_enter(&tq->tq_lock); if (flags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; @@ -184,7 +183,26 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_func = func; t->tqent_arg = arg; cv_signal(&tq->tq_dispatch_cv); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + mutex_enter(&tq->tq_lock); + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + mutex_exit(&tq->tq_lock); +} + +boolean_t +taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + if (!mutex_tryenter(&tq->tq_lock)) + return (B_FALSE); + taskq_dispatch_ent_impl(tq, func, arg, flags, t); mutex_exit(&tq->tq_lock); + return (B_TRUE); } void diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index 3fa7939bdb3c..ba9ab33227bb 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -411,6 +411,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, taskqueue_enqueue(tq->tq_queue, &task->tqent_task); } +boolean_t +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + /* XXX: implement me -- robn, 2023-10-23 */ + taskq_dispatch_ent(tq, func, arg, flags, task); + return (B_TRUE); +} + void taskq_wait(taskq_t *tq) { diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index fb25a4154485..d02ad6a4450d 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -673,17 +673,13 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, } EXPORT_SYMBOL(taskq_dispatch_delay); -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, +static void +taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { - unsigned long irqflags; ASSERT(tq); ASSERT(func); - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, - tq->tq_lock_class); - /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) { t->tqent_id = TASKQID_INVALID; @@ -694,7 +690,7 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* Dynamic taskq may be able to spawn another thread */ if (!(tq->tq_flags & TASKQ_DYNAMIC) || taskq_thread_spawn(tq) == 0) - goto out2; + return; flags |= TQ_FRONT; } @@ -734,11 +730,45 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); -out2: +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); +boolean_t +taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + + /* + * XXX I don't _think_ losing _nested matters, because I think its + * only related to lockdep, and we don't have access to that anyway + * -- robn, 2023-10-23 + */ + if (!spin_trylock_irqsave(&tq->tq_lock, irqflags)) + return (B_FALSE); + + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + + return (B_TRUE); +} +EXPORT_SYMBOL(taskq_try_dispatch_ent); + int taskq_empty_ent(taskq_ent_t *t) { From 7d74fc7a68324207b10d7b7e6bcaef1013877048 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 23 Oct 2023 12:42:28 +1100 Subject: [PATCH 19/32] spa_taskq_dispatch_ent: look for an unlocked taskq before waiting Signed-off-by: Rob Norris --- module/zfs/spa.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8d93088518b2..93213e0436fd 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1390,15 +1390,14 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * on the taskq itself. In that case we try each one until it goes in, before + * falling back to waiting on a lock. */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); @@ -1407,13 +1406,21 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, spa_taskqs_t *, tqs, taskq_ent_t *, ent); if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; + taskq_dispatch_ent(tqs->stqs_taskq[0], func, arg, flags, ent); + goto out; } - taskq_dispatch_ent(tq, func, arg, flags, ent); + int select = ((uint64_t)gethrtime()) % tqs->stqs_count; + for (int i = 0; i < tqs->stqs_count; i++) { + if (taskq_try_dispatch_ent( + tqs->stqs_taskq[select], func, arg, flags, ent)) + goto out; + select = (select+1) % tqs->stqs_count; + } + taskq_dispatch_ent(tqs->stqs_taskq[select], func, arg, flags, ent); + +out: DTRACE_PROBE2(spa_taskqs_ent__dispatched, spa_taskqs_t *, tqs, taskq_ent_t *, ent); } From 4e9cbd04a4c95ea9ed8afbee8ea2202489c1e187 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 22 Dec 2023 14:42:34 +1100 Subject: [PATCH 20/32] spa: add zio_taskq_trylock tuneable Signed-off-by: Rob Norris --- module/zfs/spa.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 93213e0436fd..3b7e375a5811 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -173,6 +173,14 @@ uint_t zio_taskq_batch_tpq; /* threads per taskq */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ +/* + * If enabled, try to find an unlocked IO taskq to dispatch an IO onto before + * falling back to waiting on a lock. This should only be enabled in + * conjunction with careful performance testing, and will likely require + * zio_taskq_read/zio_taskq_write to be adjusted as well. + */ +boolean_t zio_taskq_trylock = B_FALSE; + boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ /* @@ -1411,11 +1419,13 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, } int select = ((uint64_t)gethrtime()) % tqs->stqs_count; - for (int i = 0; i < tqs->stqs_count; i++) { - if (taskq_try_dispatch_ent( - tqs->stqs_taskq[select], func, arg, flags, ent)) - goto out; - select = (select+1) % tqs->stqs_count; + if (zio_taskq_trylock) { + for (int i = 0; i < tqs->stqs_count; i++) { + if (taskq_try_dispatch_ent( + tqs->stqs_taskq[select], func, arg, flags, ent)) + goto out; + select = (select+1) % tqs->stqs_count; + } } taskq_dispatch_ent(tqs->stqs_taskq[select], func, arg, flags, ent); @@ -10485,6 +10495,9 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, "Number of threads per IO worker taskqueue"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_trylock, UINT, ZMOD_RD, + "Try to dispatch IO to an unlocked IO taskqueue before sleeping"); + ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); From 014ff864d8f104a73cfca7f85ff85c242ebf608e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 12 Jan 2024 15:30:49 +1100 Subject: [PATCH 21/32] slack: fix decompression Turns out decompression never worked at all; likely an oversight converting the original "transparent" versions to a true compression option. This makes it work. Signed-off-by: Rob Norris --- include/sys/zio_compress.h | 2 ++ module/zfs/slack.c | 8 ++++++++ module/zfs/zio_compress.c | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 4c0d920fc2c5..6e96ed6cb3a0 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -182,6 +182,8 @@ extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern size_t slack_compress(void *src, void *dst, size_t s_len, size_t d_len, int level); +extern int slack_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. diff --git a/module/zfs/slack.c b/module/zfs/slack.c index 6c9a0418068b..cbf797dd9c74 100644 --- a/module/zfs/slack.c +++ b/module/zfs/slack.c @@ -54,3 +54,11 @@ slack_compress(void *src, void *dst, size_t s_len, size_t d_len, int level) memcpy(dst, src, c_len); return (c_len); } + +int +slack_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level) +{ + ASSERT3U(d_len, >=, s_len); + memcpy(dst, src, s_len); + return (0); +} diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 7c8188f7d7fe..cafcc8879e1a 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -68,7 +68,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, zfs_zstd_decompress, zfs_zstd_decompress_level}, - {"slack", 0, slack_compress, NULL, NULL }, + {"slack", 0, slack_compress, slack_decompress, NULL }, }; uint8_t From 831fdad1b081473f5daeaf000ec3838790256ea0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 15 Jan 2024 12:58:12 +1100 Subject: [PATCH 22/32] slack: require module param before enabling slack compression Just an extra explicit opt-in, since it changes the compression method isn't upstreamed yet. --- module/zfs/zfs_ioctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b78cb802199f..49e1c3f6ccdc 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -241,6 +241,11 @@ unsigned long zfs_max_nvlist_src_size = 0; */ unsigned long zfs_history_output_max = 1024 * 1024; +/* + * Whether or not to allow compression=slack to be set on a dataset. + */ +int zfs_slack_compress_enabled = 0; + uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; @@ -4573,6 +4578,9 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) if (compval == ZIO_COMPRESS_SLACK) { spa_t *spa; + if (!zfs_slack_compress_enabled) + return (SET_ERROR(ENOTSUP)); + if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); @@ -7770,4 +7778,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); + +ZFS_MODULE_PARAM(zfs, zfs_, slack_compress_enabled, INT, ZMOD_RW, + "Allow slack compression feature to be set on a dataset"); /* END CSTYLED */ From bef9815829e116e1320350b5fcb575931189ca4a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 18 Sep 2023 11:07:32 +1000 Subject: [PATCH 23/32] tests: add tests for zpool import behaviour when hostid changes Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15290 (cherry picked from commit 8f5aa8cb00fb591c7dc9a91f0f2ff5b42de5f9c1) --- tests/runfiles/common.run | 4 ++ .../cli_root/zpool_import/Makefile.am | 4 ++ .../cli_root/zpool_import/zpool_import.cfg | 5 ++ .../cli_root/zpool_import/zpool_import.kshlib | 1 + .../zpool_import_hostid_changed.ksh | 59 +++++++++++++++ .../zpool_import_hostid_changed_cachefile.ksh | 65 +++++++++++++++++ ...ostid_changed_cachefile_unclean_export.ksh | 71 +++++++++++++++++++ ...l_import_hostid_changed_unclean_export.ksh | 70 ++++++++++++++++++ 8 files changed, 279 insertions(+) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 421933364fd1..67493ca958ef 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -401,6 +401,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', + 'zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index a8c9a31dcfdc..226e804db38a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -36,6 +36,10 @@ dist_pkgdata_SCRIPTS = \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ zpool_import_features_003_pos.ksh \ + zpool_import_hostid_changed.ksh \ + zpool_import_hostid_changed_unclean_export.ksh \ + zpool_import_hostid_changed_cachefile.ksh \ + zpool_import_hostid_changed_cachefile_unclean_export.ksh \ zpool_import_missing_001_pos.ksh \ zpool_import_missing_002_pos.ksh \ zpool_import_missing_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 25f541ebf185..91fd7edf8529 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -26,6 +26,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib @@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test + +export HOSTID_FILE="/etc/hostid" +export HOSTID1=01234567 +export HOSTID2=89abcdef diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 8bbd668a9317..d534ee69ece8 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -11,6 +11,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh new file mode 100755 index 000000000000..bc82b7cc1ee8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable without force even if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Export the pool. +# 4. Change the hostid. +# 5. Verify that importing the pool without force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Export the pool. +log_must zpool export $TESTPOOL1 + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool without force succeeds. +log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh new file mode 100755 index 000000000000..07c43482d68f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool with a cachefile. +# 3. Backup the cachfile. +# 4. Export the pool. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Export the pool. +log_must zpool export $TESTPOOL1 + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool from cachefile " \ + "when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh new file mode 100755 index 000000000000..8362d915f0cf --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Backup the cachefile. +# 4. Simulate the pool being torn down without export: +# 4.1. Copy the underlying device state. +# 4.2. Export the pool. +# 4.3. Restore the device state from the copy. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import pool from cachefile if not cleanly " \ + "exported when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh new file mode 100755 index 000000000000..ad8cca642dbc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable without force if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Simulate the pool being torn down without export: +# 3.1. Copy the underlying device state. +# 3.2. Export the pool. +# 3.3. Restore the device state from the copy. +# 4. Change the hostid. +# 5. Verify that importing the pool fails. +# 6. Verify that importing the pool with force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool fails. +log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1 + +# 6. Verify that importing the pool with force succeeds. +log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1 + +log_pass "zpool import requires force if not cleanly exported " \ + "and hostid changed." From 0ab4172e4c7c4afee1df6ff04f505b04b11dcb76 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Sep 2023 17:02:02 +1000 Subject: [PATCH 24/32] import: require force when cachefile hostid doesn't match on-disk Previously, if a cachefile is passed to zpool import, the cached config is mostly offered as-is to ZFS_IOC_POOL_TRYIMPORT->spa_tryimport(), and the results are taken as the canonical pool config and handed back to ZFS_IOC_POOL_IMPORT. In the course of its operation, spa_load() will inspect the pool and build a new config from what it finds on disk. However, it then regenerates a new config ready to import, and so rightly sets the hostid and hostname for the local host in the config it returns. Because of this, the "require force" checks always decide the pool is exported and last touched by the local host, even if this is not true, which is possible in a HA environment when MMP is not enabled. The pool may be imported on another head, but the import checks still pass here, so the pool ends up imported on both. (This doesn't happen when a cachefile isn't used, because the pool config is discovered in userspace in zpool_find_import(), and that does find the on-disk hostid and hostname correctly). Since the systemd zfs-import-cache.service unit uses cachefile imports, this can lead to a system returning after a crash with a "valid" cachefile on disk and automatically, quietly, importing a pool that has already been taken up by a secondary head. This commit causes the on-disk hostid and hostname to be included in the ZPOOL_CONFIG_LOAD_INFO item in the returned config, and then changes the "force" checks for zpool import to use them if present. This method should give no change in behaviour for old userspace on new kernels (they won't know to look for the new config items) and for new userspace on old kernels (the won't find the new config items). Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15290 (cherry picked from commit 54b1b1d893cff26417fa8042776d7865d798d765) --- cmd/zpool/zpool_main.c | 23 +++++++++++++++---- module/zfs/spa.c | 18 +++++++++++++++ ...ostid_changed_cachefile_unclean_export.ksh | 20 +++++++++------- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 91aac5566cc5..fc8f35c5eff4 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3068,12 +3068,21 @@ zfs_force_import_required(nvlist_t *config) nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + + /* + * The hostid on LOAD_INFO comes from the MOS label via + * spa_tryimport(). If its not there then we're likely talking to an + * older kernel, so use the top one, which will be from the label + * discovered in zpool_find_import(), or if a cachefile is in use, the + * local hostid. + */ + if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) + nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); @@ -3143,7 +3152,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, uint64_t timestamp = 0; uint64_t hostid = 0; - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_HOSTNAME); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); @@ -3151,7 +3163,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_HOSTID); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3b7e375a5811..bec3b5cbb8d0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -4200,6 +4200,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); + /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh index 8362d915f0cf..dcb1ac1ab69f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -20,8 +20,8 @@ # # DESCRIPTION: -# A pool that wasn't cleanly exported should be importable from a cachefile -# without force even if the local hostid doesn't match the on-disk hostid. +# A pool that wasn't cleanly exported should not be importable from a cachefile +# without force if the local hostid doesn't match the on-disk hostid. # # STRATEGY: # 1. Set a hostid. @@ -32,8 +32,9 @@ # 4.2. Export the pool. # 4.3. Restore the device state from the copy. # 5. Change the hostid. -# 6. Verify that importing the pool from the cachefile succeeds -# without force. +# 6. Verify that importing the pool from the cachefile fails. +# 7. Verify that importing the pool from the cachefile with force +# succeeds. # verify_runnable "global" @@ -64,8 +65,11 @@ log_must rm -f $VDEV0.bak # 5. Change the hostid. log_must zgenhostid -f $HOSTID2 -# 6. Verify that importing the pool from the cachefile succeeds without force. -log_must zpool import -c $CPATHBKP $TESTPOOL1 +# 6. Verify that importing the pool from the cachefile fails. +log_mustnot zpool import -c $CPATHBKP $TESTPOOL1 -log_pass "zpool import can import pool from cachefile if not cleanly " \ - "exported when hostid changes." +# 7. Verify that importing the pool from the cachefile with force succeeds. +log_must zpool import -f -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import from cachefile requires force if not cleanly " \ + "exported and hostid changes." From 4296626ab046a2892133764c65a9e40e67276d96 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Tue, 5 Dec 2023 15:27:56 -0700 Subject: [PATCH 25/32] Extend import_progress kstat with a notes field Detail the import progress of log spacemaps as they can take a very long time. Also grab the spa_note() messages to, as they provide insight into what is happening Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Co-authored-by: Allan Jude Closes #15539 --- include/sys/spa.h | 4 + module/zfs/spa.c | 40 +++++- module/zfs/spa_log_spacemap.c | 12 +- module/zfs/spa_misc.c | 74 +++++++++- tests/runfiles/common.run | 3 +- .../zpool_import/zpool_import_status.ksh | 132 ++++++++++++++++++ 6 files changed, 255 insertions(+), 10 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh diff --git a/include/sys/spa.h b/include/sys/spa.h index 262f7d482802..63057ed2a2d1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1004,6 +1004,10 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); +extern void spa_import_progress_set_notes(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); +extern void spa_import_progress_set_notes_nolog(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bec3b5cbb8d0..d3b0cbde7078 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3331,6 +3331,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); + spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); @@ -3551,7 +3552,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; - hrtime_t import_expire; + hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; @@ -3589,7 +3590,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - while (gethrtime() < import_expire) { + spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " + "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + + int interations = 0; + while ((now = gethrtime()) < import_expire) { + if (interations++ % 30 == 0) { + spa_import_progress_set_notes(spa, "Checking MMP " + "activity, %llu ms remaining", + (u_longlong_t)NSEC2MSEC(import_expire - now)); + } + (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); @@ -5188,6 +5199,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ + spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); @@ -5200,6 +5212,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ + spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); @@ -5208,6 +5221,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Retrieve the full list of active features from the MOS and check if * they are all supported. */ + spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); @@ -5216,6 +5230,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Load several special directories from the MOS needed by the dsl_pool * layer. */ + spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) return (error); @@ -5223,6 +5238,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) /* * Retrieve pool properties from the MOS. */ + spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) return (error); @@ -5231,6 +5247,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ + spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); @@ -5239,10 +5256,12 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ + spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); @@ -5251,6 +5270,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ + spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); @@ -5272,6 +5292,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ + spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); @@ -5281,6 +5302,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ + spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* @@ -5288,6 +5310,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ + spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; @@ -5304,6 +5327,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } + spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ @@ -5323,6 +5347,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * will have been set for us by ZIL traversal operations * performed above. */ + spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* @@ -5330,6 +5355,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ + spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); @@ -5338,6 +5364,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ + spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && @@ -5351,6 +5378,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) */ spa_history_log_version(spa, "open", NULL); + spa_import_progress_set_notes(spa, + "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); @@ -5363,19 +5392,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * auxiliary threads above (from which the livelist * deletion zthr is part of). */ + spa_import_progress_set_notes(spa, + "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ + spa_import_progress_set_notes(spa, + "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); + spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_import_progress_set_notes(spa, "Finished importing"); } spa_import_progress_remove(spa_guid(spa)); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 9044d3c1668b..325a533bf098 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1155,6 +1155,7 @@ spa_ld_log_sm_data(spa_t *spa) uint_t pn = 0; uint64_t ps = 0; + uint64_t nsm = 0; psls = sls = avl_first(&spa->spa_sm_logs_by_txg); while (sls != NULL) { /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ @@ -1187,6 +1188,10 @@ spa_ld_log_sm_data(spa_t *spa) summary_add_data(spa, sls->sls_txg, sls->sls_mscount, 0, sls->sls_nblocks); + spa_import_progress_set_notes_nolog(spa, + "Read %llu of %lu log space maps", (u_longlong_t)nsm, + avl_numnodes(&spa->spa_sm_logs_by_txg)); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg @@ -1202,6 +1207,7 @@ spa_ld_log_sm_data(spa_t *spa) pn--; ps -= space_map_length(sls->sls_sm); + nsm++; space_map_close(sls->sls_sm); sls->sls_sm = NULL; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); @@ -1212,11 +1218,11 @@ spa_ld_log_sm_data(spa_t *spa) hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, - "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " - "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg), (u_longlong_t)spa_log_sm_nblocks(spa), (u_longlong_t)zfs_log_sm_blksz, - (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime)); out: if (error != 0) { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ae95e8b64c1e..357f22ea56a2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -417,6 +417,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...) zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); + + spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* @@ -2215,6 +2217,7 @@ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; + char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; @@ -2225,9 +2228,9 @@ spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { - seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", + seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", - "pool_name"); + "pool_name", "notes"); return (0); } @@ -2236,11 +2239,12 @@ spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; - seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", + seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, - (sip->pool_name ? sip->pool_name : "-")); + (sip->pool_name ? sip->pool_name : "-"), + (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } @@ -2254,6 +2258,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } @@ -2309,6 +2315,10 @@ spa_import_progress_set_state(uint64_t pool_guid, sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } error = 0; break; } @@ -2318,6 +2328,59 @@ spa_import_progress_set_state(uint64_t pool_guid, return (error); } +static void +spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, + const char *fmt, va_list adx) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + uint64_t pool_guid = spa_guid(spa); + + if (shl->size == 0) + return; + + char *notes = kmem_vasprintf(fmt, adx); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } + sip->spa_load_notes = notes; + if (log_dbgmsg) + zfs_dbgmsg("'%s' %s", sip->pool_name, notes); + notes = NULL; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + if (notes != NULL) + kmem_strfree(notes); +} + +void +spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); + va_end(adx); +} + +void +spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); + va_end(adx); +} + int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { @@ -2386,6 +2449,7 @@ spa_import_progress_add(spa_t *spa) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); + sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); @@ -2405,6 +2469,8 @@ spa_import_progress_remove(uint64_t pool_guid) if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 67493ca958ef..ae0ddcfb2fd4 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -415,7 +415,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', - 'import_rewind_device_replaced'] + 'import_rewind_device_replaced', + 'zpool_import_status'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh new file mode 100755 index 000000000000..c96961bf6419 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh @@ -0,0 +1,132 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# During a pool import, the 'import_progress' kstat contains details +# on the import progress. +# +# STRATEGY: +# 1. Create test pool with several devices +# 2. Generate some ZIL records and spacemap logs +# 3. Export the pool +# 4. Import the pool in the background and monitor the kstat content +# 5. Check the zfs debug messages for import progress +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $TESTPOOL1 +} + +log_assert "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" + +log_onexit cleanup + +log_must zpool create $TESTPOOL1 $VDEV0 $VDEV1 $VDEV2 +typeset guid=$(zpool get -H -o value guid $TESTPOOL1) + +log_must zfs create -o recordsize=8k $TESTPOOL1/fs +# +# This dd command works around an issue where ZIL records aren't created +# after freezing the pool unless a ZIL header already exists. Create a file +# synchronously to force ZFS to write one out. +# +log_must dd if=/dev/zero of=/$TESTPOOL1/fs/sync conv=fsync bs=1 count=1 + +# +# Overwrite some blocks to populate spacemap logs +# +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools + +# +# Freeze the pool to retain intent log records +# +log_must zpool freeze $TESTPOOL1 + +# fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] +log_must fill_fs /$TESTPOOL1/fs 1 2000 100 1024 R + +log_must zpool list -v $TESTPOOL1 + +# +# Unmount filesystem and export the pool +# +# At this stage the zfs intent log contains +# a set of records to replay. +# +log_must zfs unmount /$TESTPOOL1/fs + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must zpool export $TESTPOOL1 + +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 +log_note "Starting zpool import in background at" $(date +'%H:%M:%S') +zpool import -d $DEVICE_DIR -f $guid & +pid=$! + +# +# capture progress until import is finished +# +log_note waiting for pid $pid to exit +kstat import_progress +while [[ -d /proc/"$pid" ]]; do + line=$(kstat import_progress | grep -v pool_guid) + if [[ -n $line ]]; then + echo $line + fi + if [[ -f /$TESTPOOL1/fs/00 ]]; then + break; + fi + sleep 0.0001 +done +log_note "zpool import completed at" $(date +'%H:%M:%S') + +entries=$(kstat dbgmsg | grep "spa_import_progress_set_notes_impl(): 'testpool1'" | wc -l) +log_note "found $entries progress notes in dbgmsg" +log_must test $entries -gt 20 + +log_must zpool status $TESTPOOL1 + +log_pass "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" From dbf9aed3cf97df8a4e53c52e39a1513a414428bb Mon Sep 17 00:00:00 2001 From: George Wilson Date: Mon, 22 Apr 2024 12:42:38 -0400 Subject: [PATCH 26/32] Parallel pool import This commit allow spa_load() to drop the spa_namespace_lock so that imports can happen concurrently. Prior to dropping the spa_namespace_lock, the import logic will set the spa_load_thread value to track the thread which is doing the import. Consumers of spa_lookup() retain the same behavior by blocking when either a thread is holding the spa_namespace_lock or the spa_load_thread value is set. This will ensure that critical concurrent operations cannot take place while a pool is being imported. The zpool command is also enhanced to provide multi-threaded support when invoking zpool import -a. Lastly, zinject provides a mechanism to insert artificial delays when importing a pool and new zfs tests are added to verify parallel import functionality. Contributions-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #16093 --- cmd/zinject/zinject.c | 114 +++++++++++- cmd/zpool/zpool_main.c | 72 ++++++-- include/libzutil.h | 4 +- include/sys/spa.h | 2 + include/sys/spa_impl.h | 3 +- include/sys/zfs_ioctl.h | 4 +- include/sys/zio.h | 4 +- man/man8/zinject.8 | 8 + module/zfs/spa.c | 56 ++++-- module/zfs/spa_misc.c | 26 ++- module/zfs/vdev_initialize.c | 5 +- module/zfs/vdev_rebuild.c | 4 +- module/zfs/vdev_trim.c | 9 +- module/zfs/zio_inject.c | 138 ++++++++++++++- tests/runfiles/common.run | 3 +- .../cli_root/zpool_import/Makefile.am | 3 + .../zpool_import_parallel_admin.ksh | 165 ++++++++++++++++++ .../zpool_import_parallel_neg.ksh | 130 ++++++++++++++ .../zpool_import_parallel_pos.ksh | 137 +++++++++++++++ 19 files changed, 817 insertions(+), 70 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index bf97b0d68713..cfec120e3fb9 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023-2024, Klara Inc. */ /* @@ -276,6 +277,11 @@ usage(void) "\t\tcreate 3 lanes on the device; one lane with a latency\n" "\t\tof 10 ms and two lanes with a 25 ms latency.\n" "\n" + "\tzinject -P import|export -s pool\n" + "\t\tAdd an artificial delay to a future pool import or export,\n" + "\t\tsuch that the operation takes a minimum of supplied seconds\n" + "\t\tto complete.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -358,8 +364,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, { int *count = data; - if (record->zi_guid != 0 || record->zi_func[0] != '\0') + if (record->zi_guid != 0 || record->zi_func[0] != '\0' || + record->zi_duration != 0) { return (0); + } if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " @@ -462,6 +470,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record, return (0); } +static int +print_pool_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_cmd != ZINJECT_DELAY_IMPORT && + record->zi_cmd != ZINJECT_DELAY_EXPORT) { + return (0); + } + + if (*count == 0) { + (void) printf("%3s %-19s %-11s %s\n", + "ID", "POOL", "DELAY (sec)", "COMMAND"); + (void) printf("--- ------------------- -----------" + " -------\n"); + } + + *count += 1; + + (void) printf("%3d %-19s %-11llu %s\n", + id, pool, (u_longlong_t)record->zi_duration, + record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export"); + + return (0); +} + /* * Print all registered error handlers. Returns the number of handlers * registered. @@ -492,6 +527,13 @@ print_all_handlers(void) count = 0; } + (void) iter_handlers(print_pool_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_panic_handler, &count); return (count + total); @@ -564,9 +606,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record, zc.zc_guid = flags; if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to add handler: %s\n", - errno == EDOM ? "block level exceeds max level of object" : - strerror(errno)); + const char *errmsg = strerror(errno); + + switch (errno) { + case EDOM: + errmsg = "block level exceeds max level of object"; + break; + case EEXIST: + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "pool already imported"; + if (record->zi_cmd == ZINJECT_DELAY_EXPORT) + errmsg = "a handler already exists"; + break; + case ENOENT: + /* import delay injector running on older zfs module */ + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "import delay injector not supported"; + break; + default: + break; + } + (void) fprintf(stderr, "failed to add handler: %s\n", errmsg); return (1); } @@ -591,6 +651,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record, } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); + } else if (record->zi_timer > 0) { + (void) printf(" timer: %lld ms\n", + (u_longlong_t)NSEC2MSEC(record->zi_timer)); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); @@ -789,7 +852,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -919,6 +982,19 @@ main(int argc, char **argv) sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; + case 'P': + if (strcasecmp(optarg, "import") == 0) { + record.zi_cmd = ZINJECT_DELAY_IMPORT; + } else if (strcasecmp(optarg, "export") == 0) { + record.zi_cmd = ZINJECT_DELAY_EXPORT; + } else { + (void) fprintf(stderr, "invalid command '%s': " + "must be 'import' or 'export'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'q': quiet = 1; break; @@ -998,7 +1074,7 @@ main(int argc, char **argv) argc -= optind; argv += optind; - if (record.zi_duration != 0) + if (record.zi_duration != 0 && record.zi_cmd == 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { @@ -1128,8 +1204,8 @@ main(int argc, char **argv) if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || dvas != 0) { - (void) fprintf(stderr, "panic (-p) incompatible with " - "other options\n"); + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); usage(); libzfs_fini(g_zfs); return (2); @@ -1147,6 +1223,28 @@ main(int argc, char **argv) if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_DELAY_IMPORT || + record.zi_cmd == ZINJECT_DELAY_EXPORT) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1 || record.zi_duration <= 0) { + (void) fprintf(stderr, "import|export delay (-P) " + "injection requires a duration (-s) and a single " + "pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_freq > 0 || dvas != 0) { diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index fc8f35c5eff4..bb8803db32b6 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -3211,15 +3212,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +typedef struct import_parameters { + nvlist_t *ip_config; + const char *ip_mntopts; + nvlist_t *ip_props; + int ip_flags; + int *ip_err; +} import_parameters_t; + +static void +do_import_task(void *arg) +{ + import_parameters_t *ip = arg; + *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, + ip->ip_props, ip->ip_flags); + free(ip); +} + + static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, - char *orig_name, char *new_name, - boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, - importargs_t *import) + char *orig_name, char *new_name, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; + boolean_t pool_specified = (import->poolname != NULL || + import->guid != 0); + + + tpool_t *tp = NULL; + if (import->do_all) { + tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + } /* * At this point we have a list of import candidate configs. Even if @@ -3236,9 +3262,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + if (!import->do_destroyed && + pool_state == POOL_STATE_DESTROYED) continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + if (import->do_destroyed && + pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, @@ -3247,12 +3275,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, if (!pool_specified) { if (first) first = B_FALSE; - else if (!do_all) + else if (!import->do_all) (void) printf("\n"); - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); + if (import->do_all) { + import_parameters_t *ip = safe_malloc( + sizeof (import_parameters_t)); + + ip->ip_config = config; + ip->ip_mntopts = mntopts; + ip->ip_props = props; + ip->ip_flags = flags; + ip->ip_err = &err; + + (void) tpool_dispatch(tp, do_import_task, + (void *)ip); } else { /* * If we're importing from cachefile, then @@ -3300,6 +3337,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, found_config = config; } } + if (import->do_all) { + tpool_wait(tp); + tpool_destroy(tp); + } /* * If we were searching for a specific pool, verify that we found a @@ -3529,7 +3570,6 @@ zpool_do_import(int argc, char **argv) boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; @@ -3737,7 +3777,6 @@ zpool_do_import(int argc, char **argv) searchname = argv[0]; searchguid = 0; } - pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. @@ -3778,6 +3817,8 @@ zpool_do_import(int argc, char **argv) idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; + idata.do_destroyed = do_destroyed; + idata.do_all = do_all; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); @@ -3817,9 +3858,7 @@ zpool_do_import(int argc, char **argv) } err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); /* * If we're using the cachefile and we failed to import, then @@ -3840,9 +3879,8 @@ zpool_do_import(int argc, char **argv) pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, + &idata); } error: diff --git a/include/libzutil.h b/include/libzutil.h index 6b9facdf9cbe..fca0646d7f9c 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018, 2024 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H @@ -68,6 +68,8 @@ typedef struct importargs { boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ + boolean_t do_destroyed; + boolean_t do_all; } importargs_t; extern nvlist_t *zpool_search_import(void *, importargs_t *, diff --git a/include/sys/spa.h b/include/sys/spa.h index 63057ed2a2d1..ab7d42603dde 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -854,6 +854,8 @@ extern int zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; +extern avl_tree_t spa_namespace_avl; +extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index d9f1b5c4f327..6354fee7f005 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -231,6 +231,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 1ca3f211b56d..a78f5cf7fc59 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -447,6 +447,8 @@ typedef enum zinject_type { ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, + ZINJECT_DELAY_IMPORT, + ZINJECT_DELAY_EXPORT, } zinject_type_t; typedef struct zfs_share { diff --git a/include/sys/zio.h b/include/sys/zio.h index 9adeb5f1e8e8..2d65a777f202 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome @@ -685,6 +685,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); +extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); +extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index a29346929988..93083e1fbf9a 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -127,6 +127,14 @@ Force a vdev error. . .It Xo .Nm zinject +.Fl i Ar seconds +.Ar pool +.Xc +Add an artificial delay during the future import of a pool. +This injector is automatically cleared after the import is finished. +. +.It Xo +.Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d3b0cbde7078..41fce777c1a5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3227,8 +3227,6 @@ spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -4905,7 +4903,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa) int error = 0; ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), @@ -5152,6 +5151,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; + hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -5196,13 +5196,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) return (error); } + /* + * Drop the namespace lock for the rest of the function. + */ + spa->spa_load_thread = curthread; + mutex_exit(&spa_namespace_lock); + /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -5215,7 +5221,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the full list of active features from the MOS and check if @@ -5224,7 +5230,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) - return (error); + goto fail; /* * Load several special directories from the MOS needed by the dsl_pool @@ -5233,7 +5239,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve pool properties from the MOS. @@ -5241,7 +5247,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - @@ -5250,7 +5256,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) - return (error); + goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices @@ -5259,12 +5265,12 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) - return (error); + goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors @@ -5273,7 +5279,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) - return (error); + goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); @@ -5283,8 +5289,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); + error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP); + goto fail; } /* @@ -5295,7 +5302,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) - return (error); + goto fail; /* * Calculate the deflated space for the pool. This must be done before @@ -5413,13 +5420,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_config_exit(spa, SCL_CONFIG, FTAG); spa_import_progress_set_notes(spa, "Finished importing"); } + zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); +fail: + mutex_enter(&spa_namespace_lock); + spa->spa_load_thread = NULL; + cv_broadcast(&spa_namespace_cv); + + return (error); - return (0); } static int @@ -6687,9 +6700,14 @@ spa_tryimport(nvlist_t *tryconfig) /* * Create and initialize the spa structure. */ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", + TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); + kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. @@ -6829,6 +6847,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, int error; spa_t *spa; boolean_t force_removal, modifying; + hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; @@ -7040,6 +7059,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa->spa_is_exporting = B_FALSE; } + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + mutex_exit(&spa_namespace_lock); return (0); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 357f22ea56a2..b87c69bb57c1 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -79,7 +79,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy/export + * - Held at the start and end of import * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -232,9 +233,9 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ -static avl_tree_t spa_namespace_avl; +avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; +kcondvar_t spa_namespace_cv; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; @@ -606,6 +607,7 @@ spa_lookup(const char *name) ASSERT(MUTEX_HELD(&spa_namespace_lock)); +retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -617,6 +619,14 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); + if (spa == NULL) + return (NULL); + + if (spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + goto retry; + } return (spa); } @@ -714,6 +724,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_config_lock_init(spa); spa_stats_init(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* @@ -808,7 +819,6 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -903,7 +913,8 @@ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } @@ -929,7 +940,8 @@ void spa_close(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); spa_close_common(spa, tag); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index d4e08f1b7724..a88f16201743 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include @@ -729,7 +729,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 59dcfc80c7a3..0ac062797687 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -22,6 +22,7 @@ * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2024 by Delphix. All rights reserved. */ #include @@ -1067,7 +1068,8 @@ vdev_rebuild_restart_impl(vdev_t *vd) void vdev_rebuild_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index f07c747d1f69..6b3599703d79 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP */ @@ -1129,7 +1129,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1523,8 +1524,8 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index feaf41dc65e3..ed502d3e2d52 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0; typedef struct inject_handler { int zi_id; spa_t *zi_spa; + char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; @@ -699,6 +701,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static void +zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) +{ + inject_handler_t *handler; + hrtime_t delay = 0; + int id = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); + handler != NULL && handler->zi_record.zi_cmd == command; + handler = list_next(&inject_handlers, handler)) { + ASSERT3P(handler->zi_spa_name, !=, NULL); + if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + uint64_t pause = + SEC2NSEC(handler->zi_record.zi_duration); + if (pause > elapsed) { + delay = pause - elapsed; + } + id = handler->zi_id; + break; + } + } + + rw_exit(&inject_lock); + + if (delay) { + if (command == ZINJECT_DELAY_IMPORT) { + spa_import_progress_set_notes(spa, "injecting %llu " + "sec delay", (u_longlong_t)NSEC2SEC(delay)); + } + zfs_sleep_until(gethrtime() + delay); + } + if (id) { + /* all done with this one-shot handler */ + zio_clear_fault(id); + } +} + +/* + * For testing, inject a delay during an import + */ +void +zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); +} + +/* + * For testing, inject a delay during an export + */ +void +zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { @@ -756,6 +815,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record) return (0); } +static boolean_t +zio_pool_handler_exists(const char *name, zinject_type_t command) +{ + boolean_t exists = B_FALSE; + + rw_enter(&inject_lock, RW_READER); + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (command != handler->zi_record.zi_cmd) + continue; + + const char *pool = (handler->zi_spa_name != NULL) ? + handler->zi_spa_name : spa_name(handler->zi_spa); + if (strcmp(name, pool) == 0) { + exists = B_TRUE; + break; + } + } + rw_exit(&inject_lock); + + return (exists); +} /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -806,16 +887,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (!(flags & ZINJECT_NULL)) { /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. + * Pool delays for import or export don't take an + * injection reference on the spa. Instead they + * rely on matching by name. */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); + if (record->zi_cmd == ZINJECT_DELAY_IMPORT || + record->zi_cmd == ZINJECT_DELAY_EXPORT) { + if (record->zi_duration <= 0) + return (SET_ERROR(EINVAL)); + /* + * Only one import | export delay handler per pool. + */ + if (zio_pool_handler_exists(name, record->zi_cmd)) + return (SET_ERROR(EEXIST)); + + mutex_enter(&spa_namespace_lock); + boolean_t has_spa = spa_lookup(name) != NULL; + mutex_exit(&spa_namespace_lock); + + if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) + return (SET_ERROR(EEXIST)); + if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) + return (SET_ERROR(ENOENT)); + spa = NULL; + } else { + /* + * spa_inject_ref() will add an injection reference, + * which will prevent the pool from being removed + * from the namespace while still allowing it to be + * unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + } handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; + handler->zi_spa = spa; /* note: can be NULL */ handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { @@ -828,6 +935,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler->zi_next_lane = 0; } + if (handler->zi_spa == NULL) + handler->zi_spa_name = spa_strdup(name); + else + handler->zi_spa_name = NULL; + rw_enter(&inject_lock, RW_WRITER); /* @@ -887,7 +999,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen, if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ASSERT(handler->zi_spa || handler->zi_spa_name); + if (handler->zi_spa != NULL) + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + else + (void) strncpy(name, handler->zi_spa_name, buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); @@ -937,7 +1053,11 @@ zio_clear_fault(int id) ASSERT3P(handler->zi_lanes, ==, NULL); } - spa_inject_delref(handler->zi_spa); + if (handler->zi_spa_name != NULL) + spa_strfree(handler->zi_spa_name); + + if (handler->zi_spa != NULL) + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ae0ddcfb2fd4..f1c064472e0b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -416,7 +416,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced', - 'zpool_import_status'] + 'zpool_import_status', 'zpool_import_parallel_pos', + 'zpool_import_parallel_neg', 'zpool_import_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index 226e804db38a..c9486a57b1d4 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -44,6 +44,9 @@ dist_pkgdata_SCRIPTS = \ zpool_import_missing_002_pos.ksh \ zpool_import_missing_003_pos.ksh \ zpool_import_rename_001_pos.ksh \ + zpool_import_parallel_admin.ksh \ + zpool_import_parallel_neg.ksh \ + zpool_import_parallel_pos.ksh \ zpool_import_encrypted.ksh \ zpool_import_encrypted_load.ksh \ zpool_import_errata3.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh new file mode 100755 index 000000000000..c681d1b7dd23 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh @@ -0,0 +1,165 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that admin commands to different pool are not blocked by import +# +# STRATEGY: +# 1. Create 2 pools +# 2. Export one of the pools +# 4. Import the pool with an injected delay +# 5. Execute some admin commands against both pools +# 6. Verify that the admin commands to the non-imported pool don't stall +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL1 + destroy_pool $TESTPOOL2 +} + +function pool_import +{ + typeset dir=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: imported in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-import + else + echo ${pool}: import failed $errmsg in $SECONDS secs + fi +} + +function pool_add_device +{ + typeset pool=$1 + typeset device=$2 + typeset devtype=$3 + + SECONDS=0 + errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: added $devtype vdev in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-add + else + echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_stats +{ + typeset stats=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool $stats $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: $stats in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats} + else + echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_create +{ + typeset pool=$1 + typeset device=$2 + + SECONDS=0 + errmsg=$(zpool create $pool $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: created in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-create + else + echo ${pool}: create failed ${errmsg}, in $SECONDS secs + fi +} + +log_assert "Simple admin commands to different pool not blocked by import" + +log_onexit cleanup + +# +# create two pools and export one +# +log_must zpool create $TESTPOOL1 $VDEV0 +log_must zpool export $TESTPOOL1 +log_must zpool create $TESTPOOL2 $VDEV1 + +# +# import pool asyncronously with an injected 10 second delay +# +log_must zinject -P import -s 10 $TESTPOOL1 +pool_import $DEVICE_DIR $TESTPOOL1 & + +sleep 2 + +# +# run some admin commands on the pools while the import is in progress +# + +pool_add_device $TESTPOOL1 $VDEV2 "log" & +pool_add_device $TESTPOOL2 $VDEV3 "cache" & +pool_stats "status" $TESTPOOL1 & +pool_stats "status" $TESTPOOL2 & +pool_stats "list" $TESTPOOL1 & +pool_stats "list" $TESTPOOL2 & +pool_create $TESTPOOL1 $VDEV4 & +wait + +log_must zpool sync $TESTPOOL1 $TESTPOOL2 + +zpool history $TESTPOOL1 +zpool history $TESTPOOL2 + +log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import) + +# +# verify that commands to second pool did not wait for import to finish +# +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add) +[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create" + +log_pass "Simple admin commands to different pool not blocked by import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh new file mode 100755 index 000000000000..339dc2575ede --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that pool imports by same name only have one winner +# +# STRATEGY: +# 1. Create 4 single disk pools with the same name +# 2. Generate some ZIL records (for a longer import) +# 3. Export the pools +# 4. Import the pools in parallel +# 5. Repeat with using matching guids +# + +verify_runnable "global" + +POOLNAME="import_pool" +DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME" +VDEVSIZE=$((512 * 1024 * 1024)) + +log_assert "parallel pool imports by same name only have one winner" + +# each pool has its own device directory +for i in {0..3}; do + log_must mkdir -p ${DEV_DIR_PREFIX}$i + log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i +done + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $POOLNAME + + log_must rm -rf $DEV_DIR_PREFIX* +} + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + +function import_pool +{ + typeset dir=$1 + typeset pool=$2 + typeset newname=$3 + + SECONDS=0 + errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + touch $dir/imported + echo "imported $pool in $SECONDS secs" + elif [[ $errmsg == *"cannot import"* ]]; then + echo "pool import failed: $errmsg, waited $SECONDS secs" + touch $dir/failed + fi +} + +# +# create four exported pools with the same name +# +for i in {0..3}; do + log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i + log_must zpool export $POOLNAME +done +log_must zinject -P import -s 10 $POOLNAME + +# +# import the pools in parallel, expecting only one winner +# +for i in {0..3}; do + import_pool ${DEV_DIR_PREFIX}$i $POOLNAME & +done +wait + +# check the result of background imports +typeset num_imports=0 +typeset num_cannot=0 +for i in {0..3}; do + if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then + ((num_imports += 1)) + fi + if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then + ((num_cannot += 1)) + loser=$i + fi +done +[[ $num_imports -eq "1" ]] || log_fail "expecting an import" +[[ $num_cannot -eq "3" ]] || \ + log_fail "expecting 3 pool exists errors, found $num_cannot" + +log_note "$num_imports imported and $num_cannot failed (expected)" + +log_pass "parallel pool imports by same name only have one winner" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh new file mode 100755 index 000000000000..71b2437a37ec --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +export MAX_NUM=8 + +# +# DESCRIPTION: +# Verify that pool imports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Generate some ZIL records +# 3. Export the pools +# 4. Import half of the pools synchronously to baseline sequential cost +# 5. Import the other half asynchronously to demonstrate parallel savings +# 6. Export 4 pools +# 7. Test zpool import -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# +VDEVSIZE=$((512 * 1024 * 1024)) +increase_device_sizes $VDEVSIZE + +POOLNAME="import_pool" + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + for i in {0..$(($MAX_NUM - 1))}; do + destroy_pool $POOLNAME-$i + done + # reset the devices + increase_device_sizes 0 + increase_device_sizes $FILE_SIZE +} + +log_assert "Pool imports can occur in parallel" + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + + +# +# create some exported pools with import delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# import half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially imported 4 pools in $sequential_time seconds" + +# +# import half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# export pools with import delay injectors +# +for i in {4..7}; do + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# now test zpool import -a +# +SECONDS=0 +log_must zpool import -a -d $DEVICE_DIR -f +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool imports occur in parallel" From f2d25f56f5e18f6a5d84df19d4dc2b277d783168 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 29 Apr 2024 15:35:53 -0600 Subject: [PATCH 27/32] vdev probe to slow disk can stall mmp write checker Simplify vdev probes in the zio_vdev_io_done context to avoid holding the spa config lock for a long duration. Also allow zpool clear if no evidence of another host is using the pool. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Olaf Faaland Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Closes #15839 --- cmd/zpool/zpool_main.c | 2 +- include/sys/spa.h | 4 +- include/sys/uberblock_impl.h | 16 +-- include/sys/vdev_impl.h | 2 +- man/man8/zpool-clear.8 | 7 +- module/zfs/mmp.c | 5 +- module/zfs/spa.c | 102 ++++++++++++++---- module/zfs/txg.c | 9 ++ module/zfs/vdev.c | 22 ++-- module/zfs/vdev_label.c | 4 +- module/zfs/zfs_ioctl.c | 9 +- module/zfs/zio.c | 7 +- tests/runfiles/linux.run | 2 +- .../tests/functional/mmp/Makefile.am | 1 + .../functional/mmp/mmp_write_slow_disk.ksh | 97 +++++++++++++++++ 15 files changed, 239 insertions(+), 50 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index bb8803db32b6..90ca02482591 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -8464,7 +8464,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " - "pool.\n")); + "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: diff --git a/include/sys/spa.h b/include/sys/spa.h index ab7d42603dde..5f8d5f21b6a1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -798,7 +798,7 @@ extern void spa_add_feature_stats(spa_t *spa, nvlist_t *config); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -1153,6 +1153,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 91699e65131a..a14d5cf8e11e 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3cfde40a77fe..762e41fa40ce 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -290,7 +290,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index 0b256b28bd21..2a1222e8f1b1 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -49,9 +49,10 @@ If the pool was suspended it will be brought back online provided the devices can be accessed. Pools with .Sy multihost -enabled which have been suspended cannot be resumed. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. +enabled which have been suspended cannot be resumed when there is evidence +that the pool was imported by another host. +The same checks performed during an import will be applied before the clear +proceeds. . .Sh SEE ALSO .Xr zdb 8 , diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index f67a4eb22a2d..73de01a0bad3 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -662,12 +662,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 41fce777c1a5..97d1328fae9e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3539,11 +3539,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; @@ -3588,19 +3593,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " - "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } - int interations = 0; + int iterations = 0; while ((now = gethrtime()) < import_expire) { - if (interations++ % 30 == 0) { + if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3682,6 +3691,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -4002,7 +4066,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -8683,15 +8748,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8780,11 +8846,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index dbca5913348d..12ce303a34d6 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -585,6 +585,15 @@ txg_sync_thread(void *arg) timer = (delta > timeout ? 0 : timeout - delta); } + /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index fc1b62a39ac3..b2434821ef4a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1584,6 +1584,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1627,6 +1628,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1678,6 +1690,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1704,15 +1717,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ea2f84951e4..58755cd1822b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1894,6 +1894,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -1901,7 +1902,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 49e1c3f6ccdc..aa10e98fed24 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5723,10 +5723,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bee032457840..3a46675f0da0 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2535,8 +2535,11 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "is set to panic.", spa_name(spa)); if (!spa_suspended(spa)) { - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " - "I/O failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an " + "uncorrectable I/O failure and has been " + "suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 94c1cbbc3f9f..2d1b21a884ff 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -127,7 +127,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am index 2848fd4ce692..211e026a87aa 100644 --- a/tests/zfs-tests/tests/functional/mmp/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am @@ -8,6 +8,7 @@ dist_pkgdata_SCRIPTS = \ mmp_active_import.ksh \ mmp_inactive_import.ksh \ mmp_exported_import.ksh \ + mmp_write_slow_disk.ksh \ mmp_write_uberblocks.ksh \ mmp_reset_interval.ksh \ mmp_on_zdb.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 000000000000..8b118684aa7f --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpected errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config lock as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to acquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + poolexists $MMP_POOL && destroy_pool $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +clear_mmp_history + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +sync_pool $MMP_POOL + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend" From 006d506547688c7ab990fe48669778cf53108cc9 Mon Sep 17 00:00:00 2001 From: Rob N Date: Thu, 21 Sep 2023 09:56:45 +1000 Subject: [PATCH 28/32] status: report pool suspension state under failmode=continue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When failmode=continue is set and the pool suspends, both 'zpool status' and the 'zfs/pool/state' kstat ignore it and report the normal vdev tree state. There's no clear indicator that the pool is suspended. This is unlike suspend in failmode=wait, or suspend due to MMP check failure, which both report "SUSPENDED" explicitly. This commit changes it so SUSPENDED is reported for failmode=continue the same as for other modes. Rationale: The historical behaviour of failmode=continue is roughly, "press on as though all is well". To this end, the fact that the pool had suspended was not shown, to maintain the façade that all is well. Its unclear why hiding this information was considered appropriate. One possibility is that it was expected that a true pool fault would always be reported as DEGRADED or FAULTED, and that the pool could not suspend without these happening. That is not necessarily true, as vdev health and suspend state are only loosely connected, such that a pool in (apparent) good health can be suspended for good reasons, and of course a degraded pool does not lead to suspension. Even if that expectation were true, there's still a difference in urgency - a degraded pool may not need to be attended to for hours, while a suspended pool is most often unusable until an operator intervenes. An operator that has set failmode=continue has presumably done so because their workload is one that can continue to operate in a useful way when the pool suspends. In this case the operator still needs a clear indicator that there is a problem that needs attending to. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15297 --- lib/libzfs/libzfs_pool.c | 2 ++ module/zfs/spa_misc.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ed705a012730..0595c4407e8d 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,6 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2021, 2023, Klara Inc. */ #include @@ -265,6 +266,7 @@ zpool_get_state_str(zpool_handle_t *zhp) } else if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || + status == ZPOOL_STATUS_IO_FAILURE_CONTINUE || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index b87c69bb57c1..61447f6b5be3 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -27,6 +27,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #include @@ -2879,8 +2880,7 @@ spa_state_to_name(spa_t *spa) vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; - if (spa_suspended(spa) && - (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) + if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { From 6b9de042e6c33d98eaedd19fec8c1fa1b0947fb8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 10 Apr 2024 13:14:13 +1000 Subject: [PATCH 29/32] vdev_disk: fix alignment check when buffer has non-zero starting offset If a linear buffer spans multiple pages, and the first page has a non-zero starting offset, the checker would not include the offset, and so would think there was an alignment gap at the end of the first page, rather than at the start. That is, for a 16K buffer spread across five pages with an initial 512B offset: [.XXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXX.] It would be interpreted as: [XXXXXXX.][XXXXXXXX]... And be rejected as misaligned. Since it's already a linear ABD, the "linearising" copy would just reuse the buffer as-is, and the second check would failing, tripping the VERIFY in vdev_disk_io_rw(). This commit fixes all this by including the offset in the check for end-of-page alignment. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris (cherry picked from commit 52f60c29d41dec9e209e8553936b7753119e74df) --- module/os/linux/zfs/vdev_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 90a4dc135d53..743b6f23f9ac 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -770,7 +770,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * Note if we're taking less than a full block, so we can check it * above on the next call. */ - s->end = len & s->bmask; + s->end = (off+len) & s->bmask; /* All blocks after the first must start on a block size boundary. */ if (s->npages != 0 && (off & s->bmask) != 0) From f20ad6fb108b0f80d4f4b98edd20390b15108f68 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 15 Apr 2024 11:29:47 +1000 Subject: [PATCH 30/32] zil_commit_impl: don't assert ZIL is not suspended zil_suspend() forces a full ZIL commit before suspending the ZIL, so has to be allowed to call it! --- module/zfs/zil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index edc5999b957c..b8477b2f9dfc 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -3607,7 +3607,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) int zil_commit_impl(zilog_t *zilog, uint64_t foid) { - ASSERT0(zil_failed(zilog) || zilog->zl_suspend > 0); + ASSERT0(zil_failed(zilog)); ZIL_STAT_BUMP(zil_commit_count); From 4319de28a920cda18b2394d1e15e146280aaf343 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Fri, 19 Apr 2024 16:23:59 +0200 Subject: [PATCH 31/32] tests: l2arc_arcstats_pos: Add a timeout Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. --- .../tests/functional/l2arc/l2arc_arcstats_pos.ksh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh index 3e76347b029a..a3a422b99fd1 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh @@ -67,7 +67,15 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +timeout_handler() { + log_fail "${TIMEOUT_MESSAGE}" +} + +TIMEOUT_MESSAGE="Time out arcstat_quiescence_noecho l2_size before zpool offline" +trap timeout_handler USR1 +ppid="$$" && (sleep 600 && kill -USR1 "$ppid") & timeout_pid="$!" arcstat_quiescence_noecho l2_size +trap - USR1 log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size From ce2a365f563f5ba0f7e5587fb04d50e3b90fb0d3 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 2 May 2024 19:28:10 +0000 Subject: [PATCH 32/32] Add support for parallel pool exports Changed spa_export_common() such that it no longer holds the spa_namespace_lock for the entire duration and instead sets spa_export_thread to indicate an import is in progress on the spa. This allows for an export to a diffent pool to proceed in parallel while an export is still processing potentially long operations like spa_unload_log_sm_flush_all(). Calls like spa_lookup() and spa_vdev_enter() that rely on the spa_namespace_lock to serialize them against a concurrent export, now wait for any in-progress export thread to complete before proceeding. The 'zpool import -a' sub-command also provides multi-threaded support, using a thread pool to submit the exports in parallel. Sponsored-By: Klara Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady --- cmd/zpool/zpool_main.c | 88 +++++++++++- include/sys/spa_impl.h | 1 + module/zfs/arc.c | 7 +- module/zfs/spa.c | 50 +++++-- module/zfs/spa_misc.c | 33 +++-- module/zfs/vdev_initialize.c | 9 +- module/zfs/vdev_rebuild.c | 3 +- module/zfs/vdev_trim.c | 9 +- tests/runfiles/common.run | 3 +- .../cli_root/zpool_export/Makefile.am | 4 +- .../zpool_export_parallel_admin.ksh | 72 ++++++++++ .../zpool_export_parallel_pos.ksh | 129 ++++++++++++++++++ 12 files changed, 370 insertions(+), 38 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 90ca02482591..27598eba9477 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1849,10 +1849,19 @@ zpool_do_destroy(int argc, char **argv) } typedef struct export_cbdata { + tpool_t *tpool; + pthread_mutex_t mnttab_lock; boolean_t force; boolean_t hardforce; + int retval; } export_cbdata_t; + +typedef struct { + char *aea_poolname; + export_cbdata_t *aea_cbdata; +} async_export_args_t; + /* * Export one pool */ @@ -1861,11 +1870,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force, cb->hardforce) != 0) - return (1); + /* + * zpool_disable_datasets() is not thread-safe for mnttab access. + * So we serialize access here for 'zpool export -a' parallel case. + */ + if (cb->tpool != NULL) + pthread_mutex_lock(&cb->mnttab_lock); - /* The history must be logged as part of the export */ - log_history = B_FALSE; + int retval = zpool_disable_datasets(zhp, cb->force, cb->hardforce); + + if (cb->tpool != NULL) + pthread_mutex_unlock(&cb->mnttab_lock); + + if (retval) + return (1); if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) @@ -1877,6 +1895,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data) return (0); } +/* + * Asynchronous export request + */ +static void +zpool_export_task(void *arg) +{ + async_export_args_t *aea = arg; + + zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname); + if (zhp != NULL) { + int ret = zpool_export_one(zhp, aea->aea_cbdata); + if (ret != 0) + aea->aea_cbdata->retval = ret; + zpool_close(zhp); + } else { + aea->aea_cbdata->retval = 1; + } + + free(aea->aea_poolname); + free(aea); +} + +/* + * Process an export request in parallel + */ +static int +zpool_export_one_async(zpool_handle_t *zhp, void *data) +{ + tpool_t *tpool = ((export_cbdata_t *)data)->tpool; + async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t)); + + /* save pool name since zhp will go out of scope */ + aea->aea_poolname = strdup(zpool_get_name(zhp)); + aea->aea_cbdata = data; + + /* ship off actual export to another thread */ + if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0) + return (errno); /* unlikely */ + else + return (0); +} + /* * zpool export [-f] ... * @@ -1920,17 +1980,33 @@ zpool_do_export(int argc, char **argv) cb.force = force; cb.hardforce = hardforce; + cb.tpool = NULL; + cb.retval = 0; argc -= optind; argv += optind; + /* The history will be logged as part of the export itself */ + log_history = B_FALSE; + if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, - B_FALSE, zpool_export_one, &cb)); + cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + pthread_mutex_init(&cb.mnttab_lock, NULL); + + /* Asynchronously call zpool_export_one using thread pool */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + zpool_export_one_async, &cb); + + tpool_wait(cb.tpool); + tpool_destroy(cb.tpool); + (void) pthread_mutex_destroy(&cb.mnttab_lock); + + return (ret | cb.retval); } /* check arguments */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 6354fee7f005..d84e2e4e1bc3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -231,6 +231,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_export_thread; /* valid during pool export */ kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1db68669a9f5..a690a7e6ebf1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8491,11 +8491,11 @@ l2arc_dev_get_next(void) break; } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all); + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all) + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) next = NULL; l2arc_dev_last = next; @@ -10145,7 +10145,8 @@ l2arc_spa_rebuild_start(spa_t *spa) void l2arc_spa_rebuild_stop(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); /* * Locate the spa's l2arc devices and kick off rebuild threads. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 97d1328fae9e..cb7d7908c668 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2024, Klara Inc. */ /* @@ -1915,7 +1916,8 @@ spa_unload(spa_t *spa, txg_wait_flag_t txg_how) vdev_t *vd; uint64_t t, txg; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); @@ -6909,7 +6911,7 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { - int error; + int error = 0; spa_t *spa; boolean_t force_removal, modifying; hrtime_t export_start = gethrtime(); @@ -6943,8 +6945,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, new_state == POOL_STATE_EXPORTED); /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. + * Put a hold on the pool, drop the namespace lock, stop async tasks + * and see if we can export. */ spa_open_ref(spa, FTAG); @@ -6981,10 +6983,13 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); + spa->spa_export_thread = curthread; spa_close(spa, FTAG); - if (spa->spa_state == POOL_STATE_UNINITIALIZED) + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + mutex_exit(&spa_namespace_lock); goto export_spa; + } /* * The pool will be in core if it's openable, in which case we can @@ -7028,6 +7033,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, goto fail; } + mutex_exit(&spa_namespace_lock); + /* + * At this point we no longer hold the spa_namespace_lock and + * there were no references on the spa. Future spa_lookups will + * notice the spa->spa_export_thread and wait until we signal + * that we are finshed. + */ + if (spa->spa_sync_on) { /* * A pool cannot be exported if it has an active shared spare. @@ -7038,7 +7051,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); - goto fail; + goto fail_unlocked; } /* @@ -7104,13 +7117,20 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, error = spa_unload(spa, hardforce ? TXG_WAIT_F_FORCE_EXPORT : TXG_WAIT_F_NOSUSPEND); if (error != 0) - goto fail; + goto fail_unlocked; spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + + /* + * Take the namewspace lock for the actual spa_t removal + */ + mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!force_removal) spa_write_cachefile(spa, B_TRUE, B_TRUE); @@ -7122,19 +7142,29 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; } - if (new_state == POOL_STATE_EXPORTED) - zio_handle_export_delay(spa, gethrtime() - export_start); - + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); +fail_unlocked: + mutex_enter(&spa_namespace_lock); fail: if (force_removal) spa_set_export_initiator(spa, NULL); spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; + spa_async_resume(spa); + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 61447f6b5be3..3e9ccda6f0c7 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -27,7 +27,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -80,8 +80,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/export - * - Held at the start and end of import + * - Held for the duration of create/destroy + * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -623,8 +623,14 @@ spa_lookup(const char *name) if (spa == NULL) return (NULL); - if (spa->spa_load_thread != NULL && - spa->spa_load_thread != curthread) { + /* + * Avoid racing with import/export, which don't hold the namespace + * lock for their entire duration. + */ + if ((spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) || + (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } @@ -935,14 +941,15 @@ spa_close_common(spa_t *spa, const void *tag) /* * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. + * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || - spa->spa_load_thread == curthread); + spa->spa_load_thread == curthread || + spa->spa_export_thread == curthread); spa_close_common(spa, tag); } @@ -962,13 +969,15 @@ spa_async_close(spa_t *spa, void *tag) /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool + * spa_namespace_lock held or be the spa export thread. We really + * compare against spa_minref, which is the number of references + * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } @@ -1216,6 +1225,8 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); @@ -1233,6 +1244,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); if (guid != 0) { diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index a88f16201743..bff572d7d90c 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -636,7 +636,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); @@ -678,7 +679,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -710,7 +712,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) spa_t *spa = vd->vdev_spa; list_t vd_list; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 0ac062797687..6a23ce6b2416 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -1083,7 +1083,8 @@ vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 6b3599703d79..b71af92893d8 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -1021,7 +1021,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); @@ -1060,7 +1061,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -1096,7 +1098,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) list_t vd_list; vdev_t *vd_l2cache; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f1c064472e0b..4faa48ba1bad 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -376,7 +376,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_005_pos', - 'zpool_export_006_pos', 'zpool_export_007_pos'] + 'zpool_export_006_pos', 'zpool_export_007_pos', + 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am index 23c864db19ac..21ecc8c057de 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am @@ -8,7 +8,9 @@ dist_pkgdata_SCRIPTS = \ zpool_export_004_pos.ksh \ zpool_export_005_pos.ksh \ zpool_export_006_pos.ksh \ - zpool_export_007_pos.ksh + zpool_export_007_pos.ksh \ + zpool_export_parallel_admin.ksh \ + zpool_export_parallel_pos.ksh dist_pkgdata_DATA = \ zpool_export.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh new file mode 100755 index 000000000000..cab8fc2b4239 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that admin commands cannot race a pool export +# +# STRATEGY: +# 1. Create a pool +# 2. Import the pool with an injected delay in the background +# 3. Execute some admin commands against the pool +# + +verify_runnable "global" + +DEVICE_DIR=$TEST_BASE_DIR/dev_export-test + +function cleanup +{ + zinject -c all + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "admin commands cannot race a pool export" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR +log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zinject -P export -s 10 $TESTPOOL1 + +log_must zpool export $TESTPOOL1 & + +zpool set comment=hello $TESTPOOL1 +zpool reguid $TESTPOOL1 & +zpool split $TESTPOOL1 & + +log_pass "admin commands cannot race a pool export" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh new file mode 100755 index 000000000000..037d17d082bd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh @@ -0,0 +1,129 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +MAX_NUM=8 +DEVICE_DIR=$TEST_BASE_DIR/dev_import-test + + +# +# DESCRIPTION: +# Verify that pool exports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Inject an export delay using zinject +# 3. Export half of the pools synchronously to baseline sequential cost +# 4. Export the other half asynchronously to demonstrate parallel savings +# 6. Import 4 pools +# 7. Test zpool export -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# + +POOLNAME="test_pool" + +function cleanup +{ + zinject -c all + + for i in {0..$(($MAX_NUM - 1))}; do + poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i + done + + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "Pool exports can occur in parallel" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR + +# +# Create some pools with export delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i + log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# Export half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool export $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially exported 4 pools in $sequential_time seconds" + +# +# Export half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool export $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# import 4 pools with export delay injectors +# +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# now test zpool export -a +# +SECONDS=0 +log_must zpool export -a +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool exports occur in parallel"