Skip to content

Commit

Permalink
3956 ::vdev -r should work with pipelines
Browse files Browse the repository at this point in the history
3957 ztest should update the cachefile before killing itself
3958 multiple scans can lead to partial resilvering
3959 ddt entries are not always resilvered
3960 dsl_scan can skip over dedup-ed blocks if physical birth != logical birth
3961 freed gang blocks are not resilvered and can cause pool to suspend
3962 ztest should print out zfs debug buffer before exiting
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
  • Loading branch information
grwilson authored and Christopher Siden committed Aug 7, 2013
1 parent be9000c commit b4952e1
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 97 deletions.
2 changes: 1 addition & 1 deletion usr/src/cmd/mdb/common/modules/zfs/zfs.c
Expand Up @@ -1140,7 +1140,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int stats,
}

if (flags & DCMD_PIPE_OUT) {
mdb_printf("%#lr", addr);
mdb_printf("%#lr\n", addr);
} else {
if (vdev.vdev_path != NULL) {
if (mdb_readstr(desc, sizeof (desc),
Expand Down
21 changes: 16 additions & 5 deletions usr/src/cmd/ztest/ztest.c
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
Expand Down Expand Up @@ -767,6 +767,16 @@ ztest_kill(ztest_shared_t *zs)
{
zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));

/*
* Before we kill off ztest, make sure that the config is updated.
* See comment above spa_config_sync().
*/
mutex_enter(&spa_namespace_lock);
spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
mutex_exit(&spa_namespace_lock);

zfs_dbgmsg_print(FTAG);
(void) kill(getpid(), SIGKILL);
}

Expand Down Expand Up @@ -2731,7 +2741,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
uint64_t leaf, top;
uint64_t ashift = ztest_get_ashift();
uint64_t oldguid, pguid;
size_t oldsize, newsize;
uint64_t oldsize, newsize;
char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
int replacing;
int oldvd_has_siblings = B_FALSE;
Expand Down Expand Up @@ -2890,8 +2900,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",
oldpath, (longlong_t)oldsize, newpath,
(longlong_t)newsize, replacing, error, expected_error);
oldpath, oldsize, newpath,
newsize, replacing, error, expected_error);
}

VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
Expand Down Expand Up @@ -4801,7 +4811,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
*/
if (vd0 != NULL && maxfaults != 1 &&
(!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
vd0->vdev_resilvering)) {
vd0->vdev_resilver_txg != 0)) {
/*
* Make vd0 explicitly claim to be unreadable,
* or unwriteable, or reach behind its back
Expand Down Expand Up @@ -5651,6 +5661,7 @@ ztest_run(ztest_shared_t *zs)

zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
zfs_dbgmsg_print(FTAG);

umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));

Expand Down
21 changes: 16 additions & 5 deletions usr/src/uts/common/fs/zfs/dsl_scan.c
Expand Up @@ -194,6 +194,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
scn->scn_phys.scn_errors = 0;
scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
scn->scn_restart_txg = 0;
scn->scn_done_txg = 0;
spa_scan_stat_init(spa);

if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
Expand Down Expand Up @@ -769,7 +770,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
* Don't scan it now unless we need to because something
* under it was modified.
*/
if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
}
if (buf)
Expand Down Expand Up @@ -1214,7 +1215,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,

for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue;
ddt_bp_create(checksum, ddk, ddp, &bp);

Expand Down Expand Up @@ -1457,6 +1458,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;

if (scn->scn_done_txg == tx->tx_txg) {
ASSERT(!scn->scn_pausing);
/* finished with scan. */
zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
dsl_scan_done(scn, B_TRUE, tx);
ASSERT3U(spa->spa_scrub_inflight, ==, 0);
dsl_scan_sync_state(scn, tx);
return;
}

if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
Expand Down Expand Up @@ -1492,9 +1503,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));

if (!scn->scn_pausing) {
/* finished with scan. */
zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
dsl_scan_done(scn, B_TRUE, tx);
scn->scn_done_txg = tx->tx_txg + 1;
zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
tx->tx_txg, scn->scn_done_txg);
}

if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
Expand Down
11 changes: 3 additions & 8 deletions usr/src/uts/common/fs/zfs/spa.c
Expand Up @@ -4446,7 +4446,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}

/* mark the device being resilvered */
newvd->vdev_resilvering = B_TRUE;
newvd->vdev_resilver_txg = txg;

/*
* If the parent is not a mirror, or if we're replacing, insert the new
Expand Down Expand Up @@ -5303,13 +5303,6 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
return (oldvd);
}

if (vd->vdev_resilvering && vdev_dtl_empty(vd, DTL_MISSING) &&
vdev_dtl_empty(vd, DTL_OUTAGE)) {
ASSERT(vd->vdev_ops->vdev_op_leaf);
vd->vdev_resilvering = B_FALSE;
vdev_config_dirty(vd->vdev_top);
}

/*
* Check for a completed replacement. We always consider the first
* vdev in the list to be the oldest vdev, and the last one to be
Expand Down Expand Up @@ -5399,6 +5392,8 @@ spa_vdev_resilver_done(spa_t *spa)
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));

spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
Expand Down
9 changes: 7 additions & 2 deletions usr/src/uts/common/fs/zfs/spa_config.c
Expand Up @@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#include <sys/spa.h>
Expand Down Expand Up @@ -198,7 +198,12 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)

/*
* Synchronize pool configuration to disk. This must be called with the
* namespace lock held.
* namespace lock held. Synchronizing the pool cache is typically done after
* the configuration has been synced to the MOS. This exposes a window where
* the MOS config will have been updated but the cache file has not. If
* the system were to crash at that instant then the cached config may not
* contain the correct information to open the pool and an explicity import
* would be required.
*/
void
spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
Expand Down
31 changes: 31 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/dsl_scan.h
Expand Up @@ -72,11 +72,42 @@ typedef enum dsl_scan_flags {
DSF_VISIT_DS_AGAIN = 1<<0,
} dsl_scan_flags_t;

/*
* Every pool will have one dsl_scan_t and this structure will contain
* in-memory information about the scan and a pointer to the on-disk
* representation (i.e. dsl_scan_phys_t). Most of the state of the scan
* is contained on-disk to allow the scan to resume in the event of a reboot
* or panic. This structure maintains information about the behavior of a
* running scan, some caching information, and how it should traverse the pool.
*
* The following members of this structure direct the behavior of the scan:
*
* scn_pausing - a scan that cannot be completed in a single txg or
* has exceeded its allotted time will need to pause.
* When this flag is set the scanner will stop traversing
* the pool and write out the current state to disk.
*
* scn_restart_txg - directs the scanner to either restart or start a
* a scan at the specified txg value.
*
* scn_done_txg - when a scan completes its traversal it will set
* the completion txg to the next txg. This is necessary
* to ensure that any blocks that were freed during
* the scan but have not yet been processed (i.e deferred
* frees) are accounted for.
*
* This structure also maintains information about deferred frees which are
* a special kind of traversal. Deferred free can exist in either a bptree or
* a bpobj structure. The scn_is_bptree flag will indicate the type of
* deferred free that is in progress. If the deferred free is part of an
* asynchronous destroy then the scn_async_destroying flag will be set.
*/
typedef struct dsl_scan {
struct dsl_pool *scn_dp;

boolean_t scn_pausing;
uint64_t scn_restart_txg;
uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
zio_t *scn_zio_root;

Expand Down
4 changes: 2 additions & 2 deletions usr/src/uts/common/fs/zfs/sys/vdev_impl.h
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#ifndef _SYS_VDEV_IMPL_H
Expand Down Expand Up @@ -173,7 +173,7 @@ struct vdev {
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
uint64_t vdev_resilvering; /* persistent resilvering state */
uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
Expand Down
3 changes: 2 additions & 1 deletion usr/src/uts/common/fs/zfs/sys/zfs_debug.h
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#ifndef _SYS_ZFS_DEBUG_H
Expand Down Expand Up @@ -77,6 +77,7 @@ typedef struct zfs_dbgmsg {
extern void zfs_dbgmsg_init(void);
extern void zfs_dbgmsg_fini(void);
extern void zfs_dbgmsg(const char *fmt, ...);
extern void zfs_dbgmsg_print(const char *tag);

#ifndef _KERNEL
extern int dprintf_find_string(const char *string);
Expand Down
100 changes: 92 additions & 8 deletions usr/src/uts/common/fs/zfs/vdev.c
Expand Up @@ -521,8 +521,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);

(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
&vd->vdev_resilvering);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);

/*
* When importing a pool, we want to ignore the persistent fault
Expand Down Expand Up @@ -1662,6 +1662,75 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
return (empty);
}

/*
* Returns the lowest txg in the DTL range.
*/
static uint64_t
vdev_dtl_min(vdev_t *vd)
{
space_seg_t *ss;

ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
ASSERT0(vd->vdev_children);

ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
return (ss->ss_start - 1);
}

/*
* Returns the highest txg in the DTL.
*/
static uint64_t
vdev_dtl_max(vdev_t *vd)
{
space_seg_t *ss;

ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
ASSERT0(vd->vdev_children);

ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
return (ss->ss_end);
}

/*
* Determine if a resilvering vdev should remove any DTL entries from
* its range. If the vdev was resilvering for the entire duration of the
* scan then it should excise that range from its DTLs. Otherwise, this
* vdev is considered partially resilvered and should leave its DTL
* entries intact. The comment in vdev_dtl_reassess() describes how we
* excise the DTLs.
*/
static boolean_t
vdev_dtl_should_excise(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

ASSERT0(scn->scn_phys.scn_errors);
ASSERT0(vd->vdev_children);

if (vd->vdev_resilver_txg == 0 ||
vd->vdev_dtl[DTL_MISSING].sm_space == 0)
return (B_TRUE);

/*
* When a resilver is initiated the scan will assign the scn_max_txg
* value to the highest txg value that exists in all DTLs. If this
* device's max DTL is not part of this scan (i.e. it is not in
* the range (scn_min_txg, scn_max_txg] then it is not eligible
* for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
return (B_TRUE);
}
return (B_FALSE);
}

/*
* Reassess DTLs after a config change or scrub completion.
*/
Expand All @@ -1685,9 +1754,17 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

mutex_enter(&vd->vdev_dtl_lock);

/*
* If we've completed a scan cleanly then determine
* if this vdev should remove any DTLs. We only want to
* excise regions on vdevs that were available during
* the entire duration of this scan.
*/
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
(scn && scn->scn_phys.scn_errors == 0))) {
(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
vdev_dtl_should_excise(vd)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
Expand Down Expand Up @@ -1726,6 +1803,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
else
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);

/*
* If the vdev was resilvering and no longer has any
* DTLs then reset its resilvering flag.
*/
if (vd->vdev_resilver_txg != 0 &&
vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
vd->vdev_resilver_txg = 0;

mutex_exit(&vd->vdev_dtl_lock);

if (txg != 0)
Expand Down Expand Up @@ -1902,12 +1989,9 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
mutex_enter(&vd->vdev_dtl_lock);
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
vdev_writeable(vd)) {
space_seg_t *ss;

ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismin = ss->ss_start - 1;
ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismax = ss->ss_end;
thismin = vdev_dtl_min(vd);
thismax = vdev_dtl_max(vd);
needed = B_TRUE;
}
mutex_exit(&vd->vdev_dtl_lock);
Expand Down

0 comments on commit b4952e1

Please sign in to comment.