Skip to content

Commit

Permalink
btrfs: add numdevs= mount option.
Browse files Browse the repository at this point in the history
btrfs currently allocates multiple anonymous bdev numbers to hide the
fact that inode numbers are not unique across "subvolumes".
Each subvol gets a different device number.

As described in a previous patch, this is incomplete, doesn't scale, and
should be deprecated.  This patch is another step to deprecation.

With mount option "-o numdevs=many", which is the default, the current
behaviour is preserved.

With mount option "-o numdevs=1", the st_dev reported by stat() is
exactly the number that appears in /proc/$PID/mountinfo (i.e.
sb->s_dev).  This will prevent "du -x", "find -xdev" and similar tools
from keeping within a subvol, but is otherwise quite functional.

If numdevs=1 and inumbits=0, then there will often be inode number
reuse, so that combination is forbidden and the default fo inumbits
changes to BITS_PER_LONG*7/8.  With larger inumbits (close to
BITS_PER_LONG), inode number reuse is still possible, but only with
large or old filesystems.

With mount option "-o numdevs=2", precisely two anon device numbers are
allocated.  Each subvol gets the number that its parent isn't using.
When subvols are moved, the device number reported will change if needed
to differentiate from its parent.
If a subvol with dependent subvols is moved and the device numbers need
to change, the numbers in dependent subvols that are currently in cache
will NOT change.  Fixing this is a stretch goal.

Using numdevs=2 removes any problems with exhausting the number of
available anon devs, and preserves the functionality of "du -x" and
similar.  It may be a useful option for sites that experience exhaustion
problems.

numdevs=1 is, at this stage, most useful for exploring the consequences
of fully deprecating the use of multiple device numbers.  It may also be
useful for site that find they have no dependency on multiple device
numbers.

Signed-off-by: NeilBrown <neilb@suse.de>
  • Loading branch information
neilbrown authored and intel-lab-lkp committed Aug 9, 2021
1 parent 735f073 commit c5bae87
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 8 deletions.
17 changes: 15 additions & 2 deletions fs/btrfs/ctree.h
Expand Up @@ -989,6 +989,14 @@ struct btrfs_fs_info {
u32 stripesize;

unsigned short inumbits;
/* num_devs can be:
* 1 - all files in all trees use sb->s_dev
* 2 - file trees alternate between using sb->s_dev and
* secondary_anon_dev.
* 3 (BTTSF_MANY_DEVS) - Each subtree uses a unique ->anon_dev
*/
unsigned short num_devs;
dev_t secondary_anon_dev;

/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
Expand Down Expand Up @@ -1034,6 +1042,8 @@ struct btrfs_fs_info {
#endif
};

#define BTRFS_MANY_DEVS (3)

static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
Expand Down Expand Up @@ -1175,10 +1185,13 @@ struct btrfs_root {
*/
struct radix_tree_root delayed_nodes_tree;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
* If fs_info->num_devs == 3 (BTRFS_MANY_DEVS) anon_dev holds a device
* number to be reported by ->getattr().
* If fs_info->num_devs == 2, anon_dev is 0 and use_secondary_dev
* is true when this root uses the secondary, not primary, dev.
*/
dev_t anon_dev;
bool use_secondary_dev;

spinlock_t root_item_lock;
refcount_t refs;
Expand Down
24 changes: 21 additions & 3 deletions fs/btrfs/disk-io.c
Expand Up @@ -1519,7 +1519,8 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* userspace, the id pool is limited to 1M
*/
if (is_fstree(root->root_key.objectid) &&
btrfs_root_refs(&root->root_item) > 0) {
btrfs_root_refs(&root->root_item) > 0 &&
root->fs_info->num_devs == BTRFS_MANY_DEVS) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
Expand Down Expand Up @@ -3334,8 +3335,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* "-o inumbits" can over-ride this default.
* BITS_PER_LONG * 7 / 8 is a good value to use
*/
if (fs_info->inumbits > BITS_PER_LONG)
fs_info->inumbits = 0;
if (fs_info->inumbits > BITS_PER_LONG) {
if (fs_info->num_devs == 1)
fs_info->inumbits = BITS_PER_LONG * 7 / 8;
else
fs_info->inumbits = 0;
}

features = btrfs_super_incompat_flags(disk_super) &
~BTRFS_FEATURE_INCOMPAT_SUPP;
Expand Down Expand Up @@ -3381,6 +3386,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;

if (fs_info->num_devs == 0)
/* set default value */
fs_info->num_devs = BTRFS_MANY_DEVS;

if (fs_info->num_devs == 2) {
err = get_anon_bdev(&fs_info->secondary_anon_dev);
if (err)
goto fail_alloc;
}
/*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
Expand Down Expand Up @@ -4452,6 +4466,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)

btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);

if (fs_info->secondary_anon_dev)
free_anon_bdev(fs_info->secondary_anon_dev);
fs_info->secondary_anon_dev = 0;
}

int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
Expand Down
29 changes: 28 additions & 1 deletion fs/btrfs/inode.c
Expand Up @@ -6018,6 +6018,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
if (fs_info->num_devs == 2)
sub_root->use_secondary_dev = !root->use_secondary_dev;
}

return inode;
Expand Down Expand Up @@ -9297,7 +9299,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
STATX_ATTR_NODUMP);

generic_fillattr(&init_user_ns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
/* If we don't set stat->dev here, sb->s_dev will be used */
switch (btrfs_sb(inode->i_sb)->num_devs) {
case 2:
if (BTRFS_I(inode)->root->use_secondary_dev)
stat->dev = btrfs_sb(inode->i_sb)->secondary_anon_dev;
break;
case BTRFS_MANY_DEVS:
stat->dev = BTRFS_I(inode)->root->anon_dev;
}

spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
Expand Down Expand Up @@ -9500,6 +9510,15 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;

if (fs_info->num_devs == 2 &&
root->use_secondary_dev != dest->use_secondary_dev) {
BTRFS_I(old_inode)->root->use_secondary_dev =
!dest->use_secondary_dev;
BTRFS_I(new_inode)->root->use_secondary_dev =
!root->use_secondary_dev;
// FIXME any subvols beneeath 'old_inode' or 'new_inode'
// that are in cache are now wrong.
}
if (root_log_pinned) {
btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
new_dentry->d_parent);
Expand Down Expand Up @@ -9783,6 +9802,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_fail;
}

if (fs_info->num_devs == 2 &&
root->use_secondary_dev != dest->use_secondary_dev) {
BTRFS_I(old_inode)->root->use_secondary_dev =
!dest->use_secondary_dev;
// FIXME any subvols beneeath 'old_inode' that are
// in cache are now wrong.
}

if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;

Expand Down
6 changes: 4 additions & 2 deletions fs/btrfs/ioctl.c
Expand Up @@ -529,7 +529,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail_free;

ret = get_anon_bdev(&anon_dev);
if (fs_info->num_devs == BTRFS_MANY_DEVS)
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
goto fail_free;

Expand Down Expand Up @@ -736,7 +737,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot)
return -ENOMEM;

ret = get_anon_bdev(&pending_snapshot->anon_dev);
if (fs_info->num_devs == BTRFS_MANY_DEVS)
ret = get_anon_bdev(&pending_snapshot->anon_dev);
if (ret < 0)
goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
Expand Down
30 changes: 30 additions & 0 deletions fs/btrfs/super.c
Expand Up @@ -361,6 +361,7 @@ enum {
Opt_discard_mode,
Opt_inumbits,
Opt_norecovery,
Opt_numdevs,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
Expand Down Expand Up @@ -431,6 +432,7 @@ static const match_table_t tokens = {
{Opt_inumbits, "inumbits=%u"},
{Opt_nodiscard, "nodiscard"},
{Opt_norecovery, "norecovery"},
{Opt_numdevs, "numdevs=%s"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_skip_balance, "skip_balance"},
Expand Down Expand Up @@ -849,8 +851,35 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = -EINVAL;
goto out;
}
if (intarg == 0 && info->num_devs == 1) {
btrfs_err(info,
"inumbits=0 not permitted when numdevs=1");
ret = -EINVAL;
goto out;
}
info->inumbits = intarg;
break;
case Opt_numdevs:
if (info->num_devs) {
; /* silently ignore attempts to change this */
} else if (strcmp(args[0].from, "many") == 0) {
info->num_devs = BTRFS_MANY_DEVS;
} else if (strcmp(args[0].from, "1") == 0) {
if (info->inumbits == 0) {
btrfs_err(info,
"numdevs=1 not permitted with inumbits=0");
ret = -EINVAL;
}
info->num_devs = 1;
} else if (strcmp(args[0].from, "2") == 0) {
info->num_devs = 2;
} else {
btrfs_err(info,
"numdevs must be \"1\", \"2\", or \"many\".");
ret = -EINVAL;
goto out;
}
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
if (ret)
Expand Down Expand Up @@ -1548,6 +1577,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",inumbits=%u", info->inumbits);
seq_printf(seq, ",numdevs=%u", info->num_devs);
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
subvol_name = btrfs_get_subvol_name_from_objectid(info,
Expand Down

0 comments on commit c5bae87

Please sign in to comment.