From d244b7a0c6eb4a57a424297d4293184dff28b94c Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 17 Jun 2016 11:53:32 -0400 Subject: [PATCH] mds: add maximum fragment size constraint This commit adds a new config option mds_bal_fragment_size_max = 10000*10 which is an order of magnitude larger than mds_bal_split_size. This limit prevents a fragment from getting too large which results in large omap directories. Right now the limit is enforced only in the RPC paths and in stray directory entry creation. Fixes http://tracker.ceph.com/issues/16164 Signed-off-by: Patrick Donnelly (cherry picked from commit 60af83c80910070d8fb10ac7a4f6f24d49521c1b) --- src/common/config_opts.h | 1 + src/mds/MDCache.cc | 10 ++++++- src/mds/MDCache.h | 1 + src/mds/Server.cc | 59 ++++++++++++++++++++++++++++++++++++---- src/mds/Server.h | 1 + 5 files changed, 65 insertions(+), 7 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5d0ff42735a2fe..1e790870272f34 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -482,6 +482,7 @@ OPTION(mds_bal_merge_rd, OPT_FLOAT, 1000) OPTION(mds_bal_merge_wr, OPT_FLOAT, 1000) OPTION(mds_bal_interval, OPT_INT, 10) // seconds OPTION(mds_bal_fragment_interval, OPT_INT, 5) // seconds +OPTION(mds_bal_fragment_size_max, OPT_INT, 10000*10) // order of magnitude higher than split size OPTION(mds_bal_idle_threshold, OPT_FLOAT, 0) OPTION(mds_bal_max, OPT_INT, -1) OPTION(mds_bal_max_until, OPT_INT, -1) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index ddf46842276fba..1e5ded7ee1651e 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -720,7 +720,7 @@ void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin) discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1))); } -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) +CDir *MDCache::get_stray_dir(CInode *in) { string straydname; in->name_stray_dentry(straydname); @@ -730,6 +730,14 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in) frag_t fg = strayi->pick_dirfrag(straydname); CDir *straydir = strayi->get_dirfrag(fg); assert(straydir); + return straydir; +} + +CDentry *MDCache::get_or_create_stray_dentry(CInode *in) +{ + CDir *straydir = get_stray_dir(in); + string straydname; + in->name_stray_dentry(straydname); CDentry *straydn = straydir->lookup(straydname); if (!straydn) { straydn = straydir->add_null_dentry(straydname); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 5aadc13981ee34..a25ad2762cd98a 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -825,6 +825,7 @@ class MDCache { version_t dpv, MDSInternalContextBase *fin); void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c); + CDir *get_stray_dir(CInode *in); CDentry *get_or_create_stray_dentry(CInode *in); MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 313744718b0b18..3cb0715f088149 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2147,6 +2147,23 @@ bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask) return true; } +/** + * check whether fragment has reached maximum size + * + */ +bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in) +{ + const auto size = in->get_frag_size(); + if (size >= g_conf->mds_bal_fragment_size_max) { + dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl; + respond_to_request(mdr, -ENOSPC); + return false; + } + + return true; +} + + /** validate_dentry_dir * * verify that the dir exists and would own the dname. @@ -2231,15 +2248,20 @@ CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in) { CDentry *straydn = mdr->straydn; if (straydn) { - string name; - in->name_stray_dentry(name); - if (straydn->get_name() == name) + string straydname; + in->name_stray_dentry(straydname); + if (straydn->get_name() == straydname) return straydn; assert(!mdr->done_locking); mdr->unpin(straydn); } + CDir *straydir = mdcache->get_stray_dir(in); + + if (!check_fragment_space(mdr, straydir)) + return NULL; + straydn = mdcache->get_or_create_stray_dentry(in); mdr->straydn = straydn; mdr->pin(straydn); @@ -3168,7 +3190,8 @@ void Server::handle_client_openc(MDRequestRef& mdr) return; } - CInode *diri = dn->get_dir()->get_inode(); + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); rdlocks.insert(&diri->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -3176,6 +3199,9 @@ void Server::handle_client_openc(MDRequestRef& mdr) if (!check_access(mdr, diri, access)) return; + if (!check_fragment_space(mdr, dir)) + return; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); if (!dnl->is_null()) { @@ -4577,6 +4603,9 @@ void Server::handle_client_mknod(MDRequestRef& mdr) if (!check_access(mdr, diri, MAY_WRITE)) return; + if (!check_fragment_space(mdr, dn->get_dir())) + return; + unsigned mode = req->head.args.mknod.mode; if ((mode & S_IFMT) == 0) mode |= S_IFREG; @@ -4660,7 +4689,8 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) respond_to_request(mdr, -EROFS); return; } - CInode *diri = dn->get_dir()->get_inode(); + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); rdlocks.insert(&diri->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -4669,6 +4699,9 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) if (!check_access(mdr, diri, MAY_WRITE)) return; + if (!check_fragment_space(mdr, dir)) + return; + // new inode SnapRealm *realm = dn->get_dir()->inode->find_snaprealm(); snapid_t follows = realm->get_newest_seq(); @@ -4740,7 +4773,8 @@ void Server::handle_client_symlink(MDRequestRef& mdr) respond_to_request(mdr, -EROFS); return; } - CInode *diri = dn->get_dir()->get_inode(); + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); rdlocks.insert(&diri->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -4748,6 +4782,9 @@ void Server::handle_client_symlink(MDRequestRef& mdr) if (!check_access(mdr, diri, MAY_WRITE)) return; + if (!check_fragment_space(mdr, dir)) + return; + unsigned mode = S_IFLNK | 0777; CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); assert(newi); @@ -4821,6 +4858,9 @@ void Server::handle_client_link(MDRequestRef& mdr) if (!check_access(mdr, dir->get_inode(), MAY_WRITE)) return; + if (!check_fragment_space(mdr, dir)) + return; + // go! assert(g_conf->mds_kill_link_at != 1); @@ -5392,6 +5432,8 @@ void Server::handle_client_unlink(MDRequestRef& mdr) CDentry *straydn = NULL; if (dnl->is_primary()) { straydn = prepare_stray_dentry(mdr, dnl->get_inode()); + if (!straydn) + return; dout(10) << " straydn is " << *straydn << dendl; } else if (mdr->straydn) { mdr->unpin(mdr->straydn); @@ -6171,6 +6213,8 @@ void Server::handle_client_rename(MDRequestRef& mdr) CDentry *straydn = NULL; if (destdnl->is_primary() && !linkmerge) { straydn = prepare_stray_dentry(mdr, destdnl->get_inode()); + if (!straydn) + return; dout(10) << " straydn is " << *straydn << dendl; } else if (mdr->straydn) { mdr->unpin(mdr->straydn); @@ -6281,6 +6325,9 @@ void Server::handle_client_rename(MDRequestRef& mdr) if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE)) return; + if (!check_fragment_space(mdr, destdn->get_dir())) + return; + if (!check_access(mdr, srci, MAY_WRITE)) return; diff --git a/src/mds/Server.h b/src/mds/Server.h index 66aa6b9e4449de..0e871031cd2f00 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -134,6 +134,7 @@ class Server { void handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack); // some helpers + bool check_fragment_space(MDRequestRef& mdr, CDir *in); bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask); bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid); CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname);