From c419878f8f85e4015421245d0f14630565a6e96c Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 4 Aug 2016 12:25:39 +0100 Subject: [PATCH] mds: trim null dentries proactively Instead of leaving null dentries (e.g. left behind from unlinks) in the cache until they fall out of the LRU, actively push them to the bottom of the LRU and then consume all nulls at the bottom in trim() even if the cache is not oversized yet. This fixes the case where standby replay daemons would otherwise accumulate a cache full of null dentries resulting from unlinks, and it makes the behaviour of active daemons more deterministic. Fixes: http://tracker.ceph.com/issues/16919 Signed-off-by: John Spray --- src/mds/MDCache.cc | 15 ++++++++++++--- src/mds/MDLog.cc | 1 + src/mds/journal.cc | 20 ++++++++++++++++++-- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index c5b0ebae13cc3..6919cc888db1a 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6350,10 +6350,19 @@ bool MDCache::trim(int max, int count) bool is_standby_replay = mds->is_standby_replay(); int unexpirable = 0; list unexpirables; - // trim dentries from the LRU - while (lru.lru_get_size() + unexpirable > (unsigned)max) { + + // trim dentries from the LRU: only enough to satisfy `max`, + // unless we see null dentries at the bottom of the LRU, + // in which case trim all those. + bool trimming_nulls = true; + while (trimming_nulls || lru.lru_get_size() + unexpirable > (unsigned)max) { CDentry *dn = static_cast(lru.lru_expire()); - if (!dn) break; + if (!dn) { + break; + } + if (!dn->get_linkage()->is_null()) { + trimming_nulls = false; + } if ((is_standby_replay && dn->get_linkage()->inode && dn->get_linkage()->inode->item_open_file.is_on_list()) || trim_dentry(dn, expiremap)) { diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index bc018fabcc6e1..ccb02a02a15f3 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -850,6 +850,7 @@ void MDLog::replay(MDSInternalContextBase *c) // empty? if (journaler->get_read_pos() == journaler->get_write_pos()) { dout(10) << "replay - journal empty, done." << dendl; + mds->mdcache->trim(-1); if (c) { c->complete(0); } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 3954b612b0817..70a843be1df2e 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -1276,7 +1276,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast); dn->set_version(p->dnv); if (p->is_dirty()) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; + dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl; } else { dn->set_version(p->dnv); if (p->is_dirty()) dn->_mark_dirty(logseg); @@ -1302,6 +1302,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) mds->clog->warn(ss); } dir->unlink_inode(dn); + mds->mdcache->touch_dentry_bottom(dn); } if (unlinked.count(in)) linked.insert(in); @@ -1313,7 +1314,9 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) { dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; unlinked[in] = in->get_parent_dir(); + CDentry *unlinked_dn = in->get_parent_dn(); in->get_parent_dir()->unlink_inode(in->get_parent_dn()); + mds->mdcache->touch_dentry_bottom(unlinked_dn); } if (dn->get_linkage()->get_inode() != in) { if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration. @@ -1326,6 +1329,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) mds->clog->warn(ss); } dir->unlink_inode(dn); + mds->mdcache->touch_dentry_bottom(dn); } if (unlinked.count(in)) linked.insert(in); @@ -1371,6 +1375,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) dout(0) << ss.str() << dendl; } dir->unlink_inode(dn); + mds->mdcache->touch_dentry_bottom(dn); } dir->link_remote_inode(dn, p->ino, p->d_type); dn->set_version(p->dnv); @@ -1392,7 +1397,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast); dn->set_version(p->dnv); if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; + dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl; } else { dn->first = p->dnfirst; if (!dn->get_linkage()->is_null()) { @@ -1405,6 +1410,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (dn->get_linkage()->is_primary()) unlinked[in] = dir; dir->unlink_inode(dn); + mds->mdcache->touch_dentry_bottom(dn); } } dn->set_version(p->dnv); @@ -1415,6 +1421,10 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) olddir = dir; if (lump.is_importing()) dn->state_set(CDentry::STATE_AUTH); + + // Make null dentries the first things we trim + dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl; + mds->mdcache->touch_dentry_bottom(dn); } } @@ -1622,7 +1632,13 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) CInode *in = mds->mdcache->get_inode(*p); if (in) { dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; + CDentry *parent = in->get_parent_dn(); mds->mdcache->remove_inode(in); + if (parent) { + dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; + assert(parent->get_linkage()->is_null()); + mds->mdcache->touch_dentry_bottom(parent); + } } else { dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; }