From c5c58ce12a209f285fab844d816cab25f7db4273 Mon Sep 17 00:00:00 2001 From: Theofilos Date: Tue, 3 Nov 2020 15:26:59 +0000 Subject: [PATCH 1/2] cephfs: mtime should not update rctime mtime is a user modifiable field and someone can alter it to reflect a modification time in the future or past. rctime should be depended only on ctime which itself is system depended (based on how POSIX works). By setting an mtime in the future, we render the usage of rctime redundant until the future timestamp is met. Signed-off-by: Theofilos Mouratidis --- src/mds/Locker.cc | 34 +++++++++++++++++++--------------- src/mds/MDCache.cc | 9 ++------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index bacc79a10381d..b467e3184d370 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2852,11 +2852,9 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, pi.inode->rstat.rbytes = new_size; dout(10) << "check_inode_max_size mtime " << pi.inode->mtime << " -> " << new_mtime << dendl; pi.inode->mtime = new_mtime; - if (new_mtime > pi.inode->ctime) { - pi.inode->ctime = new_mtime; - if (new_mtime > pi.inode->rstat.rctime) - pi.inode->rstat.rctime = new_mtime; - } + pi.inode->ctime = ceph_clock_now(); + if (pi.inode->ctime > pi.inode->rstat.rctime) + pi.inode->rstat.rctime = pi.inode->ctime; } // use EOpen if the file is still open; otherwise, use EUpdate. @@ -3657,22 +3655,16 @@ void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t return; /* m must be valid if there are dirty caps */ + bool changed = false; ceph_assert(m); uint64_t features = m->get_connection()->get_features(); - if (m->get_ctime() > pi->ctime) { - dout(7) << " ctime " << pi->ctime << " -> " << m->get_ctime() - << " for " << *in << dendl; - pi->ctime = m->get_ctime(); - if (m->get_ctime() > pi->rstat.rctime) - pi->rstat.rctime = m->get_ctime(); - } - if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) && m->get_change_attr() > pi->change_attr) { dout(7) << " change_attr " << pi->change_attr << " -> " << m->get_change_attr() << " for " << *in << dendl; pi->change_attr = m->get_change_attr(); + changed = true; } // file @@ -3687,8 +3679,7 @@ void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t dout(7) << " mtime " << pi->mtime << " -> " << mtime << " for " << *in << dendl; pi->mtime = mtime; - if (mtime > pi->rstat.rctime) - pi->rstat.rctime = mtime; + changed = true; } if (in->is_file() && // ONLY if regular file size > pi->size) { @@ -3696,11 +3687,13 @@ void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t << " for " << *in << dendl; pi->size = size; pi->rstat.rbytes = size; + changed = true; } if (in->is_file() && (dirty & CEPH_CAP_FILE_WR) && inline_version > pi->inline_data.version) { pi->inline_data.version = inline_version; + changed = true; if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0) pi->inline_data.set_data(m->inline_data); else @@ -3710,12 +3703,14 @@ void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t dout(7) << " atime " << pi->atime << " -> " << atime << " for " << *in << dendl; pi->atime = atime; + changed = true; } if ((dirty & CEPH_CAP_FILE_EXCL) && ceph_seq_cmp(pi->time_warp_seq, m->get_time_warp_seq()) < 0) { dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq() << " for " << *in << dendl; pi->time_warp_seq = m->get_time_warp_seq(); + changed = true; } } // auth @@ -3725,26 +3720,35 @@ void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t << " -> " << m->head.uid << " for " << *in << dendl; pi->uid = m->head.uid; + changed = true; } if (m->head.gid != pi->gid) { dout(7) << " gid " << pi->gid << " -> " << m->head.gid << " for " << *in << dendl; pi->gid = m->head.gid; + changed = true; } if (m->head.mode != pi->mode) { dout(7) << " mode " << oct << pi->mode << " -> " << m->head.mode << dec << " for " << *in << dendl; pi->mode = m->head.mode; + changed = true; } if ((features & CEPH_FEATURE_FS_BTIME) && m->get_btime() != pi->btime) { dout(7) << " btime " << oct << pi->btime << " -> " << m->get_btime() << dec << " for " << *in << dendl; pi->btime = m->get_btime(); + changed = true; } } + + if (changed) { + pi->ctime = std::max(pi->ctime, ceph_clock_now()); + pi->rstat.rctime = std::max(pi->rstat.rctime, pi->ctime); + } } /* diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 9438925ffd65d..22614487cf851 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2183,12 +2183,7 @@ void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob, pf->fragstat.mtime = mut->get_op_stamp(); pf->fragstat.change_attr++; dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl; - if (pf->fragstat.mtime > pf->rstat.rctime) { - dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; - pf->rstat.rctime = pf->fragstat.mtime; - } else { - dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; - } + pf->rstat.rctime = std::max(pf->rstat.rctime, ceph_clock_now()); } if (linkunlink) { dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; @@ -2306,7 +2301,7 @@ void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob, pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); pf->accounted_fragstat = pf->fragstat; if (touched_mtime) - pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime; + pi.inode->mtime = pi.inode->dirstat.mtime; if (touched_chattr) pi.inode->change_attr = pi.inode->dirstat.change_attr; dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl; From 83d8f09870cbbb410d5cad0c368d1a7e64804d7e Mon Sep 17 00:00:00 2001 From: Theofilos Date: Tue, 3 Nov 2020 15:25:03 +0000 Subject: [PATCH 2/2] cephfs: make vxattr ceph.dir.rctime writable It is observed that clients have files with invalid ctimes on cephfs. Even if those files can be fixed, the parent directories up to the root have their rctime set to an invalid value. Make rctime modifiable and let sysadmins fix those invalid rctimes recusively (bottom-up). Signed-off-by: Theofilos Mouratidis --- qa/workunits/fs/misc/rstats.sh | 5 ++++ src/client/Client.cc | 8 ++++++- src/mds/Server.cc | 43 ++++++++++++++++++++++++++++++++++ src/mds/Server.h | 4 +++- 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/qa/workunits/fs/misc/rstats.sh b/qa/workunits/fs/misc/rstats.sh index e6b3eddf287e8..951cd274df7d5 100755 --- a/qa/workunits/fs/misc/rstats.sh +++ b/qa/workunits/fs/misc/rstats.sh @@ -75,6 +75,11 @@ touch d1/d2/f3 # create new file wait_until_changed rctime check_rctime +old_value=`getfattr --only-value -n ceph.dir.rctime .` +setfattr -n "ceph.dir.rctime" -v `date -d 'now+1hour' +%s` . +wait_until_changed rctime +check_rctime + cd .. rm -rf rstats_testdir echo OK diff --git a/src/client/Client.cc b/src/client/Client.cc index 86e01944f2e6b..2298d0074678b 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -12918,7 +12918,13 @@ const Client::VXattr Client::_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT), XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT), XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT), - XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT), + { + name: "ceph.dir.rctime", + getxattr_cb: &Client::_vxattrcb_dir_rctime, + readonly: false, + exists_cb: NULL, + flags: VXATTR_RSTAT, + }, { name: "ceph.quota", getxattr_cb: &Client::_vxattrcb_quota, diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b0113d73fb50f..86b73b27ef97f 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5324,6 +5324,21 @@ bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, b return true; } +bool Server::wrlock_nestlock(MDRequestRef& mdr, CInode *in) +{ + if (mdr->locking_state & MutationImpl::ALL_LOCKED) + return true; + + MutationImpl::LockOpVec lov; + lov.add_wrlock(&in->nestlock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return false; + + mdr->locking_state |= MutationImpl::ALL_LOCKED; + return true; +} + CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino) { CInode *in = mdcache->get_inode(ino); @@ -5861,6 +5876,34 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur) auto pi = cur->project_inode(mdr); cur->setxattr_ephemeral_dist(val); pip = pi.inode.get(); + } else if (name == "ceph.dir.rctime") { + if (!cur->is_dir()) { + respond_to_request(mdr, -EINVAL); + return; + } + + uint64_t sec, nsec = 0; + try { + if (auto pos = value.find('.', 0); pos != string::npos) { + sec = boost::lexical_cast(value.substr(0, pos)); + nsec = boost::lexical_cast(value.substr(pos+1)); + } else { + sec = boost::lexical_cast(value); + } + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse utime_t for " << name << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + utime_t val(sec, nsec); + + if (!wrlock_nestlock(mdr, cur)) + return; + + auto pi = cur->project_inode(mdr); + pi.inode->rstat.rctime = val; + mdr->no_early_reply = true; + pip = pi.inode.get(); } else { dout(10) << " unknown vxattr " << name << dendl; respond_to_request(mdr, -CEPHFS_EINVAL); diff --git a/src/mds/Server.h b/src/mds/Server.h index 76f38717c7265..b0d5161b10c36 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -206,6 +206,7 @@ class Server { bool xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout=false, bool xlock_snaplock=false); + bool wrlock_nestlock(MDRequestRef& mdr, CInode *in); CInode* try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino); void handle_client_setattr(MDRequestRef& mdr); void handle_client_setlayout(MDRequestRef& mdr); @@ -418,7 +419,8 @@ class Server { xattr_name == "ceph.dir.subvolume" || xattr_name == "ceph.dir.pin" || xattr_name == "ceph.dir.pin.random" || - xattr_name == "ceph.dir.pin.distributed"; + xattr_name == "ceph.dir.pin.distributed" || + xattr_name == "ceph.dir.rctime"; } static bool is_allowed_ceph_xattr(std::string_view xattr_name) {