From 81376b6a1a5db9f8b45c58ea318ec924e932d990 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 3 Feb 2015 14:46:39 +0800 Subject: [PATCH 1/8] osdc: In _readx() only no error can tidy read result. Signed-off-by: Jianpeng Ma (cherry picked from commit 540346d4a901d8041c9fd74641c98cdfd2e1ab32) --- src/osdc/ObjectCacher.cc | 98 ++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index e7fa908e13d4b..3f52f55bdffcf 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -1210,56 +1210,58 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, } } - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - loff_t opos = ex_it->offset; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - uint64_t bhoff = opos - bh_it->second->start(); - vector >::iterator f_it = ex_it->buffer_extents.begin(); - uint64_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (loff_t)(bh->start() + bhoff)); - - uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff); - ldout(cct, 10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len - << dendl; + if (!error) { + // create reverse map of buffer offset -> object for the eventual result. + // this is over a single ObjectExtent, so we know that + // - the bh's are contiguous + // - the buffer frags need not be (and almost certainly aren't) + loff_t opos = ex_it->offset; + map::iterator bh_it = hits.begin(); + assert(bh_it->second->start() <= opos); + uint64_t bhoff = opos - bh_it->second->start(); + vector >::iterator f_it = ex_it->buffer_extents.begin(); + uint64_t foff = 0; + while (1) { + BufferHead *bh = bh_it->second; + assert(opos == (loff_t)(bh->start() + bhoff)); + + uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff); + ldout(cct, 10) << "readx rmap opos " << opos + << ": " << *bh << " +" << bhoff + << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len + << dendl; + + bufferlist bit; // put substr here first, since substr_of clobbers, and + // we may get multiple bh's at this stripe_map position + if (bh->is_zero()) { + bufferptr bp(len); + bp.zero(); + stripe_map[f_it->first].push_back(bp); + } else { + bit.substr_of(bh->bl, + opos - bh->start(), + len); + stripe_map[f_it->first].claim_append(bit); + } - bufferlist bit; // put substr here first, since substr_of clobbers, and - // we may get multiple bh's at this stripe_map position - if (bh->is_zero()) { - bufferptr bp(len); - bp.zero(); - stripe_map[f_it->first].push_back(bp); - } else { - bit.substr_of(bh->bl, - opos - bh->start(), - len); - stripe_map[f_it->first].claim_append(bit); + opos += len; + bhoff += len; + foff += len; + if (opos == bh->end()) { + ++bh_it; + bhoff = 0; + } + if (foff == f_it->second) { + ++f_it; + foff = 0; + } + if (bh_it == hits.end()) break; + if (f_it == ex_it->buffer_extents.end()) + break; } - - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - ++bh_it; - bhoff = 0; - } - if (foff == f_it->second) { - ++f_it; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) - break; + assert(f_it == ex_it->buffer_extents.end()); + assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length); } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length); if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length)) bottouch_ob(o); @@ -1303,7 +1305,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, assert(rd->bl->length() == pos); } ldout(cct, 10) << "readx result is " << rd->bl->length() << dendl; - } else { + } else if (!error) { ldout(cct, 10) << "readx no bufferlist ptr (readahead?), done." << dendl; map::reverse_iterator i = stripe_map.rbegin(); pos = i->first + i->second.length(); From 4135b9a2d199583685a1bae3713347dcc1b872e6 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 6 Feb 2015 09:53:36 +0800 Subject: [PATCH 2/8] osdc: For trust_enoent is true, there is only one extent. Now the judgement only in conditon which will return -ENOENT. But o->exists don't depend on the extent size. It only depend on trust_enoent. So move this judgement at the first of _readx(). Make this bug ASAP occur. Signed-off-by: Jianpeng Ma (cherry picked from commit 2449fddc13b5ce8bbf1bb4ecaa5d6937f54e54d1) --- src/osdc/ObjectCacher.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 3f52f55bdffcf..ec4650d8a1787 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -1060,6 +1060,13 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, map stripe_map; // final buffer offset -> substring bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + /* + * WARNING: we can only meaningfully return ENOENT if the read request + * passed in a single ObjectExtent. Any caller who wants ENOENT instead of + * zeroed buffers needs to feed single extents into readx(). + */ + assert(!oset->return_enoent || rd->extents.size() == 1); + for (vector::iterator ex_it = rd->extents.begin(); ex_it != rd->extents.end(); ++ex_it) { @@ -1075,10 +1082,6 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, // does not exist and no hits? if (oset->return_enoent && !o->exists) { - // WARNING: we can only meaningfully return ENOENT if the read request - // passed in a single ObjectExtent. Any caller who wants ENOENT instead of - // zeroed buffers needs to feed single extents into readx(). - assert(rd->extents.size() == 1); ldout(cct, 10) << "readx object !exists, 1 extent..." << dendl; // should we worry about COW underneaeth us? From c96541ad19fb142ed31ff3006f1dc25e0c1de86c Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Mon, 2 Mar 2015 11:36:24 +0800 Subject: [PATCH 3/8] osdc: Make last missing bh to wake up the reader. Avoid wakeup early and wait again. Signed-off-by: Jianpeng Ma (cherry picked from commit d582bda090b3339d03e25b0e6d0971ad0267f476) --- src/osdc/ObjectCacher.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index ec4650d8a1787..5d54ed8a88f9a 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -1142,6 +1142,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, if (!missing.empty() || !rx.empty()) { // read missing + map::iterator last = missing.end(); for (map::iterator bh_it = missing.begin(); bh_it != missing.end(); ++bh_it) { @@ -1163,15 +1164,20 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, delete bh_it->second; } else { bh_read(bh_it->second, rd->fadvise_flags); - if (success && onfinish) { - ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, oset, onfinish) ); - } + if ((success && onfinish) || last != missing.end()) + last = bh_it; } success = false; } + //add wait in last bh avoid wakeup early. Because read is order + if (last != missing.end()) { + ldout(cct, 10) << "readx missed, waiting on " << *last->second + << " off " << last->first << dendl; + last->second->waitfor_read[last->first].push_back( new C_RetryRead(this, rd, oset, onfinish) ); + + } + // bump rx for (map::iterator bh_it = rx.begin(); bh_it != rx.end(); From 86e7698a19745c26b9d3e7a12a16c87ea9a5d565 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Mon, 2 Mar 2015 11:23:44 +0800 Subject: [PATCH 4/8] osdc: After write try merge bh. Signed-off-by: Jianpeng Ma (cherry picked from commit 1a48a8a2b222e41236341cb1241f0885a1b0b9d8) --- src/osdc/ObjectCacher.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 5d54ed8a88f9a..995b37bdc6225 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -886,6 +886,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start, } } + list hit; // apply to bh's! for (map::iterator p = ob->data_lower_bound(start); p != ob->data.end(); @@ -917,6 +918,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start, if (r >= 0) { // ok! mark bh clean and error-free mark_clean(bh); + hit.push_back(bh); ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl; } else { mark_dirty(bh); @@ -926,6 +928,13 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start, } } + for (list::iterator bh = hit.begin(); + bh != hit.end(); + ++bh) { + assert(*bh); + ob->try_merge_bh(*bh); + } + // update last_commit. assert(ob->last_commit_tid < tid); ob->last_commit_tid = tid; From 5c4f152efa8e8f57c59ea7decc05ae1a34f2a9ee Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 5 Feb 2015 10:13:06 +0800 Subject: [PATCH 5/8] osdc: Don't pass mutex into ObjectCacher::_wait_for_write. Because the mutex is the same as ObjectCacher::lock. Signed-off-by: Jianpeng Ma (cherry picked from commit d7cf7aeea5cba1ffa8e51ff1ad2424b1ec161a12) --- src/client/Client.cc | 4 +--- src/librbd/ImageCtx.cc | 2 +- src/osdc/ObjectCacher.cc | 7 +++---- src/osdc/ObjectCacher.h | 11 ++++------- src/test/osdc/object_cacher_stress.cc | 2 +- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 446f0d1260889..ab42545a614b4 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -7365,9 +7365,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) // async, caching, non-blocking. r = objectcacher->file_write(&in->oset, &in->layout, in->snaprealm->get_snap_context(), - offset, size, bl, ceph_clock_now(cct), 0, - client_lock); - + offset, size, bl, ceph_clock_now(cct), 0); put_cap_ref(in, CEPH_CAP_FILE_BUFFER); if (r < 0) diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index 5a88adb10348d..8f3c8fbd89a6f 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -632,7 +632,7 @@ class ThreadPoolSingleton : public ThreadPool { wr->extents.push_back(extent); { Mutex::Locker l(cache_lock); - object_cacher->writex(wr, object_set, cache_lock, onfinish); + object_cacher->writex(wr, object_set, onfinish); } } diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 995b37bdc6225..705814892e6a1 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -1354,8 +1354,7 @@ void ObjectCacher::retry_waiting_reads() waitfor_read.splice(waitfor_read.end(), ls); } -int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock, - Context *onfreespace) +int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace) { assert(lock.is_locked()); utime_t now = ceph_clock_now(cct); @@ -1428,7 +1427,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock, } } - int r = _wait_for_write(wr, bytes_written, oset, wait_on_lock, onfreespace); + int r = _wait_for_write(wr, bytes_written, oset, onfreespace); delete wr; //verify_stats(); @@ -1476,7 +1475,7 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len) } // blocking wait for write. -int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock, Context *onfreespace) +int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace) { assert(lock.is_locked()); int ret = 0; diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index ca23549ceac95..0bef597fdf65b 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -602,14 +602,12 @@ class ObjectCacher { * the return value is total bytes read */ int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish); - int writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock, - Context *onfreespace); + int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace); bool is_cached(ObjectSet *oset, vector& extents, snapid_t snapid); private: // write blocking - int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock, - Context *onfreespace); + int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace); void maybe_wait_for_writeback(uint64_t len); bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish); @@ -678,11 +676,10 @@ class ObjectCacher { int file_write(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc, loff_t offset, uint64_t len, - bufferlist& bl, utime_t mtime, int flags, - Mutex& wait_on_lock) { + bufferlist& bl, utime_t mtime, int flags) { OSDWrite *wr = prepare_write(snapc, bl, mtime, flags); Striper::file_to_extents(cct, oset->ino, layout, offset, len, oset->truncate_size, wr->extents); - return writex(wr, oset, wait_on_lock, NULL); + return writex(wr, oset, NULL); } bool file_flush(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc, diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc index 4f6fffe05c998..ec5f926082743 100644 --- a/src/test/osdc/object_cacher_stress.cc +++ b/src/test/osdc/object_cacher_stress.cc @@ -112,7 +112,7 @@ int stress_test(uint64_t num_ops, uint64_t num_objs, ObjectCacher::OSDWrite *wr = obc.prepare_write(snapc, bl, utime_t(), 0); wr->extents.push_back(op->extent); lock.Lock(); - obc.writex(wr, &object_set, lock, NULL); + obc.writex(wr, &object_set, NULL); lock.Unlock(); } } From 28838f20a722b72a6e926e53f104342d3b9f4791 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 5 Feb 2015 11:03:10 +0800 Subject: [PATCH 6/8] osdc: clean up code in ObjectCacher::Object::map_write Signed-off-by: Jianpeng Ma (cherry picked from commit 9f80c2909ace09cd51c24b49c98a093e0e864dca) --- src/osdc/ObjectCacher.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 705814892e6a1..d21292e08fc06 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -379,7 +379,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr) if (p->first < cur) { assert(final == 0); - if (cur + max >= p->first + p->second->length()) { + if (cur + max >= bh->end()) { // we want right bit (one splice) final = split(bh, cur); // just split it, take right half. ++p; @@ -393,7 +393,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr) } } else { assert(p->first == cur); - if (p->second->length() <= max) { + if (bh->length() <= max) { // whole bufferhead, piece of cake. } else { // we want left bit (one splice) From 05734916ab119c6d3879c2ce2bc9f9581907861a Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 5 Feb 2015 11:28:50 +0800 Subject: [PATCH 7/8] librbd: Remvoe unused func ImageCtx::read_from_cache. Signed-off-by: Jianpeng Ma (cherry picked from commit 101440a41253680770f94bc380af7072c7adaebf) --- src/librbd/ImageCtx.cc | 15 --------------- src/librbd/ImageCtx.h | 2 -- 2 files changed, 17 deletions(-) diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index 8f3c8fbd89a6f..ebdd184616bdb 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -636,21 +636,6 @@ class ThreadPoolSingleton : public ThreadPool { } } - int ImageCtx::read_from_cache(object_t o, uint64_t object_no, bufferlist *bl, - size_t len, uint64_t off) { - int r; - Mutex mylock("librbd::ImageCtx::read_from_cache"); - Cond cond; - bool done; - Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r); - aio_read_from_cache(o, object_no, bl, len, off, onfinish, 0); - mylock.Lock(); - while (!done) - cond.Wait(mylock); - mylock.Unlock(); - return r; - } - void ImageCtx::user_flushed() { if (object_cacher && cct->_conf->rbd_cache_writethrough_until_flush) { md_lock.get_read(); diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h index f9a88784e18f9..1022474e8a5b7 100644 --- a/src/librbd/ImageCtx.h +++ b/src/librbd/ImageCtx.h @@ -191,8 +191,6 @@ namespace librbd { int fadvise_flags); void write_to_cache(object_t o, const bufferlist& bl, size_t len, uint64_t off, Context *onfinish, int fadvise_flags); - int read_from_cache(object_t o, uint64_t object_no, bufferlist *bl, - size_t len, uint64_t off); void user_flushed(); void flush_cache_aio(Context *onfinish); int flush_cache(); From 6c4ccc854fa8a8403b03785b06cb35a7174f4f42 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Mon, 9 Mar 2015 14:23:23 +0800 Subject: [PATCH 8/8] librbd: Add a paramter:purge_on_error in ImageCtx::invalidate_cache(). If bh_write met error, it will try again. For closing image, if met this issue, it will trigger a assert: >>2015-02-03 15:22:49.198292 7ff62d537800 -1 osdc/ObjectCacher.cc: In function 'ObjectCacher::~ObjectCacher()' thread 7ff62d537800 time >>2015-02-03 15:22:49.195927osdc/ObjectCacher.cc: 551: FAILED >>assert(i->empty()) Now add purge_on_error, when shutdown_cache it set true. In ImageCtx::invalidate_cache, if met error and purge_on_error is true, purge the dirty bh. Signed-off-by: Jianpeng Ma (cherry picked from commit 35def5c81f7fc83d55d18320e4860c6a63d4c7f5) Conflicts: src/librbd/ImageCtx.cc : trivial resolution --- src/librbd/ImageCtx.cc | 18 +++++++++++++++--- src/librbd/ImageCtx.h | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index ebdd184616bdb..0f5d46a56f468 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -683,14 +683,26 @@ class ThreadPoolSingleton : public ThreadPool { flush_async_operations(); RWLock::RLocker owner_locker(owner_lock); - invalidate_cache(); + invalidate_cache(true); object_cacher->stop(); } - int ImageCtx::invalidate_cache() { + int ImageCtx::invalidate_cache(bool purge_on_error) { + int result; C_SaferCond ctx; invalidate_cache(&ctx); - return ctx.wait(); + result = ctx.wait(); + + if (result && purge_on_error) { + cache_lock.Lock(); + if (object_cacher != NULL) { + lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl; + object_cacher->purge_set(object_set); + } + cache_lock.Unlock(); + } + + return result; } void ImageCtx::invalidate_cache(Context *on_finish) { diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h index 1022474e8a5b7..238b0ab6bd857 100644 --- a/src/librbd/ImageCtx.h +++ b/src/librbd/ImageCtx.h @@ -195,7 +195,7 @@ namespace librbd { void flush_cache_aio(Context *onfinish); int flush_cache(); void shutdown_cache(); - int invalidate_cache(); + int invalidate_cache(bool purge_on_error=false); void invalidate_cache(Context *on_finish); void invalidate_cache_completion(int r, Context *on_finish); void clear_nonexistence_cache();