From 115545afa2de6a6cd8495d6a233e7303808c29ea Mon Sep 17 00:00:00 2001 From: Michal Jarzabek Date: Wed, 20 May 2015 22:20:38 +0100 Subject: [PATCH 001/654] ceph-detect-init/debian/__init__: improved syntax Signed-off-by: Michal Jarzabek --- src/ceph-detect-init/ceph_detect_init/debian/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py index 7518562e2f31c..7209ff6ba192a 100644 --- a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py +++ b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py @@ -8,6 +8,6 @@ def choose_init(): Returns the name of a init system (upstart, sysvinit ...). """ - if distro.lower() == 'ubuntu' or distro.lower() == 'linuxmint': + if distro.lower() in ('ubuntu', 'linuxmint'): return 'upstart' return 'sysvinit' From 51862e3d057559aa8ad39627d149dfbeb68160af Mon Sep 17 00:00:00 2001 From: Min Chen Date: Mon, 15 Jun 2015 09:48:51 +0800 Subject: [PATCH 002/654] bug fix: librados segmentation fault, when two read ops share one AioCompletionImpl This is a serious BUG: In librados, use two read ops to read one object twice with the same completion will cause segmentation fault. reproduce test code as bellow: rados_read_op_read(read_op, 0, len, buf, &bytes_read, &ret); rados_read_op_read(read_op2, 0, len, buf2, &bytes_read2, &ret2); ret = rados_aio_read_op_operate(read_op, ioctx, read_completion, object, 0); ret = rados_aio_read_op_operate(read_op2, ioctx, read_completion, object, 0); ret = rados_aio_wait_for_complete(read_completion); In order to fix it, we just need an assert() to make sure there is only one IoCtx on a single AioCompletionImpl. Signed-off-by: Min Chen --- src/librados/IoCtxImpl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc index 15ebc6fd8dc25..50a600beb1b0e 100644 --- a/src/librados/IoCtxImpl.cc +++ b/src/librados/IoCtxImpl.cc @@ -1299,6 +1299,7 @@ void librados::IoCtxImpl::set_notify_timeout(uint32_t timeout) librados::IoCtxImpl::C_aio_Ack::C_aio_Ack(AioCompletionImpl *_c) : c(_c) { + assert(!c->io); c->get(); } @@ -1331,6 +1332,7 @@ librados::IoCtxImpl::C_aio_stat_Ack::C_aio_stat_Ack(AioCompletionImpl *_c, time_t *pm) : c(_c), pmtime(pm) { + assert(!c->io); c->get(); } From 0f1d7aed4f5fa354fefd45416e459cce0bea5780 Mon Sep 17 00:00:00 2001 From: Ruben Kerkhof Date: Mon, 6 Jul 2015 20:14:08 +0200 Subject: [PATCH 003/654] Fix indentation Signed-off-by: Ruben Kerkhof --- src/client/Client.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 4e70756d4c062..333c10f76ec77 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -7133,7 +7133,7 @@ int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset) { if (iovcnt < 0) return EINVAL; - return _preadv_pwritev(fd, iov, iovcnt, offset, false); + return _preadv_pwritev(fd, iov, iovcnt, offset, false); } int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) @@ -7462,7 +7462,7 @@ int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset) { if (iovcnt < 0) return EINVAL; - return _preadv_pwritev(fd, iov, iovcnt, offset, true); + return _preadv_pwritev(fd, iov, iovcnt, offset, true); } int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write) From 67de12bf9b67c29bf613e831f4146ff9809e42f7 Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Wed, 17 Jun 2015 06:49:58 +0800 Subject: [PATCH 004/654] Fixes : #12018 osd/OSD.cc : drop write if pool is full Signed-off-by: xinxin shu --- src/osd/OSD.cc | 33 ++++++++++++++++++++++++++++++--- src/osd/OSD.h | 1 + src/osd/osd_types.cc | 5 ++++- src/osd/osd_types.h | 1 + 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f6be098442c09..fb9f8a7d99021 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6119,6 +6119,7 @@ void OSD::handle_osd_map(MOSDMap *m) o->decode(bl); if (o->test_flag(CEPH_OSDMAP_FULL)) last_marked_full = e; + set_pool_last_map_marked_full(o, e); hobject_t fulloid = get_osdmap_pobject_name(e); t.write(META_COLL, fulloid, 0, bl.length(), bl); @@ -6152,6 +6153,7 @@ void OSD::handle_osd_map(MOSDMap *m) if (o->test_flag(CEPH_OSDMAP_FULL)) last_marked_full = e; + set_pool_last_map_marked_full(o, e); bufferlist fbl; o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); @@ -8050,6 +8052,9 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap) } } + // calc actual pgid + pg_t _pgid = m->get_pg(); + int64_t pool = _pgid.pool(); if (op->may_write()) { // full? if ((service.check_failsafe_full() || @@ -8061,6 +8066,17 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap) return; } + const pg_pool_t *pi = osdmap->get_pg_pool(pool); + if (!pi) { + return; + } + // pool is full ? + map &pool_last_map_marked_full = superblock.pool_last_map_marked_full; + if (pi->has_flag(pg_pool_t::FLAG_FULL) || + (pool_last_map_marked_full.count(pool) && (m->get_map_epoch() < pool_last_map_marked_full[pool]))) { + return; + } + // invalid? if (m->get_snapid() != CEPH_NOSNAP) { service.reply_op_error(op, -EINVAL); @@ -8079,9 +8095,6 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap) } } - // calc actual pgid - pg_t _pgid = m->get_pg(); - int64_t pool = _pgid.pool(); if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 && osdmap->have_pg_pool(pool)) _pgid = osdmap->raw_pg_to_pg(_pgid); @@ -8744,3 +8757,17 @@ void OSD::PeeringWQ::_dequeue(list *out) { } in_use.insert(got.begin(), got.end()); } + +void OSD::set_pool_last_map_marked_full(OSDMap *o, epoch_t &e) +{ + map &pool_last_map_marked_full = superblock.pool_last_map_marked_full; + for (map::const_iterator it = o->get_pools().begin(); + it != o->get_pools().end(); it++) { + bool exist = pool_last_map_marked_full.count(it->first); + if (it->second.has_flag(pg_pool_t::FLAG_FULL) && !exist) + pool_last_map_marked_full[it->first] = e; + if (it->second.has_flag(pg_pool_t::FLAG_FULL) && + (exist && pool_last_map_marked_full.count(it->first) < e)) + pool_last_map_marked_full[it->first] = e; + } +} diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 85452057ca380..82fa3f881dc36 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1775,6 +1775,7 @@ class OSD : public Dispatcher, void handle_osd_map(class MOSDMap *m); void note_down_osd(int osd); void note_up_osd(int osd); + void set_pool_last_map_marked_full(OSDMap *o, epoch_t &e); bool advance_pg( epoch_t advance_to, PG *pg, diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index eeaab4db0ebb9..23885931bffa8 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -3882,7 +3882,7 @@ ostream& operator<<(ostream& out, const osd_peer_stat_t &stat) void OSDSuperblock::encode(bufferlist &bl) const { - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(cluster_fsid, bl); ::encode(whoami, bl); ::encode(current_epoch, bl); @@ -3894,6 +3894,7 @@ void OSDSuperblock::encode(bufferlist &bl) const ::encode(mounted, bl); ::encode(osd_fsid, bl); ::encode(last_map_marked_full, bl); + ::encode(pool_last_map_marked_full, bl); ENCODE_FINISH(bl); } @@ -3921,6 +3922,8 @@ void OSDSuperblock::decode(bufferlist::iterator &bl) ::decode(osd_fsid, bl); if (struct_v >= 6) ::decode(last_map_marked_full, bl); + if (struct_v >= 7) + ::decode(pool_last_map_marked_full, bl); DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index c477f1d404c22..6b4fbe3329e02 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2684,6 +2684,7 @@ class OSDSuperblock { epoch_t mounted; // last epoch i mounted epoch_t clean_thru; // epoch i was active and clean thru epoch_t last_map_marked_full; // last epoch osdmap was marked full + map pool_last_map_marked_full; // last epoch pool was marked full OSDSuperblock() : whoami(-1), From dbcf2e40d3e8a92f280879c74b7ca954a902b2d1 Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Thu, 18 Jun 2015 09:44:47 +0800 Subject: [PATCH 005/654] Fixes : #12018 resend writes after pool loses full flag Signed-off-by: xinxin shu --- src/osdc/Objecter.cc | 77 ++++++++++++++++++++++++++++++++++---------- src/osdc/Objecter.h | 6 +++- 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index f82f6c7d064ae..d20c4fb8ff5f5 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -884,7 +884,8 @@ bool Objecter::ms_dispatch(Message *m) void Objecter::_scan_requests(OSDSession *s, bool force_resend, - bool force_resend_writes, + bool cluster_full, + map *pool_full_map, map& need_resend, list& need_resend_linger, map& need_resend_command) @@ -904,8 +905,10 @@ void Objecter::_scan_requests(OSDSession *s, assert(op->session == s); ++lp; // check_linger_pool_dne() may touch linger_ops; prevent iterator invalidation ldout(cct, 10) << " checking linger op " << op->linger_id << dendl; - bool unregister; + bool unregister, force_resend_writes = cluster_full; int r = _recalc_linger_op_target(op, lc); + if (pool_full_map) + force_resend_writes = force_resend_writes || (*pool_full_map)[op->target.base_oloc.pool]; switch (r) { case RECALC_OP_TARGET_NO_ACTION: if (!force_resend && !force_resend_writes) @@ -933,6 +936,9 @@ void Objecter::_scan_requests(OSDSession *s, Op *op = p->second; ++p; // check_op_pool_dne() may touch ops; prevent iterator invalidation ldout(cct, 10) << " checking op " << op->tid << dendl; + bool force_resend_writes = cluster_full; + if (pool_full_map) + force_resend_writes = force_resend_writes || (*pool_full_map)[op->target.base_oloc.pool]; int r = _calc_target(&op->target, &op->last_force_resend); switch (r) { case RECALC_OP_TARGET_NO_ACTION: @@ -959,6 +965,9 @@ void Objecter::_scan_requests(OSDSession *s, CommandOp *c = cp->second; ++cp; ldout(cct, 10) << " checking command " << c->tid << dendl; + bool force_resend_writes = cluster_full; + if (pool_full_map) + force_resend_writes = force_resend_writes || (*pool_full_map)[c->target_pg.pool()]; int r = _calc_command_target(c); switch (r) { case RECALC_OP_TARGET_NO_ACTION: @@ -1006,9 +1015,14 @@ void Objecter::handle_osd_map(MOSDMap *m) } bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); - bool was_full = _osdmap_full_flag(); - bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || was_full; + bool cluster_full = _osdmap_full_flag(); + bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full || _osdmap_has_pool_full(); + map pool_full_map; + for (map::const_iterator it = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); it++) + pool_full_map[it->first] = it->second.has_flag(pg_pool_t::FLAG_FULL); + list need_resend_linger; map need_resend; map need_resend_command; @@ -1059,18 +1073,19 @@ void Objecter::handle_osd_map(MOSDMap *m) } logger->set(l_osdc_map_epoch, osdmap->get_epoch()); - was_full = was_full || _osdmap_full_flag(); - _scan_requests(homeless_session, skipped_map, was_full, - need_resend, need_resend_linger, - need_resend_command); + cluster_full = cluster_full || _osdmap_full_flag(); + update_pool_full_map(pool_full_map); + _scan_requests(homeless_session, skipped_map, cluster_full, + &pool_full_map, need_resend, + need_resend_linger, need_resend_command); // osd addr changes? for (map::iterator p = osd_sessions.begin(); p != osd_sessions.end(); ) { OSDSession *s = p->second; - _scan_requests(s, skipped_map, was_full, - need_resend, need_resend_linger, - need_resend_command); + _scan_requests(s, skipped_map, cluster_full, + &pool_full_map, need_resend, + need_resend_linger, need_resend_command); ++p; if (!osdmap->is_up(s->osd) || (s->con && @@ -1088,14 +1103,14 @@ void Objecter::handle_osd_map(MOSDMap *m) for (map::iterator p = osd_sessions.begin(); p != osd_sessions.end(); ++p) { OSDSession *s = p->second; - _scan_requests(s, false, false, need_resend, need_resend_linger, - need_resend_command); + _scan_requests(s, false, false, NULL, need_resend, + need_resend_linger, need_resend_command); } ldout(cct, 3) << "handle_osd_map decoding full epoch " << m->get_last() << dendl; osdmap->decode(m->maps[m->get_last()]); - _scan_requests(homeless_session, false, false, + _scan_requests(homeless_session, false, false, NULL, need_resend, need_resend_linger, need_resend_command); } else { @@ -1108,7 +1123,7 @@ void Objecter::handle_osd_map(MOSDMap *m) } bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); - bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag(); + bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag() || _osdmap_has_pool_full(); // was/is paused? if (was_pauserd || was_pausewr || pauserd || pausewr || osdmap->get_epoch() < epoch_barrier) { @@ -2162,7 +2177,8 @@ ceph_tid_t Objecter::_op_submit(Op *op, RWLock::Context& lc) ldout(cct, 10) << " paused read " << op << " tid " << last_tid.read() << dendl; op->target.paused = true; _maybe_request_map(); - } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) && _osdmap_full_flag()) { + } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) && + (_osdmap_full_flag() || _osdmap_pool_full(op->target.base_oloc.pool))) { ldout(cct, 0) << " FULL, paused modify " << op << " tid " << last_tid.read() << dendl; op->target.paused = true; _maybe_request_map(); @@ -2353,8 +2369,9 @@ bool Objecter::is_pg_changed( bool Objecter::target_should_be_paused(op_target_t *t) { + const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool); bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); - bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag(); + bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag() || pi->has_flag(pg_pool_t::FLAG_FULL); return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || (t->flags & CEPH_OSD_FLAG_WRITE && pausewr) || @@ -2379,6 +2396,11 @@ bool Objecter::osdmap_pool_full(const int64_t pool_id) const return true; } + return _osdmap_pool_full(pool_id); +} + +bool Objecter::_osdmap_pool_full(const int64_t pool_id) const +{ const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); if (pool == NULL) { ldout(cct, 4) << __func__ << ": DNE pool " << pool_id << dendl; @@ -2388,6 +2410,16 @@ bool Objecter::osdmap_pool_full(const int64_t pool_id) const return pool->has_flag(pg_pool_t::FLAG_FULL); } +bool Objecter::_osdmap_has_pool_full() const +{ + for (map::const_iterator it = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); it++) { + if (it->second.has_flag(pg_pool_t::FLAG_FULL)) + return true; + } + return false; +} + /** * Wrapper around osdmap->test_flag for special handling of the FULL flag. */ @@ -2397,6 +2429,17 @@ bool Objecter::_osdmap_full_flag() const return osdmap->test_flag(CEPH_OSDMAP_FULL) && honor_osdmap_full; } +void Objecter::update_pool_full_map(map& pool_full_map) +{ + for (map::const_iterator it = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); it++) { + if (pool_full_map.find(it->first) == pool_full_map.end()) { + pool_full_map[it->first] = it->second.has_flag(pg_pool_t::FLAG_FULL); + } else { + pool_full_map[it->first] = it->second.has_flag(pg_pool_t::FLAG_FULL) || pool_full_map[it->first]; + } + } +} int64_t Objecter::get_object_hash_position(int64_t pool, const string& key, const string& ns) diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 76cadf6827444..2c98b6b23cc3a 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1703,6 +1703,8 @@ class Objecter : public md_config_obs_t, public Dispatcher { * the global full flag is set, else false */ bool osdmap_pool_full(const int64_t pool_id) const; + bool _osdmap_pool_full(const int64_t pool_id) const; + void update_pool_full_map(map& pool_full_map); private: map linger_ops; @@ -1749,6 +1751,7 @@ class Objecter : public md_config_obs_t, public Dispatcher { RECALC_OP_TARGET_OSD_DOWN, }; bool _osdmap_full_flag() const; + bool _osdmap_has_pool_full() const; bool target_should_be_paused(op_target_t *op); int _calc_target(op_target_t *t, epoch_t *last_force_resend=0, bool any_change=false); @@ -1912,7 +1915,8 @@ class Objecter : public md_config_obs_t, public Dispatcher { void _scan_requests(OSDSession *s, bool force_resend, - bool force_resend_writes, + bool cluster_full, + map *pool_full_map, map& need_resend, list& need_resend_linger, map& need_resend_command); From 16ead95daa3d1309e8e76e57416b4201e71d0449 Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Tue, 7 Jul 2015 05:06:27 +0800 Subject: [PATCH 006/654] qa: update pool quota test for internal retries Signed-off-by: xinxin shu --- qa/workunits/rados/test_pool_quota.sh | 38 +++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/qa/workunits/rados/test_pool_quota.sh b/qa/workunits/rados/test_pool_quota.sh index 146b677a05fcd..71a9e52fe0cfc 100755 --- a/qa/workunits/rados/test_pool_quota.sh +++ b/qa/workunits/rados/test_pool_quota.sh @@ -12,27 +12,55 @@ done sleep 30 -rados -p $p put onemore /etc/passwd && exit 1 || true +rados -p $p put onemore /etc/passwd & +pid=$! ceph osd pool set-quota $p max_objects 100 -sleep 30 +wait $pid +[ $? -ne 0 ] && exit 1 || true -rados -p $p put onemore /etc/passwd +rados -p $p put twomore /etc/passwd # bytes ceph osd pool set-quota $p max_bytes 100 sleep 30 -rados -p $p put two /etc/passwd && exit 1 || true +rados -p $p put two /etc/passwd & +pid=$! ceph osd pool set-quota $p max_bytes 0 ceph osd pool set-quota $p max_objects 0 -sleep 30 +wait $pid +[ $? -ne 0 ] && exit 1 || true rados -p $p put three /etc/passwd + +#one pool being full does not block a different pool + +pp=`uuidgen` + +ceph osd pool create $pp 12 + +# set objects quota +ceph osd pool set-quota $pp max_objects 10 +sleep 30 + +for f in `seq 1 10` ; do + rados -p $pp put obj$f /etc/passwd +done + +sleep 30 + +rados -p $p put threemore /etc/passwd + +ceph osd pool set-quota $p max_bytes 0 +ceph osd pool set-quota $p max_objects 0 + +sleep 30 # done ceph osd pool delete $p $p --yes-i-really-really-mean-it +ceph osd pool delete $pp $pp --yes-i-really-really-mean-it echo OK From 6e0498da19406acc96eca39d86991e1e559908d2 Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Mon, 8 Jun 2015 08:30:08 +0800 Subject: [PATCH 007/654] MonitorDBStore : make monitor transaction more readable on dump Signed-off-by: xinxin shu --- src/common/config_opts.h | 1 + src/mon/MonitorDBStore.h | 55 +++++++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d762a2c9de366..a89b4411820f7 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -258,6 +258,7 @@ OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this // dump transactions OPTION(mon_debug_dump_transactions, OPT_BOOL, false) +OPTION(mon_debug_dump_json, OPT_BOOL, false) OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump") OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE, 10.0) // seconds OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE, 0) // range [0, 1] diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index d00d3607960b0..81bb556609466 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "os/KeyValueDB.h" #include "include/assert.h" @@ -31,7 +32,10 @@ class MonitorDBStore { boost::scoped_ptr db; bool do_dump; - int dump_fd; + int dump_fd_binary; + std::ofstream dump_fd_json; + JSONFormatter dump_fmt; + Finisher io_work; @@ -255,9 +259,15 @@ class MonitorDBStore KeyValueDB::Transaction dbt = db->get_transaction(); if (do_dump) { - bufferlist bl; - t->encode(bl); - bl.write_fd(dump_fd); + if (!g_conf->mon_debug_dump_json) { + bufferlist bl; + t->encode(bl); + bl.write_fd(dump_fd_binary); + } else { + t->dump(&dump_fmt, true); + dump_fmt.flush(dump_fd_json); + dump_fd_json.flush(); + } } list > > compact; @@ -614,7 +624,8 @@ class MonitorDBStore MonitorDBStore(const string& path) : db(0), do_dump(false), - dump_fd(-1), + dump_fd_binary(-1), + dump_fmt(true), io_work(g_ceph_context, "monstore"), is_open(false) { string::const_reverse_iterator rit; @@ -639,21 +650,35 @@ class MonitorDBStore db.reset(db_ptr); if (g_conf->mon_debug_dump_transactions) { - do_dump = true; - dump_fd = ::open( - g_conf->mon_debug_dump_location.c_str(), - O_CREAT|O_APPEND|O_WRONLY, 0644); - if (!dump_fd) { - dump_fd = -errno; - derr << "Could not open log file, got " - << cpp_strerror(dump_fd) << dendl; + if (!g_conf->mon_debug_dump_json) { + dump_fd_binary = ::open( + g_conf->mon_debug_dump_location.c_str(), + O_CREAT|O_APPEND|O_WRONLY, 0644); + if (!dump_fd_binary) { + dump_fd_binary = -errno; + derr << "Could not open log file, got " + << cpp_strerror(dump_fd_binary) << dendl; + } + } else { + dump_fmt.reset(); + dump_fmt.open_array_section("dump"); + dump_fd_json.open(g_conf->mon_debug_dump_location.c_str()); } + do_dump = true; } } ~MonitorDBStore() { assert(!is_open); - if (do_dump) - ::close(dump_fd); + if (do_dump) { + if (!g_conf->mon_debug_dump_json) { + ::close(dump_fd_binary); + } else { + dump_fmt.close_section(); + dump_fmt.flush(dump_fd_json); + dump_fd_json.flush(); + dump_fd_json.close(); + } + } } }; From 1b2e70fa6b08c40f5530449141030d9452daa7e7 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 17 Jul 2015 15:57:04 +0800 Subject: [PATCH 008/654] pybind/ceph_argparse: do not choke on non-ascii prefix * add a test for it * add the comments for utf-8 encoding, which is needed by python module loader. because the new test has a non-ascii string in it. it's the Chinese translation of "octopus and squid". Fixes: #12287 Signed-off-by: Kefu Chai --- src/pybind/ceph_argparse.py | 7 +++++++ src/test/pybind/test_ceph_argparse.py | 13 +++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py index 762ba604e88a7..9a830575d4485 100644 --- a/src/pybind/ceph_argparse.py +++ b/src/pybind/ceph_argparse.py @@ -513,6 +513,13 @@ def __init__(self, prefix=''): self.prefix = prefix def valid(self, s, partial=False): + try: + # `prefix` can always be converted into unicode when being compared, + # but `s` could be anything passed by user. + s = unicode(s) + except UnicodeDecodeError: + raise ArgumentPrefix("no match for {0}".format(s)) + if partial: if self.prefix.startswith(s): self.val = s diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 6bd2b08352636..62a9a2497b2c7 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -1,6 +1,6 @@ #!/usr/bin/nosetests --nocapture -# -*- mode:python; tab-width:4; indent-tabs-mode:t -*- -# vim: ts=4 sw=4 smarttab expandtab +# -*- mode:python; tab-width:4; indent-tabs-mode:t; coding:utf-8 -*- +# vim: ts=4 sw=4 smarttab expandtab fileencoding=utf-8 # # Ceph - scalable distributed file system # @@ -86,6 +86,15 @@ def check_no_arg(self, prefix, command): 'toomany'])) +class TestBasic: + + def test_non_ascii_in_non_options(self): + # unicode() is not able to convert this str parameter into unicode + # using the default encoding 'ascii'. and validate_command() should + # not choke on it. + assert_is_none(validate_command(sigdict, ['章鱼和鱿鱼'])) + + class TestPG(TestArgparse): def test_stat(self): From f9dd1ecd9392f5f31f12736d368393ce2feee58f Mon Sep 17 00:00:00 2001 From: huangjun Date: Tue, 21 Jul 2015 14:42:33 +0800 Subject: [PATCH 009/654] mon: added const to dump_* functions in PGMonitor Signed-off-by: huangjun --- src/mon/PGMonitor.cc | 8 ++++---- src/mon/PGMonitor.h | 8 ++++---- src/mon/PaxosService.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index a3eb9340df8e1..3e2e0cd39f2ab 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1258,7 +1258,7 @@ inline string percentify(const float& a) { //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f, void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f, object_stat_sum_t &sum, uint64_t avail, - bool verbose) + bool verbose) const { if (f) { f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10)); @@ -1289,7 +1289,7 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f, } } -int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno) +int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno) const { map wm; int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm); @@ -1412,7 +1412,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose) } } -void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) +void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) const { if (f) { f->open_object_section("stats"); @@ -1452,7 +1452,7 @@ void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) } -void PGMonitor::dump_info(Formatter *f) +void PGMonitor::dump_info(Formatter *f) const { f->open_object_section("pgmap"); pg_map.dump(f); diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 7e1602522ee5e..cb725a67a03c5 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -148,9 +148,9 @@ class PGMonitor : public PaxosService { void dump_object_stat_sum(TextTable &tbl, Formatter *f, object_stat_sum_t &sum, uint64_t avail, - bool verbose); + bool verbose) const; - int64_t get_rule_avail(OSDMap& osdmap, int ruleno); + int64_t get_rule_avail(OSDMap& osdmap, int ruleno) const; public: PGMonitor(Monitor *mn, Paxos *p, const string& service_name) @@ -190,9 +190,9 @@ class PGMonitor : public PaxosService { void check_osd_map(epoch_t epoch); void dump_pool_stats(stringstream &ss, Formatter *f, bool verbose); - void dump_fs_stats(stringstream &ss, Formatter *f, bool verbose); + void dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) const; - void dump_info(Formatter *f); + void dump_info(Formatter *f) const; int _warn_slow_request_histogram(const pow2_hist_t& h, string suffix, list >& summary, diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index c7f6cf919f932..acfea20fb6c5b 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -869,7 +869,7 @@ class PaxosService { * * @returns Our first committed version (that is available) */ - version_t get_first_committed() { + version_t get_first_committed() const{ return cached_first_committed; } /** @@ -877,7 +877,7 @@ class PaxosService { * * @returns Our last committed version */ - version_t get_last_committed() { + version_t get_last_committed() const{ return cached_last_committed; } From a140085f467889f2743294a3c150f13b62fcdf51 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 23 Jul 2015 16:36:19 -0700 Subject: [PATCH 010/654] osd: Keep a reference count on Connection while calling send_message() Fixes: #12437 Signed-off-by: David Zafman --- src/osd/OSD.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7c2d8501a969b..94d9295996d56 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -696,8 +696,8 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo return; } const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer); - Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get(); - share_map_peer(peer, peer_con, next_map); + ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst); + share_map_peer(peer, peer_con.get(), next_map); peer_con->send_message(m); release_map(next_map); } From 8652a37039d60ac28d5809ff7fb44778ff0e5850 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sat, 1 Aug 2015 12:14:41 +0800 Subject: [PATCH 011/654] osd: avoid unnecessary calculation in agent_choose_mode() Signed-off-by: Li Wang Reviewed-by: Yunchuan Wen --- src/osd/ReplicatedPG.cc | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4b8c2eb0316c8..07168b390d08b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -10997,6 +10997,17 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op) return requeued; } + TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE; + TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE; + unsigned evict_effort = 0; + + if (info.stats.stats_invalid) { + // idle; stats can't be trusted until we scrub. + dout(20) << __func__ << " stats invalid (post-split), idle" << dendl; + goto skip_calc; + } + + { uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid); assert(divisor > 0); @@ -11075,7 +11086,6 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op) << dendl; // flush mode - TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE; uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro; uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro; uint64_t flush_slop = (float)flush_target * g_conf->osd_agent_slop; @@ -11087,18 +11097,13 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op) flush_high_target -= MIN(flush_high_target, flush_slop); } - if (info.stats.stats_invalid) { - // idle; stats can't be trusted until we scrub. - dout(20) << __func__ << " stats invalid (post-split), idle" << dendl; - } else if (dirty_micro > flush_high_target) { + if (dirty_micro > flush_high_target) { flush_mode = TierAgentState::FLUSH_MODE_HIGH; } else if (dirty_micro > flush_target) { flush_mode = TierAgentState::FLUSH_MODE_LOW; } // evict mode - TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE; - unsigned evict_effort = 0; uint64_t evict_target = pool.info.cache_target_full_ratio_micro; uint64_t evict_slop = (float)evict_target * g_conf->osd_agent_slop; if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) @@ -11106,9 +11111,7 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op) else evict_target -= MIN(evict_target, evict_slop); - if (info.stats.stats_invalid) { - // idle; stats can't be trusted until we scrub. - } else if (full_micro > 1000000) { + if (full_micro > 1000000) { // evict anything clean evict_mode = TierAgentState::EVICT_MODE_FULL; evict_effort = 1000000; @@ -11130,7 +11133,9 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op) assert(evict_effort >= inc && evict_effort <= 1000000); dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl; } + } + skip_calc: bool old_idle = agent_state->is_idle(); if (flush_mode != agent_state->flush_mode) { dout(5) << __func__ << " flush_mode " From 199352df103daf081c48d80daa0536452d2884d2 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 3 Aug 2015 15:35:03 +0800 Subject: [PATCH 012/654] osd/recover_primary: remove the unfound check when recovering an object The function recovering_missing checks if the object is unfound, there is no need to double check it in recover_primary. Signed-off-by: Zhiqiang Wang --- src/osd/ReplicatedPG.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4b8c2eb0316c8..b1030492b3451 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -9303,11 +9303,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle) eversion_t need = item.need; - bool unfound = missing_loc.is_unfound(soid); - dout(10) << "recover_primary " << soid << " " << item.need - << (unfound ? " (unfound)":"") << (missing.is_missing(soid) ? " (missing)":"") << (missing.is_missing(head) ? " (missing head)":"") << (recovering.count(soid) ? " (recovering)":"") @@ -9383,7 +9380,6 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle) dout(10) << " will pull " << alternate_need << " or " << need << " from one of " << missing_loc.get_locations(soid) << dendl; - unfound = false; } } break; @@ -9393,8 +9389,6 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle) if (!recovering.count(soid)) { if (recovering.count(head)) { ++skipped; - } else if (unfound) { - ++skipped; } else { int r = recover_missing( soid, need, cct->_conf->osd_recovery_op_priority, h); From 6b17e210f5b9598816500fd40c0c3c48ee7e7470 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 3 Aug 2015 15:41:35 +0800 Subject: [PATCH 013/654] osd/recover_backfill: assert(obc) when adding pg stat for backfill objects The obc should always be on the primary. Signed-off-by: Zhiqiang Wang --- src/osd/ReplicatedPG.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index b1030492b3451..489618bbef541 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -9854,6 +9854,7 @@ int ReplicatedPG::recover_backfill( i != add_to_stat.end(); ++i) { ObjectContextRef obc = get_object_context(*i, false); + assert(obc); pg_stat_t stat; add_object_context_to_pg_stat(obc, &stat); pending_backfill_updates[*i] = stat; From f68553e7e06b3f06757b88b7fe9bdc1c1cadc75e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Da=C5=82ek?= Date: Thu, 30 Jul 2015 11:36:43 +0200 Subject: [PATCH 014/654] osd/osd_types.cc: get rid of str concat when making hash key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When namespaces are used, during hash key calculation there's temporary string involved which is generated by concatenating string with a char and another string. Get rid of that and of allocation of string, and keep all on stack. Improves object access performance when namespaces are used. Signed-off-by: Piotr Dałek --- src/osd/osd_types.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index dbcbd3dab357a..1b5285faf6f15 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1084,17 +1084,17 @@ SnapContext pg_pool_t::get_snap_context() const return SnapContext(get_snap_seq(), s); } -static string make_hash_str(const string &inkey, const string &nspace) -{ - if (nspace.empty()) - return inkey; - return nspace + '\037' + inkey; -} - uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const { - string n = make_hash_str(key, ns); - return ceph_str_hash(object_hash, n.c_str(), n.length()); + if (ns.empty()) + return ceph_str_hash(object_hash, key.data(), key.length()); + int nsl = ns.length(); + int len = key.length() + nsl + 1; + char buf[len]; + memcpy(&buf[0], ns.data(), nsl); + buf[nsl] = '\037'; + memcpy(&buf[nsl+1], key.data(), key.length()); + return ceph_str_hash(object_hash, &buf[0], len); } uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const From 10a336fe41d8835df6a27c97f4067a13c1311efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Da=C5=82ek?= Date: Wed, 1 Jul 2015 11:40:10 +0200 Subject: [PATCH 015/654] ObjectStore: partially remove op_ptr.zero() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partially remove zeroing of op_ptr buffer, it's not necessary because it is overwritten later anyway, and bufferptr::zero() adds a bit of latency to the pipeline (not only memset, but also mutex (un)locking and CRC invalidation). Signed-off-by: Piotr Dałek --- src/os/ObjectStore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 3a14a60f0588b..663448b2fcd7a 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -872,7 +872,6 @@ class ObjectStore { Op* _get_next_op() { if (op_ptr.length() == 0 || op_ptr.offset() >= op_ptr.length()) { op_ptr = bufferptr(sizeof(Op) * OPS_PER_PTR); - op_ptr.zero(); } bufferptr ptr(op_ptr, 0, sizeof(Op)); op_bl.append(ptr); @@ -880,6 +879,7 @@ class ObjectStore { op_ptr.set_offset(op_ptr.offset() + sizeof(Op)); char* p = ptr.c_str(); + memset(p, 0, sizeof(Op)); return reinterpret_cast(p); } __le32 _get_coll_id(const coll_t& coll) { From 35e45694463ea4451c2dfad870094508df332805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Da=C5=82ek?= Date: Wed, 1 Jul 2015 14:18:30 +0200 Subject: [PATCH 016/654] FileJournal: reduce time wasted by bufferptr::zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bufferptr::zero(), apart from actual memset() call, also invalidates its internal CRC cache, which is empty at the time, so no point to lock its mutex, clear CRC cache, unlock mutex and memset() entire bufferptr. Also, not entire bufferptr needs to be zeroed, so clear out just unused parts of it. Signed-off-by: Piotr Dałek --- src/os/FileJournal.cc | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index cf26e5d4f8e71..b4162f85dd26e 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -746,8 +746,8 @@ int FileJournal::read_header(header_t *hdr) const bufferlist bl; buffer::ptr bp = buffer::create_page_aligned(block_size); - bp.zero(); - int r = ::pread(fd, bp.c_str(), bp.length(), 0); + char* bpdata = bp.c_str(); + int r = ::pread(fd, bpdata, bp.length(), 0); if (r < 0) { int err = errno; @@ -755,6 +755,14 @@ int FileJournal::read_header(header_t *hdr) const return -err; } + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + if (bp.length() != (size_t)r) { + // r will be always less or equal than bp.length + bpdata += r; + memset(bpdata, 0, bp.length() - r); + } + bl.push_back(bp); try { @@ -793,8 +801,12 @@ bufferptr FileJournal::prepare_header() } ::encode(header, bl); bufferptr bp = buffer::create_page_aligned(get_top()); - bp.zero(); - memcpy(bp.c_str(), bl.c_str(), bl.length()); + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + char* data = bp.c_str(); + memcpy(data, bl.c_str(), bl.length()); + data += bl.length(); + memset(data, 0, bp.length()-bl.length()); return bp; } From caf98c83d2c42dce5c38c1c984076830df970c75 Mon Sep 17 00:00:00 2001 From: Yunchuan Wen Date: Tue, 4 Aug 2015 09:10:01 +0000 Subject: [PATCH 017/654] add agent_start/finish_evict_op to control agent_ops Signed-off-by: Yunchuan Wen Reviewed-by: Li Wang Reviewed-by: Mingxin Liu --- src/osd/OSD.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/osd/OSD.h b/src/osd/OSD.h index a5d429fabb44c..3cb62c3481192 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -708,6 +708,20 @@ class OSDService { _dequeue(pg, old_priority); } + /// note start of an async (evict) op + void agent_start_evict_op() { + Mutex::Locker l(agent_lock); + ++agent_ops; + } + + /// note finish or cancellation of an async (evict) op + void agent_finish_evict_op() { + Mutex::Locker l(agent_lock); + assert(agent_ops > 0); + --agent_ops; + agent_cond.Signal(); + } + /// note start of an async (flush) op void agent_start_op(const hobject_t& oid) { Mutex::Locker l(agent_lock); From 09839d00c0e1c5bc0e9ee2e1a1d24cb2cf9a1118 Mon Sep 17 00:00:00 2001 From: Yunchuan Wen Date: Tue, 4 Aug 2015 08:47:05 +0000 Subject: [PATCH 018/654] Allow evict operations to be throttled In agent_work(), it did not touch agent_ops when it started an evict operation, so evict operations were not throttled. This patch fixes it. Signed-off-by: Yunchuan Wen Reviewed-by: Li Wang Reviewed-by: Mingxin Liu --- src/osd/ReplicatedPG.cc | 12 ++++++++++++ src/osd/ReplicatedPG.h | 1 + 2 files changed, 13 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4b8c2eb0316c8..9dbb30c358292 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -10855,6 +10855,16 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc) return true; } +struct C_AgentEvictStartStop : public Context { + ReplicatedPGRef pg; + C_AgentEvictStartStop(ReplicatedPG *p) : pg(p) { + pg->osd->agent_start_evict_op(); + } + void finish(int r) { + pg->osd->agent_finish_evict_op(); + } +}; + bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc) { const hobject_t& soid = obc->obs.oi.soid; @@ -10943,6 +10953,8 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc) dout(10) << __func__ << " evicting " << obc->obs.oi << dendl; RepGather *repop = simple_repop_create(obc); OpContext *ctx = repop->ctx; + Context *on_evict = new C_AgentEvictStartStop(this); + ctx->on_finish = on_evict; ctx->lock_to_release = OpContext::W_LOCK; ctx->at_version = get_next_version(); assert(ctx->new_obs.exists); diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 9c280365da27f..690da03a0dc21 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -922,6 +922,7 @@ class ReplicatedPG : public PG, public PGBackend::Listener { boost::scoped_ptr agent_state; friend struct C_AgentFlushStartStop; + friend struct C_AgentEvictStartStop; friend struct C_HitSetFlushing; void agent_setup(); ///< initialize agent state From aebb9e79744f09a1735c4766b35aa530c7ddb730 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 4 Aug 2015 16:06:23 +0200 Subject: [PATCH 019/654] tools: ceph-release-notes unicode handling Prefix the print string with u so that it handles unicode gracefully. Add the cli prefix (for ceph.in related pull requests) Signed-off-by: Loic Dachary --- src/script/ceph-release-notes | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/script/ceph-release-notes b/src/script/ceph-release-notes index e59efd353cac5..007054d52f02c 100755 --- a/src/script/ceph-release-notes +++ b/src/script/ceph-release-notes @@ -64,7 +64,7 @@ def make_release_notes(gh, repo, ref, plaintext): title = pr['title'] - title_re = '^(common|mon|osd|fs|librbd|rbd|fs|mds|objecter|rgw|build/ops|tests|tools|doc|crush|librados):' + title_re = '^(cli|common|mon|osd|fs|librbd|rbd|fs|mds|objecter|rgw|build/ops|tests|tools|doc|crush|librados):' if not re.match(title_re, title): print ("ERROR: http://github.com/ceph/ceph/pull/" + str(number) + " title " + title + " does not match " + title_re) # Big assumption, do a sanity check in the end, we are @@ -80,9 +80,9 @@ def make_release_notes(gh, repo, ref, plaintext): print (">>>>>>> " + str(len(prs)) + " pr for issue " + issue) for (author, title, number) in prs: if plaintext: - print ("* {title} (#{issue}, {author})".format(title=title, issue=issue, author=author)) + print (u"* {title} (#{issue}, {author})".format(title=title, issue=issue, author=author)) else: - print ("* {title} (`issue#{issue} `_, `pr#{number} `_, {author})".format(title=title, issue=issue, author=author, number=number)) + print (u"* {title} (`issue#{issue} `_, `pr#{number} `_, {author})".format(title=title, issue=issue, author=author, number=number)) if __name__ == "__main__": From e1f58feb9b1d20b72f2eb2eefdea5982e0cddccd Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 13 Aug 2015 13:47:24 +0200 Subject: [PATCH 020/654] osd: trigger the cache agent after a promotion When a proxy read happens, the object promotion is done in parallel. The agent_choose_mode function must be called to reconsider the situation to protect against the following scenario: * proxy read * agent_choose_mode finds no object exists and the agent goes idle * object promotion happens * the agent does not reconsider and eviction does not happen although it should http://tracker.ceph.com/issues/12673 Fixes: #12673 Signed-off-by: Loic Dachary --- src/osd/ReplicatedPG.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4f5382cffce8b..2bb268ba43ec5 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6775,6 +6775,10 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results, simple_repop_submit(repop); osd->logger->inc(l_osd_tier_promote); + + assert(agent_state); + if (agent_state->is_idle()) + agent_choose_mode(); } void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue) From 7924231930732bd297d3bd034c8295e96cb81088 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 13 Aug 2015 19:41:47 +0200 Subject: [PATCH 021/654] tests: tiering agent and proxy read Verify that an object promoted to a cache tier because of a proxy read is evicted as expected. http://tracker.ceph.com/issues/12673 Refs: #12673 Signed-off-by: Loic Dachary --- qa/workunits/cephtool/test.sh | 44 +++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index f84031aa44f07..01814ae4bec12 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -221,6 +221,49 @@ function test_mon_injectargs_SI() $SUDO ceph daemon mon.a config set mon_pg_warn_min_objects $initial_value } +function test_tiering_agent() +{ + local slow=slow_eviction + local fast=fast_eviction + ceph osd pool create $slow 1 1 + ceph osd pool create $fast 1 1 + ceph osd tier add $slow $fast + ceph osd tier cache-mode $fast writeback + ceph osd tier set-overlay $slow $fast + ceph osd pool set $fast hit_set_type bloom + rados -p $slow put obj1 /etc/group + ceph osd pool set $fast target_max_objects 1 + ceph osd pool set $fast hit_set_count 1 + ceph osd pool set $fast hit_set_period 5 + # wait for the object to be evicted from the cache + local evicted + evicted=false + for i in 1 2 4 8 16 32 64 128 256 ; do + if ! rados -p $fast ls | grep obj1 ; then + evicted=true + break + fi + sleep $i + done + $evicted # assert + # the object is proxy read and promoted to the cache + rados -p $slow get obj1 /tmp/obj1 + # wait for the promoted object to be evicted again + evicted=false + for i in 1 2 4 8 16 32 64 128 256 ; do + if ! rados -p $fast ls | grep obj1 ; then + evicted=true + break + fi + sleep $i + done + $evicted # assert + ceph osd tier remove-overlay $slow + ceph osd tier remove $slow $fast + ceph osd pool delete $fast $fast --yes-i-really-really-mean-it + ceph osd pool delete $slow $slow --yes-i-really-really-mean-it +} + function test_tiering() { # tiering @@ -1686,6 +1729,7 @@ MON_TESTS+=" mon_ping" MON_TESTS+=" mon_deprecated_commands" OSD_TESTS+=" osd_bench" +OSD_TESTS+=" tiering_agent" MDS_TESTS+=" mds_tell" MDS_TESTS+=" mon_mds" From 97c66e3f3f17eea0a40d20619d171c85aec3a9ed Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 7 Aug 2015 11:51:27 +0800 Subject: [PATCH 022/654] erasure-code: Update ISA-L to 2.14 Ceph requires all .s files to annotated to assert that stack should not be executable. see bug #10114, f60da6b and 06a245a. In ISA-L 2.14 this issue gets fixed. (see src/erasure-code/isa/isa-l/include/reg_sizes.asm:L102-L106) Signed-off-by: Yuan Zhou --- src/erasure-code/isa/CMakeLists.txt | 2 +- src/erasure-code/isa/Makefile.am | 2 +- .../isa-l/erasure_code/ec_multibinary.asm.s | 21 ++++----------- .../erasure_code/gf_2vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_2vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_2vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_2vect_mad_avx.asm.s | 14 +++------- .../erasure_code/gf_2vect_mad_avx2.asm.s | 14 +++------- .../isa-l/erasure_code/gf_2vect_mad_sse.asm.s | 14 +++------- .../erasure_code/gf_3vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_3vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_3vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_3vect_mad_avx.asm.s | 14 +++------- .../erasure_code/gf_3vect_mad_avx2.asm.s | 14 +++------- .../isa-l/erasure_code/gf_3vect_mad_sse.asm.s | 14 +++------- .../erasure_code/gf_4vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_4vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_4vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_4vect_mad_avx.asm.s | 15 +++-------- .../erasure_code/gf_4vect_mad_avx2.asm.s | 14 +++------- .../isa-l/erasure_code/gf_4vect_mad_sse.asm.s | 14 +++------- .../erasure_code/gf_5vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_5vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_5vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_5vect_mad_avx.asm.s | 14 +++------- .../erasure_code/gf_5vect_mad_avx2.asm.s | 14 +++------- .../isa-l/erasure_code/gf_5vect_mad_sse.asm.s | 14 +++------- .../erasure_code/gf_6vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_6vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_6vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_6vect_mad_avx.asm.s | 14 +++------- .../erasure_code/gf_6vect_mad_avx2.asm.s | 13 ++------- .../isa-l/erasure_code/gf_6vect_mad_sse.asm.s | 14 +++------- .../erasure_code/gf_vect_dot_prod_avx.asm.s | 14 +++------- .../erasure_code/gf_vect_dot_prod_avx2.asm.s | 14 +++------- .../erasure_code/gf_vect_dot_prod_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_vect_mad_avx.asm.s | 14 +++------- .../isa-l/erasure_code/gf_vect_mad_avx2.asm.s | 14 +++------- .../isa-l/erasure_code/gf_vect_mad_sse.asm.s | 14 +++------- .../isa-l/erasure_code/gf_vect_mul_avx.asm.s | 14 +++------- .../isa-l/erasure_code/gf_vect_mul_sse.asm.s | 14 +++------- .../isa/isa-l/include/reg_sizes.asm | 27 +++++++++++++++++++ src/erasure-code/isa/isa-l/include/types.h | 16 ++++++++--- 43 files changed, 159 insertions(+), 441 deletions(-) diff --git a/src/erasure-code/isa/CMakeLists.txt b/src/erasure-code/isa/CMakeLists.txt index 48605c1d431b4..099cc727322a0 100644 --- a/src/erasure-code/isa/CMakeLists.txt +++ b/src/erasure-code/isa/CMakeLists.txt @@ -54,5 +54,5 @@ set(isa_srcs add_library(ec_isa SHARED ${isa_srcs}) add_dependencies(ec_isa ${CMAKE_SOURCE_DIR}/src/ceph_ver.h) target_link_libraries(ec_isa ${EXTRALIBS}) -set_target_properties(ec_isa PROPERTIES VERSION 2.10.0 SOVERSION 2) +set_target_properties(ec_isa PROPERTIES VERSION 2.14.0 SOVERSION 2) install(TARGETS ec_isa DESTINATION lib/erasure-code) diff --git a/src/erasure-code/isa/Makefile.am b/src/erasure-code/isa/Makefile.am index b36b8a6daf736..67725dd51572c 100644 --- a/src/erasure-code/isa/Makefile.am +++ b/src/erasure-code/isa/Makefile.am @@ -67,7 +67,7 @@ libec_isa_la_CXXFLAGS = ${AM_CXXFLAGS} -I $(srcdir)/erasure-code/isa/isa-l/inclu libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/ libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS) -libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:13:0 +libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:14:0 if LINUX libec_isa_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*' endif diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s index 62414a969c352..03f501a1bb9d2 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s @@ -387,20 +387,9 @@ _done_gf_vect_mad_init: pop arg1 ret -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro - ;;; func core, ver, snum -slversion ec_encode_data, 00, 03, 0133 -slversion gf_vect_mul, 00, 02, 0134 -slversion ec_encode_data_update, 00, 02, 0212 -slversion gf_vect_dot_prod, 00, 02, 0138 -slversion gf_vect_mad, 00, 01, 0213 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion ec_encode_data, 00, 04, 0133 +slversion gf_vect_mul, 00, 03, 0134 +slversion ec_encode_data_update, 00, 03, 0212 +slversion gf_vect_dot_prod, 00, 03, 0138 +slversion gf_vect_mad, 00, 02, 0213 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s index 964d05aadf676..db1e1c550cd1b 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -331,15 +333,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx, 02, 04, 0191 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_avx, 02, 05, 0191 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s index 7b60b54eaeede..0387893fcd0ae 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -350,15 +352,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx2, 04, 04, 0196 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_avx2, 04, 05, 0196 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s index a9c54d5f47258..95dd92ad8d575 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -333,15 +335,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_sse, 00, 03, 0062 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_sse, 00, 04, 0062 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s index f057182fcfac9..e18238130a4a0 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -230,15 +232,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_mad_avx, 02, 00, 0204 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_mad_avx, 02, 01, 0204 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s index b692084acf903..03902f4744ec3 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -241,15 +243,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_mad_avx2, 04, 00, 0205 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_mad_avx2, 04, 01, 0205 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s index a7753e7130e45..2e82c5a665710 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -233,15 +235,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_2vect_mad_sse, 00, 00, 0203 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_mad_sse, 00, 01, 0203 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s index 5ebcc4b42ac99..33fc1983762e6 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -371,15 +373,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_dot_prod_avx, 02, 04, 0192 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_dot_prod_avx, 02, 05, 0192 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s index 01ba72a921053..23c46a79816cc 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -391,15 +393,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_dot_prod_avx2, 04, 04, 0197 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_dot_prod_avx2, 04, 05, 0197 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s index 86999023d7b1f..a082fb8516c19 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -372,15 +374,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_dot_prod_sse, 00, 05, 0063 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_dot_prod_sse, 00, 06, 0063 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s index 1bc03316182db..ed25d6a67102a 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -282,15 +284,5 @@ mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_mad_avx, 02, 00, 0207 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_mad_avx, 02, 01, 0207 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s index 4f1e34d4d457e..d0b9272cdff3c 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -311,15 +313,5 @@ constip32: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_mad_avx2, 04, 00, 0208 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_mad_avx2, 04, 01, 0208 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s index 73d33a2d67f8a..a06eb3d1407f6 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 %define arg0 rcx @@ -292,15 +294,5 @@ mask0f: constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_3vect_mad_sse, 00, 00, 0206 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_mad_sse, 00, 01, 0206 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s index c85c12d64075f..9863012bcf8e3 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -435,15 +437,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx, 02, 04, 0193 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_avx, 02, 05, 0193 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s index a3b73e1761bc3..95aa8eb7c6273 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -454,15 +456,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx2, 04, 04, 0198 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_avx2, 04, 05, 0198 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s index 67b709ab5bb2b..2867cca0ea2ab 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -437,15 +439,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_sse, 00, 05, 0064 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_sse, 00, 06, 0064 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s index 03b69feefe7b2..5b2891628656d 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -330,16 +332,5 @@ mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff - -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_mad_avx, 02, 00, 020a -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_mad_avx, 02, 01, 020a diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s index 93a3eca3582b3..5df1f83852581 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -336,15 +338,5 @@ constip32: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_mad_avx2, 04, 00, 020b -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_mad_avx2, 04, 01, 020b diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s index 47c7df39314e4..f753c1309f5ca 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -336,15 +338,5 @@ mask0f: constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_4vect_mad_sse, 00, 00, 0209 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_mad_sse, 00, 01, 0209 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s index a28d8115660fc..41fd301d300a8 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -297,15 +299,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_dot_prod_avx, 02, 03, 0194 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_dot_prod_avx, 02, 04, 0194 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s index 0c5e113830546..2698addea5794 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -309,15 +311,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_dot_prod_avx2, 04, 03, 0199 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_dot_prod_avx2, 04, 04, 0199 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s index 2f527e54bdbee..5c8c90359ab0d 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -298,15 +300,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_dot_prod_sse, 00, 04, 0065 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_dot_prod_sse, 00, 05, 0065 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s index 28e0097b12e75..6b534a353e5f7 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -359,15 +361,5 @@ mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_mad_avx, 02, 00, 020d -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_mad_avx, 02, 01, 020d diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s index 603c314d5d9db..b495c2189a1e4 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -357,15 +359,5 @@ constip32: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_mad_avx2, 04, 00, 020e -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_mad_avx2, 04, 01, 020e diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s index 9484a3d498504..b26d4bcadebbb 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -367,15 +369,5 @@ mask0f: constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_5vect_mad_sse, 00, 00, 020c -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_5vect_mad_sse, 00, 01, 020c diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s index 56d1b96cb54d8..fb29f76f1ef26 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -309,15 +311,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_dot_prod_avx, 02, 03, 0195 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_dot_prod_avx, 02, 04, 0195 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s index e464ca27ffeff..85bb78a3d497f 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -320,15 +322,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_dot_prod_avx2, 04, 03, 019a -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_dot_prod_avx2, 04, 04, 019a diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s index 5fa00fb851e8a..34f7b8731e3d3 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -309,15 +311,5 @@ section .data align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_dot_prod_sse, 00, 04, 0066 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_dot_prod_sse, 00, 05, 0066 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s index 821ff0cd9b54b..3c60d0a3ec0c6 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -388,15 +390,5 @@ mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_mad_avx, 02, 00, 0210 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_mad_avx, 02, 01, 0210 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s index b9cbf3902ee83..e1804578eb9ef 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s @@ -31,6 +31,7 @@ ;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" %define PS 8 @@ -395,15 +396,5 @@ constip32: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_mad_avx2, 04, 00, 0211 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_mad_avx2, 04, 01, 0211 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s index 15d5f47797a55..574c8e525b98c 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %define PS 8 %ifidn __OUTPUT_FORMAT__, win64 @@ -400,15 +402,5 @@ mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f constip16: ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_6vect_mad_sse, 00, 00, 020f -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_mad_sse, 00, 01, 020f diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s index 809c2ee07c99d..4f06b124b8fe5 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -265,15 +267,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_avx, 02, 04, 0061 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_avx, 02, 05, 0061 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s index 648f35a1816ed..47bb38cc6cb10 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -274,15 +276,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_avx2, 04, 04, 0190 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_avx2, 04, 05, 0190 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s index 9ffe6cf012914..f7699c111c484 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -265,15 +267,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_sse, 00, 04, 0060 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_sse, 00, 05, 0060 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s index d9686b134637c..f0fd91ac14c19 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, win64 %define arg0 rcx %define arg0.w ecx @@ -190,15 +192,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_mad_avx, 02, 00, 0201 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_mad_avx, 02, 01, 0201 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s index d4eb8cfcc7e79..5fa5da4d15b98 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, win64 %define arg0 rcx %define arg0.w ecx @@ -197,15 +199,5 @@ endproc_frame section .data -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_mad_avx2, 04, 00, 0202 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_mad_avx2, 04, 01, 0202 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s index 5d0d0badf0f99..b3ebc977630fb 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest); ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, win64 %define arg0 rcx %define arg0.w ecx @@ -191,15 +193,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_mad_sse, 00, 00, 0200 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_mad_sse, 00, 01, 0200 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s index 5056c891d0184..c9438b11a1cb1 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_mul_avx(len, mul_array, src, dest) ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -158,15 +160,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_mul_avx, 01, 02, 0036 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_mul_avx, 01, 03, 0036 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s index f5eaa08c41389..2a14cc9d81d82 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s @@ -31,6 +31,8 @@ ;;; gf_vect_mul_sse(len, mul_array, src, dest) ;;; +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi %define arg1 rsi @@ -164,15 +166,5 @@ align 16 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f -%macro slversion 4 -global %1_slver_%2%3%4 -global %1_slver -%1_slver: -%1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro ;;; func core, ver, snum -slversion gf_vect_mul_sse, 00, 02, 0034 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_mul_sse, 00, 03, 0034 diff --git a/src/erasure-code/isa/isa-l/include/reg_sizes.asm b/src/erasure-code/isa/isa-l/include/reg_sizes.asm index 219ba069ebdbc..650c1fe3e0b49 100644 --- a/src/erasure-code/isa/isa-l/include/reg_sizes.asm +++ b/src/erasure-code/isa/isa-l/include/reg_sizes.asm @@ -27,6 +27,9 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + %define EFLAGS_HAS_CPUID (1<<21) %define FLAG_CPUID1_ECX_CLMUL (1<<1) %define FLAG_CPUID1_EDX_SSE2 (1<<26) @@ -94,3 +97,27 @@ %define BYTE(reg) reg %+ b %define XWORD(reg) reg %+ x + +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__, macho64 +%define elf64 macho64 +%endif + +%macro slversion 4 + section .text + global %1_slver_%2%3%4 + global %1_slver + %1_slver: + %1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro + +%endif ; ifndef _REG_SIZES_ASM_ diff --git a/src/erasure-code/isa/isa-l/include/types.h b/src/erasure-code/isa/isa-l/include/types.h index 695d94eefa784..f5775efaaf194 100644 --- a/src/erasure-code/isa/isa-l/include/types.h +++ b/src/erasure-code/isa/isa-l/include/types.h @@ -41,7 +41,7 @@ extern "C" { #endif -#ifndef __unix__ +#ifdef __WIN32__ #ifdef __MINGW32__ # include <_mingw.h> #endif @@ -59,12 +59,20 @@ typedef unsigned char UINT8; #endif -#ifdef __unix__ +#if defined __unix__ || defined __APPLE__ # define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) # define __forceinline static inline +# define aligned_free(x) free(x) #else -# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl -# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# ifdef __MINGW32__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# else +# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# endif #endif #ifdef DEBUG From 87f6b73ee51be842e220e4e31c2ead67af099dc8 Mon Sep 17 00:00:00 2001 From: Joaquim Rocha Date: Thu, 4 Jun 2015 15:22:12 +0200 Subject: [PATCH 023/654] Add new cls_numops class for numeric operations Signed-off-by: Joaquim Rocha --- src/cls/Makefile-client.am | 5 ++ src/cls/Makefile-server.am | 4 + src/cls/numops/cls_numops.cc | 164 +++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 src/cls/numops/cls_numops.cc diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am index aa4a4e6054b6d..5a45a61e001e0 100644 --- a/src/cls/Makefile-client.am +++ b/src/cls/Makefile-client.am @@ -51,10 +51,15 @@ noinst_LIBRARIES += libcls_user_client.a libcls_cephfs_client_la_SOURCES = cls/cephfs/cls_cephfs_client.cc noinst_LTLIBRARIES += libcls_cephfs_client.la +libcls_numops_client_la_SOURCES = cls/numops/cls_numops_client.cc +noinst_LTLIBRARIES += libcls_numops_client.la +DENCODER_DEPS += libcls_numops_client.la + noinst_HEADERS += \ cls/lock/cls_lock_types.h \ cls/lock/cls_lock_ops.h \ cls/lock/cls_lock_client.h \ + cls/numops/cls_numops_client.h \ cls/rbd/cls_rbd.h \ cls/rbd/cls_rbd_client.h \ cls/refcount/cls_refcount_ops.h \ diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am index 7af69ba18dbc0..5ac657352498f 100644 --- a/src/cls/Makefile-server.am +++ b/src/cls/Makefile-server.am @@ -6,6 +6,10 @@ libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' radoslib_LTLIBRARIES += libcls_hello.la +libcls_numops_la_SOURCES = cls/numops/cls_numops.cc +libcls_numops_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_numops.la + libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' diff --git a/src/cls/numops/cls_numops.cc b/src/cls/numops/cls_numops.cc new file mode 100644 index 0000000000000..37464592bbde1 --- /dev/null +++ b/src/cls/numops/cls_numops.cc @@ -0,0 +1,164 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 CERN + * + * Author: Joaquim Rocha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +/** \file + * + * This is an OSD class that implements methods for object numeric options on + * its omap values. + * + */ + +#include "objclass/objclass.h" +#include +#include +#include +#include +#include +#include + +CLS_VER(1,0) +CLS_NAME(numops) + +cls_handle_t h_class; +cls_method_handle_t h_add; +cls_method_handle_t h_mul; + +static int add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string key, diff_str; + + bufferlist::iterator iter = in->begin(); + try { + ::decode(key, iter); + ::decode(diff_str, iter); + } catch (const buffer::error &err) { + CLS_LOG(20, "add: invalid decode of input"); + return -EINVAL; + } + + char *end_ptr = 0; + double difference = strtod(diff_str.c_str(), &end_ptr); + + if (end_ptr && *end_ptr != '\0') { + CLS_ERR("add: invalid input value: %s", diff_str.c_str()); + return -EINVAL; + } + + bufferlist bl; + int ret = cls_cxx_map_get_val(hctx, key, &bl); + + double value; + + if (ret == -ENODATA || bl.length() == 0) { + value = 0; + } else if (ret < 0) { + if (ret != -ENOENT) { + CLS_ERR("add: error reading omap key %s: %d", key.c_str(), ret); + } + return ret; + } else { + std::string stored_value(bl.c_str(), bl.length()); + end_ptr = 0; + value = strtod(stored_value.c_str(), &end_ptr); + + if (end_ptr && *end_ptr != '\0') { + CLS_ERR("add: invalid stored value: %s", stored_value.c_str()); + return -EBADMSG; + } + } + + value += difference; + + std::stringstream stream; + + stream.str(""); + stream << std::setprecision(10) << value; + + bufferlist new_value; + new_value.append(stream.str()); + + return cls_cxx_map_set_val(hctx, key, &new_value); +} + +static int mul(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string key, diff_str; + + bufferlist::iterator iter = in->begin(); + try { + ::decode(key, iter); + ::decode(diff_str, iter); + } catch (const buffer::error &err) { + CLS_LOG(20, "add: invalid decode of input"); + return -EINVAL; + } + + char *end_ptr = 0; + double difference = strtod(diff_str.c_str(), &end_ptr); + + if (end_ptr && *end_ptr != '\0') { + CLS_ERR("add: invalid input value: %s", diff_str.c_str()); + return -EINVAL; + } + + bufferlist bl; + int ret = cls_cxx_map_get_val(hctx, key, &bl); + + double value; + + if (ret == -ENODATA || bl.length() == 0) { + value = 0; + } else if (ret < 0) { + if (ret != -ENOENT) { + CLS_ERR("add: error reading omap key %s: %d", key.c_str(), ret); + } + return ret; + } else { + std::string stored_value(bl.c_str(), bl.length()); + end_ptr = 0; + value = strtod(stored_value.c_str(), &end_ptr); + + if (end_ptr && *end_ptr != '\0') { + CLS_ERR("add: invalid stored value: %s", stored_value.c_str()); + return -EBADMSG; + } + } + + value *= difference; + + std::stringstream stream; + + stream.str(""); + stream << std::setprecision(10) << value; + + bufferlist new_value; + new_value.append(stream.str()); + + return cls_cxx_map_set_val(hctx, key, &new_value); +} + +void __cls_init() +{ + CLS_LOG(20, "loading cls_numops"); + + cls_register("numops", &h_class); + + cls_register_cxx_method(h_class, "add", + CLS_METHOD_RD | CLS_METHOD_WR, + add, &h_add); + + cls_register_cxx_method(h_class, "mul", + CLS_METHOD_RD | CLS_METHOD_WR, + mul, &h_mul); +} From d17f158ef312d066f02587c10243cde6373288f2 Mon Sep 17 00:00:00 2001 From: Joaquim Rocha Date: Thu, 4 Jun 2015 16:23:55 +0200 Subject: [PATCH 024/654] cls_numops: Add cls_numops client Signed-off-by: Joaquim Rocha --- src/cls/numops/cls_numops.cc | 10 ++-- src/cls/numops/cls_numops_client.cc | 80 +++++++++++++++++++++++++++++ src/cls/numops/cls_numops_client.h | 49 ++++++++++++++++++ 3 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 src/cls/numops/cls_numops_client.cc create mode 100644 src/cls/numops/cls_numops_client.h diff --git a/src/cls/numops/cls_numops.cc b/src/cls/numops/cls_numops.cc index 37464592bbde1..a632e8ecffa4c 100644 --- a/src/cls/numops/cls_numops.cc +++ b/src/cls/numops/cls_numops.cc @@ -27,6 +27,8 @@ #include #include +#define DECIMAL_PRECISION 10 + CLS_VER(1,0) CLS_NAME(numops) @@ -81,9 +83,7 @@ static int add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) value += difference; std::stringstream stream; - - stream.str(""); - stream << std::setprecision(10) << value; + stream << std::setprecision(DECIMAL_PRECISION) << value; bufferlist new_value; new_value.append(stream.str()); @@ -138,9 +138,7 @@ static int mul(cls_method_context_t hctx, bufferlist *in, bufferlist *out) value *= difference; std::stringstream stream; - - stream.str(""); - stream << std::setprecision(10) << value; + stream << std::setprecision(DECIMAL_PRECISION) << value; bufferlist new_value; new_value.append(stream.str()); diff --git a/src/cls/numops/cls_numops_client.cc b/src/cls/numops/cls_numops_client.cc new file mode 100644 index 0000000000000..689282063351f --- /dev/null +++ b/src/cls/numops/cls_numops_client.cc @@ -0,0 +1,80 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 CERN + * + * Author: Joaquim Rocha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include "objclass/objclass.h" +#include "cls/numops/cls_numops_client.h" +#include "include/encoding.h" + +#include +#include +#include + +namespace rados { + namespace cls { + namespace numops { + + int add(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_add) + { + bufferlist in, out; + ::encode(key, in); + + std::stringstream stream; + stream << value_to_add; + + ::encode(stream.str(), in); + + return ioctx->exec(oid, "numops", "add", in, out); + } + + int sub(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_subtract) + { + return add(ioctx, oid, key, -value_to_subtract); + } + + int mul(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_multiply) + { + bufferlist in, out; + ::encode(key, in); + + std::stringstream stream; + stream << value_to_multiply; + + ::encode(stream.str(), in); + + return ioctx->exec(oid, "numops", "mul", in, out); + } + + int div(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_divide) + { + if (value_to_divide == 0) + return -EINVAL; + + return mul(ioctx, oid, key, 1 / value_to_divide); + } + + } // namespace numops + } // namespace cls +} // namespace rados diff --git a/src/cls/numops/cls_numops_client.h b/src/cls/numops/cls_numops_client.h new file mode 100644 index 0000000000000..8d776bffcaf73 --- /dev/null +++ b/src/cls/numops/cls_numops_client.h @@ -0,0 +1,49 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 CERN + * + * Author: Joaquim Rocha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H +#define CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H + +#include "include/rados/librados.hpp" + +namespace rados { + namespace cls { + namespace numops { + + extern int add(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_add); + + extern int sub(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_subtract); + + extern int mul(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_multiply); + + extern int div(librados::IoCtx *ioctx, + const std::string& oid, + const std::string& key, + double value_to_divide); + + } // namespace numops + } // namespace cls +} // namespace rados + +#endif // CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H + From d742e797ef7ed556f8cec2950c64b7f28ef100a7 Mon Sep 17 00:00:00 2001 From: Joaquim Rocha Date: Fri, 5 Jun 2015 13:58:41 +0200 Subject: [PATCH 025/654] tests: Add unit tests for CLS numops class Signed-off-by: Joaquim Rocha --- src/test/Makefile-client.am | 7 + src/test/cls_numops/test_cls_numops.cc | 414 +++++++++++++++++++++++++ 2 files changed, 421 insertions(+) create mode 100644 src/test/cls_numops/test_cls_numops.cc diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 7e70dc9eddeea..bea01d4e97ddb 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -173,6 +173,13 @@ ceph_test_cls_hello_LDADD = \ ceph_test_cls_hello_CXXFLAGS = $(UNITTEST_CXXFLAGS) bin_DEBUGPROGRAMS += ceph_test_cls_hello +ceph_test_cls_numops_SOURCES = test/cls_numops/test_cls_numops.cc +ceph_test_cls_numops_LDADD = \ + $(LIBRADOS) libcls_numops_client.la \ + $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD) +ceph_test_cls_numops_CXXFLAGS = $(UNITTEST_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_cls_numops + ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc ceph_test_rados_api_cmd_LDADD = \ $(LIBCOMMON) $(LIBRADOS) $(CRYPTO_LIBS) \ diff --git a/src/test/cls_numops/test_cls_numops.cc b/src/test/cls_numops/test_cls_numops.cc new file mode 100644 index 0000000000000..8abf110581e0f --- /dev/null +++ b/src/test/cls_numops/test_cls_numops.cc @@ -0,0 +1,414 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 CERN + * + * Author: Joaquim Rocha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include + +#include "cls/numops/cls_numops_client.h" +#include "gtest/gtest.h" +#include "include/rados/librados.hpp" +#include "test/librados/test.h" + +using namespace librados; + +TEST(ClsNumOps, Add) { + Rados cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool_pp(pool_name, cluster)); + IoCtx ioctx; + cluster.ioctx_create(pool_name.c_str(), ioctx); + + // exec numops add method with an empty bufferlist + + bufferlist in, out; + + ASSERT_EQ(-EINVAL, ioctx.exec("myobject", "numops", "add", in, out)); + + // add a number to a non-existing key + + std::string key = "my-key"; + double value_in = 0.5; + + std::stringstream stream; + stream << value_in; + + ASSERT_EQ(0, rados::cls::numops::add(&ioctx, "myobject", key, value_in)); + + // check that the omap entry was set and the value matches + + std::set keys; + std::map omap; + keys.insert(key); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + std::map::iterator it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bufferlist bl = (*it).second; + std::string value_out(bl.c_str(), bl.length()); + + EXPECT_EQ(stream.str(), value_out); + + // add another value to the existing one + + double new_value_in = 3.001; + + ASSERT_EQ(0, rados::cls::numops::add(&ioctx, "myobject", key, new_value_in)); + + // check that the omap entry's value matches + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + stream.str(""); + stream << (value_in + new_value_in); + + EXPECT_EQ(stream.str(), value_out); + + // set the omap entry with some non-numeric value + + omap.clear(); + + std::string non_numeric_value("some-non-numeric-text"); + omap[key].append(non_numeric_value); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // check that adding a number does not succeed + + omap.clear(); + + ASSERT_EQ(-EBADMSG, rados::cls::numops::add(&ioctx, "myobject", key, 2.0)); + + // check that the omap entry was not changed + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + EXPECT_EQ(non_numeric_value, value_out); + + ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); +} + +TEST(ClsNumOps, Sub) { + Rados cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool_pp(pool_name, cluster)); + IoCtx ioctx; + cluster.ioctx_create(pool_name.c_str(), ioctx); + + // subtract a number from a non-existing key + + std::string key = "my-key"; + double value_in = 0.5; + + std::stringstream stream; + stream << value_in; + + ASSERT_EQ(0, rados::cls::numops::sub(&ioctx, "myobject", key, value_in)); + + // check that the omap entry was set and the value matches + + std::set keys; + std::map omap; + keys.insert(key); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + std::map::iterator it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bufferlist bl = (*it).second; + std::string value_out(bl.c_str(), bl.length()); + + EXPECT_EQ("-" + stream.str(), value_out); + + // subtract another value to the existing one + + double new_value_in = 3.001; + + ASSERT_EQ(0, rados::cls::numops::sub(&ioctx, "myobject", key, new_value_in)); + + // check that the omap entry's value matches + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + stream.str(""); + stream << -(value_in + new_value_in); + + EXPECT_EQ(stream.str(), value_out); + + // set the omap entry with some non-numeric value + + omap.clear(); + + std::string non_numeric_value("some-non-numeric-text"); + omap[key].append(non_numeric_value); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // check that subtracting a number does not succeed + + omap.clear(); + + ASSERT_EQ(-EBADMSG, rados::cls::numops::sub(&ioctx, "myobject", key, 2.0)); + + // check that the omap entry was not changed + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + EXPECT_EQ(non_numeric_value, value_out); + + ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); +} + +TEST(ClsNumOps, Mul) { + Rados cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool_pp(pool_name, cluster)); + IoCtx ioctx; + cluster.ioctx_create(pool_name.c_str(), ioctx); + + // exec numops mul method with an empty bufferlist + + bufferlist in, out; + + ASSERT_EQ(-EINVAL, ioctx.exec("myobject", "numops", "mul", in, out)); + + // multiply a number to a non-existing key + + std::string key = "my-key"; + double value_in = 0.5; + + std::stringstream stream; + stream << value_in; + + ASSERT_EQ(0, rados::cls::numops::mul(&ioctx, "myobject", key, value_in)); + + // check that the omap entry was set and the value is zero + + std::set keys; + std::map omap; + keys.insert(key); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + std::map::iterator it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bufferlist bl = (*it).second; + std::string value_out(bl.c_str(), bl.length()); + + EXPECT_EQ("0", value_out); + + // set a non-zero value so we can effectively test multiplications + + omap.clear(); + + omap[key].append(stream.str()); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // multiply another value to the existing one + + double new_value_in = 3.001; + + ASSERT_EQ(0, rados::cls::numops::mul(&ioctx, "myobject", key, new_value_in)); + + // check that the omap entry's value matches + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + stream.str(""); + stream << (value_in * new_value_in); + + EXPECT_EQ(stream.str(), value_out); + + // set the omap entry with some non-numeric value + + omap.clear(); + + std::string non_numeric_value("some-non-numeric-text"); + omap[key].append(non_numeric_value); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // check that adding a number does not succeed + + ASSERT_EQ(-EBADMSG, rados::cls::numops::mul(&ioctx, "myobject", key, 2.0)); + + // check that the omap entry was not changed + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + EXPECT_EQ(non_numeric_value, value_out); + + ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); +} + +TEST(ClsNumOps, Div) { + Rados cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool_pp(pool_name, cluster)); + IoCtx ioctx; + cluster.ioctx_create(pool_name.c_str(), ioctx); + + // divide a non-existing key by a number + + std::string key = "my-key"; + double value_in = 0.5; + + std::stringstream stream; + stream << value_in; + + ASSERT_EQ(0, rados::cls::numops::div(&ioctx, "myobject", key, value_in)); + + // check that the omap entry was set and the value is zero + + std::set keys; + std::map omap; + keys.insert(key); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + std::map::iterator it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bufferlist bl = (*it).second; + std::string value_out(bl.c_str(), bl.length()); + + EXPECT_EQ("0", value_out); + + // check that division by zero is not allowed + + ASSERT_EQ(-EINVAL, rados::cls::numops::div(&ioctx, "myobject", key, 0)); + + // set a non-zero value so we can effectively test divisions + + omap.clear(); + + omap[key].append(stream.str()); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // divide another value to the existing one + + double new_value_in = 3.001; + + ASSERT_EQ(0, rados::cls::numops::div(&ioctx, "myobject", key, new_value_in)); + + // check that the omap entry's value matches + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + stream.str(""); + stream << (value_in / new_value_in); + + EXPECT_EQ(stream.str(), value_out); + + omap.clear(); + + // set the omap entry with some non-numeric value + + std::string non_numeric_value("some-non-numeric-text"); + omap[key].append(non_numeric_value); + + ASSERT_EQ(0, ioctx.omap_set("myobject", omap)); + + // check that adding a number does not succeed + + ASSERT_EQ(-EBADMSG, rados::cls::numops::div(&ioctx, "myobject", key, 2.0)); + + // check that the omap entry was not changed + + omap.clear(); + + ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); + + it = omap.find(key); + + ASSERT_NE(omap.end(), it); + + bl = (*it).second; + value_out.assign(bl.c_str(), bl.length()); + + EXPECT_EQ(non_numeric_value, value_out); + + ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); +} From 63a0cf2922f8b41d82626cb3d4e99bd426c000ef Mon Sep 17 00:00:00 2001 From: Joaquim Rocha Date: Mon, 17 Aug 2015 22:17:53 +0200 Subject: [PATCH 026/654] qa/workunits/cls: add workunit for cls_numops tests Signed-off-by: Joaquim Rocha --- qa/workunits/cls/test_cls_numops.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 qa/workunits/cls/test_cls_numops.sh diff --git a/qa/workunits/cls/test_cls_numops.sh b/qa/workunits/cls/test_cls_numops.sh new file mode 100755 index 0000000000000..dcbafcab2e047 --- /dev/null +++ b/qa/workunits/cls/test_cls_numops.sh @@ -0,0 +1,5 @@ +#!/bin/sh -e + +ceph_test_cls_numops + +exit 0 From 5fa03e9983aa1748388ff6a375e07ac46e42f208 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Tue, 11 Aug 2015 21:24:08 +0000 Subject: [PATCH 027/654] osd: expose the number of unhealthy threads from heartbeat map Signed-off-by: Guang Yang --- src/common/HeartbeatMap.cc | 25 +++++++++++++++++++++++-- src/common/HeartbeatMap.h | 8 ++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc index 2c58276196ab4..f2bf02d31f09f 100644 --- a/src/common/HeartbeatMap.cc +++ b/src/common/HeartbeatMap.cc @@ -32,7 +32,9 @@ namespace ceph { HeartbeatMap::HeartbeatMap(CephContext *cct) : m_cct(cct), m_rwlock("HeartbeatMap::m_rwlock"), - m_inject_unhealthy_until(0) + m_inject_unhealthy_until(0), + m_unhealthy_workers(0), + m_total_workers(0) { } @@ -109,6 +111,8 @@ void HeartbeatMap::clear_timeout(heartbeat_handle_d *h) bool HeartbeatMap::is_healthy() { + int unhealthy = 0; + int total = 0; m_rwlock.get_read(); time_t now = time(NULL); if (m_cct->_conf->heartbeat_inject_failure) { @@ -129,13 +133,30 @@ bool HeartbeatMap::is_healthy() heartbeat_handle_d *h = *p; if (!_check(h, "is_healthy", now)) { healthy = false; + unhealthy++; } + total++; } m_rwlock.put_read(); - ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY") << dendl; + + m_unhealthy_workers.set(unhealthy); + m_total_workers.set(total); + + ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY") + << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl; return healthy; } +int HeartbeatMap::get_unhealthy_workers() const +{ + return m_unhealthy_workers.read(); +} + +int HeartbeatMap::get_total_workers() const +{ + return m_total_workers.read(); +} + void HeartbeatMap::check_touch_file() { if (is_healthy()) { diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h index 5513e186c2ad3..61c2f9048d835 100644 --- a/src/common/HeartbeatMap.h +++ b/src/common/HeartbeatMap.h @@ -68,6 +68,12 @@ class HeartbeatMap { // touch cct->_conf->heartbeat_file if is_healthy() void check_touch_file(); + // get the number of unhealthy workers + int get_unhealthy_workers() const; + + // get the number of total workers + int get_total_workers() const; + HeartbeatMap(CephContext *cct); ~HeartbeatMap(); @@ -76,6 +82,8 @@ class HeartbeatMap { RWLock m_rwlock; time_t m_inject_unhealthy_until; std::list m_workers; + atomic_t m_unhealthy_workers; + atomic_t m_total_workers; bool _check(const heartbeat_handle_d *h, const char *who, time_t now); }; From 5d109e97c06b9234f0fb5bb66e8357e9332a07f3 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Wed, 12 Aug 2015 20:59:37 +0000 Subject: [PATCH 028/654] common: support perf counter (for unhealthy workers) on CephContext Signed-off-by: Guang Yang --- src/common/ceph_context.cc | 50 +++++++++++++++++++++++++++++++++++++- src/common/ceph_context.h | 26 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc index 7a5c52bb0621f..b871a94b3d423 100644 --- a/src/common/ceph_context.cc +++ b/src/common/ceph_context.cc @@ -106,6 +106,9 @@ class CephContextServiceThread : public Thread _reopen_logs = false; } _cct->_heartbeat_map->check_touch_file(); + + // refresh the perf coutners + _cct->refresh_perf_values(); } return NULL; } @@ -407,11 +410,13 @@ CephContext::CephContext(uint32_t module_type_) _heartbeat_map(NULL), _crypto_none(NULL), _crypto_aes(NULL), - _lockdep_obs(NULL) + _lockdep_obs(NULL), + _cct_perf(NULL) { ceph_spin_init(&_service_thread_lock); ceph_spin_init(&_associated_objs_lock); ceph_spin_init(&_feature_lock); + ceph_spin_init(&_cct_perf_lock); _log = new ceph::log::Log(&_conf->subsys); _log->start(); @@ -426,6 +431,7 @@ CephContext::CephContext(uint32_t module_type_) _conf->add_observer(_lockdep_obs); _perf_counters_collection = new PerfCountersCollection(this); + _admin_socket = new AdminSocket(this); _heartbeat_map = new HeartbeatMap(this); @@ -459,6 +465,12 @@ CephContext::~CephContext() it != _associated_objs.end(); ++it) delete it->second; + if (_cct_perf) { + _perf_counters_collection->remove(_cct_perf); + delete _cct_perf; + _cct_perf = NULL; + } + _admin_socket->unregister_command("perfcounters_dump"); _admin_socket->unregister_command("perf dump"); _admin_socket->unregister_command("1"); @@ -504,6 +516,7 @@ CephContext::~CephContext() ceph_spin_destroy(&_service_thread_lock); ceph_spin_destroy(&_associated_objs_lock); ceph_spin_destroy(&_feature_lock); + ceph_spin_destroy(&_cct_perf_lock); delete _crypto_none; delete _crypto_aes; @@ -569,6 +582,41 @@ PerfCountersCollection *CephContext::get_perfcounters_collection() return _perf_counters_collection; } +void CephContext::enable_perf_counter() +{ + PerfCountersBuilder plb(this, "cct", l_cct_first, l_cct_last); + plb.add_u64_counter(l_cct_total_workers, "total_workers", "Total workers"); + plb.add_u64_counter(l_cct_unhealthy_workers, "unhealthy_workers", "Unhealthy workers"); + PerfCounters *perf_tmp = plb.create_perf_counters(); + + ceph_spin_lock(&_cct_perf_lock); + assert(_cct_perf == NULL); + _cct_perf = perf_tmp; + ceph_spin_unlock(&_cct_perf_lock); + + _perf_counters_collection->add(_cct_perf); +} + +void CephContext::disable_perf_counter() +{ + _perf_counters_collection->remove(_cct_perf); + + ceph_spin_lock(&_cct_perf_lock); + delete _cct_perf; + _cct_perf = NULL; + ceph_spin_unlock(&_cct_perf_lock); +} + +void CephContext::refresh_perf_values() +{ + ceph_spin_lock(&_cct_perf_lock); + if (_cct_perf) { + _cct_perf->set(l_cct_total_workers, _heartbeat_map->get_total_workers()); + _cct_perf->set(l_cct_unhealthy_workers, _heartbeat_map->get_unhealthy_workers()); + } + ceph_spin_unlock(&_cct_perf_lock); +} + AdminSocket *CephContext::get_admin_socket() { return _admin_socket; diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h index 95f6ea59bda3f..a867782de2399 100644 --- a/src/common/ceph_context.h +++ b/src/common/ceph_context.h @@ -28,6 +28,7 @@ class AdminSocket; class CephContextServiceThread; class PerfCountersCollection; +class PerfCounters; class md_config_obs_t; struct md_config_t; class CephContextHook; @@ -91,6 +92,22 @@ class CephContext { return _heartbeat_map; } + /** + * Enable the performance counter, currently we only have counter for the + * number of total/unhealthy workers. + */ + void enable_perf_counter(); + + /** + * Disable the performance counter. + */ + void disable_perf_counter(); + + /** + * Refresh perf counter values. + */ + void refresh_perf_values(); + /** * Get the admin socket associated with this CephContext. * @@ -173,6 +190,15 @@ class CephContext { md_config_obs_t *_lockdep_obs; + enum { + l_cct_first, + l_cct_total_workers, + l_cct_unhealthy_workers, + l_cct_last + }; + PerfCounters *_cct_perf; + ceph_spinlock_t _cct_perf_lock; + friend class CephContextObs; }; From de0b66abc17f10f56f4085a91065f8f376b90218 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Fri, 14 Aug 2015 18:20:03 +0000 Subject: [PATCH 029/654] test: add test for the perf counter of CephContext Signed-off-by: Guang Yang --- src/test/perf_counters.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc index 4c75ee8f5c572..a916cf08da3b3 100644 --- a/src/test/perf_counters.cc +++ b/src/test/perf_counters.cc @@ -187,3 +187,16 @@ TEST(PerfCounters, MultiplePerfCounters) { ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg)); ASSERT_EQ("{}", msg); } + +TEST(PerfCounters, CephContextPerfCounters) { + // Enable the perf counter + g_ceph_context->enable_perf_counter(); + AdminSocketClient client(get_rand_socket_path()); + std::string msg; + + ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg)); + ASSERT_EQ(sd("{\"cct\":{\"total_workers\":0,\"unhealthy_workers\":0}}"), msg); + + // Restore to avoid impact to other test cases + g_ceph_context->disable_perf_counter(); +} From 15a3e866e805279d5ed5b76cd55c72346179e0e1 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Fri, 14 Aug 2015 18:21:15 +0000 Subject: [PATCH 030/654] rgw: enable perf counter for unhealthy workers Fixes: #12666 Signed-off-by: Guang Yang --- src/rgw/rgw_main.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index bbf2ed6e6d488..6333b5f2253ba 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -1056,6 +1056,9 @@ int main(int argc, const char **argv) init_timer.add_event_after(g_conf->rgw_init_timeout, new C_InitTimeout); mutex.Unlock(); + // Enable the perf counter before starting the service thread + g_ceph_context->enable_perf_counter(); + common_init_finish(g_ceph_context); rgw_tools_init(g_ceph_context); From 10cb507d9d52464b2fd28b3cc84a92328944e462 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Jun 2015 14:48:43 -0700 Subject: [PATCH 031/654] os/FileStore: require upgrade to hammer before moving beyond Signed-off-by: Sage Weil --- src/os/FileStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 57997d291dbd7..458ecbebd6a7d 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1202,8 +1202,8 @@ int FileStore::upgrade() if (r == 1) return 0; - if (version < 3) { - derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; + if (version < 4) { + derr << "ObjectStore is old at version " << version << ". Please upgrade to hammer v0.94.x first." << dendl; return -EINVAL; } From c0f83df412c962a6b3a2f3cdd1d7c8c795ef1146 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Jun 2015 14:46:36 -0700 Subject: [PATCH 032/654] osd: require an upgrade to hammer first We must prevent a user from upgrading past hammer and then being both unable to join the cluster and unable to downgrade again. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index bff5339455e26..90ac5589bf25f 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1795,6 +1795,13 @@ int OSD::init() goto out; } + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_PGMETA)) { + derr << "OSD store does not have PGMETA feature." << dendl; + derr << "You must first upgrade to hammer." << dendl; + r = -EINVAL; + goto out; + } + if (osd_compat.compare(superblock.compat_features) < 0) { derr << "The disk uses features unsupported by the executable." << dendl; derr << " ondisk features " << superblock.compat_features << dendl; From b474991df74ab4d31b011bb6c5973fcd3c2e0179 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Jun 2015 15:10:00 -0700 Subject: [PATCH 033/654] ceph-objectstore-tool: require hammer upgrade Do not work on a store that has not been upgraded to hammer. Signed-off-by: Sage Weil --- src/tools/ceph_objectstore_tool.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index 1a767fb0a6279..caf82ae9aa40a 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -2211,6 +2211,12 @@ int main(int argc, char **argv) ret = -EINVAL; goto out; } + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_PGMETA)) { + derr << "OSD store does not have PGMETA feature." << dendl; + derr << "You must first upgrade to hammer, or use an older ceph-objectstore-tool." << dendl; + ret = -EINVAL; + goto out; + } if (op != "list" && vm.count("object")) { // Special case: Create pgmeta_oid if empty string specified From cd4e676e6d45c8166290ef834d73c2a0bda98fa2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Jun 2015 15:10:33 -0700 Subject: [PATCH 034/654] osd: drop support for pre-hammer pg metadata Users need to upgrade to hammer before moving beyond. Signed-off-by: Sage Weil --- src/osd/PG.cc | 93 +++++++-------------------------------------------- src/osd/PG.h | 5 ++- 2 files changed, 15 insertions(+), 83 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9746104a69441..b2657e49f694a 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2634,47 +2634,14 @@ void PG::init( write_if_dirty(*t); } -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - void PG::upgrade(ObjectStore *store) { assert(info_struct_v <= 8); ObjectStore::Transaction t; - assert(info_struct_v == 7); - - // 7 -> 8 - pg_log.mark_log_for_rewrite(); - ghobject_t log_oid(OSD::make_pg_log_oid(pg_id)); - ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id)); - t.remove(coll_t::meta(), log_oid); - t.remove(coll_t::meta(), biginfo_oid); - t.collection_rmattr(coll, "info"); - - t.touch(coll, pgmeta_oid); - map v; - __u8 ver = cur_struct_v; - ::encode(ver, v[infover_key]); - t.omap_setkeys(coll, pgmeta_oid, v); - - dirty_info = true; - dirty_big_info = true; - write_if_dirty(t); - - int r = store->apply_transaction(t); - if (r != 0) { - derr << __func__ << ": apply_transaction returned " - << cpp_strerror(r) << dendl; - assert(0); - } - assert(r == 0); + assert(0 == "no support for pre v8"); } -#pragma GCC diagnostic pop -#pragma GCC diagnostic warning "-Wpragmas" - int PG::_prepare_write_info(map *km, epoch_t epoch, pg_info_t &info, coll_t coll, @@ -2744,10 +2711,6 @@ void PG::prepare_write_info(map *km) dirty_big_info = false; } -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - bool PG::_has_removal_flag(ObjectStore *store, spg_t pgid) { @@ -2762,10 +2725,6 @@ bool PG::_has_removal_flag(ObjectStore *store, values.size() == 1) return true; - // try old way. tolerate EOPNOTSUPP. - char val; - if (store->collection_getattr(coll, "remove", &val, 1) > 0) - return true; return false; } @@ -2790,50 +2749,24 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, keys.insert(epoch_key); map values; int r = store->omap_get_values(coll, pgmeta_oid, keys, &values); - if (r == 0) { - assert(values.size() == 2); + if (r != 0) { + assert(0 == "unable to open pg metadata"); + } + assert(values.size() == 2); - // sanity check version - bufferlist::iterator bp = values[infover_key].begin(); - __u8 struct_v = 0; - ::decode(struct_v, bp); - assert(struct_v >= 8); + // sanity check version + bufferlist::iterator bp = values[infover_key].begin(); + __u8 struct_v = 0; + ::decode(struct_v, bp); + assert(struct_v >= 8); - // get epoch - bp = values[epoch_key].begin(); - ::decode(cur_epoch, bp); - } else if (r == -ENOENT) { - // legacy: try v7 or older - r = store->collection_getattr(coll, "info", *bl); - assert(r > 0); - bufferlist::iterator bp = bl->begin(); - __u8 struct_v = 0; - ::decode(struct_v, bp); - if (struct_v < 5) - return 0; - if (struct_v < 6) { - ::decode(cur_epoch, bp); - return cur_epoch; - } + // get epoch + bp = values[epoch_key].begin(); + ::decode(cur_epoch, bp); - // get epoch out of leveldb - string ek = get_epoch_key(pgid); - keys.clear(); - values.clear(); - keys.insert(ek); - store->omap_get_values(coll_t::meta(), legacy_infos_oid, keys, &values); - assert(values.size() == 1); - bufferlist::iterator p = values[ek].begin(); - ::decode(cur_epoch, p); - } else { - assert(0 == "unable to open pg metadata"); - } return cur_epoch; } -#pragma GCC diagnostic pop -#pragma GCC diagnostic warning "-Wpragmas" - void PG::write_if_dirty(ObjectStore::Transaction& t) { map km; diff --git a/src/osd/PG.h b/src/osd/PG.h index 7cace9ca69f59..7447ad12a86ec 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -286,10 +286,9 @@ class PG { // pg state pg_info_t info; __u8 info_struct_v; + // v8 was pgmeta, first appeared in hammer. static const __u8 cur_struct_v = 8; - // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad - // (first appeared in cuttlefish). - static const __u8 compat_struct_v = 7; + static const __u8 compat_struct_v = 8; bool must_upgrade() { return info_struct_v < cur_struct_v; } From b297e6d2b206c0da733ae9d4ac5fb321f113ae5d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Jun 2015 15:10:46 -0700 Subject: [PATCH 035/654] ceph-objectstore-tool: drop support for pre-pgmeta PGs Signed-off-by: Sage Weil --- src/tools/ceph_objectstore_tool.cc | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index caf82ae9aa40a..76633d3d9cb31 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -471,10 +471,6 @@ int finish_remove_pgs(ObjectStore *store) return 0; } -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t) { pg_info_t info(pgid); @@ -490,29 +486,14 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl; return r; } - if (struct_v < 8) { - // old xattr - cout << "setting legacy 'remove' xattr flag" << std::endl; - bufferlist one; - one.append('1'); - t->collection_setattr(coll, "remove", one); - cout << "remove " << coll_t::meta() << " " << log_oid << std::endl; - t->remove(coll_t::meta(), log_oid); - cout << "remove " << coll_t::meta() << " " << biginfo_oid << std::endl; - t->remove(coll_t::meta(), biginfo_oid); - } else { - // new omap key - cout << "setting '_remove' omap key" << std::endl; - map values; - ::encode((char)1, values["_remove"]); - t->omap_setkeys(coll, pgmeta_oid, values); - } + assert(struct_v >= 8); + cout << "setting '_remove' omap key" << std::endl; + map values; + ::encode((char)1, values["_remove"]); + t->omap_setkeys(coll, pgmeta_oid, values); return 0; } -#pragma GCC diagnostic pop -#pragma GCC diagnostic warning "-Wpragmas" - int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) { if (!dry_run) From 71deb4b9ed4f9bfc4c9150bd3d343b0a4b1ecb46 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Jul 2015 17:14:26 -0400 Subject: [PATCH 036/654] test/objectstore/FilestoreDiff: don't diff coll attrs Signed-off-by: Sage Weil --- src/test/objectstore/FileStoreDiff.cc | 31 --------------------------- src/test/objectstore/FileStoreDiff.h | 1 - 2 files changed, 32 deletions(-) diff --git a/src/test/objectstore/FileStoreDiff.cc b/src/test/objectstore/FileStoreDiff.cc index 521e56a5e2e39..a49e4af374370 100644 --- a/src/test/objectstore/FileStoreDiff.cc +++ b/src/test/objectstore/FileStoreDiff.cc @@ -253,34 +253,6 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t return ret; } -bool FileStoreDiff::diff_coll_attrs(FileStore *a_store, FileStore *b_store, coll_t coll) -{ - bool ret = false; - - int err; - std::map b_coll_attrs, a_coll_attrs; - err = b_store->collection_getattrs(coll, b_coll_attrs); - if (err < 0 && err != -EOPNOTSUPP) { - dout(0) << "diff_attrs getattrs on verify coll " << coll.to_str() - << "returns " << err << dendl; - ret = true; - } - err = a_store->collection_getattrs(coll, a_coll_attrs); - if (err < 0 && err != -EOPNOTSUPP) { - dout(0) << "diff_attrs getattrs on A coll " << coll.to_str() - << "returns " << err << dendl; - ret = true; - } - - if (b_coll_attrs.size() != a_coll_attrs.size()) { - dout(0) << "diff_attrs size mismatch (A: " << a_coll_attrs.size() - << ", B: " << a_coll_attrs.size() << ")" << dendl; - ret = true; - } - - return diff_attrs(b_coll_attrs, a_coll_attrs) || ret; -} - bool FileStoreDiff::diff() { bool ret = false; @@ -305,9 +277,6 @@ bool FileStoreDiff::diff() } } - if (diff_coll_attrs(a_store, b_store, b_coll)) - ret = true; - if (diff_objects(a_store, b_store, b_coll)) ret = true; } diff --git a/src/test/objectstore/FileStoreDiff.h b/src/test/objectstore/FileStoreDiff.h index cacd3ce84747a..f7aedeee2e682 100644 --- a/src/test/objectstore/FileStoreDiff.h +++ b/src/test/objectstore/FileStoreDiff.h @@ -27,7 +27,6 @@ class FileStoreDiff { FileStore *a_store; FileStore *b_store; - bool diff_coll_attrs(FileStore *a_store, FileStore *b_store, coll_t coll); bool diff_objects(FileStore *a_store, FileStore *b_store, coll_t coll); bool diff_objects_stat(struct stat& a, struct stat& b); bool diff_attrs(std::map& b, From 893e00bc0fd793090a35d275919ca0348e9f05b0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Jul 2015 16:54:17 -0400 Subject: [PATCH 037/654] os: drop deprecated collection_* attr methods We no longer need these for upgrades. Signed-off-by: Sage Weil --- src/os/FileStore.cc | 158 +--------------------------------------- src/os/FileStore.h | 7 -- src/os/KeyValueStore.cc | 9 +-- src/os/KeyValueStore.h | 6 -- src/os/ObjectStore.h | 115 +---------------------------- src/os/Transaction.cc | 50 +------------ 6 files changed, 6 insertions(+), 339 deletions(-) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 458ecbebd6a7d..5f2f0b04074e5 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -2633,27 +2633,8 @@ unsigned FileStore::_do_transaction( break; case Transaction::OP_COLL_SETATTR: - { - coll_t cid = i.get_cid(op->cid); - string name = i.decode_string(); - bufferlist bl; - i.decode_bl(bl); - tracepoint(objectstore, coll_setattr_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length()); - tracepoint(objectstore, coll_setattr_exit, r); - } - break; - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = i.get_cid(op->cid); - string name = i.decode_string(); - tracepoint(objectstore, coll_rmattr_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _collection_rmattr(cid, name.c_str()); - tracepoint(objectstore, coll_rmattr_exit, r); - } + assert(0 == "coll attributes no longer supported"); break; case Transaction::OP_STARTSYNC: @@ -4326,143 +4307,6 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid, return r; } - - -// collections - -int FileStore::collection_getattr(coll_t c, const char *name, - void *value, size_t size) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_getattr " << fn << " '" << name << "' len " << size << dendl; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - r = chain_fgetxattr(fd, n, value, size); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_getattr " << fn << " '" << name << "'" << dendl; - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - buffer::ptr bp; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = _fgetattr(fd, n, bp); - bl.push_back(bp); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattr " << fn << " '" << name << "' = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::collection_getattrs(coll_t cid, map& aset) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - dout(10) << "collection_getattrs " << fn << dendl; - int r = 0; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = _fgetattrs(fd, aset); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattrs " << fn << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - - -int FileStore::_collection_setattr(coll_t c, const char *name, - const void *value, size_t size) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << dendl; - char n[PATH_MAX]; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - get_attrname(name, n, PATH_MAX); - r = chain_fsetxattr(fd, n, value, size); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; - return r; -} - -int FileStore::_collection_rmattr(coll_t c, const char *name) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_rmattr " << fn << dendl; - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = chain_fremovexattr(fd, n); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_rmattr " << fn << " = " << r << dendl; - return r; -} - - -int FileStore::_collection_setattrs(coll_t cid, map& aset) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - dout(15) << "collection_setattrs " << fn << dendl; - int r = 0; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - char n[PATH_MAX]; - get_attrname(p->first.c_str(), n, PATH_MAX); - r = chain_fsetxattr(fd, n, p->second.c_str(), p->second.length()); - if (r < 0) - break; - } - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_setattrs " << fn << " = " << r << dendl; - return r; -} - int FileStore::_collection_remove_recursive(const coll_t &cid, const SequencerPosition &spos) { diff --git a/src/os/FileStore.h b/src/os/FileStore.h index 3775fbac3c71d..5692963b3c1d5 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -608,13 +608,6 @@ class FileStore : public JournalingObjectStore, int _rmattrs(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - int collection_getattr(coll_t c, const char *name, bufferlist& bl); - int collection_getattrs(coll_t cid, map &aset); - - int _collection_setattr(coll_t c, const char *name, const void *value, size_t size); - int _collection_rmattr(coll_t c, const char *name); - int _collection_setattrs(coll_t cid, map &aset); int _collection_remove_recursive(const coll_t &cid, const SequencerPosition &spos); diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc index 3bf8f33a4a68a..a93c46678b6ed 100644 --- a/src/os/KeyValueStore.cc +++ b/src/os/KeyValueStore.cc @@ -1458,15 +1458,8 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction, break; case Transaction::OP_COLL_SETATTR: - { - assert(0 == "not implemented"); - } - break; - case Transaction::OP_COLL_RMATTR: - { - assert(0 == "not implemented"); - } + assert(0 == "coll attrs no longer supported"); break; case Transaction::OP_STARTSYNC: diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h index 48135f142c3fd..3f8a430991520 100644 --- a/src/os/KeyValueStore.h +++ b/src/os/KeyValueStore.h @@ -611,12 +611,6 @@ class KeyValueStore : public ObjectStore, BufferTransaction &t); int _rmattrs(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - int _collection_setattr(coll_t c, const char *name, const void *value, - size_t size, BufferTransaction &t); - int _collection_rmattr(coll_t c, const char *name, BufferTransaction &t); - int _collection_setattrs(coll_t cid, map &aset, - BufferTransaction &t); - // collections int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num, uint64_t num_objs) const { return 0; } diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 9651a697b7872..96eb099fa8c94 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -358,9 +358,9 @@ class ObjectStore { OP_RMCOLL = 21, // cid OP_COLL_ADD = 22, // cid, oldcid, oid OP_COLL_REMOVE = 23, // cid, oid - OP_COLL_SETATTR = 24, // cid, attrname, bl - OP_COLL_RMATTR = 25, // cid, attrname - OP_COLL_SETATTRS = 26, // cid, attrset + OP_COLL_SETATTR = 24, // cid, attrname, bl **DEPRECATED** + OP_COLL_RMATTR = 25, // cid, attrname **DEPRECATED** + OP_COLL_SETATTRS = 26, // cid, attrset **DEPRECATED** OP_COLL_MOVE = 8, // newcid, oldcid, oid OP_STARTSYNC = 27, // start a sync @@ -1293,76 +1293,6 @@ class ObjectStore { data.ops++; } - // NOTE: Collection attr operations are all DEPRECATED. new - // backends need not implement these at all. - - /// Set an xattr on a collection - void collection_setattr(coll_t cid, const string& name, bufferlist& val) - __attribute__ ((deprecated)) { - if (use_tbl) { - __u32 op = OP_COLL_SETATTR; - ::encode(op, tbl); - ::encode(cid, tbl); - ::encode(name, tbl); - ::encode(val, tbl); - } else { - Op* _op = _get_next_op(); - _op->op = OP_COLL_SETATTR; - _op->cid = _get_coll_id(cid); - ::encode(name, data_bl); - ::encode(val, data_bl); - } - data.ops++; - } - - /// Remove an xattr from a collection - void collection_rmattr(coll_t cid, const string& name) - __attribute__ ((deprecated)) { - if (use_tbl) { - __u32 op = OP_COLL_RMATTR; - ::encode(op, tbl); - ::encode(cid, tbl); - ::encode(name, tbl); - } else { - Op* _op = _get_next_op(); - _op->op = OP_COLL_RMATTR; - _op->cid = _get_coll_id(cid); - ::encode(name, data_bl); - } - data.ops++; - } - /// Set multiple xattrs on a collection - void collection_setattrs(coll_t cid, map& aset) - __attribute__ ((deprecated)) { - if (use_tbl) { - __u32 op = OP_COLL_SETATTRS; - ::encode(op, tbl); - ::encode(cid, tbl); - ::encode(aset, tbl); - } else { - Op* _op = _get_next_op(); - _op->op = OP_COLL_SETATTRS; - _op->cid = _get_coll_id(cid); - ::encode(aset, data_bl); - } - data.ops++; - } - /// Set multiple xattrs on a collection - void collection_setattrs(coll_t cid, map& aset) - __attribute__ ((deprecated)) { - if (use_tbl) { - __u32 op = OP_COLL_SETATTRS; - ::encode(op, tbl); - ::encode(cid, tbl); - ::encode(aset, tbl); - } else { - Op* _op = _get_next_op(); - _op->op = OP_COLL_SETATTRS; - _op->cid = _get_coll_id(cid); - ::encode(aset, data_bl); - } - data.ops++; - } /// Remove omap from oid void omap_clear( coll_t cid, ///< [in] Collection containing oid @@ -1977,45 +1907,6 @@ class ObjectStore { * @returns true if it exists, false otherwise */ virtual bool collection_exists(coll_t c) = 0; - /** - * collection_getattr - get an xattr of a collection - * - * @param cid collection name - * @param name xattr name - * @param value pointer of buffer to receive value - * @param size size of buffer to receive value - * @returns 0 on success, negative error code on failure - */ - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) - __attribute__ ((deprecated)) { - return -EOPNOTSUPP; - } - - /** - * collection_getattr - get an xattr of a collection - * - * @param cid collection name - * @param name xattr name - * @param bl buffer to receive value - * @returns 0 on success, negative error code on failure - */ - virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) - __attribute__ ((deprecated)) { - return -EOPNOTSUPP; - } - - /** - * collection_getattrs - get all xattrs of a collection - * - * @param cid collection name - * @param aset map of keys and buffers that contain the values - * @returns 0 on success, negative error code on failure - */ - virtual int collection_getattrs(coll_t cid, map &aset) - __attribute__ ((deprecated)) { - return -EOPNOTSUPP; - } /** * is a collection empty? diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc index a5e6b515f3dbf..aab9cea139ede 100644 --- a/src/os/Transaction.cc +++ b/src/os/Transaction.cc @@ -4,10 +4,6 @@ #include "ObjectStore.h" #include "common/Formatter.h" -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - void ObjectStore::Transaction::_build_actions_from_tbl() { //used only for tbl encode @@ -324,41 +320,9 @@ void ObjectStore::Transaction::_build_actions_from_tbl() break; case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - string name; - bufferlist bl; - - ::decode(cid, p); - ::decode(name, p); - ::decode(bl, p); - - collection_setattr(cid, name, bl); - } - break; - case Transaction::OP_COLL_SETATTRS: - { - coll_t cid; - map aset; - - ::decode(cid, p); - ::decode(aset, p); - - collection_setattrs(cid, aset); - } - break; - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - string name; - - ::decode(cid, p); - ::decode(name, p); - - collection_rmattr(cid, name); - } + assert(0 == "collection attrs no longer supported"); break; case Transaction::OP_STARTSYNC: @@ -504,9 +468,6 @@ void ObjectStore::Transaction::_build_actions_from_tbl() assert(ops == data.ops); } -#pragma GCC diagnostic pop -#pragma GCC diagnostic warning "-Wpragmas" - void ObjectStore::Transaction::dump(ceph::Formatter *f) { f->open_array_section("ops"); @@ -934,10 +895,6 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f) f->close_section(); } -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - void ObjectStore::Transaction::generate_test_instances(list& o) { o.push_back(new Transaction); @@ -977,11 +934,6 @@ void ObjectStore::Transaction::generate_test_instances(listcreate_collection(c); t->collection_move_rename(c, o2, c2, o3); t->remove_collection(c); - t->collection_setattr(c, string("this"), bl); - t->collection_rmattr(c, string("foo")); - t->collection_setattrs(c, m); o.push_back(t); } -#pragma GCC diagnostic pop -#pragma GCC diagnostic warning "-Wpragmas" From ffa224a25bd7b9026c307cbf1a7fc3f3c744f5a9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 4 Aug 2015 13:19:34 -0400 Subject: [PATCH 038/654] mon: disallow post-hammer OSDs if there are up pre-hammer OSDs Force *all* OSDs to upgrade to hammer before allowing post-hammer OSDs to join. This prevents any pre-hammer OSDs from running at the same time as a post-hammer OSD. This commit, as well as the definition of the sentinal post-hammer feature, should get backported to hammer stable series. Backport: hammer Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 5e147f2478d3a..6c6ebc6fd3b11 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1845,6 +1845,26 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op) goto ignore; } + // make sure upgrades stop at hammer + // * OSD_PROXY_FEATURES is the last pre-hammer feature + // * MON_METADATA is the first post-hammer feature + if (osdmap.get_num_up_osds() > 0) { + if ((m->osd_features & CEPH_FEATURE_MON_METADATA) && + !(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES)) { + mon->clog->info() << "disallowing boot of post-hammer OSD " + << m->get_orig_source_inst() + << " because one or more up OSDs is pre-hammer\n"; + goto ignore; + } + if (!(m->osd_features & CEPH_FEATURE_OSD_PROXY_FEATURES) && + (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) { + mon->clog->info() << "disallowing boot of pre-hammer OSD " + << m->get_orig_source_inst() + << " because all up OSDs are post-hammer\n"; + goto ignore; + } + } + // already booted? if (osdmap.is_up(from) && osdmap.get_inst(from) == m->get_orig_source_inst()) { From e7e4aefb95f015442e9fcb07b1a02c30d15e440f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Aug 2015 15:43:51 -0400 Subject: [PATCH 039/654] osd: some debug output during store version upgrades Signed-off-by: Sage Weil --- src/os/FileStore.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 5f2f0b04074e5..f2c28be969b78 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1178,6 +1178,8 @@ int FileStore::version_stamp_is_valid(uint32_t *version) bl.push_back(bp); bufferlist::iterator i = bl.begin(); ::decode(*version, i); + dout(10) << __func__ << " was " << *version << " vs target " + << target_version << dendl; if (*version == target_version) return 1; else @@ -1186,6 +1188,7 @@ int FileStore::version_stamp_is_valid(uint32_t *version) int FileStore::write_version_stamp() { + dout(1) << __func__ << " " << target_version << dendl; bufferlist bl; ::encode(target_version, bl); @@ -1195,6 +1198,7 @@ int FileStore::write_version_stamp() int FileStore::upgrade() { + dout(1) << "upgrade" << dendl; uint32_t version; int r = version_stamp_is_valid(&version); if (r < 0) From 14d3a2b82bea668ee2e9898d0b612812d8f7e8cc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Aug 2015 12:53:06 -0400 Subject: [PATCH 040/654] os/FileStore: fix version check We need to do this when we first read the version, before we proceed with the mount. By the time we get to upgrade() it is too late (the DBObjectMap may have already tried a conversion, journal may have replayed, etc.). Signed-off-by: Sage Weil --- src/os/FileStore.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index f2c28be969b78..717aa17c24d55 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1206,10 +1206,7 @@ int FileStore::upgrade() if (r == 1) return 0; - if (version < 4) { - derr << "ObjectStore is old at version " << version << ". Please upgrade to hammer v0.94.x first." << dendl; - return -EINVAL; - } + assert(version >= 4); // upgrade to hammer first // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to // open up DBObjectMap with the do_upgrade flag, which we already did. @@ -1301,6 +1298,12 @@ int FileStore::mount() << cpp_strerror(ret) << dendl; goto close_fsid_fd; } else if (ret == 0) { + if (version_stamp < 4) { + derr << "FileStore is old at version " << version_stamp + << ". Please upgrade to hammer v0.94.x first." << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } if (do_update || (int)version_stamp < g_conf->filestore_update_to) { derr << "FileStore::mount : stale version stamp detected: " << version_stamp From de00f6d0df159d51bd9dcec6a111e47b04a97c96 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 19 Aug 2015 11:43:13 +0800 Subject: [PATCH 041/654] doc: add the description for min_read_recency_for_promote This cache tiering option is introduced in proxy read. Signed-off-by: Zhiqiang Wang --- doc/dev/cache-pool.rst | 12 ++++++++++++ doc/rados/operations/cache-tiering.rst | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst index 5fa910f98355f..718a2b325408a 100644 --- a/doc/dev/cache-pool.rst +++ b/doc/dev/cache-pool.rst @@ -55,6 +55,7 @@ Set the target size and enable the tiering agent for foo-hot:: ceph osd pool set foo-hot hit_set_count 1 ceph osd pool set foo-hot hit_set_period 3600 # 1 hour ceph osd pool set foo-hot target_max_bytes 1000000000000 # 1 TB + ceph osd pool set foo-hot min_read_recency_for_promote 1 Drain the cache in preparation for turning it off:: @@ -112,6 +113,17 @@ evict cache objects, all hit_set_count HitSets are loaded into RAM. Currently there is minimal benefit for hit_set_count > 1 since the agent does not yet act intelligently on that information. +The ``min_read_recency_for_promote`` defines how many HitSets to check for the +existence of an object when handling a read operation. The checking result is +used to decide whether to promote the object asynchronously. Its value should be +between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted. +If it's set to 1, the current HitSet is checked. And if this object is in the +current HitSet, it's promoted. Otherwise not. For the other values, the exact +number of archive HitSets are checked. The object is promoted if the object is +found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: + + ceph osd pool set {cachepool} min_read_recency_for_promote 1 + Cache mode ~~~~~~~~~~ diff --git a/doc/rados/operations/cache-tiering.rst b/doc/rados/operations/cache-tiering.rst index 355d9369b6e92..e77137e7dc87c 100644 --- a/doc/rados/operations/cache-tiering.rst +++ b/doc/rados/operations/cache-tiering.rst @@ -186,6 +186,17 @@ Binning accesses over time allows Ceph to determine whether a Ceph client accessed an object at least once, or more than once over a time period ("age" vs "temperature"). +The ``min_read_recency_for_promote`` defines how many HitSets to check for the +existence of an object when handling a read operation. The checking result is +used to decide whether to promote the object asynchronously. Its value should be +between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted. +If it's set to 1, the current HitSet is checked. And if this object is in the +current HitSet, it's promoted. Otherwise not. For the other values, the exact +number of archive HitSets are checked. The object is promoted if the object is +found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: + + ceph osd pool set {cachepool} min_read_recency_for_promote 1 + .. note:: The longer the period and the higher the count, the more RAM the ``ceph-osd`` daemon consumes. In particular, when the agent is active to flush or evict cache objects, all ``hit_set_count`` HitSets are loaded From d5a56c497a844da8401661f24e68bb325be551e7 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 20 Aug 2015 10:58:45 +0800 Subject: [PATCH 042/654] doc: update some of the outdated cache tiering doc Signed-off-by: Zhiqiang Wang --- doc/dev/cache-pool.rst | 13 ++++++------- doc/rados/operations/cache-tiering.rst | 8 ++++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst index 718a2b325408a..2d22f1f3b7df0 100644 --- a/doc/dev/cache-pool.rst +++ b/doc/dev/cache-pool.rst @@ -105,13 +105,7 @@ The hit_set_count and hit_set_period define how much time each HitSet should cover, and how many such HitSets to store. Binning accesses over time allows Ceph to independently determine whether an object was accessed at least once and whether it was accessed more than once over -some time period ("age" vs "temperature"). Note that the longer the -period and the higher the count the more RAM will be consumed by the -ceph-osd process. In particular, when the agent is active to flush or -evict cache objects, all hit_set_count HitSets are loaded into RAM. - -Currently there is minimal benefit for hit_set_count > 1 since the -agent does not yet act intelligently on that information. +some time period ("age" vs "temperature"). The ``min_read_recency_for_promote`` defines how many HitSets to check for the existence of an object when handling a read operation. The checking result is @@ -124,6 +118,11 @@ found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: ceph osd pool set {cachepool} min_read_recency_for_promote 1 +Note that the longer the ``hit_set_period`` and the higher the +``min_read_recency_for_promote`` the more RAM will be consumed by the ceph-osd +process. In particular, when the agent is active to flush or evict cache objects, +all hit_set_count HitSets are loaded into RAM. + Cache mode ~~~~~~~~~~ diff --git a/doc/rados/operations/cache-tiering.rst b/doc/rados/operations/cache-tiering.rst index e77137e7dc87c..f53526600d308 100644 --- a/doc/rados/operations/cache-tiering.rst +++ b/doc/rados/operations/cache-tiering.rst @@ -197,10 +197,10 @@ found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: ceph osd pool set {cachepool} min_read_recency_for_promote 1 -.. note:: The longer the period and the higher the count, the more RAM the - ``ceph-osd`` daemon consumes. In particular, when the agent is active to - flush or evict cache objects, all ``hit_set_count`` HitSets are loaded - into RAM. +.. note:: The longer the period and the higher the min_read_recency_for_promote, + the more RAM the ``ceph-osd`` daemon consumes. In particular, when the agent + is active to flush or evict cache objects, all ``hit_set_count`` HitSets are + loaded into RAM. Cache Sizing From cdaf9974de6438ca64bb78936599c36bd0f3a525 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 20 Aug 2015 16:26:03 -0400 Subject: [PATCH 043/654] unittest_bufferlist: benchmark buffer::ptr::append, copy_in, copy_out Signed-off-by: Sage Weil --- src/test/bufferlist.cc | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index c660099b2dbbb..1221371599b81 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -633,6 +633,25 @@ TEST(BufferPtr, copy_out) { } } +TEST(BufferPtr, copy_out_bench) { + for (int s=1; s<=8; s*=2) { + utime_t start = ceph_clock_now(NULL); + int buflen = 1048576; + int count = 1000; + uint64_t v; + for (int i=0; i Date: Thu, 20 Aug 2015 15:50:24 -0400 Subject: [PATCH 044/654] buffer: move inline memory ops to inline_memory.h; gcc + x86_64 only Keep the architecture-sensitive code in a separate header. Avoid duplicating the unrolled memcpy in each buffer.cc method. Signed-off-by: Sage Weil --- src/common/buffer.cc | 162 ++---------------------------------- src/include/Makefile.am | 1 + src/include/inline_memory.h | 139 +++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 155 deletions(-) create mode 100644 src/include/inline_memory.h diff --git a/src/common/buffer.cc b/src/common/buffer.cc index f2148fdcdb509..eabc9e978a00b 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -24,6 +24,7 @@ #include "common/RWLock.h" #include "include/types.h" #include "include/compat.h" +#include "include/inline_memory.h" #if defined(HAVE_XIO) #include "msg/xio/XioMsg.h" #endif @@ -37,10 +38,6 @@ #include namespace ceph { -#if defined(__GNUC__) && defined(__x86_64__) - typedef unsigned uint128_t __attribute__ ((mode (TI))); -#endif - #ifdef BUFFER_DEBUG static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; # define bdout { simple_spin_lock(&buffer_debug_lock); std::cout @@ -786,32 +783,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; if (o+l > _len) throw end_of_buffer(); char* src = _raw->data + _off + o; - if (l > 8) { - memcpy(dest, src, l); - return; - } - switch (l) { - case 8: - *((uint64_t*)(dest)) = *((uint64_t*)(src)); - return; - case 4: - *((uint32_t*)(dest)) = *((uint32_t*)(src)); - return; - case 3: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); - return; - case 2: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - return; - case 1: - *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return; - default: - memcpy(dest, src, l); - return; - } - } + maybe_inline_memcpy(dest, src, l, 8); + } unsigned buffer::ptr::wasted() { @@ -836,50 +809,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; bool buffer::ptr::is_zero() const { - const char* data = c_str(); - const char* max = data + _len; - const char* max32 = data + (_len / sizeof(uint32_t))*sizeof(uint32_t); -#if defined(__GNUC__) && defined(__x86_64__) - // we do have XMM registers in x86-64, so if we need to check at least - // 16 bytes, make use of them - int left = _len; - if (left / sizeof(uint128_t) > 0) { - // align data pointer to 16 bytes, otherwise it'll segfault due to bug - // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). - // check up to 15 first bytes while at it. - while (((unsigned long long)data) & 15) { - if (*(uint8_t*)data != 0) { - return false; - } - data += sizeof(uint8_t); - left--; - } - - const char* max128 = data + (left / sizeof(uint128_t))*sizeof(uint128_t); - - while (data < max128) { - if (*(uint128_t*)data != 0) { - return false; - } - data += sizeof(uint128_t); - } - } -#endif - while (data < max32) { - if (*(uint32_t*)data != 0) { - return false; - } - data += sizeof(uint32_t); - } - - while (data < max) { - if (*(uint8_t*)data != 0) { - return false; - } - data += sizeof(uint8_t); - } - - return true; + return mem_is_zero(c_str(), _len); } unsigned buffer::ptr::append(char c) @@ -897,47 +827,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; assert(_raw); assert(l <= unused_tail_length()); char* c = _raw->data + _off + _len; - if (l <= 32) { - _len += l; - switch (l) { - case 16: - *((uint64_t*)(c)) = *((uint64_t*)(p)); - *((uint64_t*)(c+sizeof(uint64_t))) = *((uint64_t*)(p+sizeof(uint64_t))); - return _len + _off; - case 8: - *((uint64_t*)(c)) = *((uint64_t*)(p)); - return _len + _off; - case 4: - *((uint32_t*)(c)) = *((uint32_t*)(p)); - return _len + _off; - case 2: - *((uint16_t*)(c)) = *((uint16_t*)(p)); - return _len + _off; - case 1: - *((uint8_t*)(c)) = *((uint8_t*)(p)); - return _len + _off; - } - int cursor = 0; - while (l >= sizeof(uint64_t)) { - *((uint64_t*)(c + cursor)) = *((uint64_t*)(p + cursor)); - cursor += sizeof(uint64_t); - l -= sizeof(uint64_t); - } - while (l >= sizeof(uint32_t)) { - *((uint32_t*)(c + cursor)) = *((uint32_t*)(p + cursor)); - cursor += sizeof(uint32_t); - l -= sizeof(uint32_t); - } - while (l > 0) { - *(c+cursor) = *(p+cursor); - cursor++; - l--; - } - } - else { - memcpy(c, p, l); - _len += l; - } + maybe_inline_memcpy(c, p, l, 32); + _len += l; return _len + _off; } @@ -949,46 +840,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; char* dest = _raw->data + _off + o; if (crc_reset) _raw->invalidate_crc(); - if (l < 64) { - switch (l) { - case 1: - *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return; - case 2: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - return; - case 3: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); - return; - case 4: - *((uint32_t*)(dest)) = *((uint32_t*)(src)); - return; - case 8: - *((uint64_t*)(dest)) = *((uint64_t*)(src)); - return; - default: - int cursor = 0; - while (l >= sizeof(uint64_t)) { - *((uint64_t*)(dest + cursor)) = *((uint64_t*)(src + cursor)); - cursor += sizeof(uint64_t); - l -= sizeof(uint64_t); - } - while (l >= sizeof(uint32_t)) { - *((uint32_t*)(dest + cursor)) = *((uint32_t*)(src + cursor)); - cursor += sizeof(uint32_t); - l -= sizeof(uint32_t); - } - while (l > 0) { - *(dest + cursor) = *(src + cursor); - cursor++; - l--; - } - return; - } - } else { - memcpy(dest, src, l); - } + maybe_inline_memcpy(dest, src, l, 64); } void buffer::ptr::zero(bool crc_reset) diff --git a/src/include/Makefile.am b/src/include/Makefile.am index b3ceb24bf1a58..56fa49ecf8d0e 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -68,6 +68,7 @@ noinst_HEADERS += \ include/filepath.h \ include/frag.h \ include/hash.h \ + include/inline_memory.h \ include/intarith.h \ include/interval_set.h \ include/int_types.h \ diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h new file mode 100644 index 0000000000000..6e08e420e535b --- /dev/null +++ b/src/include/inline_memory.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_INLINE_MEMORY_H +#define CEPH_INLINE_MEMORY_H + +// only define these for x86_64 for now. + +#if defined(__GNUC__) && defined(__x86_64__) + +typedef unsigned uint128_t __attribute__ ((mode (TI))); + +// optimize for the common case, which is very small copies +static inline void maybe_inline_memcpy(char *dest, const char *src, size_t l, + size_t inline_len) + __attribute__((always_inline)); + +void maybe_inline_memcpy(char *dest, const char *src, size_t l, + size_t inline_len) +{ + if (l > inline_len) { + memcpy(dest, src, l); + return; + } + switch (l) { + case 8: + *((uint64_t*)(dest)) = *((uint64_t*)(src)); + return; + case 4: + *((uint32_t*)(dest)) = *((uint32_t*)(src)); + return; + case 3: + *((uint16_t*)(dest)) = *((uint16_t*)(src)); + *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); + return; + case 2: + *((uint16_t*)(dest)) = *((uint16_t*)(src)); + return; + case 1: + *((uint8_t*)(dest)) = *((uint8_t*)(src)); + return; + default: + int cursor = 0; + while (l >= sizeof(uint64_t)) { + *((uint64_t*)(dest + cursor)) = *((uint64_t*)(src + cursor)); + cursor += sizeof(uint64_t); + l -= sizeof(uint64_t); + } + while (l >= sizeof(uint32_t)) { + *((uint32_t*)(dest + cursor)) = *((uint32_t*)(src + cursor)); + cursor += sizeof(uint32_t); + l -= sizeof(uint32_t); + } + while (l > 0) { + *(dest + cursor) = *(src + cursor); + cursor++; + l--; + } + } +} + +static inline bool mem_is_zero(const char *data, size_t len) + __attribute__((always_inline)); + +bool mem_is_zero(const char *data, size_t len) +{ + const char *max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); +#if defined(__GNUC__) && defined(__x86_64__) + // we do have XMM registers in x86-64, so if we need to check at least + // 16 bytes, make use of them + int left = len; + if (left / sizeof(uint128_t) > 0) { + // align data pointer to 16 bytes, otherwise it'll segfault due to bug + // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). + // check up to 15 first bytes while at it. + while (((unsigned long long)data) & 15) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + left--; + } + + const char* max128 = data + (left / sizeof(uint128_t))*sizeof(uint128_t); + + while (data < max128) { + if (*(uint128_t*)data != 0) { + return false; + } + data += sizeof(uint128_t); + } + } +#endif + while (data < max32) { + if (*(uint32_t*)data != 0) { + return false; + } + data += sizeof(uint32_t); + } + while (data < max) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + } + return true; +} + +#else // x86_64 + +// on other architectures, default to something simple. + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +static inline bool mem_is_zero(const char *data, size_t len) { + const char *end = data + len; + while (data < end) { + if (*data != 0) { + return false; + } + ++data; + } + return true; +} + +#endif // !x86_64 + +#endif From d164ec151877445138ec47cc746e90c43a87a806 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 20 Aug 2015 17:18:48 -0400 Subject: [PATCH 045/654] perf_serialize: fix i386 build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test/perf_local.cc: In function ‘double perf_serialize()’: error: test/perf_local.cc:683:20: inconsistent operand constraints in an ‘asm’ : "a" (1U)); ^ on ubuntu 14.04. Signed-off-by: Sage Weil --- src/test/perf_local.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/perf_local.cc b/src/test/perf_local.cc index 7dc8e8e28ef69..9672be2553b0b 100644 --- a/src/test/perf_local.cc +++ b/src/test/perf_local.cc @@ -666,7 +666,7 @@ double perf_prefetch() #endif } -#if defined(__i386__) || defined(__x86_64__) +#if defined(__x86_64__) /** * This function is used to seralize machine instructions so that no * instructions that appear after it in the current thread can run before any @@ -686,7 +686,7 @@ static inline void serialize() { // Measure the cost of cpuid double perf_serialize() { -#if defined(__i386__) || defined(__x86_64__) +#if defined(__x86_64__) int count = 1000000; uint64_t start = Cycles::rdtsc(); for (int i = 0; i < count; i++) { From 5eb80b81c8d635934f0c18ca0b61794550801f12 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Fri, 21 Aug 2015 10:51:05 +0800 Subject: [PATCH 046/654] osd: do evict before flush in agent_work This is to avoid the extreme case that the agent continuously does flush, but not evict. This may lead to the cache pool to be full. Signed-off-by: Zhiqiang Wang --- src/osd/ReplicatedPG.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c6958664b194c..7064b3a23d80c 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -10691,14 +10691,14 @@ bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota) continue; } + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc)) + ++started; if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && agent_flush_quota > 0 && agent_maybe_flush(obc)) { ++started; --agent_flush_quota; } - if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && - agent_maybe_evict(obc)) - ++started; if (started >= start_max) { // If finishing early, set "next" to the next object if (++p != ls.end()) From 1546d57d71e5ce65c8c0b000882176724ce1a5b2 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Fri, 21 Aug 2015 10:56:51 +0800 Subject: [PATCH 047/654] osd: do either flush or evict but not both in agent_work An cache pool object is either dirty or not. It's unlikely the agent will do both flush and evict at the same time for an object. Signed-off-by: Zhiqiang Wang --- src/osd/ReplicatedPG.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 7064b3a23d80c..c8d0dc1e8cb55 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -10694,8 +10694,8 @@ bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota) if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && agent_maybe_evict(obc)) ++started; - if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && - agent_flush_quota > 0 && agent_maybe_flush(obc)) { + else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && + agent_flush_quota > 0 && agent_maybe_flush(obc)) { ++started; --agent_flush_quota; } From 5e99a578050976ca22b549812ac80d494fe7041d Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Mon, 10 Aug 2015 04:25:03 -0700 Subject: [PATCH 048/654] mon: add a cache layer over MonitorDBStore the cache of of leveldb does not perform well under some condition, so we need a cache in our own stack. * add an option "mon_osd_cache_size" to control the size of cache size of MonitorDBStore. Fixes: #12638 Signed-off-by: Kefu Chai --- src/common/config_opts.h | 2 ++ src/mon/OSDMonitor.cc | 31 +++++++++++++++++++++++++++++++ src/mon/OSDMonitor.h | 14 ++++++++------ src/mon/PaxosService.h | 4 ++-- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d30e15e3428a9..f7ef5bc6a44a4 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -186,6 +186,8 @@ OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many obj OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states +OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to rely on underlying store's cache + OPTION(mon_tick_interval, OPT_INT, 5) OPTION(mon_subscribe_interval, OPT_DOUBLE, 300) OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0 diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 6c2893d0f503e..7730670b84e55 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -71,6 +71,14 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) { << ").osd e" << osdmap.get_epoch() << " "; } +OSDMonitor::OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name) + : PaxosService(mn, p, service_name), + inc_osd_cache(g_conf->mon_osd_cache_size), + full_osd_cache(g_conf->mon_osd_cache_size), + thrash_map(0), thrash_last_up_osd(-1), + op_tracker(cct, true, 1) +{} + bool OSDMonitor::_have_pending_crush() { return pending_inc.crush.length(); @@ -2409,6 +2417,29 @@ void OSDMonitor::send_incremental(epoch_t first, } } +int OSDMonitor::get_version(version_t ver, bufferlist& bl) +{ + if (inc_osd_cache.lookup(ver, &bl)) { + return 0; + } + int ret = PaxosService::get_version(ver, bl); + if (!ret) { + inc_osd_cache.add(ver, bl); + } + return ret; +} + +int OSDMonitor::get_version_full(version_t ver, bufferlist& bl) +{ + if (full_osd_cache.lookup(ver, &bl)) { + return 0; + } + int ret = PaxosService::get_version_full(ver, bl); + if (!ret) { + full_osd_cache.add(ver, bl); + } + return ret; +} epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until) { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index ef9f15779c021..a185954e11880 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -26,6 +26,7 @@ using namespace std; #include "include/types.h" +#include "common/simple_cache.hpp" #include "msg/Messenger.h" #include "osd/OSDMap.h" @@ -136,6 +137,9 @@ class OSDMonitor : public PaxosService { map osd_weight; + SimpleLRU inc_osd_cache; + SimpleLRU full_osd_cache; + void check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); @@ -157,7 +161,6 @@ class OSDMonitor : public PaxosService { void encode_pending(MonitorDBStore::TransactionRef t); void on_active(); void on_shutdown(); - /** * we haven't delegated full version stashing to paxosservice for some time * now, making this function useless in current context. @@ -387,11 +390,7 @@ class OSDMonitor : public PaxosService { int load_metadata(int osd, map& m, ostream *err); public: - OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, string service_name) - : PaxosService(mn, p, service_name), - thrash_map(0), thrash_last_up_osd(-1), - op_tracker(cct, true, 1) - { } + OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name); void tick(); // check state, take actions @@ -417,6 +416,9 @@ class OSDMonitor : public PaxosService { send_incremental(op, start); } + int get_version(version_t ver, bufferlist& bl) override; + int get_version_full(version_t ver, bufferlist& bl) override; + epoch_t blacklist(const entity_addr_t& a, utime_t until); void dump_info(Formatter *f); diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index c7f6cf919f932..54d26a69501df 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -892,7 +892,7 @@ class PaxosService { * @param bl The bufferlist to be populated * @return 0 on success; <0 otherwise */ - int get_version(version_t ver, bufferlist& bl) { + virtual int get_version(version_t ver, bufferlist& bl) { return mon->store->get(get_service_name(), ver, bl); } /** @@ -902,7 +902,7 @@ class PaxosService { * @param bl The bufferlist to be populated * @returns 0 on success; <0 otherwise */ - int get_version_full(version_t ver, bufferlist& bl) { + virtual int get_version_full(version_t ver, bufferlist& bl) { string key = mon->store->combine_strings(full_prefix_name, ver); return mon->store->get(get_service_name(), key, bl); } From 7c08c54fec0f95d309242a04270e76adce2bfc5f Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Fri, 21 Aug 2015 21:14:31 +0200 Subject: [PATCH 049/654] rgw: fix dangerous removal from STL map in filter_out_temp_url(). Fixes: #12750 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_op.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index c3827714d0f1e..d5e167935121d 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -2181,17 +2181,17 @@ void RGWPutMetadataAccount::filter_out_temp_url(map& add_att map& temp_url_keys) { map::iterator iter; - for (iter = add_attrs.begin(); iter != add_attrs.end(); ++iter) { - const string name = iter->first; - if (name.compare(RGW_ATTR_TEMPURL_KEY1) == 0) { - temp_url_keys[0] = iter->second.c_str(); - add_attrs.erase(name); - } - if (name.compare(RGW_ATTR_TEMPURL_KEY2) == 0) { - temp_url_keys[1] = iter->second.c_str(); - add_attrs.erase(name); - } + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY1); + if (iter != add_attrs.end()) { + temp_url_keys[0] = iter->second.c_str(); + add_attrs.erase(iter); + } + + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY2); + if (iter != add_attrs.end()) { + temp_url_keys[1] = iter->second.c_str(); + add_attrs.erase(iter); } set::const_iterator riter; From ea8609b25a2775ab997459978a8e871915eba853 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Aug 2015 13:16:43 -0400 Subject: [PATCH 050/654] mon/OSDMonitor: debug why pool creation fails Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 6c6ebc6fd3b11..6e2f14c20171a 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4451,24 +4451,32 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int r; r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile, crush_ruleset_name, &crush_ruleset, ss); - if (r) + if (r) { + dout(10) << " prepare_pool_crush_ruleset returns " << r << dendl; return r; + } CrushWrapper newcrush; _get_pending_crush(newcrush); CrushTester tester(newcrush, *ss); r = tester.test_with_crushtool(g_conf->crushtool.c_str(), osdmap.get_max_osd(), g_conf->mon_lease); - if (r) + if (r) { + dout(10) << " tester.test_with_crushtool returns " << r << dendl; return r; + } unsigned size, min_size; r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss); - if (r) + if (r) { + dout(10) << " prepare_pool_size returns " << r << dendl; return r; + } uint32_t stripe_width = 0; r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss); - if (r) + if (r) { + dout(10) << " prepare_pool_stripe_width returns " << r << dendl; return r; + } for (map::iterator p = pending_inc.new_pool_names.begin(); p != pending_inc.new_pool_names.end(); From 0f82f461b33d93d868e185912a2c7e4074d06900 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Aug 2015 16:40:34 -0400 Subject: [PATCH 051/654] crush/CrushTester: test fewer inputs when running crushtool If there are a lot of crush rules (say, 100) then the test can take a long time. 100 values per rule should be enough to catch most issues. Signed-off-by: Sage Weil --- src/crush/CrushTester.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc index bf303cf074062..209d03a9f1019 100644 --- a/src/crush/CrushTester.cc +++ b/src/crush/CrushTester.cc @@ -359,7 +359,12 @@ int CrushTester::test_with_crushtool(const char *crushtool_cmd, int max_id, int { SubProcessTimed crushtool(crushtool_cmd, true, false, true, timeout); string opt_max_id = boost::lexical_cast(max_id); - crushtool.add_cmd_args("-i", "-", "--test", "--check", opt_max_id.c_str(), NULL); + crushtool.add_cmd_args( + "-i", "-", + "--test", "--check", opt_max_id.c_str(), + "--min-x", "1", + "--max-x", "50", + NULL); int ret = crushtool.spawn(); if (ret != 0) { err << "failed run crushtool: " << crushtool.err(); From 42f8c5daad16aa849a0b99871d50161673c0c370 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 5 Jun 2015 21:06:48 +0800 Subject: [PATCH 052/654] osd: use GMT time for the object name of hitsets * bump the encoding version of pg_hit_set_info_t to 2, so we can tell if the corresponding hit_set is named using localtime or GMT * bump the encoding version of pg_pool_t to 20, so we can know if a pool is using GMT to name the hit_set archive or not. and we can tell if current cluster allows OSDs not support GMT mode or not. * add an option named `osd_pool_use_gmt_hitset`. if enabled, the cluster will try to use GMT mode when creating a new pool if all the the up OSDs support GMT mode. if any of the pools in the cluster is using GMT mode, then only OSDs supporting GMT mode are allowed to join the cluster. Fixes: #9732 Signed-off-by: Kefu Chai --- src/common/config_opts.h | 1 + src/include/ceph_features.h | 1 + src/mon/OSDMonitor.cc | 19 ++++++++++++++++++- src/osd/ReplicatedPG.cc | 30 ++++++++++++++++++++---------- src/osd/ReplicatedPG.h | 4 +++- src/osd/osd_types.cc | 15 +++++++++++++-- src/osd/osd_types.h | 11 +++++++---- 7 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d30e15e3428a9..4d25c5abe4110 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -533,6 +533,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host +OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it. OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 0cfc20add7aeb..81ff511c23250 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -65,6 +65,7 @@ #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ #define CEPH_FEATURE_MON_METADATA (1ULL<<50) #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */ #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 95d57dbf52424..1e67c221f90dc 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -16,6 +16,7 @@ * */ +#include #include #include @@ -1847,6 +1848,20 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op) goto ignore; } + if (any_of(osdmap.get_pools().begin(), + osdmap.get_pools().end(), + [](const std::pair& pool) + { return pool.second.use_gmt_hitset; })) { + assert(osdmap.get_num_up_osds() == 0 || + osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT); + if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) { + dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at " + << m->get_orig_source_inst() + << " doesn't announce support -- ignore" << dendl; + goto ignore; + } + } + // already booted? if (osdmap.is_up(from) && osdmap.get_inst(from) == m->get_orig_source_inst()) { @@ -4421,7 +4436,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE); if (g_conf->osd_pool_default_flag_nosizechange) pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE); - + if (g_conf->osd_pool_use_gmt_hitset && + (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) + pi->use_gmt_hitset = true; pi->size = size; pi->min_size = min_size; pi->crush_ruleset = crush_ruleset; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..eeb1f2998d548 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1156,7 +1156,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) p != info.hit_set.history.end(); ++p) { if (stamp >= p->begin && stamp <= p->end) { - oid = get_hit_set_archive_object(p->begin, p->end); + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); break; } } @@ -10535,10 +10535,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp) return hoid; } -hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end) +hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) { ostringstream ss; - ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss) << "_"; + end.gmtime(ss); + } else { + start.localtime(ss) << "_"; + end.localtime(ss); + } hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", info.pgid.ps(), info.pgid.pool(), cct->_conf->osd_hit_set_namespace); @@ -10583,7 +10592,7 @@ void ReplicatedPG::hit_set_remove_all() for (list::iterator p = info.hit_set.history.begin(); p != info.hit_set.history.end(); ++p) { - hobject_t aoid = get_hit_set_archive_object(p->begin, p->end); + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); // Once we hit a degraded object just skip if (is_degraded_or_backfilling_object(aoid)) @@ -10595,7 +10604,7 @@ void ReplicatedPG::hit_set_remove_all() if (!info.hit_set.history.empty()) { list::reverse_iterator p = info.hit_set.history.rbegin(); assert(p != info.hit_set.history.rend()); - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); assert(!is_degraded_or_backfilling_object(oid)); ObjectContextRef obc = get_object_context(oid, false); assert(obc); @@ -10713,7 +10722,7 @@ void ReplicatedPG::hit_set_persist() for (list::iterator p = info.hit_set.history.begin(); p != info.hit_set.history.end(); ++p) { - hobject_t aoid = get_hit_set_archive_object(p->begin, p->end); + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); // Once we hit a degraded object just skip further trim if (is_degraded_or_backfilling_object(aoid)) @@ -10725,7 +10734,8 @@ void ReplicatedPG::hit_set_persist() utime_t start = info.hit_set.current_info.begin; if (!start) start = hit_set_start_stamp; - oid = get_hit_set_archive_object(start, now); + oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset); + // If the current object is degraded we skip this persist request if (scrubber.write_blocked_by_scrub(oid, get_sort_bitwise())) return; @@ -10816,7 +10826,7 @@ void ReplicatedPG::hit_set_persist() updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info); hit_set_create(); - updated_hit_set_hist.current_info = pg_hit_set_info_t(); + updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset); updated_hit_set_hist.current_last_stamp = utime_t(); // fabricate an object_info_t and SnapSet @@ -10879,7 +10889,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { list::iterator p = updated_hit_set_hist.history.begin(); assert(p != updated_hit_set_hist.history.end()); - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); assert(!is_degraded_or_backfilling_object(oid)); @@ -11168,7 +11178,7 @@ void ReplicatedPG::agent_load_hit_sets() continue; } - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); if (is_unreadable_object(oid)) { dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; break; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index ab24da4199f1c..efd81d2704248 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -959,7 +959,9 @@ class ReplicatedPG : public PG, public PGBackend::Listener { void hit_set_remove_all(); hobject_t get_hit_set_current_object(utime_t stamp); - hobject_t get_hit_set_archive_object(utime_t start, utime_t end); + hobject_t get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt); // agent boost::scoped_ptr agent_state; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index ad68118fd41ff..975d51391cd8e 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -942,6 +942,7 @@ void pg_pool_t::dump(Formatter *f) const f->close_section(); // hit_set_params f->dump_unsigned("hit_set_period", hit_set_period); f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote); f->dump_unsigned("stripe_width", get_stripe_width()); @@ -1299,6 +1300,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(expected_num_objects, bl); ::encode(cache_target_dirty_high_ratio_micro, bl); ::encode(min_write_recency_for_promote, bl); + ::encode(use_gmt_hitset, bl); ENCODE_FINISH(bl); } @@ -1423,8 +1425,10 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } if (struct_v >= 20) { ::decode(min_write_recency_for_promote, bl); + ::decode(use_gmt_hitset, bl); } else { min_write_recency_for_promote = 1; + use_gmt_hitset = false; } DECODE_FINISH(bl); calc_pg_masks(); @@ -3931,19 +3935,25 @@ void pg_create_t::generate_test_instances(list& o) void pg_hit_set_info_t::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(begin, bl); ::encode(end, bl); ::encode(version, bl); + ::encode(using_gmt, bl); ENCODE_FINISH(bl); } void pg_hit_set_info_t::decode(bufferlist::iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); ::decode(begin, p); ::decode(end, p); ::decode(version, p); + if (struct_v >= 2) { + ::decode(using_gmt, p); + } else { + using_gmt = false; + } DECODE_FINISH(p); } @@ -3952,6 +3962,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const f->dump_stream("begin") << begin; f->dump_stream("end") << end; f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; } void pg_hit_set_info_t::generate_test_instances(list& ls) diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index b94f547cea71d..0d1327ed4ca7d 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1111,6 +1111,7 @@ struct pg_pool_t { HitSet::Params hit_set_params; ///< The HitSet params to use on this pool uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) uint32_t hit_set_count; ///< number of periods to retain + bool use_gmt_hitset; ///< use gmt to name the hitset archive object uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write @@ -1141,6 +1142,7 @@ struct pg_pool_t { hit_set_params(), hit_set_period(0), hit_set_count(0), + use_gmt_hitset(true), min_read_recency_for_promote(0), min_write_recency_for_promote(0), stripe_width(0), @@ -1717,10 +1719,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) struct pg_hit_set_info_t { utime_t begin, end; ///< time interval eversion_t version; ///< version this HitSet object was written - - pg_hit_set_info_t() {} - pg_hit_set_info_t(utime_t b) - : begin(b) {} + bool using_gmt; ///< use gmt for creating the hit_set archive object name + pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + pg_hit_set_info_t(utime_t b, bool using_gmt) + : begin(b), using_gmt(using_gmt) {} void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); From 03a1a3cf023a9aeb2fa26820e49e5efe3f3b3789 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 18 Aug 2015 00:04:23 +0800 Subject: [PATCH 053/654] mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd allow "ceph osd pool set $pool use_gmt_hitset " as long as the cluster supports gmt hitset. Fixes: #9732 Signed-off-by: Kefu Chai --- src/mon/MonCommands.h | 2 +- src/mon/OSDMonitor.cc | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index a9d0b38dca3f4..1bf238787b9b6 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -678,7 +678,7 @@ COMMAND("osd pool get " \ "get pool parameter ", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed|min_write_recency_for_promote " \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed|min_write_recency_for_promote " \ "name=val,type=CephString " \ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ "set pool parameter to ", "osd", "rw", "cli,rest") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 1e67c221f90dc..2d07e311372b7 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4786,6 +4786,17 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, } BloomHitSet::Params *bloomp = static_cast(p.hit_set_params.impl.get()); bloomp->set_fpp(f); + } else if (var == "use_gmt_hitset") { + if (val == "true" || (interr.empty() && n == 1)) { + if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) { + ss << "not all OSDs support GMT hit set."; + return -EINVAL; + } + p.use_gmt_hitset = true; + } else { + ss << "expecting value 'true' or '1'"; + return -EINVAL; + } } else if (var == "debug_fake_ec_pool") { if (val == "true" || (interr.empty() && n == 1)) { p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL; From cc2bcf760f2d2f20fc4a2fa78ba52475a23e55bf Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 18 Aug 2015 16:53:50 +0800 Subject: [PATCH 054/654] mon: print use_gmt_hitset in "ceph osd pool get" Signed-off-by: Kefu Chai --- src/mon/OSDMonitor.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 2d07e311372b7..5d720277fd8b9 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2824,7 +2824,7 @@ namespace { enum osd_pool_get_choices { SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL, PG_NUM, PGP_NUM, CRUSH_RULESET, HIT_SET_TYPE, - HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP, + HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP, USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO, CACHE_TARGET_FULL_RATIO, @@ -3272,6 +3272,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) ("pg_num", PG_NUM)("pgp_num", PGP_NUM)("crush_ruleset", CRUSH_RULESET) ("hit_set_type", HIT_SET_TYPE)("hit_set_period", HIT_SET_PERIOD) ("hit_set_count", HIT_SET_COUNT)("hit_set_fpp", HIT_SET_FPP) + ("use_gmt_hitset", USE_GMT_HITSET) ("auid", AUID)("target_max_objects", TARGET_MAX_OBJECTS) ("target_max_bytes", TARGET_MAX_BYTES) ("cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO) @@ -3389,6 +3390,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) } } break; + case USE_GMT_HITSET: + f->dump_bool("use_gmt_hitset", p->use_gmt_hitset); + break; case TARGET_MAX_OBJECTS: f->dump_unsigned("target_max_objects", p->target_max_objects); break; @@ -3490,6 +3494,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) } } break; + case USE_GMT_HITSET: + ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n"; + break; case TARGET_MAX_OBJECTS: ss << "target_max_objects: " << p->target_max_objects << "\n"; break; From ff087f9e01b3cb4a3cee1232b5d8096b99a8100b Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 12 Jun 2015 20:53:44 +0800 Subject: [PATCH 055/654] osd: remove useless hitset code * we do not persist current hit set using get_hit_set_current_object() anymore, instead we always append current hitset into archive even !hitset.is_full(), see fbd9c15. so it's not necessary to remove the current hitset object before putting it to disk. Signed-off-by: Kefu Chai --- src/osd/ReplicatedPG.cc | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index eeb1f2998d548..13e32bae9c90d 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -10785,43 +10785,8 @@ void ReplicatedPG::hit_set_persist() ctx->updated_hset_history = info.hit_set; pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history); - if (updated_hit_set_hist.current_last_stamp != utime_t()) { - // FIXME: we cheat slightly here by bundling in a remove on a object - // other the RepGather object. we aren't carrying an ObjectContext for - // the deleted object over this period. - hobject_t old_obj = - get_hit_set_current_object(updated_hit_set_hist.current_last_stamp); - ctx->log.push_back( - pg_log_entry_t(pg_log_entry_t::DELETE, - old_obj, - ctx->at_version, - updated_hit_set_hist.current_last_update, - 0, - osd_reqid_t(), - ctx->mtime)); - if (pool.info.require_rollback()) { - if (ctx->log.back().mod_desc.rmobject(ctx->at_version.version)) { - ctx->op_t->stash(old_obj, ctx->at_version.version); - } else { - ctx->op_t->remove(old_obj); - } - } else { - ctx->op_t->remove(old_obj); - ctx->log.back().mod_desc.mark_unrollbackable(); - } - ++ctx->at_version.version; - - struct stat st; - int r = osd->store->stat( - coll, - ghobject_t(old_obj, ghobject_t::NO_GEN, pg_whoami.shard), - &st); - assert(r == 0); - --ctx->delta_stats.num_objects; - ctx->delta_stats.num_bytes -= st.st_size; - } - updated_hit_set_hist.current_last_update = info.last_update; // *after* above remove! + updated_hit_set_hist.current_last_update = info.last_update; updated_hit_set_hist.current_info.version = ctx->at_version; updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info); From 2c7fe2bc2d248830115369816aa029f4351375b4 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 15 Aug 2015 10:08:38 -0700 Subject: [PATCH 056/654] osd: remove unused ctor Signed-off-by: Kefu Chai --- src/osd/osd_types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 0d1327ed4ca7d..cf67153c91168 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1722,8 +1722,6 @@ struct pg_hit_set_info_t { bool using_gmt; ///< use gmt for creating the hit_set archive object name pg_hit_set_info_t(bool using_gmt = true) : using_gmt(using_gmt) {} - pg_hit_set_info_t(utime_t b, bool using_gmt) - : begin(b), using_gmt(using_gmt) {} void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); From 151c0511050c196003d0737858e0bf3ea9b665a4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 23 Aug 2015 14:02:59 -0400 Subject: [PATCH 057/654] common/Mutex: avoid trylock on lock if instrumentation is not enabled Benchmarks have shown that the trylock in the lock path has a high latency cost. Only pay the penalty if instrumentation is actually enabled. While we are at it, avoid the duplicate conditional check so that the fast path is faster. Signed-off-by: Sage Weil --- src/common/Mutex.cc | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc index cedba098128cb..5e9b590a5395e 100644 --- a/src/common/Mutex.cc +++ b/src/common/Mutex.cc @@ -82,21 +82,26 @@ Mutex::~Mutex() { } void Mutex::Lock(bool no_lockdep) { - utime_t start; int r; if (lockdep && g_lockdep && !no_lockdep) _will_lock(); - if (TryLock()) { - goto out; - } - - if (logger && cct && cct->_conf->mutex_perf_counter) + if (logger && cct && cct->_conf->mutex_perf_counter) { + utime_t start; + // instrumented mutex enabled start = ceph_clock_now(cct); - r = pthread_mutex_lock(&_m); - if (logger && cct && cct->_conf->mutex_perf_counter) + if (TryLock()) { + goto out; + } + + r = pthread_mutex_lock(&_m); + logger->tinc(l_mutex_wait, ceph_clock_now(cct) - start); + } else { + r = pthread_mutex_lock(&_m); + } + assert(r == 0); if (lockdep && g_lockdep) _locked(); _post_lock(); From d63508f72719c769922a4ebbdca4c4a8ee924e87 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 24 Aug 2015 03:37:32 +0000 Subject: [PATCH 058/654] make-check: support MAKEOPTS overrides. It is useful in some cases to exercise more control over the parameters passed to make for make check. I am using it to passing a load limit parameter to make. Signed-off-by: Robin H. Johnson --- run-make-check.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/run-make-check.sh b/run-make-check.sh index 0d9a2943af22b..4c07ec1e3a2f7 100755 --- a/run-make-check.sh +++ b/run-make-check.sh @@ -41,6 +41,14 @@ function get_processors() { fi } +DEFAULT_MAKEOPTS=${DEFAULT_MAKEOPTS:--j$(get_processors)} +BUILD_MAKEOPTS=${BUILD_MAKEOPTS:-$DEFAULT_MAKEOPTS} +if can_parallel_make_check ; then + CHECK_MAKEOPTS=${CHECK_MAKEOPTS:-$DEFAULT_MAKEOPTS} +else + CHECK_MAKEOPTS="" +fi + function run() { # Same logic as install-deps.sh for finding package installer local install_cmd @@ -60,8 +68,8 @@ function run() { $DRY_RUN ./autogen.sh || return 1 $DRY_RUN ./configure "$@" --disable-static --with-radosgw --with-debug --without-lttng \ CC="ccache gcc" CXX="ccache g++" CFLAGS="-Wall -g" CXXFLAGS="-Wall -g" || return 1 - $DRY_RUN make -j$(get_processors) || return 1 - $DRY_RUN make $(maybe_parallel_make_check) check || return 1 + $DRY_RUN make $BUILD_MAKEOPTS || return 1 + $DRY_RUN make $CHECK_MAKEOPTS check || return 1 $DRY_RUN make dist || return 1 } From c5895d3fad9da0ab7f05f134c49e22795d5c61f3 Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Sun, 2 Aug 2015 18:36:40 +0800 Subject: [PATCH 059/654] bug fix: osd: do not cache unused buffer in attrs attrs only reference the origin bufferlist (decode from MOSDPGPush or ECSubReadReply message) whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe cache the attr), this causes the whole origin bufferlist would not be free until obc is evicted from obc cache. So rebuild the bufferlist before cache it. Fixes: #12565 Signed-off-by: Ning Yao Signed-off-by: Xinze Chi --- src/osd/ECBackend.cc | 9 +++++++++ src/osd/ReplicatedBackend.cc | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 47f326efaf176..0899bc86c1c82 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -372,6 +372,15 @@ void ECBackend::handle_recovery_read_complete( op.xattrs.swap(*attrs); if (!op.obc) { + // attrs only reference the origin bufferlist (decode from ECSubReadReply message) + // whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe + // cache the attr), this causes the whole origin bufferlist would not be free until + // obc is evicted from obc cache. So rebuild the bufferlist before cache it. + for (map::iterator it = op.xattrs.begin(); + it != op.xattrs.end(); + ++it) { + it->second.rebuild(); + } op.obc = get_parent()->get_obc(hoid, op.xattrs); op.recovery_info.size = op.obc->obs.oi.size; op.recovery_info.oi = op.obc->obs.oi; diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 5ddc9fd311efc..e1cf238b2b42a 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -1811,6 +1811,15 @@ bool ReplicatedBackend::handle_pull_response( bool first = pi.recovery_progress.first; if (first) { + // attrs only reference the origin bufferlist (decode from MOSDPGPush message) + // whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe + // cache the attr), this causes the whole origin bufferlist would not be free until + // obc is evicted from obc cache. So rebuild the bufferlist before cache it. + for (map::iterator it = pop.attrset.begin(); + it != pop.attrset.end(); + ++it) { + it->second.rebuild(); + } pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset); pi.recovery_info.oi = pi.obc->obs.oi; pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc); From 136242b5612b8bbf260910b1678389361e86d22a Mon Sep 17 00:00:00 2001 From: Abhishek Lekshmanan Date: Mon, 24 Aug 2015 23:11:35 +0530 Subject: [PATCH 060/654] rgw: be more flexible with iso8601 timestamps make parsing 8601 more flexible by not restricting the length of seconds to 5, this allows timestamp to be specified both as ms or us. Newer keystone backends such as fernet token backend default to microseconds when publishing iso8601 timestamps, so this allows these timestamps to be allowed when specifying the token expiry time. Fixes: #12761 Reported-by: Ian Unruh Signed-off-by: Abhishek Lekshmanan --- src/rgw/rgw_common.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index c06e5e17ef4ba..b76c328c3343a 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -357,18 +357,17 @@ bool parse_iso8601(const char *s, struct tm *t) } string str; trim_whitespace(p, str); - if (str.size() == 1 && str[0] == 'Z') + int len = str.size(); + + if (len == 1 && str[0] == 'Z') return true; - if (str.size() != 5) { - return false; - } if (str[0] != '.' || - str[str.size() - 1] != 'Z') + str[len - 1] != 'Z') return false; uint32_t ms; - int r = stringtoul(str.substr(1, 3), &ms); + int r = stringtoul(str.substr(1, len - 2), &ms); if (r < 0) return false; From 8ef2c9654e64c35f94e4fc4b4c75f3b8d833f758 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 20 Aug 2015 16:31:25 -0700 Subject: [PATCH 061/654] buffer: modify inline memory ops to use packed structs packed structs can be used to ensure alignment is not an issue. Signed-off-by: Yehuda Sadeh --- src/include/inline_memory.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h index 6e08e420e535b..65996f69bd676 100644 --- a/src/include/inline_memory.h +++ b/src/include/inline_memory.h @@ -14,11 +14,11 @@ #ifndef CEPH_INLINE_MEMORY_H #define CEPH_INLINE_MEMORY_H -// only define these for x86_64 for now. +#if defined(__GNUC__) -#if defined(__GNUC__) && defined(__x86_64__) - -typedef unsigned uint128_t __attribute__ ((mode (TI))); +typedef struct __attribute__((__packed__)) { uint16_t val; } packed_uint16_t; +typedef struct __attribute__((__packed__)) { uint32_t val; } packed_uint32_t; +typedef struct __attribute__((__packed__)) { uint64_t val; } packed_uint64_t; // optimize for the common case, which is very small copies static inline void maybe_inline_memcpy(char *dest, const char *src, size_t l, @@ -34,17 +34,17 @@ void maybe_inline_memcpy(char *dest, const char *src, size_t l, } switch (l) { case 8: - *((uint64_t*)(dest)) = *((uint64_t*)(src)); + ((packed_uint64_t*)dest)->val = ((packed_uint64_t*)src)->val; return; case 4: - *((uint32_t*)(dest)) = *((uint32_t*)(src)); + ((packed_uint32_t*)dest)->val = ((packed_uint32_t*)src)->val; return; case 3: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); + ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); return; case 2: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); + ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; return; case 1: *((uint8_t*)(dest)) = *((uint8_t*)(src)); @@ -69,6 +69,17 @@ void maybe_inline_memcpy(char *dest, const char *src, size_t l, } } +#else + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +#endif + + +#if defined(__GNUC__) && defined(__x86_64__) + +typedef unsigned uint128_t __attribute__ ((mode (TI))); + static inline bool mem_is_zero(const char *data, size_t len) __attribute__((always_inline)); @@ -117,11 +128,7 @@ bool mem_is_zero(const char *data, size_t len) return true; } -#else // x86_64 - -// on other architectures, default to something simple. - -#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) +#else // gcc and x86_64 static inline bool mem_is_zero(const char *data, size_t len) { const char *end = data + len; From f77949fedce3449befd74efeb5270579f5085b16 Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Wed, 29 Jul 2015 16:28:33 +0800 Subject: [PATCH 062/654] bug fix: osd: requeue_scrub when kick_object_context_blocked when read miss in writeback cache pool, osd do_proxy_read first and maybe promote it. but in this case, the op is not added to waiting_for_blocked_object. pg scrub maybe block by this object (_range_available_for_scrub). so after promote it, we should requeue_scrub. Fixes: 12515 Signed-off-by: Xinze Chi --- src/osd/ReplicatedPG.cc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..67a8a88118512 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -8467,19 +8467,18 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc) { const hobject_t& soid = obc->obs.oi.soid; - map, hobject_t::BitwiseComparator>::iterator p = waiting_for_blocked_object.find(soid); - if (p == waiting_for_blocked_object.end()) - return; - if (obc->is_blocked()) { dout(10) << __func__ << " " << soid << " still blocked" << dendl; return; } - list& ls = p->second; - dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; - requeue_ops(ls); - waiting_for_blocked_object.erase(p); + map, hobject_t::BitwiseComparator>::iterator p = waiting_for_blocked_object.find(soid); + if (p != waiting_for_blocked_object.end()) { + list& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + waiting_for_blocked_object.erase(p); + } map::iterator i = objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head()); From 178d4d5c0ca590cf4d1848173bd4031e6956d801 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 29 Jul 2015 15:48:04 +0800 Subject: [PATCH 063/654] osd: remove unused parameter of start_recovery_ops The RecoveryCtx parameter is not needed. Signed-off-by: Zhiqiang Wang --- src/osd/OSD.cc | 2 +- src/osd/PG.h | 3 +-- src/osd/ReplicatedPG.cc | 3 +-- src/osd/ReplicatedPG.h | 3 +-- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f8660c07c7635..b9f307c4cadc4 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7843,7 +7843,7 @@ void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle) rctx.handle = &handle; int started; - bool more = pg->start_recovery_ops(max, &rctx, handle, &started); + bool more = pg->start_recovery_ops(max, handle, &started); dout(10) << "do_recovery started " << started << "/" << max << " on " << *pg << dendl; /* diff --git a/src/osd/PG.h b/src/osd/PG.h index 6b8578aefee87..aa509e4f577cc 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1032,8 +1032,7 @@ class PG { * @returns true if any useful work was accomplished; false otherwise */ virtual bool start_recovery_ops( - int max, RecoveryCtx *prctx, - ThreadPool::TPHandle &handle, + int max, ThreadPool::TPHandle &handle, int *ops_begun) = 0; void purge_strays(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..5c39143ed2ea0 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -9450,8 +9450,7 @@ void PG::MissingLoc::check_recovery_sources(const OSDMapRef osdmap) bool ReplicatedPG::start_recovery_ops( - int max, RecoveryCtx *prctx, - ThreadPool::TPHandle &handle, + int max, ThreadPool::TPHandle &handle, int *ops_started) { int& started = *ops_started; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index ab24da4199f1c..338abb5aa25d4 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1253,8 +1253,7 @@ class ReplicatedPG : public PG, public PGBackend::Listener { void queue_for_recovery(); bool start_recovery_ops( - int max, RecoveryCtx *prctx, - ThreadPool::TPHandle &handle, int *started); + int max, ThreadPool::TPHandle &handle, int *started); int recover_primary(int max, ThreadPool::TPHandle &handle); int recover_replicas(int max, ThreadPool::TPHandle &handle); From 8c2f8bf54b90337421b99a1bc100e2ae896ddd99 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 29 Jul 2015 16:20:00 +0800 Subject: [PATCH 064/654] osd: no bother to create RecoveryCtx if no recovery op is started If there is no recovery op started, there is no need to create a RecoveryCtx and later delete all of the things in it. Signed-off-by: Zhiqiang Wang --- src/osd/OSD.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b9f307c4cadc4..0c092a60ac014 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7839,12 +7839,17 @@ void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle) dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl; #endif - PG::RecoveryCtx rctx = create_context(); - rctx.handle = &handle; - int started; bool more = pg->start_recovery_ops(max, handle, &started); dout(10) << "do_recovery started " << started << "/" << max << " on " << *pg << dendl; + // If no recovery op is started, don't bother to manipulate the RecoveryCtx + if (!started && (more || !pg->have_unfound())) { + pg->unlock(); + goto out; + } + + PG::RecoveryCtx rctx = create_context(); + rctx.handle = &handle; /* * if we couldn't start any recovery ops and things are still From 855ae1fd6e4557adba1cbd8ab532488b867cee2a Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Mon, 8 Jun 2015 16:54:08 +0800 Subject: [PATCH 065/654] bug fix: osd: avoid multi set osd_op.outdata in tier pool There are two read op on the same object for ec pool. First op read miss happend, calling do_proxy_read and promote_object, The second op only do_proxy_read. but before first op process_copy_chunk finish, the second op finish_proxy_read. first op receive reply from base pool first and then second received. so the second op set the field "outdata" in m->ops first. And then first op requeue_ops in process_copy_chunk, At last in do_osd_ops, it append outdata field. Fixes: 12540 Signed-off-by: Xinze Chi --- src/osd/ReplicatedPG.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..f05646b591a62 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2240,6 +2240,9 @@ void ReplicatedPG::cancel_proxy_read(ProxyReadOpRef prdop) // cancel objecter op, if we can if (prdop->objecter_tid) { osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED); + for (uint32_t i = 0; i < prdop->ops.size(); i++) { + prdop->ops[i].outdata.clear(); + } proxyread_ops.erase(prdop->objecter_tid); prdop->objecter_tid = 0; } @@ -6697,7 +6700,6 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) // cancel and requeue proxy ops on this object if (!r) { - kick_proxy_ops_blocked(cobc->obs.oi.soid); for (map::iterator it = proxyread_ops.begin(); it != proxyread_ops.end(); ++it) { if (it->second->soid == cobc->obs.oi.soid) { @@ -6710,6 +6712,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) cancel_proxy_write(it->second); } } + kick_proxy_ops_blocked(cobc->obs.oi.soid); } kick_object_context_blocked(cobc); From da6d5cfca5b99b1907600c0e7e0e532fe4ad7948 Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Thu, 11 Jun 2015 10:36:33 +0800 Subject: [PATCH 066/654] osd: bug fix hit_set_map size for tier pool Signed-off-by: Xinze Chi --- src/osd/ReplicatedPG.cc | 23 +++++++++++------------ src/osd/ReplicatedPG.h | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..2d9b9b24e4dfd 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2053,7 +2053,10 @@ bool ReplicatedPG::maybe_promote(ObjectContextRef obc, // Check if in other hit sets map::iterator itor; bool in_other_hit_sets = false; - for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); ++itor) { + unsigned max_in_memory_read = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0; + unsigned max_in_memory_write = pool.info.min_write_recency_for_promote > 0 ? pool.info.min_write_recency_for_promote - 1 : 0; + unsigned max_in_memory = MAX(max_in_memory_read, max_in_memory_write); + for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end() && max_in_memory--; ++itor) { if (obc.get()) { if (obc->obs.oi.soid != hobject_t() && itor->second->contains(obc->obs.oi.soid)) { in_other_hit_sets = true; @@ -10759,7 +10762,11 @@ void ReplicatedPG::hit_set_persist() if (agent_state) { agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set); - hit_set_in_memory_trim(); + uint32_t size = agent_state->hit_set_map.size(); + if (size >= pool.info.hit_set_count) { + size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0; + } + hit_set_in_memory_trim(size); } // hold a ref until it is flushed to disk @@ -10915,16 +10922,8 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) } } -void ReplicatedPG::hit_set_in_memory_trim() +void ReplicatedPG::hit_set_in_memory_trim(uint32_t max_in_memory) { - unsigned max = pool.info.hit_set_count; - unsigned max_in_memory_read = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0; - unsigned max_in_memory_write = pool.info.min_write_recency_for_promote > 0 ? pool.info.min_write_recency_for_promote - 1 : 0; - unsigned max_in_memory = MAX(max_in_memory_read, max_in_memory_write); - - if (max_in_memory > max) { - max_in_memory = max; - } while (agent_state->hit_set_map.size() > max_in_memory) { agent_state->remove_oldest_hit_set(); } @@ -11130,7 +11129,7 @@ bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota) agent_state->position = next; // Discard old in memory HitSets - hit_set_in_memory_trim(); + hit_set_in_memory_trim(pool.info.hit_set_count); if (need_delay) { assert(agent_state->delaying == false); diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index ab24da4199f1c..b848de11c9ba5 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -955,7 +955,7 @@ class ReplicatedPG : public PG, public PGBackend::Listener { void hit_set_persist(); ///< persist hit info bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets - void hit_set_in_memory_trim(); ///< discard old in memory HitSets + void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets void hit_set_remove_all(); hobject_t get_hit_set_current_object(utime_t stamp); From fc02a8adfb42c02727c3f9fb4b7c48215caf5dce Mon Sep 17 00:00:00 2001 From: Rohan Mars Date: Wed, 19 Aug 2015 09:07:28 -0700 Subject: [PATCH 067/654] added boost timegm impl for cross platform support Signed-off-by: Rohan Mars --- COPYING | 5 +++ src/include/Makefile.am | 3 +- src/include/timegm.h | 79 +++++++++++++++++++++++++++++++++++++++++ src/include/utime.h | 3 +- 4 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 src/include/timegm.h diff --git a/COPYING b/COPYING index 1b88923155cc8..5efc838319c48 100644 --- a/COPYING +++ b/COPYING @@ -144,3 +144,8 @@ File: qa/workunits/erasure-code/jquery.js Files: qa/workunits/erasure-code/jquery.{flot.categories,flot}.js Copyright (c) 2007-2014 IOLA and Ole Laursen. Licensed under the MIT license. + +Files: src/include/timegm.h + Copyright (C) Copyright Howard Hinnant + Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba + License: Boost Software License, Version 1.0 diff --git a/src/include/Makefile.am b/src/include/Makefile.am index b3ceb24bf1a58..f09bad928c24c 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -112,4 +112,5 @@ noinst_HEADERS += \ include/memory.h \ include/rados/memory.h \ include/unordered_set.h \ - include/unordered_map.h + include/unordered_map.h \ + include/timegm.h diff --git a/src/include/timegm.h b/src/include/timegm.h new file mode 100644 index 0000000000000..fb970432df83f --- /dev/null +++ b/src/include/timegm.h @@ -0,0 +1,79 @@ +// (C) Copyright Howard Hinnant +// (C) Copyright 2010-2011 Vicente J. Botet Escriba +// Use, modification and distribution are subject to the Boost Software License, +// Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt). + +//===-------------------------- locale ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// This code was adapted by Vicente from Howard Hinnant's experimental work +// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get() + +#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H +#define BOOST_CHRONO_IO_TIME_POINT_IO_H + +#include + +static int32_t is_leap(int32_t year) { + if(year % 400 == 0) + return 1; + if(year % 100 == 0) + return 0; + if(year % 4 == 0) + return 1; + return 0; +} + +static int32_t days_from_0(int32_t year) { + year--; + return 365 * year + (year / 400) - (year/100) + (year / 4); +} + +int32_t static days_from_1970(int32_t year) { + static const int days_from_0_to_1970 = days_from_0(1970); + return days_from_0(year) - days_from_0_to_1970; +} + +static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) { + static const int32_t days[2][12] = + { + { 0,31,59,90,120,151,181,212,243,273,304,334}, + { 0,31,60,91,121,152,182,213,244,274,305,335} + }; + + return days[is_leap(year)][month-1] + day - 1; +} + +static time_t internal_timegm(tm const *t) { + int year = t->tm_year + 1900; + int month = t->tm_mon; + if(month > 11) + { + year += month/12; + month %= 12; + } + else if(month < 0) + { + int years_diff = (-month + 11)/12; + year -= years_diff; + month+=12 * years_diff; + } + month++; + int day = t->tm_mday; + int day_of_year = days_from_1jan(year,month,day); + int days_since_epoch = days_from_1970(year) + day_of_year ; + + time_t seconds_in_day = 3600 * 24; + time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec; + + return result; +} + +#endif diff --git a/src/include/utime.h b/src/include/utime.h index 9f1007be8a7ba..30780d1af3937 100644 --- a/src/include/utime.h +++ b/src/include/utime.h @@ -21,6 +21,7 @@ #include #include "include/types.h" +#include "include/timegm.h" #include "common/strtol.h" @@ -291,7 +292,7 @@ class utime_t { *nsec = (uint64_t)usec * 1000; } } - time_t t = timegm(&tm); + time_t t = internal_timegm(&tm); if (epoch) *epoch = (uint64_t)t; From 98c0606c69cffa657a4dfaf6422e63fb3c407761 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 16:40:23 -0400 Subject: [PATCH 068/654] include/inline_memcpy: make prototype resemble memcpy's Suggested-by: Steve Capper Signed-off-by: Sage Weil --- src/include/inline_memory.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h index 65996f69bd676..3cdcae8859a0d 100644 --- a/src/include/inline_memory.h +++ b/src/include/inline_memory.h @@ -21,52 +21,52 @@ typedef struct __attribute__((__packed__)) { uint32_t val; } packed_uint32_t; typedef struct __attribute__((__packed__)) { uint64_t val; } packed_uint64_t; // optimize for the common case, which is very small copies -static inline void maybe_inline_memcpy(char *dest, const char *src, size_t l, +static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l, size_t inline_len) __attribute__((always_inline)); -void maybe_inline_memcpy(char *dest, const char *src, size_t l, +void *maybe_inline_memcpy(void *dest, const void *src, size_t l, size_t inline_len) { if (l > inline_len) { - memcpy(dest, src, l); - return; + return memcpy(dest, src, l); } switch (l) { case 8: ((packed_uint64_t*)dest)->val = ((packed_uint64_t*)src)->val; - return; + return dest; case 4: ((packed_uint32_t*)dest)->val = ((packed_uint32_t*)src)->val; - return; + return dest; case 3: ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; - *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); - return; + *((uint8_t*)((char*)dest+2)) = *((uint8_t*)((char*)src+2)); + return dest; case 2: ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; - return; + return dest; case 1: *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return; + return dest; default: int cursor = 0; while (l >= sizeof(uint64_t)) { - *((uint64_t*)(dest + cursor)) = *((uint64_t*)(src + cursor)); + *((uint64_t*)((char*)dest + cursor)) = *((uint64_t*)((char*)src + cursor)); cursor += sizeof(uint64_t); l -= sizeof(uint64_t); } while (l >= sizeof(uint32_t)) { - *((uint32_t*)(dest + cursor)) = *((uint32_t*)(src + cursor)); + *((uint32_t*)((char*)dest + cursor)) = *((uint32_t*)((char*)src + cursor)); cursor += sizeof(uint32_t); l -= sizeof(uint32_t); } while (l > 0) { - *(dest + cursor) = *(src + cursor); + *((char*)dest + cursor) = *((char*)src + cursor); cursor++; l--; } } + return dest; } #else From dfd142f2aa2c2b7790fa944edcd8abef3918f614 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 16:40:01 -0400 Subject: [PATCH 069/654] include/inline_memcpy: use __builtin_memcpy instead of explicit ptr copies The gcc version of this is smart enough to handle any alignment issues that exist and *should* generate the fastest code that is general and correct. On x86_64 this performs identically for the uint64_t microbenchmark. Suggested-by: Steve Capper Signed-off-by: Sage Weil --- src/include/inline_memory.h | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h index 3cdcae8859a0d..33c6bc0ea0dce 100644 --- a/src/include/inline_memory.h +++ b/src/include/inline_memory.h @@ -16,10 +16,6 @@ #if defined(__GNUC__) -typedef struct __attribute__((__packed__)) { uint16_t val; } packed_uint16_t; -typedef struct __attribute__((__packed__)) { uint32_t val; } packed_uint32_t; -typedef struct __attribute__((__packed__)) { uint64_t val; } packed_uint64_t; - // optimize for the common case, which is very small copies static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l, size_t inline_len) @@ -33,30 +29,26 @@ void *maybe_inline_memcpy(void *dest, const void *src, size_t l, } switch (l) { case 8: - ((packed_uint64_t*)dest)->val = ((packed_uint64_t*)src)->val; - return dest; + return __builtin_memcpy(dest, src, 8); case 4: - ((packed_uint32_t*)dest)->val = ((packed_uint32_t*)src)->val; - return dest; + return __builtin_memcpy(dest, src, 4); case 3: - ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; - *((uint8_t*)((char*)dest+2)) = *((uint8_t*)((char*)src+2)); - return dest; + return __builtin_memcpy(dest, src, 3); case 2: - ((packed_uint16_t*)dest)->val = ((packed_uint16_t*)src)->val; - return dest; + return __builtin_memcpy(dest, src, 2); case 1: - *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return dest; + return __builtin_memcpy(dest, src, 1); default: int cursor = 0; while (l >= sizeof(uint64_t)) { - *((uint64_t*)((char*)dest + cursor)) = *((uint64_t*)((char*)src + cursor)); + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint64_t)); cursor += sizeof(uint64_t); l -= sizeof(uint64_t); } while (l >= sizeof(uint32_t)) { - *((uint32_t*)((char*)dest + cursor)) = *((uint32_t*)((char*)src + cursor)); + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint32_t)); cursor += sizeof(uint32_t); l -= sizeof(uint32_t); } From 7cc8d86cbba92905c5373fb43abe3b2854cf84af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 12:06:49 -0400 Subject: [PATCH 070/654] ceph_test_msgr: parse CEPH_ARGS Signed-off-by: Sage Weil --- src/test/msgr/test_msgr.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/msgr/test_msgr.cc b/src/test/msgr/test_msgr.cc index f8f1928de2e07..e3a27467a9fa2 100644 --- a/src/test/msgr/test_msgr.cc +++ b/src/test/msgr/test_msgr.cc @@ -1370,6 +1370,7 @@ TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {} int main(int argc, char **argv) { vector args; argv_to_vec(argc, (const char **)argv, args); + env_to_vec(args); global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); g_ceph_context->_conf->set_val("auth_cluster_required", "none"); From 8a08acce376cb37345f0f45f127c18d3189fc951 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 17:43:35 -0400 Subject: [PATCH 071/654] common/hobject_t: fix is_temp() off-by-one pool 0 -> -2 for its temp objects. Fixes: #12785 Signed-off-by: Sage Weil --- src/common/hobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/hobject.h b/src/common/hobject.h index e92cc6e182dd4..4698756515556 100644 --- a/src/common/hobject.h +++ b/src/common/hobject.h @@ -82,7 +82,7 @@ struct hobject_t { } bool is_temp() const { - return pool < POOL_TEMP_START && pool != INT64_MIN; + return pool <= POOL_TEMP_START && pool != INT64_MIN; } bool is_meta() const { return pool == POOL_META; From 347ac0f80f1362f4c06fb8695d8fd9f40bbf89d9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 18:20:29 -0400 Subject: [PATCH 072/654] ceph_test_rados_api_tier: make PromoteOn2ndRead tolerate thrashing Repeate the test up to 20 times until we get a read that doesn't trigger promote. Fixes: #9221 (again) Signed-off-by: Sage Weil --- src/test/librados/tier.cc | 71 ++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc index 0bef57a91ffef..eb2db6bf973e8 100644 --- a/src/test/librados/tier.cc +++ b/src/test/librados/tier.cc @@ -2238,12 +2238,12 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) { TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) { // create object - { + for (int i=0; i<20; ++i) { bufferlist bl; bl.append("hi there"); ObjectWriteOperation op; op.write_full(bl); - ASSERT_EQ(0, ioctx.operate("foo", &op)); + ASSERT_EQ(0, ioctx.operate("foo" + stringify(i), &op)); } // configure cache @@ -2279,40 +2279,63 @@ TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) { // wait for maps to settle cluster.wait_for_latest_osdmap(); - // 1st read, don't trigger a promote - utime_t start = ceph_clock_now(NULL); - { - bufferlist bl; - ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0)); - } - utime_t end = ceph_clock_now(NULL); - float dur = end - start; - cout << "duration " << dur << std::endl; + int fake = 0; // set this to non-zero to test spurious promotion, + // e.g. from thrashing + int attempt = 0; + string obj; + while (true) { + // 1st read, don't trigger a promote + obj = "foo" + stringify(attempt); + cout << obj << std::endl; + { + bufferlist bl; + ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0)); + if (--fake >= 0) { + sleep(1); + ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0)); + sleep(1); + } + } - // verify the object is NOT present in the cache tier - { - NObjectIterator it = cache_ioctx.nobjects_begin(); - if (it != cache_ioctx.nobjects_end()) { - if (dur > 1.0) { - cout << " object got promoted, but read was slow, ignoring" << std::endl; - } else { - ASSERT_TRUE(it == cache_ioctx.nobjects_end()); + // verify the object is NOT present in the cache tier + { + bool found = false; + NObjectIterator it = cache_ioctx.nobjects_begin(); + while (it != cache_ioctx.nobjects_end()) { + cout << " see " << it->get_oid() << std::endl; + if (it->get_oid() == string(obj.c_str())) { + found = true; + break; + } + ++it; } + if (!found) + break; } + + ++attempt; + ASSERT_LE(attempt, 20); + cout << "hrm, object is present in cache on attempt " << attempt + << ", retrying" << std::endl; } // Read until the object is present in the cache tier + cout << "verifying " << obj << " is eventually promoted" << std::endl; while (true) { bufferlist bl; - ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0)); + ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0)); + bool there = false; NObjectIterator it = cache_ioctx.nobjects_begin(); - if (it != cache_ioctx.nobjects_end()) { - ASSERT_TRUE(it->get_oid() == string("foo")); + while (it != cache_ioctx.nobjects_end()) { + if (it->get_oid() == string(obj.c_str())) { + there = true; + break; + } ++it; - ASSERT_TRUE(it == cache_ioctx.nobjects_end()); - break; } + if (there) + break; sleep(1); } From fb1b6ddd7bfe40d4a38d5ec59b1c3f59f2a83f23 Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Mon, 24 Aug 2015 23:01:27 +0800 Subject: [PATCH 073/654] common: fix insert empty ptr when bufferlist rebuild Fixes: #12775 Signed-off-by: Xinze Chi --- src/common/buffer.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 051137e0178a8..f1b203524f08b 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1375,6 +1375,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; void buffer::list::rebuild() { + if (_len == 0) { + _buffers.clear(); + return; + } ptr nb; if ((_len & ~CEPH_PAGE_MASK) == 0) nb = buffer::create_page_aligned(_len); @@ -1394,7 +1398,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; } _memcopy_count += pos; _buffers.clear(); - _buffers.push_back(nb); + if (nb.length()) + _buffers.push_back(nb); invalidate_crc(); } From 2d2f0eb338906742d516a2f8e6d5d05505be8f47 Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Mon, 24 Aug 2015 22:59:40 +0800 Subject: [PATCH 074/654] test: add test case for insert empty ptr when buffer rebuild Signed-off-by: Xinze Chi --- src/test/bufferlist.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index c660099b2dbbb..089711ee431af 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -1451,6 +1451,19 @@ TEST(BufferList, rebuild) { EXPECT_TRUE(bl.is_page_aligned()); EXPECT_EQ((unsigned)1, bl.buffers().size()); } + { + bufferlist bl; + char t1[] = "X"; + bufferlist a2; + a2.append(t1, 1); + bl.rebuild(); + bl.append(a2); + EXPECT_EQ((unsigned)1, bl.length()); + bufferlist::iterator p = bl.begin(); + char dst[1]; + p.copy(1, dst); + EXPECT_EQ(0, memcmp(dst, "X", 1)); + } } TEST(BufferList, rebuild_page_aligned) { From 15e5ebe14787bbbc6b906d72ccd2d418d762cfad Mon Sep 17 00:00:00 2001 From: Xinze Chi Date: Mon, 24 Aug 2015 23:02:10 +0800 Subject: [PATCH 075/654] common: fix code format Signed-off-by: Xinze Chi --- src/common/buffer.cc | 98 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index f1b203524f08b..3d7f446f7a011 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1403,57 +1403,57 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; invalidate_crc(); } -void buffer::list::rebuild_aligned(unsigned align) -{ - rebuild_aligned_size_and_memory(align, align); -} - -void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size, - unsigned align_memory) -{ - std::list::iterator p = _buffers.begin(); - while (p != _buffers.end()) { - // keep anything that's already align and sized aligned - if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) { - /*cout << " segment " << (void*)p->c_str() - << " offset " << ((unsigned long)p->c_str() & (align - 1)) - << " length " << p->length() - << " " << (p->length() & (align - 1)) << " ok" << std::endl; - */ - ++p; - continue; + void buffer::list::rebuild_aligned(unsigned align) + { + rebuild_aligned_size_and_memory(align, align); + } + + void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size, + unsigned align_memory) + { + std::list::iterator p = _buffers.begin(); + while (p != _buffers.end()) { + // keep anything that's already align and sized aligned + if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) { + /*cout << " segment " << (void*)p->c_str() + << " offset " << ((unsigned long)p->c_str() & (align - 1)) + << " length " << p->length() + << " " << (p->length() & (align - 1)) << " ok" << std::endl; + */ + ++p; + continue; + } + + // consolidate unaligned items, until we get something that is sized+aligned + list unaligned; + unsigned offset = 0; + do { + /*cout << " segment " << (void*)p->c_str() + << " offset " << ((unsigned long)p->c_str() & (align - 1)) + << " length " << p->length() << " " << (p->length() & (align - 1)) + << " overall offset " << offset << " " << (offset & (align - 1)) + << " not ok" << std::endl; + */ + offset += p->length(); + unaligned.push_back(*p); + _buffers.erase(p++); + } while (p != _buffers.end() && + (!p->is_aligned(align_memory) || + !p->is_n_align_sized(align_size) || + (offset % align_size))); + if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) { + ptr nb(buffer::create_aligned(unaligned._len, align_memory)); + unaligned.rebuild(nb); + _memcopy_count += unaligned._len; + } + _buffers.insert(p, unaligned._buffers.front()); } - - // consolidate unaligned items, until we get something that is sized+aligned - list unaligned; - unsigned offset = 0; - do { - /*cout << " segment " << (void*)p->c_str() - << " offset " << ((unsigned long)p->c_str() & (align - 1)) - << " length " << p->length() << " " << (p->length() & (align - 1)) - << " overall offset " << offset << " " << (offset & (align - 1)) - << " not ok" << std::endl; - */ - offset += p->length(); - unaligned.push_back(*p); - _buffers.erase(p++); - } while (p != _buffers.end() && - (!p->is_aligned(align_memory) || - !p->is_n_align_sized(align_size) || - (offset % align_size))); - if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) { - ptr nb(buffer::create_aligned(unaligned._len, align_memory)); - unaligned.rebuild(nb); - _memcopy_count += unaligned._len; - } - _buffers.insert(p, unaligned._buffers.front()); } -} - -void buffer::list::rebuild_page_aligned() -{ - rebuild_aligned(CEPH_PAGE_SIZE); -} + + void buffer::list::rebuild_page_aligned() + { + rebuild_aligned(CEPH_PAGE_SIZE); + } // sort-of-like-assignment-op void buffer::list::claim(list& bl, unsigned int flags) From 55cec07ba92e181cedd105a46ca45536bfc19f3d Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Wed, 26 Aug 2015 12:21:15 +0800 Subject: [PATCH 076/654] Messenger: Fix rand() generate the same sequence numbers Signed-off-by: Haomai Wang --- src/msg/Messenger.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc index 43e66c985f7c0..0d38c139fac37 100644 --- a/src/msg/Messenger.cc +++ b/src/msg/Messenger.cc @@ -15,6 +15,7 @@ Messenger *Messenger::create(CephContext *cct, const string &type, uint64_t nonce, uint64_t features) { int r = -1; + srand(time(NULL)); if (type == "random") r = rand() % 2; // random does not include xio if (r == 0 || type == "simple") From c318129ba1889942341d26d3614c6ab2a794faba Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Wed, 26 Aug 2015 13:53:38 +0200 Subject: [PATCH 077/654] ceph.spec.in: Restart services only if they are running Signed-off-by: Boris Ranto --- ceph.spec.in | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 0dd64221ba04c..dfae17854d301 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1119,10 +1119,19 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %post selinux %if 0%{?_with_systemd} - /usr/bin/systemctl stop ceph.target > /dev/null 2>&1 || : + /usr/bin/systemctl status ceph.target > /dev/null 2>&1 %else - /sbin/service ceph stop >/dev/null 2>&1 || : + /sbin/service ceph status >/dev/null 2>&1 %endif +STATUS=$? + +if test $STATUS -eq 0; then +%if 0%{?_with_systemd} + /usr/bin/systemctl stop ceph.target > /dev/null 2>&1 +%else + /sbin/service ceph stop >/dev/null 2>&1 +%endif +fi OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}') %{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp @@ -1134,34 +1143,51 @@ if %{_sbindir}/selinuxenabled; then fi fi +# Start iff it was started before +if test $STATUS -eq 0; then %if 0%{?_with_systemd} /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || : %else /sbin/service ceph start >/dev/null 2>&1 || : %endif +fi + exit 0 %postun selinux if [ $1 -eq 0 ]; then %if 0%{?_with_systemd} - /usr/bin/systemctl stop ceph.target > /dev/null 2>&1 || : + /usr/bin/systemctl status ceph.target > /dev/null 2>&1 %else - /sbin/service ceph stop >/dev/null 2>&1 || : + /sbin/service ceph status >/dev/null 2>&1 %endif + STATUS=$? + + if test $STATUS -eq 0; then + %if 0%{?_with_systemd} + /usr/bin/systemctl stop ceph.target > /dev/null 2>&1 + %else + /sbin/service ceph stop >/dev/null 2>&1 + %endif + fi + %{_sbindir}/semodule -n -r ceph if %{_sbindir}/selinuxenabled ; then %{_sbindir}/load_policy %relabel_files fi; + + if test $STATUS -eq 0; then %if 0%{?_with_systemd} /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || : %else /sbin/service ceph start >/dev/null 2>&1 || : %endif -fi; + fi +fi exit 0 -%endif +%endif # with selinux ################################################################################# %if 0%{with libs_compat} From df21a6e212a99599bbeafa2fb6c3ec97deac6739 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Aug 2015 13:16:57 +0100 Subject: [PATCH 078/654] osd: expose PGLSFilter in objclass interface Signed-off-by: John Spray --- src/objclass/class_api.cc | 8 ++++++ src/objclass/objclass.h | 33 +++++++++++++++++++++++++ src/osd/ClassHandler.cc | 7 ++++++ src/osd/ClassHandler.h | 11 +++++++++ src/osd/ReplicatedPG.cc | 52 ++++++++++++++++++++++++++++++++++++++- src/osd/ReplicatedPG.h | 48 +----------------------------------- 6 files changed, 111 insertions(+), 48 deletions(-) diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc index 5137448292b9c..09b7f4f93c218 100644 --- a/src/objclass/class_api.cc +++ b/src/objclass/class_api.cc @@ -82,6 +82,14 @@ int cls_unregister_method(cls_method_handle_t handle) return 1; } +int cls_register_cxx_filter(cls_handle_t hclass, const std::string &filter_name, + cls_cxx_filter_factory_t fn) +{ + ClassHandler::ClassData *cls = (ClassHandler::ClassData *)hclass; + cls->register_cxx_filter(filter_name, fn); + return 0; +} + int cls_call(cls_method_context_t hctx, const char *cls, const char *method, char *indata, int datalen, char **outdata, int *outdatalen) diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h index 6f0de2825c253..e14d6a6dc4b58 100644 --- a/src/objclass/objclass.h +++ b/src/objclass/objclass.h @@ -8,6 +8,7 @@ #include "../include/types.h" #include "msg/msg_types.h" +#include "common/hobject.h" extern "C" { #endif @@ -94,9 +95,41 @@ extern void class_fini(void); typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx, class buffer::list *inbl, class buffer::list *outbl); +class PGLSFilter { +protected: + string xattr; +public: + PGLSFilter(); + virtual ~PGLSFilter(); + virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata) = 0; + + /** + * xattr key, or empty string. If non-empty, this xattr will be fetched + * and the value passed into ::filter + */ + virtual string& get_xattr() { return xattr; } + + /** + * If true, objects without the named xattr (if xattr name is not empty) + * will be rejected without calling ::filter + */ + virtual bool reject_empty_xattr() { return true; } +}; + +// Classes expose a filter constructor that returns a subclass of PGLSFilter +typedef PGLSFilter* (*cls_cxx_filter_factory_t)( + bufferlist::iterator *args); + + + extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags, cls_method_cxx_call_t class_call, cls_method_handle_t *handle); +extern int cls_register_cxx_filter(cls_handle_t hclass, + const std::string &filter_name, + cls_cxx_filter_factory_t fn); + extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive); extern int cls_cxx_remove(cls_method_context_t hctx); extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime); diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc index c52177b7e1690..ea6fed89fd6f3 100644 --- a/src/osd/ClassHandler.cc +++ b/src/osd/ClassHandler.cc @@ -221,6 +221,13 @@ ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const ch return &method; } +void ClassHandler::ClassData::register_cxx_filter( + const std::string &filter_name, + cls_cxx_filter_factory_t fn) +{ + filters_map[filter_name] = fn; +} + ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(const char *mname) { map::iterator iter = methods_map.find(mname); diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h index 93cf3c07fbcb9..843214d8d527d 100644 --- a/src/osd/ClassHandler.h +++ b/src/osd/ClassHandler.h @@ -49,6 +49,7 @@ class ClassHandler void *handle; map methods_map; + map filters_map; set dependencies; /* our dependencies */ set missing_dependencies; /* only missing dependencies */ @@ -64,11 +65,21 @@ class ClassHandler ClassMethod *register_cxx_method(const char *mname, int flags, cls_method_cxx_call_t func); void unregister_method(ClassMethod *method); + void register_cxx_filter( + const std::string &filter_name, + cls_cxx_filter_factory_t fn); + ClassMethod *get_method(const char *mname) { Mutex::Locker l(handler->mutex); return _get_method(mname); } int get_method_flags(const char *mname); + + cls_cxx_filter_factory_t get_filter(const std::string &filter_name) + { + Mutex::Locker l(handler->mutex); + return filters_map[filter_name]; + } }; private: diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 08a29c735af7a..20087794d4fdc 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -20,6 +20,7 @@ #include "ReplicatedPG.h" #include "OSD.h" #include "OpRequest.h" +#include "objclass/objclass.h" #include "common/errno.h" #include "common/perf_counters.h" @@ -508,6 +509,31 @@ void ReplicatedPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef o op->mark_delayed("waiting for blocked object"); } +class PGLSPlainFilter : public PGLSFilter { + string val; +public: + PGLSPlainFilter(bufferlist::iterator& params) { + ::decode(xattr, params); + ::decode(val, params); + } + virtual ~PGLSPlainFilter() {} + virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata); +}; + +class PGLSParentFilter : public PGLSFilter { + inodeno_t parent_ino; +public: + PGLSParentFilter(bufferlist::iterator& params) { + xattr = "_parent"; + ::decode(parent_ino, params); + generic_dout(0) << "parent_ino=" << parent_ino << dendl; + } + virtual ~PGLSParentFilter() {} + virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata); +}; + bool PGLSParentFilter::filter(const hobject_t &obj, bufferlist& xattr_data, bufferlist& outdata) { @@ -580,7 +606,31 @@ int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilt } else if (type.compare("plain") == 0) { filter = new PGLSPlainFilter(iter); } else { - return -EINVAL; + std::size_t dot = type.find("."); + if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) { + return -EINVAL; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = NULL; + int r = osd->class_handler->open_class(class_name, &cls); + if (r != 0) { + derr << "Error opening class '" << class_name << "': " + << cpp_strerror(r) << dendl; + return -EINVAL; + } else { + assert(cls); + } + + cls_cxx_filter_factory_t fn = cls->get_filter(filter_name); + if (fn == NULL) { + derr << "Error finding filter '" << filter_name << "' in class " + << class_name << dendl; + return -EINVAL; + } + filter = fn(&iter); + assert(filter); } *pfilter = filter; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index ab24da4199f1c..8fd426fea01ed 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -46,6 +46,7 @@ class CopyFromCallback; class PromoteCallback; class ReplicatedPG; +class PGLSFilter; void intrusive_ptr_add_ref(ReplicatedPG *pg); void intrusive_ptr_release(ReplicatedPG *pg); uint64_t get_with_id(ReplicatedPG *pg); @@ -57,53 +58,6 @@ void put_with_id(ReplicatedPG *pg, uint64_t id); typedef boost::intrusive_ptr ReplicatedPGRef; #endif -class PGLSFilter { -protected: - string xattr; -public: - PGLSFilter(); - virtual ~PGLSFilter(); - virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, - bufferlist& outdata) = 0; - - /** - * xattr key, or empty string. If non-empty, this xattr will be fetched - * and the value passed into ::filter - */ - virtual string& get_xattr() { return xattr; } - - /** - * If true, objects without the named xattr (if xattr name is not empty) - * will be rejected without calling ::filter - */ - virtual bool reject_empty_xattr() { return true; } -}; - -class PGLSPlainFilter : public PGLSFilter { - string val; -public: - PGLSPlainFilter(bufferlist::iterator& params) { - ::decode(xattr, params); - ::decode(val, params); - } - virtual ~PGLSPlainFilter() {} - virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, - bufferlist& outdata); -}; - -class PGLSParentFilter : public PGLSFilter { - inodeno_t parent_ino; -public: - PGLSParentFilter(bufferlist::iterator& params) { - xattr = "_parent"; - ::decode(parent_ino, params); - generic_dout(0) << "parent_ino=" << parent_ino << dendl; - } - virtual ~PGLSParentFilter() {} - virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, - bufferlist& outdata); -}; - class ReplicatedPG : public PG, public PGBackend::Listener { friend class OSD; friend class Watch; From b610588bf451b00a09b40784fd06117a5f37db80 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Wed, 26 Aug 2015 16:32:57 +0200 Subject: [PATCH 079/654] ceph.spec.in: remove obsolete SUSE-specific code http://tracker.ceph.com/issues/12791 Fixes: #12791 Signed-off-by: Nathan Cutler --- ceph.spec.in | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 0dd64221ba04c..37e543901b121 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -491,10 +491,6 @@ python-rados, python-rbd and python-cephfs. Packages still depending on python-ceph should be fixed to depend on python-rados, python-rbd or python-cephfs instead. -%if 0%{?opensuse} || 0%{?suse_version} -%debug_package -%endif - ################################################################################# # common ################################################################################# @@ -544,7 +540,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` --with-rgw-user=root \ --with-rgw-group=root \ %endif -%if 0%{?opensuse} || 0%{?suse_version} +%if 0%{?suse_version} --with-systemd-libexec-dir=/usr/lib/ceph/ \ --with-rgw-user=wwwrun \ --with-rgw-group=www \ @@ -653,7 +649,7 @@ rm -rf $RPM_BUILD_ROOT %pre %if 0%{?_with_systemd} - %if 0%{?opensuse} || 0%{?suse_version} + %if 0%{?suse_version} # service_add_pre and friends don't work with parameterized systemd service # instances, only with single services or targets, so we always pass # ceph.target to these macros @@ -665,7 +661,7 @@ rm -rf $RPM_BUILD_ROOT %post /sbin/ldconfig %if 0%{?_with_systemd} - %if 0%{?opensuse} || 0%{?suse_version} + %if 0%{?suse_version} %service_add_post ceph.target %endif %else @@ -675,7 +671,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %preun %if 0%{?_with_systemd} - %if 0%{?opensuse} || 0%{?suse_version} + %if 0%{?suse_version} %service_del_preun ceph.target %endif # Need a special case here when removing the RPM to disable specific @@ -693,9 +689,6 @@ mkdir -p %{_localstatedir}/run/ceph/ fi fi %else - %if 0%{?opensuse} || 0%{?suse_version} - %stop_on_removal ceph - %endif %if 0%{?rhel} || 0%{?fedora} if [ $1 = 0 ] ; then /sbin/service ceph stop >/dev/null 2>&1 @@ -899,7 +892,7 @@ fi %post radosgw /sbin/ldconfig -%if 0%{?opensuse} || 0%{?suse_version} +%if 0%{?suse_version} # TODO: find out what exactly this systemd-tmpfiles inovcation is for systemd-tmpfiles --create /%{_tmpfilesdir}/ceph-rgw.conf # explicit systemctl daemon-reload (that's the only relevant bit of From d7bf8cb594e5276d1c80544f5ec954d52b159750 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 26 Aug 2015 14:34:30 -0700 Subject: [PATCH 080/654] rgw: init some manifest fields when handling explicit objs Fixes: #11455 When dealing with old manifest that has explicit objs, we also need to set the head size and head object correctly so that code that relies on this info doesn't break. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 25caf2ca12342..9d5de6df9162a 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -327,6 +327,12 @@ class RGWObjManifest { ::decode(rules, bl); } else { explicit_objs = true; + if (!objs.empty()) { + map::iterator iter = objs.begin(); + head_obj = iter->second.loc; + head_size = iter->second.size; + max_head_size = head_size; + } } if (struct_v >= 4) { From 01a9a792a7032593b32609acdf3e4c2279ddc666 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 10 Aug 2015 13:40:43 -0400 Subject: [PATCH 081/654] osbench: add multithreaded objectstore benchmark usage: ceph_objectstore_bench [flags] --size total size in bytes --block-size block size in bytes for each write --repeats number of times to repeat the write cycle --threads number of threads to carry out this workload --multi-object have each thread write to a separate object Signed-off-by: Casey Bodley --- ceph.spec.in | 1 + src/.gitignore | 1 + src/test/CMakeLists.txt | 5 + src/test/Makefile-client.am | 4 + src/test/objectstore_bench.cc | 286 ++++++++++++++++++++++++++++++++++ 5 files changed, 297 insertions(+) create mode 100644 src/test/objectstore_bench.cc diff --git a/ceph.spec.in b/ceph.spec.in index 37e543901b121..6e80f999991fb 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1046,6 +1046,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_bindir}/ceph_erasure_code %{_bindir}/ceph_erasure_code_benchmark %{_bindir}/ceph_omapbench +%{_bindir}/ceph_objectstore_bench %{_bindir}/ceph_perf_objectstore %{_bindir}/ceph_perf_local %{_bindir}/ceph_perf_msgr_client diff --git a/src/.gitignore b/src/.gitignore index f657ab3122ebf..c7b9dd5a8e0c9 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -48,6 +48,7 @@ Makefile /ceph_xattr_bench /ceph_kvstorebench /ceph_omapbench +/ceph_objectstore_bench /ceph_smalliobench /ceph_smalliobenchdumb /ceph_smalliobenchfs diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 8763f972f1cfe..29151a40c8a70 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -223,6 +223,11 @@ add_executable(kvstorebench target_link_libraries(kvstorebench librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) +# ceph_objectstore_bench +add_executable(ceph_objectstore_bench objectstore_bench.cc + $) +target_link_libraries(ceph_objectstore_bench global os ${TCMALLOC_LIBS}) + ## System tests # systest diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 01aaa0e4dc58d..2d87a17bef0b2 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -68,6 +68,10 @@ ceph_omapbench_SOURCES = test/omap_bench.cc ceph_omapbench_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) bin_DEBUGPROGRAMS += ceph_omapbench +ceph_objectstore_bench_SOURCES = test/objectstore_bench.cc +ceph_objectstore_bench_LDADD = $(LIBOS) $(CEPH_GLOBAL) +bin_DEBUGPROGRAMS += ceph_objectstore_bench + if LINUX ceph_kvstorebench_SOURCES = \ test/kv_store_bench.cc \ diff --git a/src/test/objectstore_bench.cc b/src/test/objectstore_bench.cc new file mode 100644 index 0000000000000..ae1f79fc4a611 --- /dev/null +++ b/src/test/objectstore_bench.cc @@ -0,0 +1,286 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include +#include + +#include "os/ObjectStore.h" + +#include "global/global_init.h" + +#include "common/strtol.h" +#include "common/ceph_argparse.h" + +#define dout_subsys ceph_subsys_filestore + +static void usage() +{ + derr << "usage: ceph_objectstore_bench [flags]\n" + " --size\n" + " total size in bytes\n" + " --block-size\n" + " block size in bytes for each write\n" + " --repeats\n" + " number of times to repeat the write cycle\n" + " --threads\n" + " number of threads to carry out this workload\n" + " --multi-object\n" + " have each thread write to a separate object\n" << dendl; + generic_server_usage(); +} + +// helper class for bytes with units +struct byte_units { + size_t v; + byte_units(size_t v) : v(v) {} + + bool parse(const std::string &val, std::string *err); + + operator size_t() const { return v; } +}; + +bool byte_units::parse(const std::string &val, std::string *err) +{ + v = strict_sistrtoll(val.c_str(), err); + return err->empty(); +} + +std::ostream& operator<<(std::ostream &out, const byte_units &amount) +{ + static const char* units[] = { "B", "KB", "MB", "GB", "TB", "PB", "EB" }; + static const int max_units = sizeof(units)/sizeof(*units); + + int unit = 0; + auto v = amount.v; + while (v >= 1024 && unit < max_units) { + // preserve significant bytes + if (v < 1048576 && (v % 1024 != 0)) + break; + v >>= 10; + unit++; + } + return out << v << ' ' << units[unit]; +} + +struct Config { + byte_units size; + byte_units block_size; + int repeats; + int threads; + bool multi_object; + Config() + : size(1048576), block_size(4096), + repeats(1), threads(1), + multi_object(false) {} +}; + +class C_NotifyCond : public Context { + std::mutex *mutex; + std::condition_variable *cond; + bool *done; +public: + C_NotifyCond(std::mutex *mutex, std::condition_variable *cond, bool *done) + : mutex(mutex), cond(cond), done(done) {} + void finish(int r) { + std::lock_guard lock(*mutex); + *done = true; + cond->notify_one(); + } +}; + +void osbench_worker(ObjectStore *os, const Config &cfg, + const coll_t cid, const ghobject_t oid, + uint64_t starting_offset) +{ + bufferlist data; + data.append(buffer::create(cfg.block_size)); + + dout(0) << "Writing " << cfg.size + << " in blocks of " << cfg.block_size << dendl; + + assert(starting_offset < cfg.size); + assert(starting_offset % cfg.block_size == 0); + + ObjectStore::Sequencer sequencer("osbench"); + + for (int i = 0; i < cfg.repeats; ++i) { + uint64_t offset = starting_offset; + size_t len = cfg.size; + + list tls; + + std::cout << "Write cycle " << i << std::endl; + while (len) { + size_t count = len < cfg.block_size ? len : (size_t)cfg.block_size; + + auto t = new ObjectStore::Transaction; + t->write(cid, oid, offset, count, data); + tls.push_back(t); + + offset += count; + if (offset > cfg.size) + offset -= cfg.size; + len -= count; + } + + // set up the finisher + std::mutex mutex; + std::condition_variable cond; + bool done = false; + + os->queue_transactions(&sequencer, tls, nullptr, + new C_NotifyCond(&mutex, &cond, &done)); + + std::unique_lock lock(mutex); + cond.wait(lock, [&done](){ return done; }); + lock.unlock(); + + while (!tls.empty()) { + auto t = tls.front(); + tls.pop_front(); + delete t; + } + } +} + +int main(int argc, const char *argv[]) +{ + Config cfg; + + // command-line arguments + vector args; + argv_to_vec(argc, argv, args); + env_to_vec(args); + + global_init(nullptr, args, CEPH_ENTITY_TYPE_OSD, CODE_ENVIRONMENT_UTILITY, 0); + + std::string val; + vector::iterator i = args.begin(); + while (i != args.end()) { + if (ceph_argparse_double_dash(args, i)) + break; + + if (ceph_argparse_witharg(args, i, &val, "--size", (char*)nullptr)) { + std::string err; + if (!cfg.size.parse(val, &err)) { + derr << "error parsing size: " << err << dendl; + usage(); + } + } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)nullptr)) { + std::string err; + if (!cfg.block_size.parse(val, &err)) { + derr << "error parsing block-size: " << err << dendl; + usage(); + } + } else if (ceph_argparse_witharg(args, i, &val, "--repeats", (char*)nullptr)) { + cfg.repeats = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--threads", (char*)nullptr)) { + cfg.threads = atoi(val.c_str()); + } else if (ceph_argparse_flag(args, i, "--multi-object", (char*)nullptr)) { + cfg.multi_object = true; + } else { + derr << "Error: can't understand argument: " << *i << "\n" << dendl; + usage(); + } + } + + common_init_finish(g_ceph_context); + + // create object store + dout(0) << "objectstore " << g_conf->osd_objectstore << dendl; + dout(0) << "data " << g_conf->osd_data << dendl; + dout(0) << "journal " << g_conf->osd_journal << dendl; + dout(0) << "size " << cfg.size << dendl; + dout(0) << "block-size " << cfg.block_size << dendl; + dout(0) << "repeats " << cfg.repeats << dendl; + dout(0) << "threads " << cfg.threads << dendl; + + auto os = std::unique_ptr( + ObjectStore::create(g_ceph_context, + g_conf->osd_objectstore, + g_conf->osd_data, + g_conf->osd_journal)); + if (!os) { + derr << "bad objectstore type " << g_conf->osd_objectstore << dendl; + return 1; + } + if (os->mkfs() < 0) { + derr << "mkfs failed" << dendl; + return 1; + } + if (os->mount() < 0) { + derr << "mount failed" << dendl; + return 1; + } + + dout(10) << "created objectstore " << os.get() << dendl; + + // create a collection + spg_t pg; + const coll_t cid(pg); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + os->apply_transaction(t); + } + + // create the objects + std::vector oids; + if (cfg.multi_object) { + oids.reserve(cfg.threads); + for (int i = 0; i < cfg.threads; i++) { + std::stringstream oss; + oss << "osbench-thread-" << i; + oids.emplace_back(pg.make_temp_object(oss.str())); + + ObjectStore::Transaction t; + t.touch(cid, oids[i]); + int r = os->apply_transaction(t); + assert(r == 0); + } + } else { + oids.emplace_back(pg.make_temp_object("osbench")); + + ObjectStore::Transaction t; + t.touch(cid, oids.back()); + int r = os->apply_transaction(t); + assert(r == 0); + } + + // run the worker threads + std::vector workers; + workers.reserve(cfg.threads); + + using namespace std::chrono; + auto t1 = high_resolution_clock::now(); + for (int i = 0; i < cfg.threads; i++) { + const auto &oid = cfg.multi_object ? oids[i] : oids[0]; + workers.emplace_back(osbench_worker, os.get(), std::ref(cfg), + cid, oid, i * cfg.size / cfg.threads); + } + for (auto &worker : workers) + worker.join(); + auto t2 = high_resolution_clock::now(); + workers.clear(); + + auto duration = duration_cast(t2 - t1); + byte_units total = cfg.size * cfg.repeats * cfg.threads; + byte_units rate = (1000000LL * total) / duration.count(); + size_t iops = (1000000LL * total / cfg.block_size) / duration.count(); + dout(0) << "Wrote " << total << " in " + << duration.count() << "us, at a rate of " << rate << "/s and " + << iops << " iops" << dendl; + + // remove the objects + ObjectStore::Transaction t; + for (const auto &oid : oids) + t.remove(cid, oid); + os->apply_transaction(t); + + os->umount(); + return 0; +} From 26f716e949a0e522b17085faf324c3fe6af69092 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 11 Aug 2015 14:38:06 -0400 Subject: [PATCH 082/654] memstore: use intrusive_ptr instead of shared_ptr Signed-off-by: Casey Bodley --- src/os/MemStore.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/os/MemStore.h b/src/os/MemStore.h index 477e8fde63ed1..a3bed1cbad1be 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -16,21 +16,28 @@ #ifndef CEPH_MEMSTORE_H #define CEPH_MEMSTORE_H +#include + #include "include/assert.h" #include "include/unordered_map.h" #include "include/memory.h" #include "common/Finisher.h" +#include "common/RefCountedObj.h" #include "common/RWLock.h" #include "ObjectStore.h" class MemStore : public ObjectStore { public: - struct Object { + struct Object : public RefCountedObject { bufferlist data; map xattr; bufferlist omap_header; map omap; + typedef boost::intrusive_ptr Ref; + friend void intrusive_ptr_add_ref(Object *o) { o->get(); } + friend void intrusive_ptr_release(Object *o) { o->put(); } + void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); ::encode(data, bl); @@ -74,14 +81,18 @@ class MemStore : public ObjectStore { f->close_section(); } }; - typedef ceph::shared_ptr ObjectRef; + typedef Object::Ref ObjectRef; - struct Collection { + struct Collection : public RefCountedObject { ceph::unordered_map object_hash; ///< for lookup map object_map; ///< for iteration map xattr; RWLock lock; ///< for object_{map,hash} + typedef boost::intrusive_ptr Ref; + friend void intrusive_ptr_add_ref(Collection *c) { c->get(); } + friend void intrusive_ptr_release(Collection *c) { c->put(); } + // NOTE: The lock only needs to protect the object_map/hash, not the // contents of individual objects. The osd is already sequencing // reads and writes, so we will never see them concurrently at this @@ -136,7 +147,7 @@ class MemStore : public ObjectStore { Collection() : lock("MemStore::Collection::lock") {} }; - typedef ceph::shared_ptr CollectionRef; + typedef Collection::Ref CollectionRef; private: class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { From 5d8307a10fd074635d66dbd41d7938e6dca105a0 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 11 Aug 2015 14:41:17 -0400 Subject: [PATCH 083/654] memstore: add Object interface to hide bufferlist this prepares MemStore for a new object data implementation that replaces bufferlist Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 167 ++++++++++++++++++++++++++------------------- src/os/MemStore.h | 53 ++++++++++---- 2 files changed, 138 insertions(+), 82 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 0683d6ab3daa9..788589d22cb12 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -293,7 +293,7 @@ int MemStore::stat( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; - st->st_size = o->data.length(); + st->st_size = o->get_size(); st->st_blksize = 4096; st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; st->st_nlink = 1; @@ -319,16 +319,15 @@ int MemStore::read( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; - if (offset >= o->data.length()) + if (offset >= o->get_size()) return 0; size_t l = len; if (l == 0) // note: len == 0 means read the entire object - l = o->data.length(); - else if (offset + l > o->data.length()) - l = o->data.length() - offset; + l = o->get_size(); + else if (offset + l > o->get_size()) + l = o->get_size() - offset; bl.clear(); - bl.substr_of(o->data, offset, l); - return bl.length(); + return o->read(offset, l, bl); } int MemStore::fiemap(coll_t cid, const ghobject_t& oid, @@ -344,11 +343,11 @@ int MemStore::fiemap(coll_t cid, const ghobject_t& oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; - if (offset >= o->data.length()) + if (offset >= o->get_size()) return 0; size_t l = len; - if (offset + l > o->data.length()) - l = o->data.length() - offset; + if (offset + l > o->get_size()) + l = o->get_size() - offset; map m; m[offset] = l; ::encode(m, bl); @@ -957,7 +956,7 @@ int MemStore::_touch(coll_t cid, const ghobject_t& oid) ObjectRef o = c->get_object(oid); if (!o) { - o.reset(new Object); + o.reset(new BufferlistObject); c->object_map[oid] = o; c->object_hash[oid] = o; } @@ -980,46 +979,18 @@ int MemStore::_write(coll_t cid, const ghobject_t& oid, ObjectRef o = c->get_object(oid); if (!o) { // write implicitly creates a missing object - o.reset(new Object); + o.reset(new BufferlistObject); c->object_map[oid] = o; c->object_hash[oid] = o; } - int old_size = o->data.length(); - _write_into_bl(bl, offset, &o->data); - used_bytes += (o->data.length() - old_size); + const ssize_t old_size = o->get_size(); + o->write(offset, bl); + used_bytes += (o->get_size() - old_size); return 0; } -void MemStore::_write_into_bl(const bufferlist& src, unsigned offset, - bufferlist *dst) -{ - unsigned len = src.length(); - - // before - bufferlist newdata; - if (dst->length() >= offset) { - newdata.substr_of(*dst, 0, offset); - } else { - newdata.substr_of(*dst, 0, dst->length()); - bufferptr bp(offset - dst->length()); - bp.zero(); - newdata.append(bp); - } - - newdata.append(src); - - // after - if (dst->length() > offset + len) { - bufferlist tail; - tail.substr_of(*dst, offset + len, dst->length() - (offset + len)); - newdata.append(tail); - } - - dst->claim(newdata); -} - int MemStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len) { @@ -1043,20 +1014,10 @@ int MemStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size) ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; - if (o->data.length() > size) { - bufferlist bl; - bl.substr_of(o->data, 0, size); - used_bytes -= o->data.length() - size; - o->data.claim(bl); - } else if (o->data.length() == size) { - // do nothing - } else { - bufferptr bp(size - o->data.length()); - bp.zero(); - used_bytes += bp.length(); - o->data.append(bp); - } - return 0; + const ssize_t old_size = o->get_size(); + int r = o->truncate(size); + used_bytes += (o->get_size() - old_size); + return r; } int MemStore::_remove(coll_t cid, const ghobject_t& oid) @@ -1073,7 +1034,7 @@ int MemStore::_remove(coll_t cid, const ghobject_t& oid) c->object_map.erase(oid); c->object_hash.erase(oid); - used_bytes -= o->data.length(); + used_bytes -= o->get_size(); return 0; } @@ -1142,12 +1103,12 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid, return -ENOENT; ObjectRef no = c->get_object(newoid); if (!no) { - no.reset(new Object); + no.reset(new BufferlistObject); c->object_map[newoid] = no; c->object_hash[newoid] = no; } - used_bytes += oo->data.length() - no->data.length(); - no->data = oo->data; + used_bytes += oo->get_size() - no->get_size(); + no->clone(oo.get(), 0, oo->get_size(), 0); no->omap_header = oo->omap_header; no->omap = oo->omap; no->xattr = oo->xattr; @@ -1172,20 +1133,18 @@ int MemStore::_clone_range(coll_t cid, const ghobject_t& oldoid, return -ENOENT; ObjectRef no = c->get_object(newoid); if (!no) { - no.reset(new Object); + no.reset(new BufferlistObject); c->object_map[newoid] = no; c->object_hash[newoid] = no; } - if (srcoff >= oo->data.length()) + if (srcoff >= oo->get_size()) return 0; - if (srcoff + len >= oo->data.length()) - len = oo->data.length() - srcoff; - bufferlist bl; - bl.substr_of(oo->data, srcoff, len); + if (srcoff + len >= oo->get_size()) + len = oo->get_size() - srcoff; - int old_size = no->data.length(); - _write_into_bl(bl, dstoff, &no->data); - used_bytes += (no->data.length() - old_size); + const ssize_t old_size = no->get_size(); + no->clone(oo.get(), srcoff, len, dstoff); + used_bytes += (no->get_size() - old_size); return len; } @@ -1399,3 +1358,71 @@ int MemStore::_split_collection(coll_t cid, uint32_t bits, uint32_t match, return 0; } + +// BufferlistObject +int MemStore::BufferlistObject::read(uint64_t offset, uint64_t len, + bufferlist &bl) +{ + bl.substr_of(data, offset, len); + return bl.length(); +} + +int MemStore::BufferlistObject::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + + // before + bufferlist newdata; + if (get_size() >= offset) { + newdata.substr_of(data, 0, offset); + } else { + newdata.substr_of(data, 0, get_size()); + bufferptr bp(offset - get_size()); + bp.zero(); + newdata.append(bp); + } + + newdata.append(src); + + // after + if (get_size() > offset + len) { + bufferlist tail; + tail.substr_of(data, offset + len, get_size() - (offset + len)); + newdata.append(tail); + } + + data.claim(newdata); + return 0; +} + +int MemStore::BufferlistObject::clone(Object *src, uint64_t srcoff, + uint64_t len, uint64_t dstoff) +{ + auto srcbl = dynamic_cast(src); + if (srcbl == nullptr) + return -ENOTSUP; + + if (srcoff == dstoff && len == src->get_size()) { + data = srcbl->data; + return 0; + } + bufferlist bl; + bl.substr_of(srcbl->data, srcoff, len); + return write(dstoff, bl); +} + +int MemStore::BufferlistObject::truncate(uint64_t size) +{ + if (get_size() > size) { + bufferlist bl; + bl.substr_of(data, 0, size); + data.claim(bl); + } else if (get_size() == size) { + // do nothing + } else { + bufferptr bp(size - get_size()); + bp.zero(); + data.append(bp); + } + return 0; +} diff --git a/src/os/MemStore.h b/src/os/MemStore.h index a3bed1cbad1be..38159e9b7ce82 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -29,7 +29,6 @@ class MemStore : public ObjectStore { public: struct Object : public RefCountedObject { - bufferlist data; map xattr; bufferlist omap_header; map omap; @@ -38,24 +37,29 @@ class MemStore : public ObjectStore { friend void intrusive_ptr_add_ref(Object *o) { o->get(); } friend void intrusive_ptr_release(Object *o) { o->put(); } - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - ::encode(data, bl); + // interface for object data + virtual size_t get_size() const = 0; + virtual int read(uint64_t offset, uint64_t len, bufferlist &bl) = 0; + virtual int write(uint64_t offset, const bufferlist &bl) = 0; + virtual int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) = 0; + virtual int truncate(uint64_t offset) = 0; + virtual void encode(bufferlist& bl) const = 0; + virtual void decode(bufferlist::iterator& p) = 0; + + void encode_base(bufferlist& bl) const { ::encode(xattr, bl); ::encode(omap_header, bl); ::encode(omap, bl); - ENCODE_FINISH(bl); } - void decode(bufferlist::iterator& p) { - DECODE_START(1, p); - ::decode(data, p); + void decode_base(bufferlist::iterator& p) { ::decode(xattr, p); ::decode(omap_header, p); ::decode(omap, p); - DECODE_FINISH(p); } + void dump(Formatter *f) const { - f->dump_int("data_len", data.length()); + f->dump_int("data_len", get_size()); f->dump_int("omap_header_len", omap_header.length()); f->open_array_section("xattrs"); @@ -83,6 +87,31 @@ class MemStore : public ObjectStore { }; typedef Object::Ref ObjectRef; + struct BufferlistObject : public Object { + bufferlist data; + + size_t get_size() const override { return data.length(); } + + int read(uint64_t offset, uint64_t len, bufferlist &bl) override; + int write(uint64_t offset, const bufferlist &bl) override; + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) override; + int truncate(uint64_t offset) override; + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + ::encode(data, bl); + encode_base(bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& p) override { + DECODE_START(1, p); + ::decode(data, p); + decode_base(p); + DECODE_FINISH(p); + } + }; + struct Collection : public RefCountedObject { ceph::unordered_map object_hash; ///< for lookup map object_map; ///< for iteration @@ -126,7 +155,7 @@ class MemStore : public ObjectStore { while (s--) { ghobject_t k; ::decode(k, p); - ObjectRef o(new Object); + ObjectRef o(new BufferlistObject); o->decode(p); object_map.insert(make_pair(k, o)); object_hash.insert(make_pair(k, o)); @@ -139,7 +168,7 @@ class MemStore : public ObjectStore { for (map::const_iterator p = object_map.begin(); p != object_map.end(); ++p) { - result += p->second->data.length(); + result += p->second->get_size(); } return result; From 54739a59abaf1ddf14b066f69deca2afd486283f Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Thu, 19 Jun 2014 13:04:27 -0400 Subject: [PATCH 084/654] memstore: protect object omap with a mutex Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 19 +++++++++++++++++-- src/os/MemStore.h | 16 +++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 788589d22cb12..d0615c536b432 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -463,6 +463,7 @@ int MemStore::omap_get( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); *header = o->omap_header; *out = o->omap; return 0; @@ -484,6 +485,7 @@ int MemStore::omap_get_header( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); *header = o->omap_header; return 0; } @@ -503,6 +505,7 @@ int MemStore::omap_get_keys( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); for (map::iterator p = o->omap.begin(); p != o->omap.end(); ++p) @@ -526,6 +529,7 @@ int MemStore::omap_get_values( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { @@ -552,6 +556,7 @@ int MemStore::omap_check_keys( ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { @@ -1109,6 +1114,12 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid, } used_bytes += oo->get_size() - no->get_size(); no->clone(oo.get(), 0, oo->get_size(), 0); + + // take both omap locks with std::lock() + std::unique_lock oo_lock(oo->omap_mutex, std::defer_lock), + no_lock(no->omap_mutex, std::defer_lock); + std::lock(oo_lock, no_lock); + no->omap_header = oo->omap_header; no->omap = oo->omap; no->xattr = oo->xattr; @@ -1160,6 +1171,7 @@ int MemStore::_omap_clear(coll_t cid, const ghobject_t &oid) ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); o->omap.clear(); o->omap_header.clear(); return 0; @@ -1177,6 +1189,7 @@ int MemStore::_omap_setkeys(coll_t cid, const ghobject_t &oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); for (map::const_iterator p = aset.begin(); p != aset.end(); ++p) o->omap[p->first] = p->second; return 0; @@ -1194,6 +1207,7 @@ int MemStore::_omap_rmkeys(coll_t cid, const ghobject_t &oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) o->omap.erase(*p); return 0; @@ -1212,10 +1226,10 @@ int MemStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); map::iterator p = o->omap.lower_bound(first); map::iterator e = o->omap.lower_bound(last); - while (p != e) - o->omap.erase(p++); + o->omap.erase(p, e); return 0; } @@ -1231,6 +1245,7 @@ int MemStore::_omap_setheader(coll_t cid, const ghobject_t &oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->omap_mutex); o->omap_header = bl; return 0; } diff --git a/src/os/MemStore.h b/src/os/MemStore.h index 38159e9b7ce82..d06f331c0687b 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -16,6 +16,7 @@ #ifndef CEPH_MEMSTORE_H #define CEPH_MEMSTORE_H +#include #include #include "include/assert.h" @@ -29,6 +30,7 @@ class MemStore : public ObjectStore { public: struct Object : public RefCountedObject { + std::mutex omap_mutex; map xattr; bufferlist omap_header; map omap; @@ -188,35 +190,35 @@ class MemStore : public ObjectStore { : c(c), o(o), it(o->omap.begin()) {} int seek_to_first() { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); it = o->omap.begin(); return 0; } int upper_bound(const string &after) { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); it = o->omap.upper_bound(after); return 0; } int lower_bound(const string &to) { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); it = o->omap.lower_bound(to); return 0; } bool valid() { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); return it != o->omap.end(); } int next() { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); ++it; return 0; } string key() { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); return it->first; } bufferlist value() { - RWLock::RLocker l(c->lock); + std::lock_guard(o->omap_mutex); return it->second; } int status() { From 61cd2da27e1618d99383f3fd9c8a9e1d11bcfcf7 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 18 Aug 2015 16:21:28 -0400 Subject: [PATCH 085/654] memstore: protect object xattrs with a mutex Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 19 ++++++++++++++----- src/os/MemStore.h | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index d0615c536b432..24470dcc4318b 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -367,6 +367,7 @@ int MemStore::getattr(coll_t cid, const ghobject_t& oid, if (!o) return -ENOENT; string k(name); + std::lock_guard lock(o->xattr_mutex); if (!o->xattr.count(k)) { return -ENODATA; } @@ -386,6 +387,7 @@ int MemStore::getattrs(coll_t cid, const ghobject_t& oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->xattr_mutex); aset = o->xattr; return 0; } @@ -1056,6 +1058,7 @@ int MemStore::_setattrs(coll_t cid, const ghobject_t& oid, ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->xattr_mutex); for (map::const_iterator p = aset.begin(); p != aset.end(); ++p) o->xattr[p->first] = p->second; return 0; @@ -1072,9 +1075,11 @@ int MemStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name) ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; - if (!o->xattr.count(name)) + std::lock_guard lock(o->xattr_mutex); + auto i = o->xattr.find(name); + if (i == o->xattr.end()) return -ENODATA; - o->xattr.erase(name); + o->xattr.erase(i); return 0; } @@ -1089,6 +1094,7 @@ int MemStore::_rmattrs(coll_t cid, const ghobject_t& oid) ObjectRef o = c->get_object(oid); if (!o) return -ENOENT; + std::lock_guard lock(o->xattr_mutex); o->xattr.clear(); return 0; } @@ -1115,10 +1121,13 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid, used_bytes += oo->get_size() - no->get_size(); no->clone(oo.get(), 0, oo->get_size(), 0); - // take both omap locks with std::lock() - std::unique_lock oo_lock(oo->omap_mutex, std::defer_lock), + // take xattr and omap locks with std::lock() + std::unique_lock + ox_lock(oo->xattr_mutex, std::defer_lock), + nx_lock(no->xattr_mutex, std::defer_lock), + oo_lock(oo->omap_mutex, std::defer_lock), no_lock(no->omap_mutex, std::defer_lock); - std::lock(oo_lock, no_lock); + std::lock(ox_lock, nx_lock, oo_lock, no_lock); no->omap_header = oo->omap_header; no->omap = oo->omap; diff --git a/src/os/MemStore.h b/src/os/MemStore.h index d06f331c0687b..0aeacca4d18d2 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -30,6 +30,7 @@ class MemStore : public ObjectStore { public: struct Object : public RefCountedObject { + std::mutex xattr_mutex; std::mutex omap_mutex; map xattr; bufferlist omap_header; From 46f92f0d57cdae7c2fb105ac7d07b063e48c80f0 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 19 Aug 2015 10:06:04 -0400 Subject: [PATCH 086/654] memstore: BufferlistObject uses spinlock for data Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 19 +++++++++++++------ src/os/MemStore.h | 2 ++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 24470dcc4318b..68db70569ec5d 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -1387,6 +1387,7 @@ int MemStore::_split_collection(coll_t cid, uint32_t bits, uint32_t match, int MemStore::BufferlistObject::read(uint64_t offset, uint64_t len, bufferlist &bl) { + std::lock_guard lock(mutex); bl.substr_of(data, offset, len); return bl.length(); } @@ -1395,6 +1396,8 @@ int MemStore::BufferlistObject::write(uint64_t offset, const bufferlist &src) { unsigned len = src.length(); + std::lock_guard lock(mutex); + // before bufferlist newdata; if (get_size() >= offset) { @@ -1422,21 +1425,25 @@ int MemStore::BufferlistObject::write(uint64_t offset, const bufferlist &src) int MemStore::BufferlistObject::clone(Object *src, uint64_t srcoff, uint64_t len, uint64_t dstoff) { - auto srcbl = dynamic_cast(src); + auto srcbl = dynamic_cast(src); if (srcbl == nullptr) return -ENOTSUP; - if (srcoff == dstoff && len == src->get_size()) { - data = srcbl->data; - return 0; - } bufferlist bl; - bl.substr_of(srcbl->data, srcoff, len); + { + std::lock_guard lock(srcbl->mutex); + if (srcoff == dstoff && len == src->get_size()) { + data = srcbl->data; + return 0; + } + bl.substr_of(srcbl->data, srcoff, len); + } return write(dstoff, bl); } int MemStore::BufferlistObject::truncate(uint64_t size) { + std::lock_guard lock(mutex); if (get_size() > size) { bufferlist bl; bl.substr_of(data, 0, size); diff --git a/src/os/MemStore.h b/src/os/MemStore.h index 0aeacca4d18d2..21da3121d3a28 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -22,6 +22,7 @@ #include "include/assert.h" #include "include/unordered_map.h" #include "include/memory.h" +#include "include/Spinlock.h" #include "common/Finisher.h" #include "common/RefCountedObj.h" #include "common/RWLock.h" @@ -91,6 +92,7 @@ class MemStore : public ObjectStore { typedef Object::Ref ObjectRef; struct BufferlistObject : public Object { + Spinlock mutex; bufferlist data; size_t get_size() const override { return data.length(); } From 79454822d7665dd5ded97dda6b651546f0faf9dc Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 18 Aug 2015 16:22:12 -0400 Subject: [PATCH 087/654] memstore: move collection lock into get_object only hold the collection lock while accessing the hash/object maps, don't use it to serialize entire operations Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 34 ++++------------------------------ src/os/MemStore.h | 3 ++- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 68db70569ec5d..1f51f0e50f34c 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -271,7 +271,6 @@ bool MemStore::exists(coll_t cid, const ghobject_t& oid) CollectionRef c = get_collection(cid); if (!c) return false; - RWLock::RLocker l(c->lock); // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the // shared_ptr needs to be compared to nullptr. @@ -288,7 +287,6 @@ int MemStore::stat( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -314,7 +312,6 @@ int MemStore::read( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker lc(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -338,7 +335,6 @@ int MemStore::fiemap(coll_t cid, const ghobject_t& oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker lc(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -361,7 +357,6 @@ int MemStore::getattr(coll_t cid, const ghobject_t& oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -382,7 +377,6 @@ int MemStore::getattrs(coll_t cid, const ghobject_t& oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -460,7 +454,6 @@ int MemStore::omap_get( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -482,7 +475,6 @@ int MemStore::omap_get_header( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -502,7 +494,6 @@ int MemStore::omap_get_keys( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -526,7 +517,6 @@ int MemStore::omap_get_values( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -553,7 +543,6 @@ int MemStore::omap_check_keys( CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -576,7 +565,6 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(coll_t cid, CollectionRef c = get_collection(cid); if (!c) return ObjectMap::ObjectMapIterator(); - RWLock::RLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -959,7 +947,6 @@ int MemStore::_touch(coll_t cid, const ghobject_t& oid) CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) { @@ -981,7 +968,6 @@ int MemStore::_write(coll_t cid, const ghobject_t& oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) { @@ -1016,7 +1002,6 @@ int MemStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size) CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1035,13 +1020,12 @@ int MemStore::_remove(coll_t cid, const ghobject_t& oid) return -ENOENT; RWLock::WLocker l(c->lock); - ObjectRef o = c->get_object(oid); - if (!o) + auto i = c->object_hash.find(oid); + if (i == c->object_hash.end()) return -ENOENT; + c->object_hash.erase(i); c->object_map.erase(oid); - c->object_hash.erase(oid); - - used_bytes -= o->get_size(); + used_bytes -= i->second->get_size(); return 0; } @@ -1053,7 +1037,6 @@ int MemStore::_setattrs(coll_t cid, const ghobject_t& oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1070,7 +1053,6 @@ int MemStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name) CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1089,7 +1071,6 @@ int MemStore::_rmattrs(coll_t cid, const ghobject_t& oid) CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1107,7 +1088,6 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef oo = c->get_object(oldoid); if (!oo) @@ -1146,7 +1126,6 @@ int MemStore::_clone_range(coll_t cid, const ghobject_t& oldoid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef oo = c->get_object(oldoid); if (!oo) @@ -1175,7 +1154,6 @@ int MemStore::_omap_clear(coll_t cid, const ghobject_t &oid) CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1193,7 +1171,6 @@ int MemStore::_omap_setkeys(coll_t cid, const ghobject_t &oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1211,7 +1188,6 @@ int MemStore::_omap_rmkeys(coll_t cid, const ghobject_t &oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1230,7 +1206,6 @@ int MemStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) @@ -1249,7 +1224,6 @@ int MemStore::_omap_setheader(coll_t cid, const ghobject_t &oid, CollectionRef c = get_collection(cid); if (!c) return -ENOENT; - RWLock::WLocker l(c->lock); ObjectRef o = c->get_object(oid); if (!o) diff --git a/src/os/MemStore.h b/src/os/MemStore.h index 21da3121d3a28..d1edc2acf330c 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -133,7 +133,8 @@ class MemStore : public ObjectStore { // level. ObjectRef get_object(ghobject_t oid) { - ceph::unordered_map::iterator o = object_hash.find(oid); + RWLock::RLocker l(lock); + auto o = object_hash.find(oid); if (o == object_hash.end()) return ObjectRef(); return o->second; From b0882fb682d2fc2a3fded46188eb5779e2c783d9 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 19 Aug 2015 09:59:59 -0400 Subject: [PATCH 088/654] memstore: replace apply_lock with sequencer Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 1f51f0e50f34c..0fff54506ce9b 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -581,8 +581,22 @@ int MemStore::queue_transactions(Sequencer *osr, TrackedOpRef op, ThreadPool::TPHandle *handle) { - // fixme: ignore the Sequencer and serialize everything. - Mutex::Locker l(apply_lock); + // because memstore operations are synchronous, we can implement the + // Sequencer with a mutex. this guarantees ordering on a given sequencer, + // while allowing operations on different sequencers to happen in parallel + struct OpSequencer : public Sequencer_impl { + std::mutex mutex; + void flush() override {} + bool flush_commit(Context*) override { return true; } + }; + + std::unique_lock lock; + if (osr) { + auto seq = reinterpret_cast(&osr->p); + if (*seq == nullptr) + *seq = new OpSequencer; + lock = std::unique_lock((*seq)->mutex); + } for (list::iterator p = tls.begin(); p != tls.end(); ++p) { // poke the TPHandle heartbeat just to exercise that code path From 51d2553f5818788d88f664e610cb848681113ee8 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 3 Aug 2015 15:00:03 -0400 Subject: [PATCH 089/654] memstore: add PageSet for MemStore object data introduce class PageSet as an alternative to bufferlist for storing object data PageSet uses an avl set to manage its buffers, enabling lookups in logarithmic time. this approach also allows for sparse objects the main PageSet operations are get_range(), alloc_range(), and free_pages_after(). get_range() returns a vector containing all allocated pages that intersect the given range. alloc_range() does the same, but allocates pages for any holes in the range. free_pages_after() is for the truncate operation, and frees all allocated pages after the page containing the given offset PageSet uses a spinlock to provide mutual exclusion on the avl set itself, but not for i/o operations on its pages; that's why the interface returns pages in a vector, rather than as iterators into its internal avl set. the pages themselves are reference counted to avoid races between get/alloc_range() and free_pages_after() Signed-off-by: Casey Bodley --- src/os/Makefile.am | 1 + src/os/PageSet.h | 227 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 src/os/PageSet.h diff --git a/src/os/Makefile.am b/src/os/Makefile.am index ba80fd356db83..769a976deeecf 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -70,6 +70,7 @@ noinst_HEADERS += \ os/KeyValueStore.h \ os/ObjectMap.h \ os/ObjectStore.h \ + os/PageSet.h \ os/SequencerPosition.h \ os/WBThrottle.h \ os/XfsFileStoreBackend.h \ diff --git a/src/os/PageSet.h b/src/os/PageSet.h new file mode 100644 index 0000000000000..f011fbdd6bf00 --- /dev/null +++ b/src/os/PageSet.h @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013- Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PAGESET_H +#define CEPH_PAGESET_H + +#include +#include +#include +#include +#include +#include +#include + +#include "include/encoding.h" +#include "include/Spinlock.h" + + +struct Page { + char *const data; + boost::intrusive::avl_set_member_hook<> hook; + uint64_t offset; + + // avoid RefCountedObject because it has a virtual destructor + std::atomic nrefs; + void get() { ++nrefs; } + void put() { if (--nrefs == 0) delete this; } + + typedef boost::intrusive_ptr Ref; + friend void intrusive_ptr_add_ref(Page *p) { p->get(); } + friend void intrusive_ptr_release(Page *p) { p->put(); } + + // key-value comparison functor for avl + struct Less { + bool operator()(uint64_t offset, const Page &page) const { + return offset < page.offset; + } + bool operator()(const Page &page, uint64_t offset) const { + return page.offset < offset; + } + bool operator()(const Page &lhs, const Page &rhs) const { + return lhs.offset < rhs.offset; + } + }; + void encode(bufferlist &bl, size_t page_size) const { + bl.append(buffer::copy(data, page_size)); + ::encode(offset, bl); + } + void decode(bufferlist::iterator &p, size_t page_size) { + ::decode_array_nohead(data, page_size, p); + ::decode(offset, p); + } + + static Ref create(size_t page_size, uint64_t offset = 0) { + // allocate the Page and its data in a single buffer + auto buffer = new char[page_size + sizeof(Page)]; + // place the Page structure at the end of the buffer + return new (buffer + page_size) Page(buffer, offset); + } + + // copy disabled + Page(const Page&) = delete; + const Page& operator=(const Page&) = delete; + + private: // private constructor, use create() instead + Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {} + + static void operator delete(void *p) { + delete[] reinterpret_cast(p)->data; + } +}; + +class PageSet { + public: + // alloc_range() and get_range() return page refs in a vector + typedef std::vector page_vector; + + private: + // store pages in a boost intrusive avl_set + typedef Page::Less page_cmp; + typedef boost::intrusive::member_hook, + &Page::hook> member_option; + typedef boost::intrusive::avl_set, member_option> page_set; + + typedef typename page_set::iterator iterator; + + page_set pages; + size_t page_size; + + typedef Spinlock lock_type; + lock_type mutex; + + void free_pages(iterator cur, iterator end) { + while (cur != end) { + Page *page = &*cur; + cur = pages.erase(cur); + page->put(); + } + } + + int count_pages(uint64_t offset, uint64_t len) const { + // count the overlapping pages + int count = 0; + if (offset % page_size) { + count++; + size_t rem = page_size - offset % page_size; + len = len <= rem ? 0 : len - rem; + } + count += len / page_size; + if (len % page_size) + count++; + return count; + } + + public: + PageSet(size_t page_size) : page_size(page_size) {} + PageSet(PageSet &&rhs) + : pages(std::move(rhs.pages)), page_size(rhs.page_size) {} + ~PageSet() { + free_pages(pages.begin(), pages.end()); + } + + // disable copy + PageSet(const PageSet&) = delete; + const PageSet& operator=(const PageSet&) = delete; + + bool empty() const { return pages.empty(); } + size_t size() const { return pages.size(); } + size_t get_page_size() const { return page_size; } + + // allocate all pages that intersect the range [offset,length) + void alloc_range(uint64_t offset, uint64_t length, page_vector &range) { + // loop in reverse so we can provide hints to avl_set::insert_check() + // and get O(1) insertions after the first + uint64_t position = offset + length - 1; + + range.resize(count_pages(offset, length)); + auto out = range.rbegin(); + + std::lock_guard lock(mutex); + iterator cur = pages.end(); + while (length) { + const uint64_t page_offset = position & ~(page_size-1); + + typename page_set::insert_commit_data commit; + auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit); + if (insert.second) { + auto page = Page::create(page_size, page_offset); + cur = pages.insert_commit(*page, commit); + + // assume that the caller will write to the range [offset,length), + // so we only need to zero memory outside of this range + + // zero end of page past offset + length + if (offset + length < page->offset + page_size) + std::fill(page->data + offset + length - page->offset, + page->data + page_size, 0); + // zero front of page between page_offset and offset + if (offset > page->offset) + std::fill(page->data, page->data + offset - page->offset, 0); + } else { // exists + cur = insert.first; + } + // add a reference to output vector + out->reset(&*cur); + ++out; + + auto c = std::min(length, (position & (page_size-1)) + 1); + position -= c; + length -= c; + } + // make sure we sized the vector correctly + assert(out == range.rend()); + } + + // return all allocated pages that intersect the range [offset,length) + void get_range(uint64_t offset, uint64_t length, page_vector &range) { + auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp()); + while (cur != pages.end() && cur->offset < offset + length) + range.push_back(&*cur++); + } + + void free_pages_after(uint64_t offset) { + std::lock_guard lock(mutex); + auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp()); + if (cur == pages.end()) + return; + if (cur->offset < offset) + cur++; + free_pages(cur, pages.end()); + } + + void encode(bufferlist &bl) const { + ::encode(page_size, bl); + unsigned count = pages.size(); + ::encode(count, bl); + for (auto p = pages.rbegin(); p != pages.rend(); ++p) + p->encode(bl, page_size); + } + void decode(bufferlist::iterator &p) { + assert(empty()); + ::decode(page_size, p); + unsigned count; + ::decode(count, p); + auto cur = pages.end(); + for (unsigned i = 0; i < count; i++) { + auto page = Page::create(page_size); + page->decode(p, page_size); + cur = pages.insert_before(cur, *page); + } + } +}; + +#endif // CEPH_PAGESET_H From dd7fe61e61fc6b44edb3735471c96a1e17fb644e Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 3 Aug 2015 16:45:07 -0400 Subject: [PATCH 090/654] memstore: add unit test for PageSet Signed-off-by: Casey Bodley --- src/test/CMakeLists.txt | 6 + src/test/Makefile-server.am | 5 + src/test/test_pageset.cc | 271 ++++++++++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 src/test/test_pageset.cc diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 29151a40c8a70..3ffbc29f21171 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -905,6 +905,12 @@ target_link_libraries(unittest_subprocess set_target_properties(unittest_subprocess PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_pageset +add_executable(unittest_pageset test_pageset.cc) +target_link_libraries(unittest_pageset ${UNITTEST_LIBS}) +set_target_properties(unittest_pageset PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + if(${WITH_RADOSGW}) # test_cors set(test_cors_srcs test_cors.cc) diff --git a/src/test/Makefile-server.am b/src/test/Makefile-server.am index a10f146f80ddd..bb3ce82fe7ee5 100644 --- a/src/test/Makefile-server.am +++ b/src/test/Makefile-server.am @@ -200,6 +200,11 @@ ceph_test_snap_mapper_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL) ceph_test_snap_mapper_CXXFLAGS = $(UNITTEST_CXXFLAGS) bin_DEBUGPROGRAMS += ceph_test_snap_mapper +unittest_pageset_SOURCES = test/test_pageset.cc +unittest_pageset_LDADD = $(UNITTEST_LDADD) +unittest_pageset_CXXFLAGS = $(UNITTEST_CXXFLAGS) +check_TESTPROGRAMS += unittest_pageset + endif # WITH_OSD if WITH_SLIBROCKSDB diff --git a/src/test/test_pageset.cc b/src/test/test_pageset.cc new file mode 100644 index 0000000000000..c105af74165c2 --- /dev/null +++ b/src/test/test_pageset.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "gtest/gtest.h" + +#include "os/PageSet.h" + +TEST(PageSet, AllocAligned) +{ + PageSet pages(1); + PageSet::page_vector range; + + pages.alloc_range(0, 4, range); + ASSERT_EQ(4u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(1u, range[1]->offset); + ASSERT_EQ(2u, range[2]->offset); + ASSERT_EQ(3u, range[3]->offset); +} + +TEST(PageSet, AllocUnaligned) +{ + PageSet pages(2); + PageSet::page_vector range; + + // front of first page + pages.alloc_range(0, 1, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + range.clear(); + + // back of first page + pages.alloc_range(1, 1, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + range.clear(); + + // back of first page and front of second + pages.alloc_range(1, 2, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // back of first page and all of second + pages.alloc_range(1, 3, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // back of first page, all of second, and front of third + pages.alloc_range(1, 4, range); + ASSERT_EQ(3u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + ASSERT_EQ(4u, range[2]->offset); +} + +TEST(PageSet, GetAligned) +{ + // allocate 4 pages + PageSet pages(1); + PageSet::page_vector range; + pages.alloc_range(0, 4, range); + range.clear(); + + // get first page + pages.get_range(0, 1, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + range.clear(); + + // get second and third pages + pages.get_range(1, 2, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(1u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // get all four pages + pages.get_range(0, 4, range); + ASSERT_EQ(4u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(1u, range[1]->offset); + ASSERT_EQ(2u, range[2]->offset); + ASSERT_EQ(3u, range[3]->offset); + range.clear(); +} + +TEST(PageSet, GetUnaligned) +{ + // allocate 3 pages + PageSet pages(2); + PageSet::page_vector range; + pages.alloc_range(0, 6, range); + range.clear(); + + // front of first page + pages.get_range(0, 1, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + range.clear(); + + // back of first page + pages.get_range(1, 1, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + range.clear(); + + // back of first page and front of second + pages.get_range(1, 2, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // back of first page and all of second + pages.get_range(1, 3, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // back of first page, all of second, and front of third + pages.get_range(1, 4, range); + ASSERT_EQ(3u, range.size()); + ASSERT_EQ(0u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + ASSERT_EQ(4u, range[2]->offset); + range.clear(); + + // back of third page with nothing beyond + pages.get_range(5, 999, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(4u, range[0]->offset); + range.clear(); +} + +TEST(PageSet, GetHoles) +{ + // allocate pages at offsets 1, 2, 5, and 7 + PageSet pages(1); + PageSet::page_vector range; + for (uint64_t i : {1, 2, 5, 7}) + pages.alloc_range(i, 1, range); + range.clear(); + + // nothing at offset 0, page at offset 1 + pages.get_range(0, 2, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(1u, range[0]->offset); + range.clear(); + + // nothing at offset 0, pages at offset 1 and 2, nothing at offset 3 + pages.get_range(0, 4, range); + ASSERT_EQ(2u, range.size()); + ASSERT_EQ(1u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + range.clear(); + + // page at offset 2, nothing at offset 3 or 4 + pages.get_range(2, 3, range); + ASSERT_EQ(1u, range.size()); + ASSERT_EQ(2u, range[0]->offset); + range.clear(); + + // get the full range + pages.get_range(0, 999, range); + ASSERT_EQ(4u, range.size()); + ASSERT_EQ(1u, range[0]->offset); + ASSERT_EQ(2u, range[1]->offset); + ASSERT_EQ(5u, range[2]->offset); + ASSERT_EQ(7u, range[3]->offset); + range.clear(); +} + +TEST(PageSet, FreeAligned) +{ + // allocate 4 pages + PageSet pages(1); + PageSet::page_vector range; + pages.alloc_range(0, 4, range); + range.clear(); + + // get the full range + pages.get_range(0, 4, range); + ASSERT_EQ(4u, range.size()); + range.clear(); + + // free after offset 4 has no effect + pages.free_pages_after(4); + pages.get_range(0, 4, range); + ASSERT_EQ(4u, range.size()); + range.clear(); + + // free page 4 + pages.free_pages_after(3); + pages.get_range(0, 4, range); + ASSERT_EQ(3u, range.size()); + range.clear(); + + // free pages 2 and 3 + pages.free_pages_after(1); + pages.get_range(0, 4, range); + ASSERT_EQ(1u, range.size()); + range.clear(); +} + +TEST(PageSet, FreeUnaligned) +{ + // allocate 4 pages + PageSet pages(2); + PageSet::page_vector range; + pages.alloc_range(0, 8, range); + range.clear(); + + // get the full range + pages.get_range(0, 8, range); + ASSERT_EQ(4u, range.size()); + range.clear(); + + // free after offset 7 has no effect + pages.free_pages_after(7); + pages.get_range(0, 8, range); + ASSERT_EQ(4u, range.size()); + range.clear(); + + // free page 4 + pages.free_pages_after(5); + pages.get_range(0, 8, range); + ASSERT_EQ(3u, range.size()); + range.clear(); + + // free pages 2 and 3 + pages.free_pages_after(1); + pages.get_range(0, 8, range); + ASSERT_EQ(1u, range.size()); + range.clear(); +} + +TEST(PageSet, FreeHoles) +{ + // allocate pages at offsets 1, 2, 5, and 7 + PageSet pages(1); + PageSet::page_vector range; + for (uint64_t i : {1, 2, 5, 7}) + pages.alloc_range(i, 1, range); + range.clear(); + + // get the full range + pages.get_range(0, 8, range); + ASSERT_EQ(4u, range.size()); + range.clear(); + + // free page 7 + pages.free_pages_after(6); + pages.get_range(0, 8, range); + ASSERT_EQ(3u, range.size()); + range.clear(); + + // free page 5 + pages.free_pages_after(3); + pages.get_range(0, 8, range); + ASSERT_EQ(2u, range.size()); + range.clear(); + + // free pages 1 and 2 + pages.free_pages_after(0); + pages.get_range(0, 8, range); + ASSERT_EQ(0u, range.size()); +} From 97aed59fb9e865d30d31d2b7f4e93fc9727c96fa Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 26 Aug 2015 15:08:18 -0700 Subject: [PATCH 091/654] rgw: delete finisher only after finalizing watches Fixes: #12208 The watch error path might try to schedule a finisher work, delete finisher only after watch destruction. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index b5c359fa941e1..cc0481d45452c 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -1432,11 +1432,17 @@ void RGWRados::finalize() { if (finisher) { finisher->stop(); - delete finisher; } if (need_watch_notify()) { finalize_watch(); } + if (finisher) { + /* delete finisher only after cleaning up watches, as watch error path might call + * into finisher. We stop finisher before finalizing watch to make sure we don't + * actually handle any racing work + */ + delete finisher; + } delete meta_mgr; delete data_log; if (use_gc_thread) { From 71a0a029f8839b69470f9d1f082ec837b34a3dd4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 16:45:50 -0700 Subject: [PATCH 092/654] debian: create ceph user and group Use Debian base-passwd allocated UID/GID pair. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 debian/ceph-common.postinst diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst new file mode 100644 index 0000000000000..896dbeb7ccb89 --- /dev/null +++ b/debian/ceph-common.postinst @@ -0,0 +1,90 @@ +#!/bin/sh +# vim: set noet ts=8: +# postinst script for ceph-mds +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# +# postinst configure +# old-postinst abort-upgrade +# conflictor's-postinst abort-remove in-favour +# postinst abort-remove +# deconfigured's-postinst abort-deconfigure in-favour [ ] +# + +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +# Let the admin override these distro-specified defaults. This is NOT +# recommended! +[ -f "/etc/default/ceph" ] && . /etc/default/ceph + +[ -z "$SERVER_HOME" ] && SERVER_HOME=/var/lib/ceph +[ -z "$SERVER_USER" ] && SERVER_USER=ceph +[ -z "$SERVER_NAME" ] && SERVER_NAME="Ceph storage service" +[ -z "$SERVER_GROUP" ] && SERVER_GROUP=ceph +[ -z "$SERVER_UID" ] && SERVER_UID=64045 # alloc by Debian base-passwd maintainer + +# Groups that the user will be added to, if undefined, then none. +[ -z "$SERVER_ADDGROUP" ] && SERVER_ADDGROUP= + +case "$1" in + configure) + # create user to avoid running server as root + # 1. create group if not existing + if ! getent group | grep -q "^$SERVER_GROUP:" ; then + echo -n "Adding group $SERVER_GROUP.." + addgroup --quiet --system --gid $SERVER_GID \ + $SERVER_GROUP 2>/dev/null ||true + echo "..done" + fi + # 2. create user if not existing + if ! getent passwd | grep -q "^$SERVER_USER:"; then + echo -n "Adding system user $SERVER_USER.." + adduser --quiet \ + --system \ + --ingroup $SERVER_GROUP \ + --no-create-home \ + --disabled-password \ + --uid $SERVER_UID \ + --gid $SERVER_GID \ + $SERVER_USER 2>/dev/null || true + echo "..done" + fi + # 3. adjust passwd entry + usermod -c "$SERVER_NAME" \ + -d $SERVER_HOME \ + -g $SERVER_GROUP \ + $SERVER_USER + # 4. Add the user to extra groups + if test -n $SERVER_ADDGROUP + then + if ! groups $SERVER_USER | cut -d: -f2 | \ + grep -qw $SERVER_ADDGROUP; then + echo -n "Adding user $SERVER_USER to groups $SERVER_ADDGROUP.." + adduser $SERVER_USER $SERVER_ADDGROUP + echo "..done" + fi + fi + + ;; + abort-upgrade|abort-remove|abort-deconfigure) + : + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 From 2ba3d61d276f7af421eea0d01765464956b73e8d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:03:02 -0700 Subject: [PATCH 093/654] debian: chown ceph:ceph /var/llib/ceph Do not do it recursively--there may already be huge amounts of data here. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index 896dbeb7ccb89..8670ddb29319d 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -71,6 +71,13 @@ case "$1" in fi fi + # 5. adjust file and directory permissions + if ! dpkg-statoverride --list $SERVER_HOME >/dev/null + then + chown $SERVER_USER:$SERVER_GROUP $SERVER_HOME + chmod u=rwx,g=rx,o= $SERVER_HOME + fi + ;; abort-upgrade|abort-remove|abort-deconfigure) : From 7522650939ce5bfe4acb9c5f7196c9f8c6906dd6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:03:39 -0700 Subject: [PATCH 094/654] debian: chown -R ceph:ceph /var/log/ceph The number of log files is generally bounded; safe to chown these. Allow ceph group members to write to this dir. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index 8670ddb29319d..8b9e01f453d6c 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -77,6 +77,11 @@ case "$1" in chown $SERVER_USER:$SERVER_GROUP $SERVER_HOME chmod u=rwx,g=rx,o= $SERVER_HOME fi + if ! dpkg-statoverride --list /var/log/ceph >/dev/null + then + chown -R $SERVER_USER:$SERVER_GROUP /var/log/ceph + chmod u=rwx,g=rxs,o= /var/log/ceph + fi ;; abort-upgrade|abort-remove|abort-deconfigure) From 3c569382808f8a38328c95d037f53ebee8475a29 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:06:12 -0700 Subject: [PATCH 095/654] ceph.spec: chown and chmod /var/lib/ceph and /var/log/ceph Signed-off-by: Sage Weil --- ceph.spec.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ceph.spec.in b/ceph.spec.in index 37e543901b121..7b0fc3bdf0ccc 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -847,6 +847,8 @@ mkdir -p %{_localstatedir}/run/ceph/ %{python_sitelib}/ceph_argparse.py* %{python_sitelib}/ceph_daemon.py* %{_udevrulesdir}/50-rbd.rules +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/ +%attr(2750,ceph,ceph) %dir %{_localstatedir}/log/ceph/ %postun -n ceph-common # Package removal cleanup From ec1ee5e90101a3bcc6356a8a8d2973db838bc9e3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:12:23 -0700 Subject: [PATCH 096/654] systemd: run mon and mds as ceph:ceph Signed-off-by: Sage Weil --- systemd/ceph-mds@.service | 2 ++ systemd/ceph-mon@.service | 2 ++ 2 files changed, 4 insertions(+) diff --git a/systemd/ceph-mds@.service b/systemd/ceph-mds@.service index aec46fd4eee87..e045ebba0aaf9 100644 --- a/systemd/ceph-mds@.service +++ b/systemd/ceph-mds@.service @@ -7,6 +7,8 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph +User=ceph +Group=ceph ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i ExecReload=/bin/kill -HUP $MAINPID diff --git a/systemd/ceph-mon@.service b/systemd/ceph-mon@.service index 2e884507e8e9b..396cb84a18c72 100644 --- a/systemd/ceph-mon@.service +++ b/systemd/ceph-mon@.service @@ -13,6 +13,8 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph +User=ceph +Group=ceph ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i ExecReload=/bin/kill -HUP $MAINPID From b8893f6b8a5b11d5bcc35b56986ea65b5e7ab81f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:12:34 -0700 Subject: [PATCH 097/654] systemd: chown ceph:ceph /var/run/ceph Signed-off-by: Sage Weil --- systemd/ceph.tmpfiles.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/ceph.tmpfiles.d b/systemd/ceph.tmpfiles.d index 871de3392e821..d2a7aa1b5136b 100644 --- a/systemd/ceph.tmpfiles.d +++ b/systemd/ceph.tmpfiles.d @@ -1 +1 @@ -d /var/run/ceph 0755 root root - +d /var/run/ceph 0755 ceph ceph - From 6532e1c48656b37c59185cb7bd3840fac008050f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 24 Apr 2015 10:28:31 -0700 Subject: [PATCH 098/654] debian: fix /var/lib/ceph/* directory ownership These dirs are owned by the package; make sure they are owend by the ceph user. Signed-off-by: Sage Weil --- debian/ceph-mds.postinst | 8 ++++++++ debian/ceph.postinst | 11 +++++++++++ debian/radosgw.postinst | 8 ++++++++ 3 files changed, 27 insertions(+) diff --git a/debian/ceph-mds.postinst b/debian/ceph-mds.postinst index 66b3b5fc2d71e..b69efedaafb01 100644 --- a/debian/ceph-mds.postinst +++ b/debian/ceph-mds.postinst @@ -18,10 +18,18 @@ set -e # for details, see http://www.debian.org/doc/debian-policy/ or # the debian-policy package +[ -f "/etc/default/ceph" ] && . /etc/default/ceph +[ -z "$SERVER_USER" ] && SERVER_USER=ceph +[ -z "$SERVER_GROUP" ] && SERVER_GROUP=ceph case "$1" in configure) [ -x /sbin/start ] && start ceph-mds-all || : + + if ! dpkg-statoverride --list /var/lib/ceph/mds >/dev/null + then + chown $SERVER_USER:$SERVER_GROUP /var/lib/ceph/mds + fi ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/debian/ceph.postinst b/debian/ceph.postinst index 5d64f640b9946..75eeb59c6246c 100644 --- a/debian/ceph.postinst +++ b/debian/ceph.postinst @@ -24,11 +24,22 @@ set -e # for details, see http://www.debian.org/doc/debian-policy/ or # the debian-policy package +[ -f "/etc/default/ceph" ] && . /etc/default/ceph +[ -z "$SERVER_USER" ] && SERVER_USER=ceph +[ -z "$SERVER_GROUP" ] && SERVER_GROUP=ceph case "$1" in configure) rm -f /etc/init/ceph.conf [ -x /sbin/start ] && start ceph-all || : + + # adjust file and directory permissions + for DIR in /var/lib/ceph/* ; do + if ! dpkg-statoverride --list $DIR >/dev/null + then + chown $SERVER_USER:$SERVER_GROUP $DIR + fi + done ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/debian/radosgw.postinst b/debian/radosgw.postinst index f3468bc60bf2c..07e3ec30b6d3e 100644 --- a/debian/radosgw.postinst +++ b/debian/radosgw.postinst @@ -24,10 +24,18 @@ set -e # for details, see http://www.debian.org/doc/debian-policy/ or # the debian-policy package +[ -f "/etc/default/ceph" ] && . /etc/default/ceph +[ -z "$SERVER_USER" ] && SERVER_USER=ceph +[ -z "$SERVER_GROUP" ] && SERVER_GROUP=ceph case "$1" in configure) [ -x /sbin/start ] && start radosgw-all || : + + if ! dpkg-statoverride --list /var/lib/ceph/radosgw >/dev/null + then + chown $SERVER_USER:$SERVER_GROUP /var/lib/ceph/radosgw + fi ;; abort-upgrade|abort-remove|abort-deconfigure) : From 4dfe0a8a4b958aab154ee14729c8444fbcb4b798 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 24 Apr 2015 14:57:46 -0700 Subject: [PATCH 099/654] global: add --setuser and --setgroup options These are done after reading config files/environment and before log files are opened. Allow a name or id to be specified. In the case of --setuser, also switch to that user's gid, unless --setgroup is also specified. Signed-off-by: Sage Weil --- doc/man/8/ceph-mds.rst | 11 ++++++ doc/man/8/ceph-mon.rst | 11 ++++++ doc/man/8/ceph-osd.rst | 11 ++++++ src/common/ceph_argparse.cc | 2 ++ src/common/config_opts.h | 2 ++ src/global/global_init.cc | 56 ++++++++++++++++++++++++++++++- src/test/cli/radosgw-admin/help.t | 2 ++ 7 files changed, 94 insertions(+), 1 deletion(-) diff --git a/doc/man/8/ceph-mds.rst b/doc/man/8/ceph-mds.rst index d2ae92292256b..af1f3c7cb52a7 100644 --- a/doc/man/8/ceph-mds.rst +++ b/doc/man/8/ceph-mds.rst @@ -47,6 +47,17 @@ Options Debug mode: like ``-f``, but also send all log output to stderr. +.. option:: --setuser userorgid + + Set uid after starting. If a username is specified, the user + record is looked up to get a uid and a gid, and the gid is also set + as well, unless --setgroup is also specified. + +.. option:: --setgroup grouporgid + + Set gid after starting. If a group name is specified the group + record is looked up to get a gid. + .. option:: -c ceph.conf, --conf=ceph.conf Use *ceph.conf* configuration file instead of the default diff --git a/doc/man/8/ceph-mon.rst b/doc/man/8/ceph-mon.rst index 287c668349bb9..7a2cd032c4814 100644 --- a/doc/man/8/ceph-mon.rst +++ b/doc/man/8/ceph-mon.rst @@ -36,6 +36,17 @@ Options Debug mode: like ``-f``, but also send all log output to stderr. +.. option:: --setuser userorgid + + Set uid after starting. If a username is specified, the user + record is looked up to get a uid and a gid, and the gid is also set + as well, unless --setgroup is also specified. + +.. option:: --setgroup grouporgid + + Set gid after starting. If a group name is specified the group + record is looked up to get a gid. + .. option:: -c ceph.conf, --conf=ceph.conf Use *ceph.conf* configuration file instead of the default diff --git a/doc/man/8/ceph-osd.rst b/doc/man/8/ceph-osd.rst index 9a1e6afdd88d0..e8b2805b14e7f 100644 --- a/doc/man/8/ceph-osd.rst +++ b/doc/man/8/ceph-osd.rst @@ -38,6 +38,17 @@ Options Debug mode: like ``-f``, but also send all log output to stderr. +.. option:: --setuser userorgid + + Set uid after starting. If a username is specified, the user + record is looked up to get a uid and a gid, and the gid is also set + as well, unless --setgroup is also specified. + +.. option:: --setgroup grouporgid + + Set gid after starting. If a group name is specified the group + record is looked up to get a gid. + .. option:: --osd-data osddata Use object store at *osddata*. diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc index 858882baf0efd..1a60f2e1a8351 100644 --- a/src/common/ceph_argparse.cc +++ b/src/common/ceph_argparse.cc @@ -521,6 +521,8 @@ static void generic_usage(bool is_server) --id/-i ID set ID portion of my name\n\ --name/-n TYPE.ID set name\n\ --cluster NAME set cluster name (default: ceph)\n\ + --setuser USER set uid to user or uid (and gid to user's gid)\n\ + --setgroup GROUP set gid to group or gid\n\ --version show version and quit\n\ " << std::endl; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 15c8ed57592a8..5e26ac1fb9ba4 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -29,6 +29,8 @@ OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed OPTION(crushtool, OPT_STR, "crushtool") // crushtool utility path OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit() +OPTION(setuser, OPT_STR, "") // uid or user name +OPTION(setgroup, OPT_STR, "") // gid or group name OPTION(pid_file, OPT_STR, "") // default changed by common_preinit() OPTION(chdir, OPT_STR, "/") OPTION(max_open_files, OPT_LONGLONG, 0) diff --git a/src/global/global_init.cc b/src/global/global_init.cc index 06a7c2eaff588..23be38c5f4f24 100644 --- a/src/global/global_init.cc +++ b/src/global/global_init.cc @@ -29,6 +29,9 @@ #include "include/compat.h" #include "include/color.h" +#include +#include + #include #include #ifdef WITH_LTTNG @@ -130,13 +133,64 @@ void global_init(std::vector < const char * > *alt_def_args, if (g_conf->log_flush_on_exit) g_ceph_context->_log->set_flush_on_exit(); + // drop privileges? + if (g_conf->setgroup.length() || + g_conf->setuser.length()) { + uid_t uid = 0; // zero means no change; we can only drop privs here. + gid_t gid = 0; + if (g_conf->setuser.length()) { + uid = atoi(g_conf->setuser.c_str()); + if (!uid) { + char buf[4096]; + struct passwd pa; + struct passwd *p = 0; + getpwnam_r(g_conf->setuser.c_str(), &pa, buf, sizeof(buf), &p); + if (!p) { + cerr << "unable to look up user '" << g_conf->setuser << "'" + << std::endl; + exit(1); + } + uid = p->pw_uid; + gid = p->pw_gid; + } + } + if (g_conf->setgroup.length() > 0) { + gid = atoi(g_conf->setgroup.c_str()); + if (!gid) { + char buf[4096]; + struct group gr; + struct group *g = 0; + getgrnam_r(g_conf->setgroup.c_str(), &gr, buf, sizeof(buf), &g); + if (!g) { + cerr << "unable to look up group '" << g_conf->setgroup << "'" + << std::endl; + exit(1); + } + gid = g->gr_gid; + } + } + if (setgid(gid) != 0) { + int r = errno; + cerr << "unable to setgid " << gid << ": " << cpp_strerror(r) + << std::endl; + exit(1); + } + if (setuid(uid) != 0) { + int r = errno; + cerr << "unable to setuid " << uid << ": " << cpp_strerror(r) + << std::endl; + exit(1); + } + dout(0) << "set uid:gid to " << uid << ":" << gid << dendl; + } + if (g_conf->run_dir.length() && code_env == CODE_ENVIRONMENT_DAEMON && !(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) { int r = ::mkdir(g_conf->run_dir.c_str(), 0755); if (r < 0 && errno != EEXIST) { r = -errno; - derr << "warning: unable to create " << g_conf->run_dir << ": " << cpp_strerror(r) << dendl; + cerr << "warning: unable to create " << g_conf->run_dir << ": " << cpp_strerror(r) << std::endl; } } diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index 33aee1d5eb332..fec8737541e4b 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -135,6 +135,8 @@ --id/-i ID set ID portion of my name --name/-n TYPE.ID set name --cluster NAME set cluster name (default: ceph) + --setuser USER set uid to user or uid (and gid to user's gid) + --setgroup GROUP set gid to group or gid --version show version and quit [1] From 7c9fdf44f2c18659a0bcc03f7b98dafdf9f54448 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:15:14 -0700 Subject: [PATCH 100/654] systemd: make ceph-osd setuid/gid to ceph:ceph Signed-off-by: Sage Weil --- systemd/ceph-osd@.service.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systemd/ceph-osd@.service.in b/systemd/ceph-osd@.service.in index 72d21f432d11c..69ab8c358a5ac 100644 --- a/systemd/ceph-osd@.service.in +++ b/systemd/ceph-osd@.service.in @@ -7,8 +7,8 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph -ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i -ExecStartPre=@systemd_libexec_dir@/ceph-osd-prestart.sh --cluster ${CLUSTER} --id %i +ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph +ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --setuser ceph --setgroup ceph --id %i LimitNOFILE=131072 ExecReload=/bin/kill -HUP $MAINPID From c7ee798a0f9ddb79f799fe19dba0873efa4fdcfa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Aug 2015 11:37:30 -0400 Subject: [PATCH 101/654] set nofile ulimit in /etc/security/limits.d/ceph only Specify the nofile ulimit in one standard place, where everyone expects it to be. Drop it from the ceph-osd unit file. Leave upstart and sysvinit untouched for the time being to avoid compat issues. Signed-off-by: Sage Weil --- Makefile.am | 1 + ceph.spec.in | 1 + debian/ceph-common.install | 1 + debian/rules | 1 + etc/ceph.limits.d | 9 +++++++++ systemd/ceph-osd@.service.in | 1 - 6 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 etc/ceph.limits.d diff --git a/Makefile.am b/Makefile.am index d6f7bbdf19edf..fcf40707d45c3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,6 +10,7 @@ EXTRA_DIST += \ src/test/cli \ src/test/downloads \ systemd/ceph.tmpfiles.d \ + etc/ceph.limits.d \ udev/50-rbd.rules \ udev/60-ceph-partuuid-workaround.rules \ udev/95-ceph-osd.rules \ diff --git a/ceph.spec.in b/ceph.spec.in index 7b0fc3bdf0ccc..bc0e981c0b5bc 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -577,6 +577,7 @@ find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap +install -D ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf diff --git a/debian/ceph-common.install b/debian/ceph-common.install index 4e21adff9c474..1fa4c1309f086 100644 --- a/debian/ceph-common.install +++ b/debian/ceph-common.install @@ -25,5 +25,6 @@ usr/share/ceph/id_dsa_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com.pub etc/ceph/rbdmap etc/init.d/rbdmap +etc/security/limits.d/ceph lib/udev/rules.d/50-rbd.rules usr/lib/python*/dist-packages/ceph_argparse.py* diff --git a/debian/rules b/debian/rules index bb0aeaf3da18e..5521d05bb2fd4 100755 --- a/debian/rules +++ b/debian/rules @@ -85,6 +85,7 @@ install: build install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules install -D -m 644 src/rbdmap $(DESTDIR)/etc/ceph/rbdmap install -D -m 755 src/init-rbdmap $(DESTDIR)/etc/init.d/rbdmap + install -D -m 644 etc/ceph.limits.d $(DESTDIR)/etc/security/limits.d/ceph # Add here commands to install the package into debian/testpack. # Build architecture-independent files here. diff --git a/etc/ceph.limits.d b/etc/ceph.limits.d new file mode 100644 index 0000000000000..702aa0332ecb7 --- /dev/null +++ b/etc/ceph.limits.d @@ -0,0 +1,9 @@ +# /etc/security/limits.d/ceph +# +# +# + +# We want a very large value for nofile for the ceph user as the ceph +# clients and daemons consume lots and lots of file descriptors. + +ceph - nofile 4194304 diff --git a/systemd/ceph-osd@.service.in b/systemd/ceph-osd@.service.in index 69ab8c358a5ac..5c7f77c7fe721 100644 --- a/systemd/ceph-osd@.service.in +++ b/systemd/ceph-osd@.service.in @@ -9,7 +9,6 @@ EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --setuser ceph --setgroup ceph --id %i -LimitNOFILE=131072 ExecReload=/bin/kill -HUP $MAINPID [Install] From ed0cd42d244ca2bd68e929f5c9e77ba71526d86a Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Sat, 25 Apr 2015 10:04:52 +0200 Subject: [PATCH 102/654] ceph.spec.in: add ceph user/group Add support of create ceph user/group. TODO: Add correct number for SUSE. Signed-off-by: Danny Al-Gaaf --- ceph.spec.in | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ceph.spec.in b/ceph.spec.in index bc0e981c0b5bc..6dffda868fcdc 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -37,6 +37,21 @@ restorecon -R /var/log/ceph > /dev/null 2>&1; %global _with_systemd 1 %endif +CEPH_GROUP_ID="" +CEPH_USER_ID="" +%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +CEPH_GROUP_ID="-g 167" +CEPH_USER_ID="-u 167" +%endif +# %if 0%{?suse_version} +# CEPH_GROUP_ID="-g " +# CEPH_USER_ID="-u " +# %endif + + +################################################################################# +# common +################################################################################# Name: ceph Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} @@ -645,6 +660,11 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw +%pre +%{_sbindir}/groupadd $CEPH_GROUP_ID -o -r ceph 2>/dev/null || : +%{_sbindir}/useradd -r -o -g ceph $CEPH_USER_ID -s /bin/false -c "Ceph daemons" -d %{_localstatedir}/lib/ceph ceph 2> /dev/null || : +%{_sbindir}/usermod -g ceph ceph 2>/dev/null || : + %clean rm -rf $RPM_BUILD_ROOT From 596c9b68c83af70f2be6afede0df459846a78b7e Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Thu, 30 Apr 2015 12:34:08 +0200 Subject: [PATCH 103/654] ceph.spec.in: install ceph.limits.d limits.d expect the file to end with *.conf. Add the installed file to package list of the ceph package. Signed-off-by: Danny Al-Gaaf --- ceph.spec.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index 6dffda868fcdc..c15e41682b301 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -592,7 +592,7 @@ find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap -install -D ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph +install -D ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph.conf %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf @@ -794,6 +794,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_libdir}/ceph/erasure-code/libec_*.so* %{_udevrulesdir}/60-ceph-partuuid-workaround.rules %{_udevrulesdir}/95-ceph-osd.rules +%config %{_sysconfdir}/security/limits.d/ceph.conf %config %{_sysconfdir}/bash_completion.d/ceph %config(noreplace) %{_sysconfdir}/logrotate.d/ceph %if 0%{?suse_version} From 2d4f3a9199bf074706ee01f1069d1036dbd3f206 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Wed, 17 Jun 2015 17:55:59 +0200 Subject: [PATCH 104/654] ceph.spec.in: Fixup uid/gid setting We need to set the variables in the %pre phase, otherwise they are not properly evaluated. Also use /sbin/nolongin instead of /bin/false and set the default uid/gid pair for fedora and rhel (these were already allocated). We can also use them for older fedora releases as they are guaranteed to be free. Signed-off-by: Boris Ranto --- ceph.spec.in | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index c15e41682b301..e304ae6bdb952 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -37,17 +37,6 @@ restorecon -R /var/log/ceph > /dev/null 2>&1; %global _with_systemd 1 %endif -CEPH_GROUP_ID="" -CEPH_USER_ID="" -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} -CEPH_GROUP_ID="-g 167" -CEPH_USER_ID="-u 167" -%endif -# %if 0%{?suse_version} -# CEPH_GROUP_ID="-g " -# CEPH_USER_ID="-u " -# %endif - ################################################################################# # common @@ -661,9 +650,20 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw %pre -%{_sbindir}/groupadd $CEPH_GROUP_ID -o -r ceph 2>/dev/null || : -%{_sbindir}/useradd -r -o -g ceph $CEPH_USER_ID -s /bin/false -c "Ceph daemons" -d %{_localstatedir}/lib/ceph ceph 2> /dev/null || : -%{_sbindir}/usermod -g ceph ceph 2>/dev/null || : +CEPH_GROUP_ID="" +CEPH_USER_ID="" +# disabled for now until we have the numbers +%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +CEPH_GROUP_ID="-g 167" +CEPH_USER_ID="-u 167" +%endif +# %if 0%{?suse_version} +# CEPH_GROUP_ID="-g " +# CEPH_USER_ID="-u " +# %endif + +%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || : +%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || : %clean rm -rf $RPM_BUILD_ROOT From d9df52b49a96b5aaf2ecf383441b2c429427bc4e Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Wed, 17 Jun 2015 20:06:40 +0200 Subject: [PATCH 105/654] ceph.spec.in: Fix ceph.limits.d path Signed-off-by: Boris Ranto --- ceph.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index e304ae6bdb952..94a3af198d99c 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -581,7 +581,7 @@ find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap -install -D ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph.conf +install -D etc/ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph.conf %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf From e95904f7040bbdd6c710ec52ee901e319386d50a Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Thu, 18 Jun 2015 13:57:47 +0200 Subject: [PATCH 106/654] ceph.spec.in: /var/lib/ceph is owned by ceph package Signed-off-by: Boris Ranto --- ceph.spec.in | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 94a3af198d99c..937da890c68ee 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -820,17 +820,15 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_mandir}/man8/ceph-clsinfo.8* %{_mandir}/man8/librados-config.8* #set up placeholder directories -%dir %{_localstatedir}/lib/ceph/ -%dir %{_localstatedir}/lib/ceph/tmp -%dir %{_localstatedir}/lib/ceph/mon -%dir %{_localstatedir}/lib/ceph/osd -%dir %{_localstatedir}/lib/ceph/mds -%dir %{_localstatedir}/lib/ceph/bootstrap-osd -%dir %{_localstatedir}/lib/ceph/bootstrap-mds -%dir %{_localstatedir}/lib/ceph/bootstrap-rgw -%if (! 0%{?suse_version}) || ( 0%{?suse_version} && (! 0%{?_with_systemd}) ) -%dir %{_localstatedir}/run/ceph/ -%endif +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/ +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds +%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw +%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph/ ################################################################################# %files -n ceph-common @@ -869,7 +867,6 @@ mkdir -p %{_localstatedir}/run/ceph/ %{python_sitelib}/ceph_argparse.py* %{python_sitelib}/ceph_daemon.py* %{_udevrulesdir}/50-rbd.rules -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/ %attr(2750,ceph,ceph) %dir %{_localstatedir}/log/ceph/ %postun -n ceph-common From ceb93e8e69e125c9358f63b0099e7509b6624bcf Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Thu, 18 Jun 2015 13:42:04 +0200 Subject: [PATCH 107/654] ceph.spec.in: User and group must be created in ceph-common pre-install script The package ceph-common uses ceph user and group but they are both created in ceph package %pre phase. We need to move the script to ceph-common %pre phase. Signed-off-by: Boris Ranto --- ceph.spec.in | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 937da890c68ee..d00b910493887 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -649,21 +649,11 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw -%pre -CEPH_GROUP_ID="" -CEPH_USER_ID="" -# disabled for now until we have the numbers -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} -CEPH_GROUP_ID="-g 167" -CEPH_USER_ID="-u 167" +%if %{defined suse_version} +# Fedora seems to have some problems with this macro, use it only on SUSE +%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib} +%fdupes %buildroot %endif -# %if 0%{?suse_version} -# CEPH_GROUP_ID="-g " -# CEPH_USER_ID="-u " -# %endif - -%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || : -%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || : %clean rm -rf $RPM_BUILD_ROOT @@ -869,6 +859,22 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_udevrulesdir}/50-rbd.rules %attr(2750,ceph,ceph) %dir %{_localstatedir}/log/ceph/ +%pre -n ceph-common +CEPH_GROUP_ID="" +CEPH_USER_ID="" +# disabled for now until we have the numbers +%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +CEPH_GROUP_ID="-g 167" +CEPH_USER_ID="-u 167" +%endif +# %if 0%{?suse_version} +# CEPH_GROUP_ID="-g " +# CEPH_USER_ID="-u " +# %endif + +%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || : +%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || : + %postun -n ceph-common # Package removal cleanup if [ "$1" -eq "0" ] ; then From 25f68aee020245b23352462c3cb791d155b12df7 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Thu, 18 Jun 2015 17:02:17 +0200 Subject: [PATCH 108/654] init-ceph.in: Set ceph user and group when running the daemons Signed-off-by: Boris Ranto --- src/init-ceph.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/init-ceph.in b/src/init-ceph.in index d676ac72f2cbc..acb57d4a9594b 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -304,9 +304,9 @@ for name in $what; do [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;" if [ -n "$SYSTEMD_RUN" ]; then - cmd="$SYSTEMD_RUN -r bash -c '$files $cmd --cluster $cluster -f'" + cmd="$SYSTEMD_RUN -r bash -c '$files $cmd --cluster $cluster --setuser ceph --setgroup ceph -f'" else - cmd="$files $wrap $cmd --cluster $cluster $runmode" + cmd="$files $wrap $cmd --cluster $cluster --setuser ceph --setgroup ceph $runmode" fi if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then From bbedc8e57e74f7c336e5e547ddf5166d8f65cdb0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 17:14:27 -0700 Subject: [PATCH 109/654] ceph-osd-prestart.sh: ensure data dir is root or ceph before start Signed-off-by: Sage Weil --- src/ceph-osd-prestart.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/ceph-osd-prestart.sh b/src/ceph-osd-prestart.sh index 79f2c132d9fc9..a76747b232d5b 100644 --- a/src/ceph-osd-prestart.sh +++ b/src/ceph-osd-prestart.sh @@ -17,6 +17,9 @@ if [ -z "$id" ]; then exit 1; fi +data="/var/lib/ceph/osd/${cluster:-ceph}-$id" +journal="$data/journal" + update="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_update_on_start || :)" if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then @@ -27,11 +30,11 @@ if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then fi location="$($hook --cluster ${cluster:-ceph} --id $id --type osd)" weight="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_initial_weight || :)" - defaultweight=`df -P -k /var/lib/ceph/osd/${cluster:-ceph}-$id/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.4f", d); print r }'` + defaultweight=`df -P -k $data/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.4f", d); print r }'` ceph \ --cluster="${cluster:-ceph}" \ --name="osd.$id" \ - --keyring="/var/lib/ceph/osd/${cluster:-ceph}-$id/keyring" \ + --keyring="$data/keyring" \ osd crush create-or-move \ -- \ "$id" \ @@ -39,7 +42,6 @@ if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then $location fi -journal="/var/lib/ceph/osd/${cluster:-ceph}-$id/journal" if [ -L "$journal" -a ! -e "$journal" ]; then udevadm settle --timeout=5 || : if [ -L "$journal" -a ! -e "$journal" ]; then @@ -48,3 +50,14 @@ if [ -L "$journal" -a ! -e "$journal" ]; then exit 0 fi fi + + +# ensure ownership is correct +owner=`stat -c %U $data/.` +if [ $owner -ne 'ceph' -a $owner -ne 'root' ]; then + echo "ceph-osd data dir $data is not owned by 'ceph' or 'root'" + echo "you must 'ceph-disk chown ...' or similar to fix ownership" + exit 1 +fi + +exit 0 From 8bd35bd6079b1251d7c0d72ba32e14047209bb7b Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 5 Aug 2015 14:25:07 +0200 Subject: [PATCH 110/654] Set Ceph device partitions owner to ceph user in udev. Signed-off-by: Milan Broz --- udev/95-ceph-osd.rules.systemd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udev/95-ceph-osd.rules.systemd b/udev/95-ceph-osd.rules.systemd index dccfe34f84b85..235c25509c7d3 100644 --- a/udev/95-ceph-osd.rules.systemd +++ b/udev/95-ceph-osd.rules.systemd @@ -2,6 +2,7 @@ ACTION=="add", SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \ + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ TAG+="systemd", \ ENV{SYSTEMD_WANTS}+="ceph-disk-activate@/dev/$name.service" @@ -9,6 +10,7 @@ ACTION=="add", SUBSYSTEM=="block", \ ACTION=="add", SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \ + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ TAG+="systemd", \ ENV{SYSTEMD_WANTS}+="ceph-disk-activate-journal@/dev/$name.service" @@ -16,6 +18,7 @@ ACTION=="add", SUBSYSTEM=="block", \ ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \ + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name" # Map data device and @@ -24,5 +27,6 @@ ACTION=="add" SUBSYSTEM=="block", \ ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \ + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ TAG+="systemd", \ ENV{SYSTEMD_WANTS}+="ceph-disk-dmcrypt-activate@/dev/$name.service" From 52e978e4b3660baa9f50b1bb8247909b672142e7 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 23 Jun 2015 16:45:21 +0200 Subject: [PATCH 111/654] Set keys owner to ceph user if exists. Also fix directory access rigths. Signed-off-by: Milan Broz --- src/ceph-create-keys | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/ceph-create-keys b/src/ceph-create-keys index 57eaf1744c1a6..1ccd98f9b91c8 100755 --- a/src/ceph-create-keys +++ b/src/ceph-create-keys @@ -7,12 +7,28 @@ import os import subprocess import sys import time +import pwd +import grp LOG = logging.getLogger(os.path.basename(sys.argv[0])) QUORUM_STATES = ['leader', 'peon'] +def get_ceph_uid(): + try: + uid = pwd.getpwnam('ceph').pw_uid + except: + uid = -1 + return uid + +def get_ceph_gid(): + try: + gid = grp.getgrnam('ceph').gr_gid + except: + gid = -1 + return gid + def wait_for_quorum(cluster, mon_id): while True: p = subprocess.Popen( @@ -68,10 +84,13 @@ def get_key(cluster, mon_id): pathdir = os.path.dirname(path) if not os.path.exists(pathdir): os.makedirs(pathdir) + os.chmod(pathdir, 0770) + os.chown(pathdir, get_ceph_uid(), get_ceph_gid()) while True: try: with file(tmp, 'w') as f: os.fchmod(f.fileno(), 0600) + os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid()) LOG.info('Talking to monitor...') returncode = subprocess.call( args=[ @@ -137,11 +156,14 @@ def bootstrap_key(cluster, type_): pathdir = os.path.dirname(path) if not os.path.exists(pathdir): os.makedirs(pathdir) + os.chmod(pathdir, 0770) + os.chown(pathdir, get_ceph_uid(), get_ceph_gid()) while True: try: with file(tmp, 'w') as f: os.fchmod(f.fileno(), 0600) + os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid()) LOG.info('Talking to monitor...') returncode = subprocess.call( args=args, From 09db67fdb5d6aacff27794550b0df3f90bd21df4 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 5 Aug 2015 14:49:57 +0200 Subject: [PATCH 112/654] ceph-disk: set owner of created files to ceph Signed-off-by: Milan Broz --- src/ceph-disk | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index 85196f94c2359..20be6673ff3df 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -32,6 +32,8 @@ import tempfile import uuid import time import shlex +import pwd +import grp """ Prepare: @@ -710,6 +712,12 @@ def get_osd_id(path): check_osd_id(osd_id) return osd_id +def get_ceph_user(): + if pwd.getpwnam('ceph') and grp.getgrnam('ceph'): + return 'ceph' + else: + return 'root' + def path_set_context(path): # restore selinux context to default policy values if which('restorecon'): @@ -720,6 +728,15 @@ def path_set_context(path): ], ) + # if ceph user exists, set owner to ceph + if get_ceph_user() == 'ceph': + command( + [ + 'chown', '-R', 'ceph:ceph', + path, + ], + ) + def _check_output(args=None, **kwargs): out, ret = command(args, **kwargs) if ret: @@ -1806,6 +1823,8 @@ def mkfs( '--osd-journal', os.path.join(path, 'journal'), '--osd-uuid', fsid, '--keyring', os.path.join(path, 'keyring'), + '--setuser', get_ceph_user(), + '--setgroup', get_ceph_user(), ], ) # TODO ceph-osd --mkfs removes the monmap file? From 28fdac32e7b22a9eda43a610610abd9b5b242269 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Aug 2015 11:57:48 -0400 Subject: [PATCH 113/654] global: implement setuser_match_path Allow the --setuser and --setgroup to be conditional on the specified user/group matching the ownership of a given path. This allows the ceph daemons to switch to user ceph for newly deployed instances or stay as root depending on the ownership of the data directory. Signed-off-by: Sage Weil Reviewed-by: Boris Ranto --- src/common/config_opts.h | 1 + src/global/global_init.cc | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5e26ac1fb9ba4..0abf9b418f5f6 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -31,6 +31,7 @@ OPTION(crushtool, OPT_STR, "crushtool") // crushtool utility path OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit() OPTION(setuser, OPT_STR, "") // uid or user name OPTION(setgroup, OPT_STR, "") // gid or group name +OPTION(setuser_match_path, OPT_STR, "") // make setuser/group conditional on this patch matching ownership OPTION(pid_file, OPT_STR, "") // default changed by common_preinit() OPTION(chdir, OPT_STR, "/") OPTION(max_open_files, OPT_LONGLONG, 0) diff --git a/src/global/global_init.cc b/src/global/global_init.cc index 23be38c5f4f24..a073613fa67c4 100644 --- a/src/global/global_init.cc +++ b/src/global/global_init.cc @@ -169,6 +169,32 @@ void global_init(std::vector < const char * > *alt_def_args, gid = g->gr_gid; } } + if ((uid || gid) && + g_conf->setuser_match_path.length()) { + struct stat st; + int r = ::stat(g_conf->setuser_match_path.c_str(), &st); + if (r < 0) { + r = -errno; + cerr << "unable to stat setuser_match_path " + << g_conf->setuser_match_path + << ": " << cpp_strerror(r) << std::endl; + exit(1); + } + if ((uid && uid != st.st_uid) || + (gid && gid != st.st_gid)) { + cerr << "WARNING: will not setuid/gid: " << g_conf->setuser_match_path + << " owned by " << st.st_uid << ":" << st.st_gid + << " and not requested " << uid << ":" << gid + << std::endl; + uid = 0; + gid = 0; + } else { + dout(10) << "setuser_match_path " + << g_conf->setuser_match_path << " owned by " + << st.st_uid << ":" << st.st_gid << ", doing setuid/gid" + << dendl; + } + } if (setgid(gid) != 0) { int r = errno; cerr << "unable to setgid " << gid << ": " << cpp_strerror(r) From 8f3185bade4b67876ca305e2ce9238626796fb11 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Aug 2015 11:57:57 -0400 Subject: [PATCH 114/654] systemd: use --setuser and --setgroup for all daemons Allow all daemons drop privilege themselves, instead of letting systemd do it. Among other things, this means that admins can conditionally not drop prives by setting setuser match path = /var/lib/ceph/$type/$cluster-$id in their ceph.conf to ease the pain of upgrade. Signed-off-by: Sage Weil Reviewed-by: Boris Ranto --- systemd/ceph-mds@.service | 4 +--- systemd/ceph-mon@.service | 4 +--- systemd/ceph-osd@.service.in | 2 +- systemd/ceph-radosgw@.service | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/systemd/ceph-mds@.service b/systemd/ceph-mds@.service index e045ebba0aaf9..7e5a95e8c4ebe 100644 --- a/systemd/ceph-mds@.service +++ b/systemd/ceph-mds@.service @@ -7,9 +7,7 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph -User=ceph -Group=ceph -ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i +ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph ExecReload=/bin/kill -HUP $MAINPID [Install] diff --git a/systemd/ceph-mon@.service b/systemd/ceph-mon@.service index 396cb84a18c72..7ac9b8f2ec769 100644 --- a/systemd/ceph-mon@.service +++ b/systemd/ceph-mon@.service @@ -13,9 +13,7 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph -User=ceph -Group=ceph -ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i +ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph ExecReload=/bin/kill -HUP $MAINPID [Install] diff --git a/systemd/ceph-osd@.service.in b/systemd/ceph-osd@.service.in index 5c7f77c7fe721..fac1932f58f22 100644 --- a/systemd/ceph-osd@.service.in +++ b/systemd/ceph-osd@.service.in @@ -8,7 +8,7 @@ PartOf=ceph.target EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph -ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --setuser ceph --setgroup ceph --id %i +ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph ExecReload=/bin/kill -HUP $MAINPID [Install] diff --git a/systemd/ceph-radosgw@.service b/systemd/ceph-radosgw@.service index 8a520aca30d65..486cef889cb44 100644 --- a/systemd/ceph-radosgw@.service +++ b/systemd/ceph-radosgw@.service @@ -7,7 +7,7 @@ PartOf=ceph.target [Service] EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph -ExecStart=/usr/bin/radosgw -f --cluster ${CLUSTER} --name client.%i +ExecStart=/usr/bin/radosgw -f --cluster ${CLUSTER} --name client.%i --setuser ceph --setgroup ceph [Install] WantedBy=ceph.target From 960139edba9fee804ff60f0a0b71a7d361953858 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Aug 2015 10:49:13 -0400 Subject: [PATCH 115/654] PendingReleaseNotes: some notes about upgrade and ceph user Signed-off-by: Sage Weil --- PendingReleaseNotes | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 95e563e214e19..6108ee9eb63d2 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -4,6 +4,34 @@ v9.1.0 v9.0.4 ====== +Upgrading +--------- + +* Ceph daemons now run as user and group ceph by default. During upgrade, + administrators have two options: + + #. Add the following line to ``ceph.conf`` on all hosts:: + + setuser match path = /var/lib/ceph/$type/$cluster-$id + + This will make the daemon remain root (i.e., not drop privileges and + switch to user ceph) if the daemon's data directory is still owned by + root. Newly deployed daemons will be created with data owned by user + ceph and will run with reduced privileges, but upgraded daemons will + continue to run as root. + + #. Fix the data ownership during the upgrade. This is the preferred option, + but is more work. The process for each host would be to: + + #. Upgrade the ceph package. This creates the ceph user and group. + #. Stop the daemon(s) + #. Fix the ownership. E.g.,:: + + chown -R ceph:ceph /var/lib/ceph/mon/ceph-foo + ceph-disk chown /dev/sdb1 + + #. Restart the daemon(s) + * The on-disk format for the experimental KeyValueStore OSD backend has changed. You will need to remove any OSDs using that backend before you upgrade any test clusters that use it. From aef00eb891db6febd548857f28f59ff241e5142f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 12 Aug 2015 16:43:38 -0400 Subject: [PATCH 116/654] ceph-disk: fix get_ceph_user getpwnam throws an exception Signed-off-by: Sage Weil --- src/ceph-disk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 20be6673ff3df..d7b3233cff5c7 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -713,9 +713,11 @@ def get_osd_id(path): return osd_id def get_ceph_user(): - if pwd.getpwnam('ceph') and grp.getgrnam('ceph'): + try: + pwd.getpwnam('ceph') + grp.getgrnam('ceph') return 'ceph' - else: + except KeyError: return 'root' def path_set_context(path): From b89d7521816990b3865fa025dc5408b63d118d8d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 12 Aug 2015 16:50:31 -0400 Subject: [PATCH 117/654] global_init: ignore --set{user,group} if not root Assume these options can only drop privileges; normal users can't setuid root (and even if they could we wouldn't want to). This makes ceph-disk behave when it sees the ceph user as 'root' and invokes things with --setuser root but runs as a non-root user. Signed-off-by: Sage Weil --- src/global/global_init.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/global/global_init.cc b/src/global/global_init.cc index a073613fa67c4..ce044078fbe89 100644 --- a/src/global/global_init.cc +++ b/src/global/global_init.cc @@ -133,6 +133,20 @@ void global_init(std::vector < const char * > *alt_def_args, if (g_conf->log_flush_on_exit) g_ceph_context->_log->set_flush_on_exit(); + // consider --setuser root a no-op, even if we're not root + if (getuid() != 0) { + if (g_conf->setuser.length()) { + cerr << "ignoring --setuser " << g_conf->setuser << " since I am not root" + << std::endl; + g_conf->set_val("setuser", "", false, false); + } + if (g_conf->setgroup.length()) { + cerr << "ignoring --setgroup " << g_conf->setgroup + << " since I am not root" << std::endl; + g_conf->set_val("setgroup", "", false, false); + } + } + // drop privileges? if (g_conf->setgroup.length() || g_conf->setuser.length()) { From 7cd07490f219925efcbbfbd66625cd89ab9be19c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 12 Aug 2015 16:54:55 -0400 Subject: [PATCH 118/654] PendingReleaseNotes: more notes about the 'ceph' user Signed-off-by: Sage Weil --- PendingReleaseNotes | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 6108ee9eb63d2..e91ded961dbac 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -7,18 +7,27 @@ v9.0.4 Upgrading --------- -* Ceph daemons now run as user and group ceph by default. During upgrade, - administrators have two options: +* Ceph daemons now run as user and group ceph by default. The ceph user has a + static UID assigned by Fedora and Debian (also used by derivative distributions + like RHEL/CentOS and Ubuntu). On SUSE the ceph user will currently get a + dynamically assigned UID when the user is created. + + If your systems already have a ceph user, upgrading the package will cause + problems. We suggest you first remove or rename the existing 'ceph' user + before upgrading. + + When upgrading, administrators have two options: #. Add the following line to ``ceph.conf`` on all hosts:: setuser match path = /var/lib/ceph/$type/$cluster-$id - This will make the daemon remain root (i.e., not drop privileges and - switch to user ceph) if the daemon's data directory is still owned by - root. Newly deployed daemons will be created with data owned by user - ceph and will run with reduced privileges, but upgraded daemons will - continue to run as root. + This will make the Ceph daemons run as root (i.e., not drop + privileges and switch to user ceph) if the daemon's data + directory is still owned by root. Newly deployed daemons will + be created with data owned by user ceph and will run with + reduced privileges, but upgraded daemons will continue to run as + root. #. Fix the data ownership during the upgrade. This is the preferred option, but is more work. The process for each host would be to: From 18e0c7778a39c570c55da4bba1afece275384ff2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 09:12:51 -0400 Subject: [PATCH 119/654] debian: rename ceph-common.postinst There was a trailing space! Signed-off-by: Sage Weil --- debian/{ceph-common.postinst => ceph-common.postinst} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename debian/{ceph-common.postinst => ceph-common.postinst} (98%) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst similarity index 98% rename from debian/ceph-common.postinst rename to debian/ceph-common.postinst index 8b9e01f453d6c..f6e0d7a04c225 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -1,6 +1,6 @@ #!/bin/sh # vim: set noet ts=8: -# postinst script for ceph-mds +# postinst script for ceph-commont # # see: dh_installdeb(1) From 7c96016876cb86943e4683f61ac1ab35ada8c6bf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 13:46:34 -0400 Subject: [PATCH 120/654] debian/ceph-common.postinst: fix /var/log/ceph permissions Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index f6e0d7a04c225..6a14f1ec1aeb3 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -80,7 +80,9 @@ case "$1" in if ! dpkg-statoverride --list /var/log/ceph >/dev/null then chown -R $SERVER_USER:$SERVER_GROUP /var/log/ceph - chmod u=rwx,g=rxs,o= /var/log/ceph + # members of group ceph can log here, but cannot remove + # others' files. non-members cannot read any logs. + chmod u=rwx,g=rwxs,o=t /var/log/ceph fi ;; From 94da8c1e7a1d3374f44da3ee95154b882d64e0ad Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 14:50:42 -0400 Subject: [PATCH 121/654] debian/ceph-common.postinst: fix adduser, addgroup Drop the extra groups thing--it's broken anyway. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index 6a14f1ec1aeb3..647693417593b 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -1,6 +1,6 @@ #!/bin/sh # vim: set noet ts=8: -# postinst script for ceph-commont +# postinst script for ceph-mds # # see: dh_installdeb(1) @@ -28,6 +28,8 @@ set -e [ -z "$SERVER_NAME" ] && SERVER_NAME="Ceph storage service" [ -z "$SERVER_GROUP" ] && SERVER_GROUP=ceph [ -z "$SERVER_UID" ] && SERVER_UID=64045 # alloc by Debian base-passwd maintainer +[ -z "$SERVER_GID" ] && SERVER_GID=$SERVER_UID + # Groups that the user will be added to, if undefined, then none. [ -z "$SERVER_ADDGROUP" ] && SERVER_ADDGROUP= @@ -47,7 +49,6 @@ case "$1" in echo -n "Adding system user $SERVER_USER.." adduser --quiet \ --system \ - --ingroup $SERVER_GROUP \ --no-create-home \ --disabled-password \ --uid $SERVER_UID \ @@ -56,20 +57,12 @@ case "$1" in echo "..done" fi # 3. adjust passwd entry + echo -n "Setting system user $SERVER_USER properties.." usermod -c "$SERVER_NAME" \ -d $SERVER_HOME \ -g $SERVER_GROUP \ - $SERVER_USER - # 4. Add the user to extra groups - if test -n $SERVER_ADDGROUP - then - if ! groups $SERVER_USER | cut -d: -f2 | \ - grep -qw $SERVER_ADDGROUP; then - echo -n "Adding user $SERVER_USER to groups $SERVER_ADDGROUP.." - adduser $SERVER_USER $SERVER_ADDGROUP - echo "..done" - fi - fi + $SERVER_USER + echo "..done" # 5. adjust file and directory permissions if ! dpkg-statoverride --list $SERVER_HOME >/dev/null From c86055398c9b054fccfabee6033745f86ec84025 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 19 Aug 2015 17:20:13 -0400 Subject: [PATCH 122/654] debian/ceph-common.dirs: install /var/lib/ceph It is the ceph user's home dir and we need to set the ownership. Signed-off-by: Sage Weil --- debian/ceph-common.dirs | 1 + 1 file changed, 1 insertion(+) diff --git a/debian/ceph-common.dirs b/debian/ceph-common.dirs index 4987b42ac97bd..a52e86d9727e7 100644 --- a/debian/ceph-common.dirs +++ b/debian/ceph-common.dirs @@ -1,2 +1,3 @@ etc/ceph var/log/ceph +var/lib/ceph From 48f98e1d7c06bd545e0bc35a84e4dc2accae8e83 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 19 Aug 2015 19:20:38 -0400 Subject: [PATCH 123/654] upstart: setuser ceph Signed-off-by: Sage Weil --- src/upstart/ceph-mds.conf | 2 +- src/upstart/ceph-mon.conf | 2 +- src/upstart/ceph-osd.conf | 2 +- src/upstart/radosgw.conf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf index 4063d9116ebce..a392aaee6e16c 100644 --- a/src/upstart/ceph-mds.conf +++ b/src/upstart/ceph-mds.conf @@ -23,4 +23,4 @@ export id # this breaks oneiric #usage "cluster = name of cluster (defaults to 'ceph'); id = mds instance id" -exec /usr/bin/ceph-mds --cluster="${cluster:-ceph}" -i "$id" -f +exec /usr/bin/ceph-mds --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf index 83c98583c5d69..c266f6cc9f1ef 100644 --- a/src/upstart/ceph-mon.conf +++ b/src/upstart/ceph-mon.conf @@ -23,7 +23,7 @@ export id # this breaks oneiric #usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id" -exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f +exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph post-stop script # Cleanup socket in case of segfault diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf index 2438c206f292b..f02f46571a055 100644 --- a/src/upstart/ceph-osd.conf +++ b/src/upstart/ceph-osd.conf @@ -22,4 +22,4 @@ instance ${cluster:-ceph}/$id export cluster export id -exec /usr/bin/ceph-osd --cluster="${cluster:-ceph}" -i "$id" -f +exec /usr/bin/ceph-osd --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph diff --git a/src/upstart/radosgw.conf b/src/upstart/radosgw.conf index d1b5bc3b83f06..fdc438006a93a 100644 --- a/src/upstart/radosgw.conf +++ b/src/upstart/radosgw.conf @@ -23,4 +23,4 @@ export id # this breaks oneiric #usage "cluster = name of cluster (defaults to 'ceph'); id = mds instance id" -exec /usr/bin/radosgw --cluster="${cluster:-ceph}" --id "$id" -f +exec /usr/bin/radosgw --cluster="${cluster:-ceph}" --id "$id" -f --setuser ceph --setgroup ceph From 743b15b412478e5c332e052cb94615270de87251 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Aug 2015 12:32:22 -0400 Subject: [PATCH 124/654] debian: /var/run/ceph should be owned by ceph:ceph Fix upstart and sysvinit scripts to create /var/run/ceph properly. Chown existing dir on upgrade. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 7 +++++++ src/init-ceph.in | 5 ++++- src/upstart/ceph-mds.conf | 2 +- src/upstart/ceph-mon.conf | 2 +- src/upstart/ceph-osd.conf | 2 +- src/upstart/radosgw.conf | 2 +- 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index 647693417593b..36410a3b630b6 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -78,6 +78,13 @@ case "$1" in chmod u=rwx,g=rwxs,o=t /var/log/ceph fi + # 6. fix /var/run/ceph + if [ -d /var/run/ceph ]; then + echo -n "Fixing /var/run/ceph ownership.." + chown $SERVER_USER:$SERVER_GROUP /var/run/ceph + echo "..done" + fi + ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/src/init-ceph.in b/src/init-ceph.in index acb57d4a9594b..3e3b3a44cd5d3 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -376,7 +376,10 @@ for name in $what; do fi echo Starting Ceph $name on $host... - mkdir -p $run_dir + if [ ! -d $run_dir ]; then + # assume /var/run exists + install -d -m0755 -o ceph -g ceph /var/run/ceph + fi get_conf pre_start_eval "" "pre start eval" [ -n "$pre_start_eval" ] && $pre_start_eval get_conf pre_start "" "pre start command" diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf index a392aaee6e16c..5c74fc16b2e0f 100644 --- a/src/upstart/ceph-mds.conf +++ b/src/upstart/ceph-mds.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-mds || { stop; exit 0; } test -d "/var/lib/ceph/mds/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 /var/run/ceph + install -d -m0755 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf index c266f6cc9f1ef..be4e0efad60f7 100644 --- a/src/upstart/ceph-mon.conf +++ b/src/upstart/ceph-mon.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-mon || { stop; exit 0; } test -d "/var/lib/ceph/mon/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 /var/run/ceph + install -d -m0755 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf index f02f46571a055..a508b4126274c 100644 --- a/src/upstart/ceph-osd.conf +++ b/src/upstart/ceph-osd.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-osd || { stop; exit 0; } test -d "/var/lib/ceph/osd/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 /var/run/ceph + install -d -m0755 -o ceph -g ceph /var/run/ceph /usr/libexec/ceph/ceph-osd-prestart.sh --cluster="${cluster:-ceph}" -i "$id" end script diff --git a/src/upstart/radosgw.conf b/src/upstart/radosgw.conf index fdc438006a93a..4cb6b5d08db25 100644 --- a/src/upstart/radosgw.conf +++ b/src/upstart/radosgw.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/radosgw || { stop; exit 0; } test -d "/var/lib/ceph/radosgw/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 /var/run/ceph + install -d -m0755 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id From 6a735d625729f16ccb533a128097a0f623b05b14 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 27 Aug 2015 11:05:58 +0800 Subject: [PATCH 125/654] test: disable newstore test until it's merged Newstore hasn't been merged. It leads to a segment fault in one of the teuthology testing job since ObjectStore::create() returns NULL. Signed-off-by: Zhiqiang Wang --- src/test/objectstore/store_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index e6b4d8f55c721..170a965e2bfeb 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -2413,7 +2413,7 @@ TEST_P(StoreTest, SetAllocHint) { INSTANTIATE_TEST_CASE_P( ObjectStore, StoreTest, - ::testing::Values("memstore", "filestore", "keyvaluestore", "newstore")); + ::testing::Values("memstore", "filestore", "keyvaluestore"/*, "newstore" */)); #else From 83ba59780e0ed7ac7d4c4047d01dc15f879252d1 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 27 Aug 2015 11:17:17 +0800 Subject: [PATCH 126/654] test: handle the case when ObjectStore::create returns NULL When the objectstore type doesn't exist, NULL is returned in SetUp. Handle the NULL return code to avoid a segment fault. Signed-off-by: Zhiqiang Wang --- src/test/objectstore/store_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 170a965e2bfeb..94c4bb3ce7cb3 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -55,6 +55,10 @@ class StoreTest : public ::testing::TestWithParam { string(GetParam()), string("store_test_temp_dir"), string("store_test_temp_journal")); + if (!store_) { + cerr << __func__ << ": objectstore type " << string(GetParam()) << " doesn't exist yet!" << std::endl; + return; + } store.reset(store_); EXPECT_EQ(store->mkfs(), 0); EXPECT_EQ(store->mount(), 0); From 10fd1a560fb97bcc357b60992aa3d9e7d7bf11dc Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 27 Aug 2015 12:27:52 +0800 Subject: [PATCH 127/654] test: don't unmount when no store is created When store is not created in SetUp for some reasons, unmounting it would lead to segment fault. Signed-off-by: Zhiqiang Wang --- src/test/objectstore/store_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 94c4bb3ce7cb3..7cb07664e1fd0 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -65,7 +65,8 @@ class StoreTest : public ::testing::TestWithParam { } virtual void TearDown() { - store->umount(); + if (store) + store->umount(); } }; From 24f4d22d215dda9f7264849be7a6384f4dad8f2c Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Wed, 26 Aug 2015 10:36:10 +0800 Subject: [PATCH 128/654] TestMsgr: Fix forever hang under lossless policy and one is WAIT another down Signed-off-by: Haomai Wang --- src/test/msgr/test_msgr.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/test/msgr/test_msgr.cc b/src/test/msgr/test_msgr.cc index f8f1928de2e07..03692eb288a19 100644 --- a/src/test/msgr/test_msgr.cc +++ b/src/test/msgr/test_msgr.cc @@ -983,6 +983,15 @@ class SyntheticWorkload { ConnectionRef conn = _get_random_connection(); dispatcher.clear_pending(conn); conn->mark_down(); + pair &p = available_connections[conn]; + // it's a lossless policy, so we need to mark down each side + if (!p.first->get_default_policy().server && !p.second->get_default_policy().server) { + ASSERT_EQ(conn->get_messenger(), p.first); + ConnectionRef peer = p.second->get_connection(p.first->get_myinst()); + peer->mark_down(); + dispatcher.clear_pending(peer); + available_connections.erase(peer); + } ASSERT_EQ(available_connections.erase(conn), 1U); } From 328d30cb7859b2dc1a58c472ba17ea3356acc131 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Wed, 26 Aug 2015 10:49:32 +0800 Subject: [PATCH 129/654] AsyncConnection: Fix uninitialized variable compile warning Signed-off-by: Haomai Wang --- src/msg/async/AsyncConnection.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index aaf9413415c9f..36a19f80cccce 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -536,7 +536,7 @@ void AsyncConnection::process() ldout(async_msgr->cct, 20) << __func__ << " begin MSG" << dendl; ceph_msg_header header; ceph_msg_header_old oldheader; - __u32 header_crc; + __u32 header_crc = 0; int len; if (has_feature(CEPH_FEATURE_NOSRCADDR)) len = sizeof(header); From e2c3c8638d6cd1bacfb2a3c659e4e3d8db74b6d6 Mon Sep 17 00:00:00 2001 From: guce Date: Thu, 27 Aug 2015 17:32:26 +0800 Subject: [PATCH 130/654] rgw: add key parameter conflict check for radosgw-admin command line. 1.--access-key & --gen-access-key radosgw-admin key create --uid=aaa --access-key='111' --gen-access-key --gen-secret --access-key effective 2.--secret & --gen-secret radosgw-admin key create --uid=aaa --access-key='222' --secret='222' --gen-secret --gen-secret effective test fix: before: conflict parameters can be configured. after changes, do the same procedure, can check for conflicts. Signed-off-by: Ce Gu --- src/rgw/rgw_admin.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 1140cbdbbc5f8..f8fd27b028c6d 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -1385,6 +1385,16 @@ int main(int argc, char **argv) break; } } + + /* check key parameter conflict */ + if ((!access_key.empty()) && gen_access_key) { + cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl; + return -EINVAL; + } + if ((!secret_key.empty()) && gen_secret_key) { + cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl; + return -EINVAL; + } } // default to pretty json From e749b214e7eb70aa56b61c12de3d988bdf9b3413 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Aug 2015 15:56:59 +0100 Subject: [PATCH 131/654] objclass: enable unregistering filter factory Signed-off-by: John Spray --- src/objclass/class_api.cc | 19 +++++++++++++++---- src/objclass/objclass.h | 5 ++++- src/osd/ClassHandler.cc | 22 ++++++++++++++++++++-- src/osd/ClassHandler.h | 25 +++++++++++++++++++++---- src/osd/ReplicatedPG.cc | 6 +++--- 5 files changed, 63 insertions(+), 14 deletions(-) diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc index 09b7f4f93c218..a3c065cc726ce 100644 --- a/src/objclass/class_api.cc +++ b/src/objclass/class_api.cc @@ -82,12 +82,23 @@ int cls_unregister_method(cls_method_handle_t handle) return 1; } -int cls_register_cxx_filter(cls_handle_t hclass, const std::string &filter_name, - cls_cxx_filter_factory_t fn) +int cls_register_cxx_filter(cls_handle_t hclass, + const std::string &filter_name, + cls_cxx_filter_factory_t fn, + cls_filter_handle_t *handle) { ClassHandler::ClassData *cls = (ClassHandler::ClassData *)hclass; - cls->register_cxx_filter(filter_name, fn); - return 0; + cls_filter_handle_t hfilter = (cls_filter_handle_t)cls->register_cxx_filter(filter_name, fn); + if (handle) { + *handle = hfilter; + } + return (hfilter != NULL); +} + +void cls_unregister_filter(cls_filter_handle_t handle) +{ + ClassHandler::ClassFilter *filter = (ClassHandler::ClassFilter *)handle; + filter->unregister(); } int cls_call(cls_method_context_t hctx, const char *cls, const char *method, diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h index e14d6a6dc4b58..9275ad98b5f1d 100644 --- a/src/objclass/objclass.h +++ b/src/objclass/objclass.h @@ -35,6 +35,7 @@ void __cls_init(); typedef void *cls_handle_t; typedef void *cls_method_handle_t; +typedef void *cls_filter_handle_t; typedef void *cls_method_context_t; typedef int (*cls_method_call_t)(cls_method_context_t ctx, char *indata, int datalen, @@ -71,6 +72,7 @@ extern int cls_unregister(cls_handle_t); extern int cls_register_method(cls_handle_t hclass, const char *method, int flags, cls_method_call_t class_call, cls_method_handle_t *handle); extern int cls_unregister_method(cls_method_handle_t handle); +extern void cls_unregister_filter(cls_filter_handle_t handle); @@ -128,7 +130,8 @@ extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int extern int cls_register_cxx_filter(cls_handle_t hclass, const std::string &filter_name, - cls_cxx_filter_factory_t fn); + cls_cxx_filter_factory_t fn, + cls_filter_handle_t *handle=NULL); extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive); extern int cls_cxx_remove(cls_method_context_t hctx); diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc index ea6fed89fd6f3..87d5a75a30122 100644 --- a/src/osd/ClassHandler.cc +++ b/src/osd/ClassHandler.cc @@ -221,11 +221,15 @@ ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const ch return &method; } -void ClassHandler::ClassData::register_cxx_filter( +ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter( const std::string &filter_name, cls_cxx_filter_factory_t fn) { - filters_map[filter_name] = fn; + ClassFilter &filter = filters_map[filter_name]; + filter.fn = fn; + filter.name = filter_name; + filter.cls = this; + return &filter; } ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(const char *mname) @@ -259,6 +263,20 @@ void ClassHandler::ClassMethod::unregister() cls->unregister_method(this); } +void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter) +{ + /* no need for locking, called under the class_init mutex */ + map::iterator iter = filters_map.find(filter->name); + if (iter == filters_map.end()) + return; + filters_map.erase(iter); +} + +void ClassHandler::ClassFilter::unregister() +{ + cls->unregister_filter(this); +} + int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata) { int ret; diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h index 843214d8d527d..e4bb9991e1205 100644 --- a/src/osd/ClassHandler.h +++ b/src/osd/ClassHandler.h @@ -35,6 +35,17 @@ class ClassHandler ClassMethod() : cls(0), flags(0), func(0), cxx_func(0) {} }; + struct ClassFilter { + struct ClassHandler::ClassData *cls; + std::string name; + cls_cxx_filter_factory_t fn; + + void unregister(); + + ClassFilter() : fn(0) + {} + }; + struct ClassData { enum Status { CLASS_UNKNOWN, @@ -49,7 +60,7 @@ class ClassHandler void *handle; map methods_map; - map filters_map; + map filters_map; set dependencies; /* our dependencies */ set missing_dependencies; /* only missing dependencies */ @@ -65,9 +76,10 @@ class ClassHandler ClassMethod *register_cxx_method(const char *mname, int flags, cls_method_cxx_call_t func); void unregister_method(ClassMethod *method); - void register_cxx_filter( + ClassFilter *register_cxx_filter( const std::string &filter_name, cls_cxx_filter_factory_t fn); + void unregister_filter(ClassFilter *method); ClassMethod *get_method(const char *mname) { Mutex::Locker l(handler->mutex); @@ -75,10 +87,15 @@ class ClassHandler } int get_method_flags(const char *mname); - cls_cxx_filter_factory_t get_filter(const std::string &filter_name) + ClassFilter *get_filter(const std::string &filter_name) { Mutex::Locker l(handler->mutex); - return filters_map[filter_name]; + std::map::iterator i = filters_map.find(name); + if (i == filters_map.end()) { + return NULL; + } else { + return &(i->second); + } } }; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 20087794d4fdc..e7df89023e86b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -623,13 +623,13 @@ int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilt assert(cls); } - cls_cxx_filter_factory_t fn = cls->get_filter(filter_name); - if (fn == NULL) { + ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name); + if (class_filter == NULL) { derr << "Error finding filter '" << filter_name << "' in class " << class_name << dendl; return -EINVAL; } - filter = fn(&iter); + filter = class_filter->fn(&iter); assert(filter); } From 60d51fc4a987cc9d92be9f0691c47f44f45fed6a Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Aug 2015 13:17:22 +0100 Subject: [PATCH 132/654] cls: add a filter to the hello class for testing Signed-off-by: John Spray --- src/cls/hello/cls_hello.cc | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc index 0d5c78b161762..a7bc9e865ff0c 100644 --- a/src/cls/hello/cls_hello.cc +++ b/src/cls/hello/cls_hello.cc @@ -258,6 +258,35 @@ static int bad_writer(cls_method_context_t hctx, bufferlist *in, bufferlist *out } +class PGLSHelloFilter : public PGLSFilter { + string val; +public: + PGLSHelloFilter(bufferlist::iterator& params) { + ::decode(xattr, params); + ::decode(val, params); + } + virtual ~PGLSHelloFilter() {} + virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata) + { + if (val.size() != xattr_data.length()) + return false; + + if (memcmp(val.c_str(), xattr_data.c_str(), val.size())) + return false; + + return true; + } +}; + + +PGLSFilter *hello_filter(bufferlist::iterator *q) +{ + assert(q); + return new PGLSHelloFilter(*q); +} + + /** * initialize class * @@ -304,4 +333,7 @@ void __cls_init() bad_reader, &h_bad_reader); cls_register_cxx_method(h_class, "bad_writer", CLS_METHOD_RD, bad_writer, &h_bad_writer); + + // A PGLS filter + cls_register_cxx_filter(h_class, "hello", hello_filter); } From 2777438b362bb36295618a1fff252b9e1b12adc5 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Aug 2015 13:30:19 +0100 Subject: [PATCH 133/654] test: add a test for filter in cls hello Signed-off-by: John Spray --- src/test/cls_hello/test_cls_hello.cc | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/test/cls_hello/test_cls_hello.cc b/src/test/cls_hello/test_cls_hello.cc index 58ecb97f9f6d6..efd9fc749f5c0 100644 --- a/src/test/cls_hello/test_cls_hello.cc +++ b/src/test/cls_hello/test_cls_hello.cc @@ -16,6 +16,7 @@ #include #include "include/rados/librados.hpp" +#include "include/encoding.h" #include "test/librados/test.h" #include "gtest/gtest.h" @@ -131,3 +132,54 @@ TEST(ClsHello, BadMethods) { ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); } + +TEST(ClsHello, Filter) { + Rados cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool_pp(pool_name, cluster)); + IoCtx ioctx; + cluster.ioctx_create(pool_name.c_str(), ioctx); + + char buf[128]; + memset(buf, 0xcc, sizeof(buf)); + bufferlist obj_content; + obj_content.append(buf, sizeof(buf)); + + std::string target_str = "content"; + + // Write xattr bare, no ::encod'ing + bufferlist target_val; + target_val.append(target_str); + bufferlist nontarget_val; + nontarget_val.append("rhubarb"); + + ASSERT_EQ(0, ioctx.write("has_xattr", obj_content, obj_content.length(), 0)); + ASSERT_EQ(0, ioctx.write("has_wrong_xattr", obj_content, obj_content.length(), 0)); + ASSERT_EQ(0, ioctx.write("no_xattr", obj_content, obj_content.length(), 0)); + + ASSERT_EQ(0, ioctx.setxattr("has_xattr", "theattr", target_val)); + ASSERT_EQ(0, ioctx.setxattr("has_wrong_xattr", "theattr", nontarget_val)); + + bufferlist filter_bl; + std::string filter_name = "hello.hello"; + ::encode(filter_name, filter_bl); + ::encode("_theattr", filter_bl); + ::encode(target_str, filter_bl); + + NObjectIterator iter(ioctx.nobjects_begin(filter_bl)); + bool foundit = false; + int k = 0; + while (iter != ioctx.nobjects_end()) { + foundit = true; + // We should only see the object that matches the filter + ASSERT_EQ((*iter).get_oid(), "has_xattr"); + // We should only see it once + ASSERT_EQ(k, 0); + ++iter; + ++k; + } + ASSERT_TRUE(foundit); + + ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster)); +} + From f5df1e4884997516459b35a9fa03cc3f69a3c019 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 27 Aug 2015 14:12:59 +0100 Subject: [PATCH 134/654] osd: separate filter init from construction ...so that implementations can readily handle decode errors in client args and return an error code. Signed-off-by: John Spray --- src/cls/hello/cls_hello.cc | 16 ++++++----- src/objclass/objclass.h | 11 +++++--- src/osd/ReplicatedPG.cc | 55 +++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc index a7bc9e865ff0c..d1adbd4a4cf7c 100644 --- a/src/cls/hello/cls_hello.cc +++ b/src/cls/hello/cls_hello.cc @@ -261,10 +261,15 @@ static int bad_writer(cls_method_context_t hctx, bufferlist *in, bufferlist *out class PGLSHelloFilter : public PGLSFilter { string val; public: - PGLSHelloFilter(bufferlist::iterator& params) { - ::decode(xattr, params); - ::decode(val, params); + int init(bufferlist::iterator& params) { + try { + ::decode(xattr, params); + ::decode(val, params); + } catch (buffer::error &e) { + return -EINVAL; + } } + virtual ~PGLSHelloFilter() {} virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, bufferlist& outdata) @@ -280,10 +285,9 @@ class PGLSHelloFilter : public PGLSFilter { }; -PGLSFilter *hello_filter(bufferlist::iterator *q) +PGLSFilter *hello_filter() { - assert(q); - return new PGLSHelloFilter(*q); + return new PGLSHelloFilter(); } diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h index 9275ad98b5f1d..58777a00d897c 100644 --- a/src/objclass/objclass.h +++ b/src/objclass/objclass.h @@ -106,6 +106,13 @@ class PGLSFilter { virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, bufferlist& outdata) = 0; + /** + * Arguments passed from the RADOS client. Implementations must + * handle any encoding errors, and return an appropriate error code, + * or 0 on valid input. + */ + virtual int init(bufferlist::iterator ¶ms) = 0; + /** * xattr key, or empty string. If non-empty, this xattr will be fetched * and the value passed into ::filter @@ -120,9 +127,7 @@ class PGLSFilter { }; // Classes expose a filter constructor that returns a subclass of PGLSFilter -typedef PGLSFilter* (*cls_cxx_filter_factory_t)( - bufferlist::iterator *args); - +typedef PGLSFilter* (*cls_cxx_filter_factory_t)(); extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags, diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index e7df89023e86b..ac777d15fa91c 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -512,9 +512,16 @@ void ReplicatedPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef o class PGLSPlainFilter : public PGLSFilter { string val; public: - PGLSPlainFilter(bufferlist::iterator& params) { - ::decode(xattr, params); - ::decode(val, params); + virtual int init(bufferlist::iterator ¶ms) + { + try { + ::decode(xattr, params); + ::decode(val, params); + } catch (buffer::error &e) { + return -EINVAL; + } + + return 0; } virtual ~PGLSPlainFilter() {} virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, @@ -524,10 +531,19 @@ class PGLSPlainFilter : public PGLSFilter { class PGLSParentFilter : public PGLSFilter { inodeno_t parent_ino; public: - PGLSParentFilter(bufferlist::iterator& params) { + PGLSParentFilter() { xattr = "_parent"; - ::decode(parent_ino, params); + } + virtual int init(bufferlist::iterator ¶ms) + { + try { + ::decode(parent_ino, params); + } catch (buffer::error &e) { + return -EINVAL; + } generic_dout(0) << "parent_ino=" << parent_ino << dendl; + + return 0; } virtual ~PGLSParentFilter() {} virtual bool filter(const hobject_t &obj, bufferlist& xattr_data, @@ -602,9 +618,9 @@ int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilt } if (type.compare("parent") == 0) { - filter = new PGLSParentFilter(iter); + filter = new PGLSParentFilter(); } else if (type.compare("plain") == 0) { - filter = new PGLSPlainFilter(iter); + filter = new PGLSPlainFilter(); } else { std::size_t dot = type.find("."); if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) { @@ -629,13 +645,28 @@ int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilt << class_name << dendl; return -EINVAL; } - filter = class_filter->fn(&iter); - assert(filter); + filter = class_filter->fn(); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + derr << "Buggy class " << class_name << " failed to construct " + "filter " << filter_name << dendl; + return -EINVAL; + } } - *pfilter = filter; - - return 0; + assert(filter); + int r = filter->init(iter); + if (r < 0) { + derr << "Error initializing filter " << type << ": " + << cpp_strerror(r) << dendl; + delete filter; + return -EINVAL; + } else { + // Successfully constructed and initialized, return it. + *pfilter = filter; + return 0; + } } From 3eb36fc732609b4faae2d4484c953da36892eee5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 10:06:07 -0400 Subject: [PATCH 135/654] doc/release-notes: v0.94.3 Fix up the release timeline link for v9.0.3 too. Signed-off-by: Sage Weil --- doc/changelog/v0.94.3.txt | 2660 +++++++++++++++++++++++++++++++++++++ doc/release-notes.rst | 23 +- doc/releases.rst | 4 +- 3 files changed, 2682 insertions(+), 5 deletions(-) create mode 100644 doc/changelog/v0.94.3.txt diff --git a/doc/changelog/v0.94.3.txt b/doc/changelog/v0.94.3.txt new file mode 100644 index 0000000000000..6078325657be2 --- /dev/null +++ b/doc/changelog/v0.94.3.txt @@ -0,0 +1,2660 @@ +commit 95cefea9fd9ab740263bf8bb4796fd864d9afe2b (tag: refs/tags/v0.94.3, refs/remotes/gh/hammer) +Author: Jenkins +Date: Wed Aug 26 10:39:37 2015 -0700 + + 0.94.3 + +commit 697101e4dfd9822050ce401b5f6212bfd81fea89 +Merge: 88e7ee7 81a311a +Author: Gregory Farnum +Date: Tue Aug 18 12:43:57 2015 +0100 + + Merge pull request #5589 from ceph/hammer-12709 + + Workunits : fs/misc/chmod.sh : Include ACL characters in permission check + + Reviewed-by: Greg Farnum + +commit 81a311a744987564b70852fdacfd915523c73b5d +Author: Yazen Ghannam +Date: Mon May 4 12:33:16 2015 -0400 + + Workunits : fs/misc/chmod.sh : Include ACL characters in permission check. + + Signed-off-by: Yazen Ghannam + (cherry picked from commit d3dbfffefb0ae53583350f53258dc902670da659) + +commit 88e7ee716fdd7bcf81845087021a677de5a50da8 +Merge: bb12f92 1a32379 +Author: Loic Dachary +Date: Tue Aug 4 13:02:17 2015 +0200 + + Merge pull request #5160 from theanalyst/wip-11910-hammer + + mon: pg ls is broken + + Reviewed-by: Kefu Chai + +commit bb12f925cf0d78f97eefc2e271e73596050b9919 +Merge: e801d4c e19f928 +Author: Loic Dachary +Date: Tue Aug 4 12:34:18 2015 +0200 + + Merge pull request #5384 from dachary/wip-12502-hammer + + rgw: need conversion tool to handle fixes following #11974 + + Reviewed-by: Yehuda Sadeh + +commit e801d4c943b8004ef613345505df91057913cd39 +Merge: 78a4024 154f18c +Author: Loic Dachary +Date: Tue Aug 4 11:22:08 2015 +0200 + + Merge pull request #5405 from ceph/wip-12465-hammer + + Log::reopen_log_file: take m_flush_mutex + + Reviewed-by: Loic Dachary + +commit 78a4024c14253503b770070aa36a090c6b8f1eaf +Merge: a451e88 7034720 +Author: Sage Weil +Date: Fri Jul 31 15:18:52 2015 -0400 + + Merge pull request #5121 from theanalyst/wip-11983-hammer + + FAILED assert(!old_value.deleted()) in upgrade:giant-x-hammer-distro-basic-multi run + + Reviewed-by: Sage Weil + +commit a451e882ab9a929d240747b4b09786ca4b4ce377 +Merge: 218f537 1063f52 +Author: Sage Weil +Date: Fri Jul 31 15:17:00 2015 -0400 + + Merge pull request #5269 from dachary/wip-12362-hammer + + stuck incomplete + + Reviewed-by: Sage Weil + +commit 218f537491a46a0251bef7690a7f5a86b988ee63 +Merge: 07fa83a 8abc46a +Author: Loic Dachary +Date: Fri Jul 31 20:55:48 2015 +0200 + + Merge pull request #5117 from theanalyst/wip-12099-hammer + + rgw: rados objects wronly deleted + + Reviewed-by: Yehuda Sadeh + +commit 07fa83aefc9bd4ea1495fa1e117a438b2c460e46 +Merge: a69d431 56c2688 +Author: Loic Dachary +Date: Fri Jul 31 20:55:29 2015 +0200 + + Merge pull request #5118 from theanalyst/wip-12042-hammer + + DragonDisk fails to create directories via S3: MissingContentLength + + Reviewed-by: Yehuda Sadeh + +commit a69d431ea52b5de7fc5cfe142ff3b69ff64e8048 +Merge: 5353480 c78cc00 +Author: Loic Dachary +Date: Fri Jul 31 20:55:14 2015 +0200 + + Merge pull request #5214 from SUSE/wip-12299-hammer + + RGW Swift API: support for 202 Accepted response code on container creation + + Reviewed-by: Yehuda Sadeh + +commit 5353480f1df297ad9dd1bd3154887ed2564f0280 +Merge: fb9156f a5dbcbb +Author: Loic Dachary +Date: Fri Jul 31 20:54:57 2015 +0200 + + Merge pull request #5226 from SUSE/wip-12322-hammer + + rgw: keystone does not support chunked input + + Reviewed-by: Yehuda Sadeh + +commit fb9156f412f83c0ce1fe4c5c9ccd57fd79f0c992 +Merge: 7193c16 b1618a9 +Author: Loic Dachary +Date: Fri Jul 31 20:54:41 2015 +0200 + + Merge pull request #5227 from SUSE/wip-12323-hammer + + RGW Swift API: XML document generated in response for GET on account does not contain account name + + Reviewed-by: Yehuda Sadeh + +commit 7193c16b65fdc1694b968899d23eae0638d89f11 +Merge: ac86490 e39dce7 +Author: Loic Dachary +Date: Fri Jul 31 20:54:26 2015 +0200 + + Merge pull request #5228 from theanalyst/wip-11872-hammer + + RGW does not send Date HTTP header when civetweb frontend is used + + Reviewed-by: Yehuda Sadeh + +commit ac86490821336ce024940d48d82f7a5ff7a302b1 +Merge: 33dbfc6 557865c +Author: Loic Dachary +Date: Fri Jul 31 20:54:02 2015 +0200 + + Merge pull request #5229 from theanalyst/wip-12242-hammer + + Fix tool for #11442 does not correctly fix objects created via multipart uploads + + Reviewed-by: Yehuda Sadeh + +commit 33dbfc6919840882c6cbc10dad2fc24cf0720bf9 +Merge: 99ca62f e50caab +Author: Loic Dachary +Date: Fri Jul 31 20:53:41 2015 +0200 + + Merge pull request #5237 from theanalyst/wip-12245-hammer + + rgw: empty json response when getting user quota + + Reviewed-by: Yehuda Sadeh + +commit 99ca62f2bf1e21a41cb7b6ecdb8a8731a18de195 +Merge: 1f5f319 2357b6c +Author: Loic Dachary +Date: Fri Jul 31 20:53:18 2015 +0200 + + Merge pull request #5284 from SUSE/wip-12398-hammer + + rgw: Properly respond to the Connection header with Civetweb + + Reviewed-by: Yehuda Sadeh + +commit 1f5f31905bb5f499a2db4a02993dbc6efa1c4251 +Merge: 5cbb6cf 9458b84 +Author: Loic Dachary +Date: Fri Jul 31 20:53:04 2015 +0200 + + Merge pull request #5285 from SUSE/wip-12399-hammer + + rgw: multipart list part response returns incorrect field + + Reviewed-by: Yehuda Sadeh + +commit 5cbb6cfb69aad0db470f99e39e33f4b4b1abfb95 +Merge: 1df93e1 e4b55b3 +Author: Loic Dachary +Date: Fri Jul 31 20:52:43 2015 +0200 + + Merge pull request #5286 from SUSE/wip-12400-hammer + + rgw: radosgw-admin dumps user info twice + + Reviewed-by: Yehuda Sadeh + +commit 1df93e19a0275ed218c8f83bc674f16d1856f241 +Merge: a48cbc0 2ecb3b7 +Author: Yan, Zheng +Date: Fri Jul 31 09:50:22 2015 +0800 + + Merge pull request #5427 from dachary/wip-12088-hammer-part-2 + + Fh ref count will leak if readahead does not need to do read from osd + +commit 2ecb3b7f4a49c574bc178a106c6bf0d8247f2a5e +Author: Zhi Zhang +Date: Wed Jul 22 10:54:53 2015 +0800 + + Fh ref count will leak if readahead does not need to do read from osd + + The 3c8cdeacf46ae4031189d2ef6948aa3b6ab4ae43 backport introduced a leak. + + http://tracker.ceph.com/issues/12319 Fixes: #12319 + + Signed-off-by: Zhi Zhang + +commit a48cbc0a847f19ea613b76a479acc831e9316c62 +Merge: 06c27cd 5ef0846 +Author: Loic Dachary +Date: Thu Jul 30 21:43:48 2015 +0200 + + Merge pull request #5120 from theanalyst/wip-11999-hammer + + cephfs Dumper tries to load whole journal into memory at once + + Reviewed-by: Greg Farnum + +commit 06c27cdd420598c497766ee5879335942a0acc09 +Merge: 19abe5e 408880b +Author: Loic Dachary +Date: Thu Jul 30 21:43:21 2015 +0200 + + Merge pull request #5119 from theanalyst/wip-12098-hammer + + kernel_untar_build fails on EL7 + + Reviewed-by: Greg Farnum + +commit 19abe5ee35c099c67b56ac268710fcd20bec60d3 +Merge: e3d17e4 4c199bf +Author: Loic Dachary +Date: Thu Jul 30 17:00:14 2015 +0200 + + Merge pull request #5417 from dachary/wip-11998-hammer + + debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2 + + Reviewed-by: Sage Weil + +commit 4c199bf57dc54dc5e5f45cd9b34878a8459d434e +Author: Loic Dachary +Date: Thu Jul 30 09:43:20 2015 +0200 + + debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2 + + The d8733be2ef8874b9a858a7ffddfb81b9b656e9a6 backport introduced a + regression by adding an incorrect Depends / Break combo supposed to + reflect the fact that ceph_argparse moved from ceph to ceph-common after + v0.94.2. It assumed the package is released under the 0.94.2 version + where in reality it is released under the 0.94.2-1xxx version (where xxx + is trusty, jessie etc.). + + The Depends / Break combo is changed to use 0.94.2-2 instead. + + See also http://tracker.ceph.com/issues/12529 for a larger discussion. + + http://tracker.ceph.com/issues/11998 Fixes: #11998 + + Signed-off-by: Loic Dachary + +commit e3d17e49731569ea92917f574d42d93258c77189 +Merge: cbba706 89aa8ff +Author: Loic Dachary +Date: Wed Jul 29 16:22:36 2015 +0200 + + Merge pull request #5248 from ceph/wip-11833-hammer + + mon: add an "osd crush tree" command + + Reviewed-by: Kefu Chai + +commit cbba7064c6cc4cde3e8a49c25ce671e91d31b9c7 +Merge: 8355bda 3c8cdea +Author: Loic Dachary +Date: Wed Jul 29 10:49:08 2015 +0200 + + Merge pull request #5222 from ceph/hammer-12088 + + client: reference counting 'struct Fh' + + Reviewed-by: John Spray + +commit 8355bdab56bc4e5ce4d20ba3486c082f06d8dcd1 +Merge: 52d0e5d ec70533 +Author: Orit Wasserman +Date: Tue Jul 28 23:33:18 2015 +0200 + + Merge pull request #5231 from theanalyst/wip-12243-hammer + + Civetweb RGW appears to report full size of object as downloaded when only partially downloaded + +commit 52d0e5da5ebad7fe42c2e469cea9773c7714c2b5 +Merge: 7fd31b1 03c07d7 +Author: Loic Dachary +Date: Tue Jul 28 22:40:23 2015 +0200 + + Merge pull request #5243 from theanalyst/wip-12239-hammer + + librbd/internal.cc: 1967: FAILED assert(watchers.size() == 1) + + Reviewed-by: Josh Durgin + +commit 7fd31b1b3c2c8e9dd3d9e5464775422215f7a4bc +Merge: 7230de3 5c812c1 +Author: Loic Dachary +Date: Tue Jul 28 22:40:03 2015 +0200 + + Merge pull request #5241 from theanalyst/wip-12238-hammer + + [ FAILED ] TestLibRBD.ExclusiveLockTransition + + Reviewed-by: Josh Durgin + +commit 7230de317736a71a5764cf224bd1309da1c7b3c6 +Merge: 6b6228f 7132277 +Author: Loic Dachary +Date: Tue Jul 28 22:30:23 2015 +0200 + + Merge pull request #5265 from SUSE/wip-12368-hammer + + linking ceph to tcmalloc causes segfault on SUSE SLE11-SP3 + + Reviewed-by: Loic Dachary + +commit 6b6228f8949e975cac763513898ea9704cb8baf1 +Merge: d62c3ea f99f312 +Author: Loic Dachary +Date: Tue Jul 28 22:27:40 2015 +0200 + + Merge pull request #5280 from ceph/wip-12384-hammer + + librbd: add valgrind memory checks for unit tests + + Reviewed-by: Loic Dachary + +commit d62c3ea344d9e49e9586867e872e8d5b3f019948 +Merge: 7b57ff8 b872882 +Author: Loic Dachary +Date: Tue Jul 28 22:26:25 2015 +0200 + + Merge pull request #5279 from ceph/wip-12237-hammer + + A client opening an image mid-resize can result in the object map being invalidated + + Reviewed-by: Loic Dachary + +commit 7b57ff8a9ced6c2f22456ed034cc83d07f82fbb3 +Merge: 481728a f819332 +Author: Loic Dachary +Date: Tue Jul 28 22:10:03 2015 +0200 + + Merge pull request #5283 from SUSE/wip-12397-hammer + + ceph.spec.in: 95-ceph-osd.rules, mount.ceph, and mount.fuse.ceph not installed properly on SUSE + + Reviewed-by: Loic Dachary + +commit 481728a04dd2c85096c3bc01cc37da9642b038ca +Merge: 54bb924 d8733be +Author: Loic Dachary +Date: Tue Jul 28 21:54:33 2015 +0200 + + Merge pull request #5206 from SUSE/wip-11998-hammer + + /usr/bin/ceph from ceph-common is broken without installing ceph + + Reviewed-by: Loic Dachary + +commit 54bb924e68ae2b4df65576a5d788d593b9d9e722 +Merge: e099058 c5c627f +Author: Loic Dachary +Date: Tue Jul 28 21:47:29 2015 +0200 + + Merge pull request #5055 from SUSE/wip-12044-hammer + + rgw/logrotate.conf calls service with wrong init script name + + Reviewed-by: Loic Dachary + +commit e0990583298277f1c631f7c2d2260d6c3fa64c9f +Merge: 8b93978 e149916 +Author: Loic Dachary +Date: Tue Jul 28 21:46:11 2015 +0200 + + Merge pull request #5040 from SUSE/wip-11964-hammer + + systemd: Increase max files open limit for OSD daemon + + Reviewed-by: Loic Dachary + +commit 8b93978881375d063fe2df8f40406ea650dda766 +Merge: 5a7cab2 22f58ce +Author: Loic Dachary +Date: Tue Jul 28 21:45:44 2015 +0200 + + Merge pull request #5038 from SUSE/wip-11876-hammer + + ceph-post-file fails on rhel7 + + Reviewed-by: Loic Dachary + +commit 5a7cab205bb1b3fdbf49a852cb978fc28eba8212 +Merge: 5218eff 38d36b1 +Author: Loic Dachary +Date: Tue Jul 28 21:45:25 2015 +0200 + + Merge pull request #5030 from SUSE/wip-12092-hammer + + packaging: add SuSEfirewall2 service files + + Reviewed-by: Loic Dachary + +commit 5218eff07c303fb2762ea9f38b9a9c23c24efcae +Merge: 0b54d50 8acfb99 +Author: Loic Dachary +Date: Tue Jul 28 21:38:14 2015 +0200 + + Merge pull request #5028 from SUSE/wip-12090-hammer + + rcceph script is buggy + + Reviewed-by: Loic Dachary + +commit 0b54d50ecd1445dfc1a46552adb83b9dae9210d9 +Merge: 45beb86 37d77d3 +Author: Loic Dachary +Date: Tue Jul 28 21:37:44 2015 +0200 + + Merge pull request #5026 from SUSE/wip-12087-hammer + + max files open limit for OSD daemon is too low + + Reviewed-by: Loic Dachary + +commit e19f928bd770a37f2f631c4cd796e2e30a494234 +Author: Yehuda Sadeh +Date: Fri Jun 26 16:56:28 2015 -0700 + + rgw: conversion tool to fix broken multipart objects + + Fixes: #12079 + + Broken multipart objects: multipart objects that created on 0.94.2 + and that start with underscore have a bad locator on their tail objects. + This extends the tool that was needed for older issue we've had with + hammer multipart objects (that start with underscore). The same usage + applies: + + $ radosgw-admin bucket check --check-head-obj-locator \ + --bucket= [--fix] + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit f02ca6107172cecd80a490df9f0d66204e62326c) + +commit 28d32f6090724d62b6168d64031454f44eb4cc88 +Author: Yehuda Sadeh +Date: Fri Jun 26 13:49:55 2015 -0700 + + rgw: only scan for objects not in namespace + + Fixes: #11984 + The tool should only work on the head objects, and these are not inside + any namespace. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 8103908548bf7d6c9fa47fb181cd450670bae8d6) + +commit e22e2b43b4039a44f5f8fbbe59edc21fbe118bdc +Author: Yehuda Sadeh +Date: Wed Apr 22 16:04:35 2015 -0700 + + rgw_admin: add --remove-bad flag to bucket check + + Add this flag so that the bad object will be removed (should be called + only after user has verified that objects content is correct). + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 06d67d9139a95b704b80de527381fd1bbf7981ce) + +commit 154f18ce3e52094fe84b058565a865ed97b079d6 (refs/remotes/gh/wip-12465-hammer) +Author: Samuel Just +Date: Fri Jul 24 15:38:18 2015 -0700 + + Log::reopen_log_file: take m_flush_mutex + + Otherwise, _flush() might continue to write to m_fd after it's closed. + This might cause log data to go to a data object if the filestore then + reuses the fd during that time. + + Fixes: #12465 + Backport: firefly, hammer + Signed-off-by: Samuel Just + (cherry picked from commit 8778ab3a1ced7fab07662248af0c773df759653d) + +commit b8728823493b9dfde0333fb41725002fc50e4d9b (refs/remotes/gh/wip-12237-hammer) +Author: Jason Dillaman +Date: Sun Jul 5 10:47:38 2015 -0400 + + librados_test_stub: read op should return number of bytes read + + Signed-off-by: Jason Dillaman + (cherry picked from commit f8a7b507983e31399831e802e99429b95386ed41) + +commit 7d9fce3aa3832a1b8bd7f18abd4745dbc0033582 +Author: Jason Dillaman +Date: Sun Jul 5 10:35:28 2015 -0400 + + tests: fixed TestObjectMap.InvalidateFlagInMemoryOnly + + librados and librados_test_stub return different result codes + for a read full object operation. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 2ace2b77f8ed83e753fe4a48bcc997f5d1dd465f) + +commit 4a77be0a65c8b4ec3dc437721f8c321737b260de +Author: Jason Dillaman +Date: Sun Jul 5 11:09:09 2015 -0400 + + librbd: don't attempt to invalidate an object map in R/O mode + + The ImageWatcher is not initialized when in R/O mode, which + resulted in a NULL pointer dereference. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 64d740f8fa10ba872e324ec2580a4d8c3f99a9ce) + +commit 0aea70f68b299441e692efdce6d5e7ff18b78c39 +Author: Jason Dillaman +Date: Tue Jun 23 11:17:12 2015 -0400 + + tests: add new unit tests for object map invalidation + + Signed-off-by: Jason Dillaman + (cherry picked from commit 0215e9753c09460f6fc84ded9397e36a209f2e32) + +commit c732cb889b4a61254d06703bf032082e56b196de +Author: Jason Dillaman +Date: Wed Mar 25 09:41:13 2015 -0400 + + librbd: move object map codes to common location + + These codes will need to be accessible from cls_rbd and librbd. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 4ac584c34d576b489ed4c4862703b8fb427b3bc2) + +commit 27c99ea972a7b218ea591b208d0d1dd51eef6f95 +Author: Jason Dillaman +Date: Tue Jun 23 11:14:51 2015 -0400 + + librbd: only update image flags when holding exclusive lock + + It was possible for a client to open an image while another client + was shrinking an image. This would result in the former invalidating + the object map on-disk if it openned the image between updating the + image header and resizing the object map. + + Fixes: #11791 + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit eb81a6a7e391327ac993fd406443b206a7f7bffc) + +commit ef453630200ab72373f08357ca6b5ac5c5bbb397 +Author: Jason Dillaman +Date: Fri Jul 17 12:43:46 2015 -0400 + + librbd: new ImageWatcher::is_lock_supported method + + The new version does not attempt to acquire the snap_lock, to avoid + cases where a recursive lock would result. + + Signed-off-by: Jason Dillaman + +commit e4b55b398e68e870a7cf21276e63da2c4c6e3faa +Author: guce +Date: Sat Jul 11 14:08:33 2015 +0800 + + Fixes: #12286 radosgw-admin: after subuser modify print only once user info. + + remove rgw_admin.cc OPT_SUBUSER_MODIFY, show_user_info code block. + + switch (opt_cmd) { + ... + case OPT_SUBUSER_MODIFY: + show_user_info(info, formatter); //show first time (remove this) + break; + ... + } + + // output the result of a user operation + if (output_user_info) { + ... + show_user_info(info, formatter); //show second time + } + + test fix: + before: after subuser modify print twice user info. + after changes, do the same procedure, print only once user info. + + Signed-off-by: guce guce@h3c.com + (cherry picked from commit c604dd97fc179e5c2f640818c0f6e7cf99701947) + +commit 9458b845bf863ccf878873c4f0b089ddf84c7203 +Author: Henry Chang +Date: Wed Apr 22 18:26:45 2015 +0800 + + rgw: fix ListParts response + + The response XML element name should be 'ListPartsResult'. + + Fixes: #11494 + + Signed-off-by: Henry Chang + (cherry picked from commit caa9f0e461f1eed526fc43ee74699a7243aef9b8) + +commit 2357b6c808f4f7c5997af48149585a6051c04b8f +Author: Wido den Hollander +Date: Sat Jul 11 00:01:52 2015 +0200 + + rgw: If the client sends a Connection: close header respond accordingly. + + HTTP/1.1 assumes Keep-Alive by default, but if a Connection: close header is send + the server should respond with it as well. + + This makes the client close the connection after the request. + + Fixes: #12298 + (cherry picked from commit 79197d3711edc4b04a7ea4335b6e1b65754996d5) + +commit f819332e2826eae14849c5e68a380d1d87039d22 +Author: Nathan Cutler +Date: Thu Jul 9 21:38:46 2015 +0200 + + ceph.spec.in: install 95-ceph-osd.rules, mount.ceph, and mount.fuse.ceph properly on SUSE + + http://tracker.ceph.com/issues/12261 Fixes: #12261 + + Signed-off-by: Nathan Cutler + (cherry picked from commit 5ce38b9536efabf99a236c7a9d15c149fa4c16a6) + +commit d8733be2ef8874b9a858a7ffddfb81b9b656e9a6 +Author: Ken Dreyer +Date: Tue Apr 14 07:58:17 2015 -0600 + + debian: move ceph_argparse into ceph-common + + Prior to this commit, if a user installed the "ceph-common" Debian + package without installing "ceph", then /usr/bin/ceph would crash + because it was missing the ceph_argparse library. + + Ship the ceph_argparse library in "ceph-common" instead of "ceph". (This + was the intention of the original commit that moved argparse to "ceph", + 2a23eac54957e596d99985bb9e187a668251a9ec) + + http://tracker.ceph.com/issues/11388 Refs: #11388 + + Reported-by: Jens Rosenboom + Signed-off-by: Ken Dreyer + (cherry picked from commit 110608e5bdd9e2f03020ad41f0c2d756684d4417) + + Conflicts: + debian/ceph.install + There is no ceph_daemon.py in hammer + debian/control + Depends/Replaces/Breaks version adapted (from 9.0.0 to 0.94.2) + also adapted ceph-dbg Replaces/Breaks + +commit f99f3125ff76628e2525dca00bb7b983f941a08b (refs/remotes/gh/wip-12384-hammer) +Author: Zhiqiang Wang +Date: Fri Mar 20 16:15:42 2015 +0800 + + test: potential memory leak in FlushAioPP + + Should call the release function instead of deleting it to free + librbd::RBD::AioCompletion and librbd::AioCompletion. Otherwise there is + a potential memory leak. + + Signed-off-by: Zhiqiang Wang + (cherry picked from commit ada7ec860cb7901c560c12a5af36dc7c23051b76) + +commit a4fc63af630e77586e3ba2f17df3b6be4a1e2055 +Author: Jason Dillaman +Date: Tue Apr 28 15:25:49 2015 -0400 + + pybind: fix valgrind warning on rbd_get_parent_info call + + Signed-off-by: Jason Dillaman + (cherry picked from commit 2586e3ba1e20603a87c833513e09dae9281beb4d) + +commit aa3eb28f6be62991bc790de5c19cb7b6e30fa189 +Author: Jason Dillaman +Date: Tue Apr 28 11:12:00 2015 -0400 + + osdc: invalid read of freed memory + + The bytes not in cache stat was potentially reading the bh length + from a deleted bufferhead. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 5ccc4422d6172376bd6f1be8d3a99c0a54eab807) + +commit 18ede754388372cf210d7db87fa46f3536cf0e44 +Author: Jason Dillaman +Date: Tue Apr 28 10:56:15 2015 -0400 + + krbd: fix incorrect types in the krbd API + + The C API functions were referencing the C++ CephContext + instead of the C rados_config_t. Additionally, the ceph + namespace was missing on the Formatter class. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 740fd275a60630e60b3bcf41637a2ca486885d9c) + +commit 488578c1d557ebec7e50d53e45ed46f42984f4f8 +Author: Jason Dillaman +Date: Tue Apr 28 10:54:47 2015 -0400 + + fsx: cleanup crypto library at exit + + Also made small tweaks so that it can be compiled under + a C++ compiler. + + Signed-off-by: Jason Dillaman + (cherry picked from commit c44f8e7fbc19924a9453d8c032c624ebb6c0296f) + +commit 97ff6cb2f8fdd4d946eeab338ec225450e3ad8f3 +Author: Jason Dillaman +Date: Fri Apr 24 14:29:59 2015 -0400 + + tests: add run-rbd-valgrind-unit-tests.sh + + Signed-off-by: Jason Dillaman + (cherry picked from commit 5534faaa469b8a6a4c9687aad1a6723f3e859353) + +commit e690907cbb3b229f84f1e996d58636d00f823e8f +Author: Jason Dillaman +Date: Fri Apr 24 00:23:03 2015 -0400 + + valgrind: update valgrind suppressions for lttng-ust + + Signed-off-by: Jason Dillaman + (cherry picked from commit 8d87bdf597aad3d6be47aedd216a673bd9093a24) + +commit fe013e0a813c5697e917da642143388de60e8528 +Author: Jason Dillaman +Date: Fri Apr 24 00:21:15 2015 -0400 + + librbd: TaskFinisher should finish all queued tasks + + The destructor wasn't waiting for all Finisher tasks + to complete before stopping the thread. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 8e20240e4155e2f0398e79f4c0095d2d6ba1d4cb) + +commit 43cd3ac923c9accfb81acf41f5bd12b8a05322c7 +Author: Jason Dillaman +Date: Thu Apr 23 23:10:23 2015 -0400 + + tests: fix valgrind errors with librbd unit test + + Signed-off-by: Jason Dillaman + (cherry picked from commit ed5472a10eb515e2a177a640c3f6ed929db9ee4f) + +commit 5d8d6a1a776f833847edc80d2a9b31ecb440ade5 +Author: Jason Dillaman +Date: Thu Apr 23 23:09:45 2015 -0400 + + tests: librbd should release global data before exit + + Signed-off-by: Jason Dillaman + (cherry picked from commit 6ab1bb5614a5d257a82cf8ea280eef5c90cf765b) + +commit 13f926e4e96d0b7178a9762bbbf589961dba47b7 +Author: Jason Dillaman +Date: Thu Apr 23 23:08:51 2015 -0400 + + librados_test_stub: cleanup singleton memory allocation + + Signed-off-by: Jason Dillaman + (cherry picked from commit 54c88255b74741d882b88f791497862635357634) + +commit 45beb86423c3bd74dbafd36c6822e71ad9680e17 +Merge: 5e399b0 582cf73 +Author: Loic Dachary +Date: Fri Jul 17 19:48:05 2015 +0200 + + Merge pull request #5046 from ceph/wip-12109-hammer + + librbd: new QA client upgrade tests + + Reviewed-by: Loic Dachary + +commit 1063f5275d1031812d564a1bd8ada64bed561026 +Author: Samuel Just +Date: Wed May 20 12:08:15 2015 -0700 + + PG::find_best_info: ignore info.les for incomplete peer + + See included update to doc/dev/osd_internals/last_epoch_started.rst + + Fixes: 11687 + Signed-off-by: Samuel Just + (cherry picked from commit 371d9baa120dc0302e9e61d3bc0e309dfaa773a0) + +commit 5e399b035d7cf861cf66a8ead00b388c4857cbb6 +Merge: 706b1c7 ad5745b +Author: Samuel Just +Date: Thu Jul 16 14:58:49 2015 -0700 + + Merge pull request #5159 from theanalyst/wip-11701-hammer + + make the all osd/filestore thread pool suicide timeouts separately configurable + + Reviewed-by: Samuel Just + +commit 713227791ab28c5e09073acb7b2c3c83ca0f0d6a +Author: Thorsten Behrens +Date: Mon Mar 16 00:13:38 2015 +0100 + + Conditional-compile against minimal tcmalloc. + + Certain older systems (SLE11 in this case) do not provide the full + tcmalloc functionality, due to e.g. incomplete libunwind + pieces. Use --with-tcmalloc-minimal to enable the cut-down + version. + + Here's how the various mem allocator switches interact now: + + --with-jemalloc: overrides --with-tcmalloc & --with-tcmalloc-minimal + --with-tcmalloc-minimal: overrides --with-tcmalloc + --with-tcmalloc: the default. use --without-tcmalloc to disable + + Signed-off-by: Thorsten Behrens + (cherry picked from commit c6f1c07113ca19547fdac10cd9b817a60142aee2) + +commit 706b1c7c5bcaaff96aa6950302b7aef097918d30 +Merge: daf5450 5e72479 +Author: Sage Weil +Date: Thu Jul 16 11:04:52 2015 -0400 + + Merge pull request #5252 from ceph/wip-12021-hammer + + OSDMonitor: allow addition of cache pool with non-empty snaps with co… + + Reviewed-by: Sage Weil + +commit daf5450765684b0b2ed049320d7463b637321e5a +Merge: d20f513 bd91fb0 +Author: Loic Dachary +Date: Thu Jul 16 01:15:51 2015 +0200 + + Merge pull request #4891 from theanalyst/wip-11740-hammer + + crush: take crashes due to invalid arg + + Reviewed-by: Loic Dachary + +commit ad5745bfd768b52ae6a766391232becad8587641 +Author: Samuel Just +Date: Wed May 6 10:49:00 2015 -0700 + + OSD: add command_wq suicide timeout + + Signed-off-by: Samuel Just + (cherry picked from commit df4e5de819c30003cfbe50a071c49039cf534419) + + Conflicts: + src/common/config_opts.h + Trivial merge conflict + +commit 059a579c02b312bbd32fa41485c361ae3847a3ba +Author: Samuel Just +Date: Wed May 6 10:54:31 2015 -0700 + + OSD: add remove_wq suicide timeout + + Signed-off-by: Samuel Just + (cherry picked from commit f2fbfa32a16666be46359f0eab7b04ca80a753f5) + +commit b8826bc4e3da6fcb9338ad6c01af1a88e6585a4d +Author: Samuel Just +Date: Wed May 6 10:52:40 2015 -0700 + + OSD: add scrub_wq suicide timeout + + Signed-off-by: Samuel Just + (cherry picked from commit 547a7041edc833f3cc8e04d388574809e30a8af6) + +commit 878dd403930a2058656a99c14b465358e134843c +Author: Samuel Just +Date: Wed May 6 10:51:28 2015 -0700 + + OSD: add snap_trim_wq suicide timeout + + Signed-off-by: Samuel Just + (cherry picked from commit e1073a4a577211672148a4112bd633831552d66f) + +commit 11575832a37ea247a8febe912b3058f51a464ab6 +Author: Samuel Just +Date: Wed May 6 10:50:19 2015 -0700 + + OSD: add recovery_wq suicide timeout + + Signed-off-by: Samuel Just + (cherry picked from commit 85311b656852af75bfbbc6699f92fc6aa233c316) + + Conflicts: src/common/config_opts.h + + There was a merge conflict due to introduction of `osd_recovery_sleep` + which was introduced in #3829 + +commit a82b4505848c09ad0094768c886f2015bdaa1148 +Author: Samuel Just +Date: Wed May 6 11:02:19 2015 -0700 + + OSD: add op_wq suicide timeout + + Signed-off-by: Samuel Just + +commit 89aa8ff9855ae868d59bd10fe3a3aab8517e90fc +Author: Kefu Chai +Date: Tue Jun 2 23:52:22 2015 +0800 + + mon: add an "osd crush tree" command + + * to print crush buckets/items in a tree + + Fixes: #11833 + Signed-off-by: Kefu Chai + (cherry picked from commit 5436c290f3622feb8d4b279ed6552b2510e0cee9) + + Conflicts: + src/test/mon/osd-crush.sh: + do not start mon as run() takes care of it already + +commit d20f513d9b185eff82bee2ca719b5453358e740b +Merge: 8753b2b 3d74164 +Author: Loic Dachary +Date: Tue Jul 14 20:43:02 2015 +0200 + + Merge pull request #4899 from theanalyst/wip-11911-hammer + + start_flush: filter out removed snaps before determining snapc's + + Reviewed-by: Samuel Just + +commit 8753b2b14536c34a7b6dec927c7a5b8100de7f68 +Merge: 3d72652 ecac1a4 +Author: Loic Dachary +Date: Tue Jul 14 20:42:45 2015 +0200 + + Merge pull request #4868 from SUSE/wip-11879-hammer + + Clock skew causes missing summary and confuses Calamari + + Reviewed-by: Samuel Just + +commit 3d72652d7ba6b2fff3d39ea7965c3c61d5fa0a04 +Merge: 9a79e8e fdb43eb +Author: Loic Dachary +Date: Tue Jul 14 16:42:12 2015 +0200 + + Merge pull request #4883 from SUSE/wip-11638-hammer + + ceph.spec.in: ceph-common subpackage def needs tweaking for SUSE/openSUSE + + Reviewed-by: Ken Dreyer + +commit 03c07d76ac8361ddd302f5bc0575aee7fb5edc99 +Author: Jason Dillaman +Date: Fri Jun 26 09:59:36 2015 -0400 + + librbd: assertion failure race condition if watch disconnected + + It's possible for librbd's watch of the header object to be reset by + connection issues just prior to the image being removed. This will + causes an assertion failure which assumes at least one watcher on the + image. + + Fixes: #12176 + Backport: hammer, firefly + Signed-off-by: Jason Dillaman + (cherry picked from commit af276de4f299960e43761904c043924cec5fef11) + +commit 5c812c1552d954f2c91c000332ddc74c9e91825a +Author: Jason Dillaman +Date: Thu Jun 25 16:51:31 2015 -0400 + + librbd: prevent object map updates from being interrupted + + Object map updates were being canceled in-flight when the exclusive lock + is released. This resulted in an ERESTART error code bubbling up to + AioRequest. + + Fixes: 12165 + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 590cdc90edaf4f4ff06c97eb2f43b92ab9b60084) + + Conflicts: + src/librbd/ObjectMap.h + conflict due to a variable `m_snap_id' which was introduced in PR #4140 + which is dropped as we are not backporting that feature + +commit e50caab2251bb68fea1adbd17acc43aa98ab1206 +Author: wuxingyi +Date: Tue Jun 23 01:46:48 2015 +0000 + + rgw: fix empty json response when getting user quota + + Fixes: #12117 + Signed-off-by: wuxingyi + (cherry picked from commit 64fceed2202c94edf28b8315fe14c9affa8c0116) + +commit ec705336551436517c16bffdc6bf5467899ae4bb +Author: Yehuda Sadeh +Date: Thu Jun 25 14:31:03 2015 -0700 + + rgw: error out if frontend did not send all data + + Fixes: #11851 + The civetweb mg_write() doesn't return error when it can't flush all data + to the user, it just sends the total number of bytes written. Modified the + client io to return total number of bytes and return an error if didn't + send anything. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit daa679c3dd3770a6d5421e2cc9a36924f4550439) + +commit 557865c85bb907fe69248c4f1acb88320a7c1bb5 +Author: Yehuda Sadeh +Date: Wed Jun 17 15:11:28 2015 -0700 + + rgw: fix reset_loc() + + Fixes: #11974 + + Only need to set locator for underscore if namespace is empty + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit d3bd27f4855df6bb207b656527138026af1a36a2) + +commit b1618a97fef644dc3dced502d600de6a5d55d085 +Author: Radoslaw Zarzynski +Date: Mon Apr 20 14:55:00 2015 +0200 + + rgw: fix lack of account name in XML listing of Swift account. + + Fixes: #11431 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 837388bbc39a1bf9019302c3a4d3a3fe22caeeb4) + +commit e39dce7935dd513b77ce34bc79d70a2c23437cbb +Author: Radoslaw Zarzynski +Date: Wed Feb 18 15:48:43 2015 +0100 + + rgw: generate the "Date" HTTP header for civetweb. + + Fixes: #10873 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit ea384f83b601f60e135c3d3f960fdb75a919dd84) + +commit a5dbcbbdddce6cdeccb1e6f5641601d673cd1896 +Author: Hervé Rousseau +Date: Mon Apr 27 17:54:30 2015 +0200 + + Swift: Set Content-Length when requesting/checking Keystone tokens + + Running Keystone with WSGIChunkedRequest=On is not supported. + + We have to make sure that we set the Content-Length header when getting + an admin token and checking revoked tokens, otherwise Keystone returns + a HTTP 411 error. + + Same applies when checking revoked tickets. + + Fixes: #11473 + Backport: Hammer, Firefly + Signed-off-by: Hervé Rousseau + (cherry picked from commit 24f477417fdac9d68902fa211c8edf92a2e8729f) + +commit 3c8cdeacf46ae4031189d2ef6948aa3b6ab4ae43 +Author: Yan, Zheng +Date: Wed Jul 8 10:11:43 2015 +0800 + + client: reference counting 'struct Fh' + + The async readahead finisher needs to reference 'struct Fh'. But + it's possible user closes FD and free the corresponding 'struct Fh' + before async readahead finishes. + + Fixes: #12088 + Signed-off-by: Yan, Zheng + (cherry picked from commit 34b939a81d38173b882c429b28dedce778504ba8) + +commit c78cc00afb6deb8022db60dbe8649335f61bd345 +Author: Radoslaw Zarzynski +Date: Thu Mar 19 14:52:18 2015 +0100 + + rgw: rectify 202 Accepted in response for PUT on existing bucket. + + Fixes: #11148 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 3998fe7e02a6c25a3302c80a9c9907357fd3a23e) + +commit 9a79e8e7da5f34f1adaf6137e01bcd42766ae677 +Merge: 5527720 7f1c0cc +Author: Kefu Chai +Date: Sun Jul 12 02:19:57 2015 +0800 + + Merge pull request #5208 from tchaikov/wip-11975-hammer + + tests: TEST_crush_reject_empty must not run a mon + + Reviewed-by: Loic Dachary + +commit 7f1c0cc9cd3deab925440b56d82c3e61a8ba5ab1 +Author: Kefu Chai +Date: Sat Jul 11 23:04:33 2015 +0800 + + crush/CrushTester: return EINVAL if crushtool returns non-zero + + this backports a tiny part of ec02441, otherwise + CrushTester will return 1, and "ceph" cli will take it + as EPERM, which is miss leading, and fails + osd-crush.sh:TEST_crush_reject_empty. + + Signed-off-by: Kefu Chai + +commit 2aaeea145b24b972a0b98549c3527ccf98f4c96f +Author: Loic Dachary +Date: Fri Jul 10 16:23:47 2015 +0200 + + tests: TEST_crush_reject_empty must not run a mon + + * Back in Hammer, the osd-crush.sh individual tests did not run the + monitor, it was taken care of by the run() function. An attempt to run + another mon fails with: + + error: IO lock testdir/osd-crush/a/store.db/LOCK: Resource temporarily + unavailable + + This problem was introduced by cc1cc033930e8690a57674e842a003f6bbc7a242 + from https://github.com/ceph/ceph/pull/4936 + * replace test/mon/mon-test-helpers.sh with test/ceph-helpers.sh as + we need run_osd() in this newly added test + * update the run-dir of commands: ceph-helpers.sh use the different + convention for the run-dir of daemons. + + http://tracker.ceph.com/issues/11975 Refs: #11975 + + Signed-off-by: Loic Dachary + +commit 80afb81124a0d2ef25a23a12c86617ab1da3a4bd +Author: Loic Dachary +Date: Tue Apr 21 16:11:33 2015 +0200 + + ceph-helpers: implement test_expect_failure + + To display the output in case the command did not fail with the expected + output. + + Signed-off-by: Loic Dachary + (cherry picked from commit 5871781b10ff0b26c731b70d1898c474006cbee3) + +commit 6b5e9a1df7dfb3a971e40aec35119ec019515b69 +Author: Loic Dachary +Date: Wed Jun 10 23:16:01 2015 +0200 + + tests: display the output of failed make check runs + + After a make check fails, it shows a summary but not the output of the + failed tests although they contain information to diagnose the problem. + + Set the VERBOSE=true automake variable which is documented to collect + and display the failed script output at the end of a run (the content of + the test-suite.log file (valid from automake-1.11 up). + + http://www.gnu.org/software/automake/manual/automake.html#index-VERBOSE + + Also remove the run-make-check.sh that did the same in a way that is not + compatible with automake-1.11. + + Signed-off-by: Loic Dachary + (cherry picked from commit 3a55cb029bb7db9542d2b14f2deda90feb0ae0f6) + +commit 552772025cb8d5f51ffb3a069d1bd93bc73f1123 +Merge: f4d77c2 1440122 +Author: Loic Dachary +Date: Fri Jul 10 10:17:24 2015 +0200 + + Merge pull request #4889 from theanalyst/wip-11484-hammer + + OPT_INT option interprets 3221225472 as -1073741824, and crashes in Throttle::Throttle() + + Reviewed-by: Kefu Chai + +commit f4d77c22aa51edb45211e080f3fdf28a7a0cfdd4 +Merge: 5088105 a62c3aa +Author: Loic Dachary +Date: Fri Jul 10 10:16:25 2015 +0200 + + Merge pull request #4776 from tchaikov/wip-11279-hammer + + ceph: cli interactive mode does not understand quotes + + Reviewed-by: Kefu Chai + +commit 5088105300c013b1b804c938a30ac63ba710556d +Merge: e3b1f7b 0b6d442 +Author: Loic Dachary +Date: Fri Jul 10 10:14:42 2015 +0200 + + Merge pull request #4657 from ceph/wip-hammer-11535-admin-socket + + common/admin_socket: close socket descriptor in destructor + + Reviewed-by: Loic Dachary + +commit e3b1f7be9e8474fbec98076790ff683bccd44ce9 +Merge: dd29a86 558d639 +Author: Kefu Chai +Date: Fri Jul 10 16:07:48 2015 +0800 + + Merge pull request #4687 from SUSE/wip-7387-hammer + + utf8 and old gcc breakage on RHEL6.5 + + Reviewed-by: Kefu Chai + +commit dd29a869db5503fc9e2c6d1d44ee4311d95af20c +Merge: 7f1fb57 0e5e7e1 +Author: Kefu Chai +Date: Fri Jul 10 16:00:00 2015 +0800 + + Merge pull request #5122 from theanalyst/wip-11982-hammer + + ceph fails to compile with boost 1.58 + + Reviewed-by: Kefu Chai + +commit 7f1fb574608800c3e6aa12df6c7888acbf397a52 +Merge: adc7016 5141301 +Author: Kefu Chai +Date: Fri Jul 10 15:59:35 2015 +0800 + + Merge pull request #4936 from ceph/wip-11975-hammer + + mon crashes when "ceph osd tree 85 --format json" + + Reviewed-by: Kefu Chai + +commit adc70161d14fc2b51e6c6f38580f76ff0067717a +Merge: 2d68db8 82988d6 +Author: Loic Dachary +Date: Fri Jul 10 09:48:44 2015 +0200 + + Merge pull request #4892 from theanalyst/wip-11760-hammer + + ceph-disk: get_partition_type fails on /dev/cciss... + + Reviewed-by: Loic Dachary + +commit 2d68db8371263645642cf28473deea4456ca7021 +Merge: 1cffe8c ba1a016 +Author: Loic Dachary +Date: Fri Jul 10 09:43:57 2015 +0200 + + Merge pull request #4877 from SUSE/wip-11902-hammer + + admin/build-doc: script fails silently under certain circumstances + + Reviewed-by: Loic Dachary + +commit 51413011417b76f5ad2830d9f93fbfe78c77e467 +Author: Kefu Chai +Date: Tue May 26 18:11:59 2015 +0800 + + mon: add "--check" to CrushTester::test_with_crushtool() + + so we don't need to call CrushTester::check_name_maps() in OSDMonitor.cc + anymore. + + Fixes: #11680 + Signed-off-by: Kefu Chai + (cherry picked from commit c6e634875316cf17368d497e6dc4f6f4b5dd65d2) + +commit 5ec27cf589b4535f07e28a86bd304f7a46427ac4 +Author: Kefu Chai +Date: Tue May 26 17:51:50 2015 +0800 + + crushtool: rename "--check-names" to "--check" + + * because "--check" also checks for the max_id + + Note: edited since we do not have the fix introduced in 46103b2 in + hammer. + + Signed-off-by: Kefu Chai + (cherry picked from commit 9381d53acdce85fcbff828926b911e050ba36e51) + +commit 2a8fe8862a15342cc5716c146487d0b42af0fbf6 +Author: Kefu Chai +Date: Tue May 26 16:58:23 2015 +0800 + + mon: check the new crush map against osdmap.max_osd + + Fixes: #11680 + Signed-off-by: Kefu Chai + (cherry picked from commit 22e6bd6e01d5df3f3e897562597e22ca1737f8c8) + +commit c0b0f52ddbd4e22998a36addfb32f27614183e19 +Author: Kefu Chai +Date: Tue May 26 15:35:10 2015 +0800 + + crushtool: enable check against max_id + + add an argument "max_id" for "--check-names" to check if any item + has an id greater or equal to given "max_id" in crush map. + + Note: edited since we do not have the fix introduced in 46103b2 in + hammer. + + Signed-off-by: Kefu Chai + (cherry picked from commit d0658dd3cdf072b2a7c2a1986f8785a697c591ee) + +commit f041bbebf98a2aff2ad542e8d0c12c46af427573 +Author: Kefu Chai +Date: Tue May 26 15:34:33 2015 +0800 + + crush/CrushTester: check if any item id is too large + + Signed-off-by: Kefu Chai + (cherry picked from commit e640d89240017956b8c7411babb86be0f1e2b172) + +commit cc1cc033930e8690a57674e842a003f6bbc7a242 +Author: Kefu Chai +Date: Mon May 25 20:14:32 2015 +0800 + + mon: validate new crush for unknown names + + * the "osd tree dump" command enumerates all buckets/osds found in either the + crush map or the osd map. but the newly set crushmap is not validated for + the dangling references, so we need to check to see if any item in new crush + map is referencing unknown type/name when a new crush map is sent to + monitor, reject it if any. + + Fixes: #11680 + Signed-off-by: Kefu Chai + (cherry picked from commit a955f36a509e5412b1f72632a1a956d99e768e35) + +commit ff29a7f9dd21505c681881e609183aed9ac3250e +Author: Kefu Chai +Date: Tue May 26 12:08:36 2015 +0800 + + crushtool: add the "--check-names" option + + * so one is able to verify that the "ceph osd tree" won't chock on the + new crush map because of dangling name/type references + + Signed-off-by: Kefu Chai + (cherry picked from commit d6b46d4c7b722945ce24ac2930381a109b1e3dda) + +commit 960ea49699f421ceb89c9e0c9430378a35f09a9a +Author: Kefu Chai +Date: Tue May 26 12:08:09 2015 +0800 + + crush/CrushTester: add check_name_maps() method + + * check for dangling bucket name or type names referenced by the + buckets/items in the crush map. + * also check for the references from Item(0, 0, 0) which does not + necessarily exist in the crush map under testing. the rationale + behind this is: the "ceph osd tree" will also print stray OSDs + whose id is greater or equal to 0. so it would be useful to + check if the crush map offers the type name indexed by "0" + (the name of OSDs is always "OSD.{id}", so we don't need to + look up the name of an OSD item in the crushmap). + + Signed-off-by: Kefu Chai + (cherry picked from commit b75384d73958faf81d45847a7dfa56f4fa347e6f) + +commit 1cffe8c46a39142ee0da4e2279eda7276df262e1 +Merge: 6ffb1c4 ef6641c +Author: Loic Dachary +Date: Fri Jul 10 09:39:15 2015 +0200 + + Merge pull request #4667 from SUSE/wip-11611-hammer + + ceph.spec: update OpenSUSE BuildRequires + + Reviewed-by: Loic Dachary + +commit 5e72479b0882ac13597d7a613698e583dcb2b932 +Author: Samuel Just +Date: Tue Jul 7 11:43:01 2015 -0700 + + OSDMonitor: allow addition of cache pool with non-empty snaps with config + + We need to be able to allow the version of ceph_test_* from earlier + versions of ceph to continue to work. This patch also adjusts the + work unit to use a single rados snap to test the condition without + --force-nonempty to ensure that we don't need to be careful about + the config value when running that script. + + Signed-off-by: Samuel Just + +commit 6ffb1c4ae43bcde9f5fde40dd97959399135ed86 +Merge: c7ebf96 524f4a5 +Author: Gregory Farnum +Date: Wed Jul 8 16:52:12 2015 +0100 + + Merge pull request #5123 from theanalyst/wip-11979-hammer + + MDSMonitor: handle MDSBeacon messages properly + + Reviewed-by: Kefu Chai + +commit ecac1a458bc7331ed8d667f20ba31995d74892d3 +Author: Thorsten Behrens +Date: Fri Feb 6 01:26:40 2015 +0100 + + Always provide summary for non-healthy cluster + + This fixes a problem, wherein calamari does not provide + popup drill-downs for warnings or errors, should the summary + be missing. + + Calamari gets health info from /api/v1/cluster/$FSID/health. + If the data here has a summary field, this summary is provided + in a popup window: + + /api/v1/cluster/$FSID/health is populated (ultimately) with + status obtained via librados python bindings from the ceph + cluster. In the case where there's clock skew, the summary + field supplied by the ceph cluster is empty. + + No summary field, no popup window with more health details. + + Signed-off-by: Thorsten Behrens + (cherry picked from commit eaf6e0cf48488fe604d0ef0db164d44948d4e8d4) + +commit c7ebf96a9a4a6143b112c8606d5ee346fb800cec +Merge: b163728 1a321e4 +Author: Loic Dachary +Date: Wed Jul 8 15:36:38 2015 +0200 + + Merge pull request #4862 from SUSE/wip-11874-hammer + + Bucket header is enclosed by quotes + + Reviewed-by: Yehuda Sadeh + +commit b1637289ec4ff99d923457577893b4c4a8d2e9fe +Merge: e33af22 54f4e7d +Author: Loic Dachary +Date: Wed Jul 8 15:35:53 2015 +0200 + + Merge pull request #4885 from theanalyst/wip-11755-hammer + + Object copy bug + + Reviewed-by: Yehuda Sadeh + +commit e33af22dec32467f229ca2dc1dc9668702a44ce8 +Merge: 5696b0f 9dfef60 +Author: Loic Dachary +Date: Wed Jul 8 15:34:45 2015 +0200 + + Merge pull request #4884 from theanalyst/wip-11722-hammer + + Keystone PKI token expiration is not enforced + + Reviewed-by: Yehuda Sadeh + +commit 5696b0ff2a981ae8afe72df796ba7d7da47bb7d1 +Merge: 72ecd52 ed5442b +Author: Loic Dachary +Date: Wed Jul 8 15:30:48 2015 +0200 + + Merge pull request #4875 from ceph/wip-11770-hammer + + librbd: aio calls may block + + Reviewed-by: Jason Dillaman + Reviewed-by: Loic Dachary + +commit 1a32379dd6cb56ed69b8f448d3612506c8131fbe +Author: Kefu Chai +Date: Mon May 11 19:30:30 2015 +0800 + + mon/PGMap: add more constness + + Signed-off-by: Kefu Chai + (cherry picked from commit e1f1c56ce88ef3ad05e881d650fac637931ce3fe) + +commit 84ebc3d320bdf871ccf85e555951cea421b56021 +Author: Kefu Chai +Date: Mon May 11 19:29:13 2015 +0800 + + mon/PGMap: sort pg states by the states in "pg ls" spec + + Signed-off-by: Kefu Chai + (cherry picked from commit 990dfb6cd45438bc8293ac37882daa413860a2f8) + +commit e310461aecc667cf26806bc5a3bbabb05696bdfc +Author: Kefu Chai +Date: Mon May 11 17:02:41 2015 +0800 + + mon: s/recovery/recoverying/ in "pg ls*" commands' spec + + * also translate "repair" if specified as "states" + * update test_mon_pg in cephtool-test-mon.sh + + Fixes: #11569 + Signed-off-by: Kefu Chai + (cherry picked from commit 89f89ca3477eddcae11a05fbd58a8f3658eb1fc1) + +commit 524f4a52d115ecda8cd7793d0f8bea148eff92af +Author: Kefu Chai +Date: Fri May 15 22:50:36 2015 +0800 + + mon: always reply mdsbeacon + + the MDS (Beacon) is always expecting the reply for the mdsbeacon messages from + the lead mon, and it uses the delay as a metric for the laggy-ness of the + Beacon. when it comes to the MDSMonitor on a peon, it will remove the route + session at seeing a reply (route message) from leader, so a reply to + mdsbeacon will stop the peon from resending the mdsbeacon request to the + leader. + + if the MDSMonitor re-forwards the unreplied requests after they are + outdated, there are chances that the requests reflecting old and even wrong + state of the MDSs mislead the lead monitor. for example, the MDSs which sent + the outdated messages could be dead. + + Fixes: #11590 + Signed-off-by: Kefu Chai + (cherry picked from commit b3555e9c328633c9e1fbc27d652c004b30535e5b) + +commit 413e407dea446bebb9c36abb3025ada450dd4fe9 +Author: Kefu Chai +Date: Tue Jun 2 23:20:21 2015 -0700 + + mon/MDSMonitor: rename labels to a better name + + * s/ignore/reply/ + * s/out/ignore/ + + Signed-off-by: Kefu Chai + (cherry picked from commit f00ecb8b3df73ce6337985bc6d43bce5143ee537) + +commit a03968ad584a3ff8e351cc5dce053e535fcdc454 +Author: Kefu Chai +Date: Tue Jun 2 12:55:06 2015 +0800 + + mon: send no_reply() to peon to drop ignored mdsbeacon + + so the peon can remove the ignored mdsbeacon request from the + routed_requets at seeing this reply, and hence no longer resend the + request. + + Fixes: #11590 + Signed-off-by: Kefu Chai + (cherry picked from commit 72a37b3a8e145d8522ea67fc14ce2c5510b6852b) + +commit 39f34596b0ec6f769f507e2b372204f8551f7ee0 +Author: Kefu Chai +Date: Tue Jun 2 12:22:26 2015 +0800 + + mon: remove unnecessary error handling + + msg.get_session() should always return a non-zero pointer in + Monitor.dispatch() + + Signed-off-by: Kefu Chai + (cherry picked from commit 16e8e2cc82a90c49cd8aa3d0e3acc4694ba659a0) + +commit 0e5e7e1d259579571c1fc05660f6af8e295e733b +Author: Kefu Chai +Date: Fri May 8 15:21:20 2015 +0800 + + mon: remove unused variable + + * as a side effect, this change silences + http://tracker.ceph.com/issues/11576 + + Fixes: #11576 + Signed-off-by: Kefu Chai + (cherry picked from commit e7b196a4a091c0ea258866559ba06e7ed0cc4247) + +commit 70347209260688f1a067354c744569499adb6920 +Author: Samuel Just +Date: Wed Apr 1 16:37:51 2015 -0700 + + ReplicatedPG::finish_promote: handle results->snaps is empty case + + If results->snaps winds up empty after filtering removed snaps, + we need to treat the object as if we had gotten an ENOENT. + + PartialFix: #11296 + Backport: firefly, hammer + Signed-off-by: Samuel Just + (cherry picked from commit 6150757dbe0fa11cceb14460865b859a7c8164c7) + +commit 3e44dc16ed3fbda053996e44826aa3d90042a234 +Author: Samuel Just +Date: Wed Apr 1 16:25:22 2015 -0700 + + ReplicatedPG::finish_promote: fix snap promote head snaps + + If the snaps vector is: 10=[9,5,2]:[4]+head, the backing pool's snaps + vector is 3=[2]:[]+head, and we request clone 4 from the backing pool, + the backing pool will send us head with an empty results->snaps vector. + Actually, clone 4 should be trimmed, but the cache pool does not know + that. Thus, we should construct an empty snaps vector for that clone. + + PartialFix: #11296 + Backport: firefly, hammer + Signed-off-by: Samuel Just + (cherry picked from commit a45a698372def1623323470c6a1c4eb70e0bb79f) + +commit 5ef08466abf1b3934fcad0a0ca46f3a4380d6dbd +Author: John Spray +Date: Wed Jun 3 10:04:26 2015 +0100 + + tools: chunk reads in Dumper + + Previously tried to read entire journal + into memory in one go, which was problematic + for large journals. + + Fixes: #11746 + Signed-off-by: John Spray + (cherry picked from commit e3ddcb894ad09326698999d42de0ce3feb69f28e) + +commit 408880bed296e5cbf05864fa6744a5b00a245272 +Author: Greg Farnum +Date: Tue Jun 16 08:13:41 2015 -0700 + + qa: update to newer Linux tarball + + This should make newer gcc releases happier in their default configuration. + kernel.org is now distributing tarballs as .xz files so we change to that + as well when decompressing (it is supported by Ubuntu Precise so we should + be all good). + + Fixes: #11758 + + Signed-off-by: Greg Farnum + (cherry picked from commit 1ea3f47ab806d48ca7b045c2731d344eae3900e1) + +commit 56c2688b87d7d78831f8e147fc67cc0651ab644c +Author: Yehuda Sadeh +Date: Fri Apr 24 14:45:40 2015 -0700 + + rgw: simplify content length handling + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit e97fd5052cab83c5f699531a8c960b93437a8f9f) + +commit d9bbef3e470c6406bb65dc40e7e9c08c5d599f73 +Author: Robin H. Johnson +Date: Fri Apr 24 10:49:16 2015 -0700 + + rgw: make compatability deconfliction optional. + + Per request from Yehuda, the deconfliction for having both + HTTP_CONTENT_LENGTH and CONTENT_LENGTH set is now optional, and + controlled by new configuration boolean, which defaults to false. + rgw content length compat + + X-URL: https://github.com/ceph/ceph/pull/4436#issuecomment-95994887 + Signed-off-by: Robin H. Johnson + (cherry picked from commit 79d17af1a1ec0659884f768945a7bac6282b5e0b) + +commit 0260abd5d265bd63ea9c89f4082c31ba1c5ae8fa +Author: Robin H. Johnson +Date: Wed Apr 22 12:53:06 2015 -0700 + + rgw: improve content-length env var handling + + The FastCGI specification, section 6.3 on Authorizers, describes a case + where HTTP_CONTENT_LENGTH will be set in the environment and + CONTENT_LENGTH will NOT be set. + + Further documention in the code. + + Fixes: #11419 + Signed-off-by: Robin H. Johnson + (cherry picked from commit 3e38eab44bfb082fdd2b6f29b8b0357f8f5c11bb) + +commit 8abc46a157e4c1431a92a1e52ab694dccff5d514 +Author: wuxingyi +Date: Wed Jun 10 06:57:57 2015 +0000 + + rgw: fix data corruption when race condition + + We should delete the object in the multipart namespace lastly to prevent a previous upload + wrongly deleting objects belong to the following upload. + + Fixes: #11749 + Signed-off-by: wuxingyi + (cherry picked from commit ac1e729a75b5d995028bbc223bcf5ecce0d112cc) + +commit 72ecd522941156c8a7e5303531944b0735dcbeb8 +Merge: 59f37a9 d723e11 +Author: Abhishek L +Date: Wed Jul 1 18:09:46 2015 +0530 + + Merge pull request #4886 from theanalyst/wip-11737-hammer + + Reviewed-by: Greg Farnum + +commit 59f37a9bafc095181b3f41ec5d93ac58e2cda604 +Merge: 53a2143 89d0266 +Author: Kefu Chai +Date: Sat Jun 27 17:11:11 2015 +0800 + + Merge pull request #5095 from ceph/wip-fix-doc-hammer + + doc: fix doc build + + Reviewed-by: Loic Dachary + +commit 89d0266a60729d5d9747867e8c30abf71a891231 +Author: Kefu Chai +Date: Sat Jun 27 14:44:55 2015 +0800 + + doc: add the corresponding @endcond command for @cond + + * they are used to applease asphyxiate, as it + is not able to handle "enum" sections + + Signed-off-by: Kefu Chai + +commit 2aa77b33a97e5a3ca134c9c555aa6e7a69ef50f7 +Author: Kefu Chai +Date: Sat Jun 27 14:43:01 2015 +0800 + + doc: remove orphan: directive in ceph-create-keys.rst + + * it is used to silence the sphinx warning, but conf.py + does not like it. + + Signed-off-by: Kefu Chai + +commit ad66e40e8bd598da7c9738cb44abb543008c90a3 +Author: Kefu Chai +Date: Sat Jun 27 14:41:59 2015 +0800 + + doc: let doxygen ignore src/tracing + + Signed-off-by: Kefu Chai + +commit 53a2143eb7e549de1185b01ed0bde841ffa5235a +Merge: 6f7cd04 bfb1442 +Author: Samuel Just +Date: Fri Jun 26 14:19:40 2015 -0700 + + Merge pull request #4902 from theanalyst/wip-11908-hammer + + Fixes for rados ops with snaps + + Reviewed-by: Samuel Just + +commit 6f7cd0460d5729c15966119e0177ddc56a361d8e +Merge: 78d894a 356bd2c +Author: Loic Dachary +Date: Thu Jun 25 20:03:42 2015 -0400 + + Merge pull request #5069 from dachary/wip-11806-hammer + + ceph / ceph-dbg steal ceph-objecstore-tool from ceph-test / ceph-test-dbg + + Reviewed-by: Ken Dreyer + +commit 356bd2c68ca730e766d06c46a0364784f5d72275 +Author: Loic Dachary +Date: Wed Jun 24 14:58:47 2015 -0400 + + debian: ceph-dbg steals ceph-objectstore-tool from ceph-test-dbg (take 2) + + 968573b8930a7c8485bf53e3a989ce2f7d0a2fff incorrectly backported + 6f11fbf41fab10924b1e0e41fcf27864779d4073. It should instead reflect that + ceph-dbg in 0.94.2 and above will break ceph-test-dbg in all versions + prior to 0.94.2. + + In other words, 0.94-XXXX being lower than 0.94.1, upgrading from + 0.94.1 to 0.94.2 will not notice that ceph-dbg breaks ceph-test-dbg. + + $ dpkg --compare-versions 0.94-XXXX lt 0.94.1 && echo yes || echo no + yes + $ dpkg --compare-versions 0.94.2 lt 0.94.1-xxx && echo yes || echo no + no + + http://tracker.ceph.com/issues/11806 Fixes: #11806 + + Signed-off-by: Loic Dachary + +commit c5c627fdd3de669ee94ab7ecd6939c29a4a68011 +Author: wuxingyi +Date: Wed Mar 11 17:34:40 2015 +0800 + + rgw/logrotate.conf: Rename service name + + The service name for ceph rados gateway was changed to "ceph-radosgw", + the previous version of service name "radosgw" would cause a failed reload, + and finally make it impossible to write any log data to the log file. + + Signed-off-by: wuxingyi + (cherry picked from commit 9df3f798179481fe8ae6ae873dcb793de7d8f367) + +commit 582cf731e05baabd2cd79567af89d7f005c24925 (refs/remotes/gh/wip-12109-hammer) +Author: Jason Dillaman +Date: Tue May 5 13:08:21 2015 -0400 + + tests: add librbd watch/notify version compatibility test + + Fixes: #11405 + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 41e4cbe032e32762e3a9e8bc3eff8ece19f91a54) + +commit 43b9aef11c1281d8b83b659a523dba365d79add4 +Author: Jason Dillaman +Date: Tue May 5 11:22:55 2015 -0400 + + qa/workunits/rbd: add notify_master/slave bootstrap scripts + + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 124b1d3d2d0dbd685bbd717856d29b316e62e660) + +commit f995fb50d3d7734161fa498db9e204eaded651b6 +Author: Jason Dillaman +Date: Tue May 5 10:34:48 2015 -0400 + + qa/workunits/rbd: add new test_librbd_api workunit + + This only tests the public librbd API for use during upgrade tests. + + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 9039955f42d01044cfcf20c56ca2181e51c317ee) + +commit a09da2a2c6c12c6bf2f8e17f2096b3ab24587007 +Author: Jason Dillaman +Date: Tue May 5 10:27:38 2015 -0400 + + tests: create librbd API-only integration test suite + + The QA client-upgrade test suite requires a librbd test + that is dynamically linked to librbd. Since the current + ceph_test_librbd includes tests against the non-public API, + it is statically linked against librbd and thus cannot be + used to test a client upgrade scenario. + + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 6fe94c8cbb924c31c1007e2d0c76f28ce9dbca57) + +commit e149916cd40a58ce5db5f63a2b4efd82a8c4e1ba +Author: Owen Synge +Date: Tue Apr 21 11:31:24 2015 +0200 + + Increase max files open limit for OSD daemon. + + Under heavy load the number of file descriptors opened + by the OSD can go beyond the 64K file limit. This patch + increases the default to 128K. + + Signed-off-by: Owen Synge + (cherry picked from commit ebda4ba1c67172852587e47a8e6fb538809a1b1c) + +commit 22f58ce2665f1ea6b38f6016dc585202a4595322 +Author: Sage Weil +Date: Mon Jun 1 15:10:14 2015 -0700 + + Makefile: install ceph-post-file keys with mode 600 + + Otherwise ssh (may) prompt for a password. + + Signed-off-by: Sage Weil + (cherry picked from commit 106a1c3081d02446aa3d8e13865da0c3393bae90) + +commit 3e65a10bd2351744da199a4b076659191b4378a2 +Author: Joseph McDonald +Date: Mon Jun 1 15:05:32 2015 -0700 + + ceph-post-file: improve check for a source install + + Signed-off-by: Joseph McDonald + (cherry picked from commit ee170eadcdcb4b54d36a7d474558484de9d917eb) + +commit c1f6743940250b04ae6dbea30d8805571add39b6 +Author: Joseph McDonald +Date: Mon Jun 1 15:00:39 2015 -0700 + + ceph-post-file: behave when sftp doesn't take -i + + Fixes: #11836 + Signed-off-by: Joseph McDonald + (cherry picked from commit b84031ed5eaace1222e14d3c4076a3ab1155da96) + +commit 38d36b1174160ad104704aaa7ff5290d1ec3e782 +Author: Tim Serong +Date: Thu Apr 30 10:55:38 2015 +1000 + + packaging: move SuSEfirewall2 templates out of src + + Better to have static config like this that needs to be installed in + /etc in a separate subdirectory of the project. + + Signed-off-by: Tim Serong + (cherry picked from commit 70292658d5febb6c5f94af4df9c3e93551772d12) + +commit 24bc9f2c58ca78c8023935b8b69f8b01adbc11af +Author: Tim Serong +Date: Wed Apr 29 13:12:38 2015 +1000 + + packaging: add SuSEfirewall2 service files + + This adds SuSEfirewall2 service files for Ceph MON, OSD and MDS, for use + on SLES and openSUSE. The MON template opens port 6789 and the OSD/MDS + template opens the range 6800-7300 as per + http://ceph.com/docs/master/rados/configuration/network-config-ref/ + + Signed-off-by: Tim Serong + (cherry picked from commit 77685f5b787c56bcb1c4d9f1e058e25312fa62fe) + +commit 8acfb994f22efa07beeecccda300cbd50d683566 +Author: Owen Synge +Date: Thu May 7 12:02:41 2015 +0200 + + Bug fix to ceph systemV compatability script. + + Was failing with more than one OSD / MON deamon on a single node. + Fixes suse bugzilla #927862 + + Signed-off-by: Owen Synge + (cherry picked from commit dfda3ff8741fcdbac3150456ca7614cf75ef1776) + +commit bd3fd929e6f95e6d9840317aa3ac02f52711c94b +Author: Owen Synge +Date: Thu Jun 18 14:16:03 2015 +0200 + + Fixes to rcceph script + + - only start OSDs if mon daemons are also present + - adds support for mask and unmask + - removes support for cluster with non default cluster name, + as this was very limited and inconsistent + - Reapplied from a patch as could not cherry-pick + 66cb46c411d874be009c225450eea5021cf1219b from Mon Jan 12 + as this produced issues with src/gmock + + Signed-off-by: Owen Synge + (cherry picked from commit bfa0c4a626fdbb2bf978ccfab783ac428156144b) + +commit 37d77d3680d2c8adda35eddf9625a331a45ece11 +Author: Owen Synge +Date: Tue Apr 21 11:31:24 2015 +0200 + + Increase max files open limit for OSD daemon. + + Under heavy load the number of file descriptors opened + by the OSD can go beyond the 64K file limit. This patch + increases the default to 128K. + + Signed-off-by: Owen Synge + (cherry picked from commit ebda4ba1c67172852587e47a8e6fb538809a1b1c) + +commit 78d894a634d727a9367f809a1f57234e5e6935be +Author: Sage Weil +Date: Wed Jun 17 09:35:28 2015 -0700 + + qa/workunits/rados/test-upgarde-v9.0.1: fix exclude syntax + + It's -, then a list of all exclusions separated by :. There are just 2. + + Signed-off-by: Sage Weil + +commit 3e8d60a80ce31860eac76a1f6489a35e1795a0c0 +Author: Sage Weil +Date: Tue Jun 16 21:05:29 2015 -0700 + + qa/workunits/rados/test-upgrade-v9.0.1: skip one more evict test + + Signed-off-by: Sage Weil + +commit 348a3d3c9880e7d022e71a2faafe51c8f771406e +Author: Josh Durgin +Date: Mon Jun 15 15:12:43 2015 -0700 + + qa: add compatibility filtered rados api tests for upgrades + + Post-9.0.1, the evict op returns success when an object doesn't exist + in the cache tier. Skip the tests that are incompatible across + versions. + + Fixes: #11548 + Signed-off-by: Josh Durgin + +commit d2b80966b8f74de818a671c90b4c821a4b0782db +Merge: 70bba62 f68bf94 +Author: Sage Weil +Date: Mon Jun 15 12:38:57 2015 -0700 + + Merge pull request #4961 from ceph/wip-11493-hammer + + backport 11493 fixes, and test, prevetning ec cache pools + + Reviewed-by: Samuel Just + +commit f68bf94e370fb11a3047ec2762a972a0b7a7c0bb +Author: Samuel Just +Date: Fri May 15 13:05:40 2015 -0700 + + OSDMonitor: disallow ec pools as tiers + + Fixes: 11650 + Signed-off-by: Samuel Just + (cherry picked from commit 11b7801bb57cb25cd2d26d58722d49691747725b) + +commit 13c8d58da1303cc68d99da19f79d625f91f99d43 +Author: Sage Weil +Date: Wed Apr 29 12:34:25 2015 -0700 + + mon: prevent pool with snapshot state from being used as a tier + + If we add a pool with snap state as a tier the snap state gets clobbered + by OSDMap::Incremental::propogate_snaps_to_tiers(), and may prevent OSDs + from starting. Disallow this. + + Include a test. + + Fixes: #11493 + Backport: hammer, giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit bbec53edf9e585af4e20bbc9ba9057d6fdfda342) + +commit 58e62662f6ef04ac76470090d1d958467e34194a +Author: Samuel Just +Date: Fri May 8 10:26:48 2015 -0700 + + test/librados/tier.cc: destroy and recreate cache pool on every test + + Namespaces are not sufficient with the checks for 11493 in the mon. + + Signed-off-by: Samuel Just + (cherry picked from commit bef09e0cdb274cb1c87335a2af9ee532d14a4596) + +commit 70bba6226a64090dcf41cd90b23fdf5aed8cd0ca (refs/remotes/jashan/hammer) +Merge: 3b6977b 5a60a03 +Author: Loic Dachary +Date: Mon Jun 15 17:26:07 2015 +0200 + + Merge pull request #4846 from SUSE/wip-11862-hammer + + missing man pages for ceph-create-keys, ceph-disk-* + + Reviewed-by: Kefu Chai + Reviewed-by: Ken Dreyer + +commit 3b6977b706dbc99cac25bec1b71a628c398c6ff1 +Merge: 5fb8561 3db1026 +Author: Sage Weil +Date: Thu Jun 11 13:54:35 2015 -0700 + + Merge pull request #4934 from dachary/wip-releases-hammer + + doc/release-notes: v0.94.2 + +commit 3db1026f3706e6f5a5c25013cb6646a0298057d8 +Author: Sage Weil +Date: Wed Jun 10 12:48:41 2015 -0700 + + doc/release-notes: v0.94.2 + + Signed-off-by: Sage Weil + (cherry picked from commit 306345b29c259ab04a58ed5d40f801645485b29d) + + Conflicts: + doc/release-notes.rst + +commit ed5442b1057dcc4fb1f9404c805dabe2bbde2252 (refs/remotes/jashan/wip-11770-hammer, refs/remotes/gh/wip-11770-hammer) +Author: Jason Dillaman +Date: Tue Jun 2 10:33:35 2015 -0400 + + tests: verify librbd blocking aio code path + + Signed-off-by: Jason Dillaman + (cherry picked from commit 4cf41486e9c9e1efcb863960a8f3e0326ffca7e5) + + Conflicts: + src/test/librbd/test_librbd.cc: trival resolution + +commit 20e104869f3d17ce672438144700a4d984d487b4 +Author: Jason Dillaman +Date: Mon Jun 1 22:56:11 2015 -0400 + + librbd: new rbd_non_blocking_aio config option + + Setting this option to false reverts librbd to legacy behavior + where AIO operations could potentially block. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 769cad12716b85d87eacc1069dd9f5c21cad3915) + +commit b4571b3e238efc39767f753e0ec1622c8bd6d6e6 +Author: Jason Dillaman +Date: Thu Apr 9 20:34:28 2015 -0400 + + PendingReleaseNotes: document changes to librbd's aio_read methods + + Signed-off-by: Jason Dillaman + +commit 9ea1edd0ca9e385f823ad04b05bc887d77aa5136 +Author: Jason Dillaman +Date: Thu Apr 9 13:33:09 2015 -0400 + + librbd: AioRequest::send no longer returns a result + + The librados calls used by AioRequest::send should always return + zero unless there is a bug. + + Signed-off-by: Jason Dillaman + (cherry picked from commit c77bce3311ab62892eb8c1d883263ba7ed663b20) + + Conflicts: + src/librbd/AioRequest.cc: trivial resolution + src/librbd/AsyncFlattenRequest.cc: trivial resolution + +commit 272df2aed79a95dd9c45db4e0953e9b321f7b0f5 +Author: Jason Dillaman +Date: Wed Apr 8 21:55:36 2015 -0400 + + tests: update librbd AIO tests to remove result code + + Signed-off-by: Jason Dillaman + (cherry picked from commit 948b15eb52fd5d9ce842fa12ee0cecda17353b01) + + Conflicts: + src/test/librbd/test_internal.cc: trivial resolution + src/test/librbd/test_librbd.cc: trivial resolution + +commit dd2e4c13ff6d88edb25f90af62af16ba825c15c9 +Author: Jason Dillaman +Date: Wed Apr 8 21:37:50 2015 -0400 + + librbd: internal AIO methods no longer return result + + All failures should be returned via the AioCompletion. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 9ab42d613128ab08c688ddbea93df4c95068b9cd) + + Conflicts: + src/librbd/AioRequest.cc: trivial resolution + src/librbd/internal.cc: trivial resolution + +commit dbd4e293d7124c89a22148e8fa5f425a995c900c +Author: Jason Dillaman +Date: Wed Apr 8 21:48:21 2015 -0400 + + Throttle: added pending_error method to SimpleThrottle + + Allow the client of SimpleThrottle to detect an async error + so that it can exit early. + + Signed-off-by: Jason Dillaman + (cherry picked from commit b88b88c5df91325fb713c2031a56bffe421268e0) + +commit 7df6091a30b1b94d764240262195e971175554b3 +Author: Jason Dillaman +Date: Wed Apr 8 20:18:50 2015 -0400 + + librbd: add new fail method to AioCompletion + + Helper method to handle passing fatal errors generated within + librbd (not from the OSDs) back to the client. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 6d1d0c867855a96bee4c13a0c0a39a0e002ccd12) + +commit cf6e1f50ea7b5c2fd6298be77c06ed4765d66611 +Author: Jason Dillaman +Date: Wed Apr 8 19:06:52 2015 -0400 + + librbd: avoid blocking AIO API methods + + Enqueue all AIO API methods within the new librbd thread pool to + reduce the possibility of any blocking operations. To maintain + backwards compatibility with the legacy return codes of the API's + AIO methods, it's still possible to block attempting to acquire + the snap_lock. + + Fixes: #11056 + Signed-off-by: Jason Dillaman + (cherry picked from commit 3a7b5e30efdb21aa1a0aeb68a5d02a1ac2a5faf3) + +commit e61974aed09a3f81e1f65a4bbaed43e3f22b27b4 +Author: Jason Dillaman +Date: Wed Apr 8 17:24:08 2015 -0400 + + librbd: add task pool / work queue for requests + + Signed-off-by: Jason Dillaman + (cherry picked from commit afb896d91f886b647baf38f7ec94cc3739f6d2a9) + + Conflicts: + src/librbd/ImageCtx.cc: trivial resolution + src/librbd/ImageCtx.h: trivial resolution + +commit bfb144268b803340efad29cd6c627b170ea32402 +Author: Samuel Just +Date: Wed May 20 16:10:02 2015 -0700 + + ReplicatedPG::release_op_ctx_locks: requeue in scrub queue if blocked + + Otherwise we can reorder an op around another op which got blocked by a + scrub which started after the first blocked on an obc. + + Fixes: #11691 + Signed-off-by: Samuel Just + (cherry picked from commit be873eb8da7b29ecefaa5a99b88de7ddcca711ee) + +commit c7b6a6370a69149ea94f9e35d536aa90f06e7659 +Author: Samuel Just +Date: Tue May 19 10:56:11 2015 -0700 + + ReplicatedPG::finish_ctx: take excl lock if operation is rw + + Fixes: #11677 + Signed-off-by: Samuel Just + (cherry picked from commit 5c2b795724423ed484ab451de855ddcfc085342b) + +commit 1550a569dab120ce28396fe365565e8e4acd9801 +Author: Samuel Just +Date: Thu May 21 12:13:43 2015 -0700 + + RadosModel: randomly prefix delete with assert_exists + + Signed-off-by: Samuel Just + (cherry picked from commit 4fe7d2abdff2fce359e5e992206644cc03825ee0) + +commit 4cdc5f7d6b3ec488c79c09cb44a43d4d9398b74c +Author: Samuel Just +Date: Thu May 21 11:36:42 2015 -0700 + + RadosModel: assert exists on subsequent writes + + Signed-off-by: Samuel Just + (cherry picked from commit 121aa3bc612b86281535ac3bcfe98bc99bc99ace) + +commit 25c730bda74b94f2c894c508ab09988dbd528c4e +Author: Samuel Just +Date: Tue May 19 10:23:01 2015 -0700 + + test/librados/snapshots.cc: add test for 11677 + + Signed-off-by: Samuel Just + (cherry picked from commit c2d17b927f8a222164b3bf2922a4ff337696f566) + +commit 3d74164d3d6caaa5099abd9a1d1920482d3e05c2 +Author: Samuel Just +Date: Wed May 27 11:14:15 2015 -0700 + + ReplicatedPG::trim_object: write filtered snapset while we're at it + + If we trimmed an object, we might as well remove the obsolete snaps + as well. + + Signed-off-by: Samuel Just + (cherry picked from commit 90eb7768f99ea249952df195a844a3a7c9a59b78) + +commit a1161540bc0094a951021d4ca651b95ec045213e +Author: Samuel Just +Date: Wed May 27 11:00:54 2015 -0700 + + ReplicatedPG: start_flush: use filtered snapset + + Otherwise, we might send our deletes based on deleted snaps. This is + problematic since we may have trimmed the clones to which those snaps + belong, causing us to send them at an earlier snap than we used before. + + The specific situation was + + 78:[78, 70, 63, 5a, 58, 57]:[64(63), 58(58, 57)] + + with 58 already clean. To flush 64, we send: + + delete@58 + delete@59 + copyfrom@62 + + Then, snap 63 is trimmed leaving us with a snapset of: + + 78:[78, 70, 63, 5a, 58, 57]:[58(58, 57)] + + since trim_object doesn't filter the head object snapset snaps. This + isn't really a bug since in general all snapset users must be aware + that there may be trimmed snaps in snapset::snaps. However, here + it becomes a problem when we go to flush head: + + delete@58 -- ignored due to snapc + delete@59 -- ignored due to snapc + copyfrom@78 -- not ignored + + The base pool head is at snap seq 62, so it clones that value into + clone 78(78, 70) instead of forgetting it. What should have happened + is that we should have based our flushes on filtered snapset: + + 78:[78, 70, 58, 57]:[58(58, 57)] + + Causing us to instead send: + + delete@58 -- ignored due to snapc + delete@69 -- not ignored, causes no clone to be made + copyfrom@78 -- not ignored, updates head such that a subsequent clone + will leave 70 out of the clone snaps vector. + + Fixes: 11787 + Signed-off-by: Samuel Just + (cherry picked from commit 6051e255ac062985ada1989edb7f23cd750915e2) + +commit 82988d611bad6226138b94590275faadbca3554a +Author: islepnev +Date: Fri Apr 17 22:33:01 2015 +0300 + + ceph-disk: support NVMe device partitions + + Linux nvme kernel module v0.9 enumerate devices as following: + + /dev/nvme0 - characted revice + /dev/nvme0n1 - whole block device + /dev/nvme0n1p1 - first partition + /dev/nvme0n1p2 - second partition + + http://tracker.ceph.com/issues/11612 Fixes: #11612 + + Signed-off-by: Ilja Slepnev + (cherry picked from commit 9b62cf254d02d30609793be8b1cb8a94f38891f1) + +commit bd91fb027ab91d487b1d61d25516c13590735d89 +Author: Sage Weil +Date: Tue May 12 16:37:56 2015 -0700 + + mon: prevent bucket deletion when referenced by a rule + + If a rule references a bucket with 'take', prevent deletion. + + Fixes: #11602 + Signed-off-by: Sage Weil + (cherry picked from commit 3d591afef90b0601572c748f13faac029d05f5a0) + +commit 56565ee1cdb06a7705d1c3f26f5592b10399324a +Author: Sage Weil +Date: Tue May 12 14:03:49 2015 -0700 + + crush: fix crash from invalid 'take' argument + + Verify that the 'take' argument is a valid device or bucket. Otherwise, + ignore it (do not add the value to the working vector). + + Backport: hammer, firefly + Fixes: #11602 + Reported-by: shiva rkreddy + Signed-off-by: Sage Weil + (cherry picked from commit 9324d0a1af61e1c234cc48e2175b4e6320fff8f4) + +commit 1440122d61d5b0a3f8360f4e2101db1018109799 +Author: Kefu Chai +Date: Wed Apr 29 03:28:18 2015 -0700 + + common/config: detect overflow of float values + + Signed-off-by: Kefu Chai + (cherry picked from commit 1ff409ef8d022a1a84d034bd3db976c4d769e993) + +commit 9b947fa320b77e0055a581005353c2561a12a198 +Author: Kefu Chai +Date: Wed Apr 29 15:41:08 2015 +0800 + + common/config: detect overflow of int values + + * #include "strtol.h" in strtol.cc, to ensure the function defintions + are consistent. + * add a test accordingly + * fix the testcase of converting 1024E. + * do not accept integers overflow after adding SI suffix + * do not accept integers underflow (i.e. negative values) + + Fixes: #11484 + Signed-off-by: Kefu Chai + (cherry picked from commit d62f80dc7b25d312ff05b65b7be854aae15b66a8) + +commit d723e1156e70a492d633e43b86e7c373e5750065 +Author: Yan, Zheng +Date: Tue May 12 14:52:30 2015 +0800 + + mds: clear CDir::STATE_REJOINUNDEF after fetching dirfrag + + Fixes: #11541 + Signed-off-by: Yan, Zheng + (cherry picked from commit ab1e5394dc778f6799472bd79a4d9ba7197107c2) + +commit 54f4e7d4a534448293c74612f8140cf34b9cf9f8 +Author: Javier M. Mellid +Date: Fri May 15 14:22:29 2015 +0200 + + rgw: Use attrs from source bucket on copy + + On copy objects, when bucket source is the same as the destination, use attrs + from source bucket. + + Fixes: #11639 + + Signed-off-by: Javier M. Mellid + (cherry picked from commit 1dac80df1d4a2364154ed8b404d13609936c257b) + +commit 9dfef6004d208af3730634796aad199391707826 +Author: Anton Aksola +Date: Fri Apr 10 13:25:21 2015 +0300 + + rgw: always check if token is expired + + Fixes: #11367 + + Currently token expiration is only checked by the token cache. With PKI + tokens no expiration check is done after decoding the token. This causes + PKI tokens to be valid indefinitely. UUID tokens are validated by + keystone after cache miss so they are not affected by this bug. + + This commit adds explicit token expiration check to + RGWSwift::validate_keystone_token() + + Signed-off-by: Anton Aksola + Reported-by: Riku Lehto + (cherry picked from commit 2df069390ea3bbcfbab5022750e89f51d197cc11) + +commit fdb43ebe2b1211acdb454836a64dbd589feeef45 +Author: Nathan Cutler +Date: Fri May 15 21:43:34 2015 +0200 + + ceph.spec.in: tweak ceph-common for SUSE/openSUSE + + ceph-common needs python-argparse in SUSE/openSUSE and + needs redhat-lsb-core only in RHEL/CentOS/Fedora. + + http://tracker.ceph.com/issues/11638 Fixes: #11638 + + Signed-off-by: Nathan Cutler + (cherry picked from commit 363d957d8fdd15a1674befbd8e485fd89b76d716) + +commit ba1a0167cc809081eda7e6cc9ecfb971e439d696 +Author: John Spray +Date: Wed Jun 3 10:09:09 2015 +0100 + + admin/build-doc: fix dependency checks + + http://tracker.ceph.com/issues/11857 Fixes: #11857 + + Signed-off-by: John Spray + (cherry picked from commit 539c1ba7211f579bad4f59ae824f1e68e620ecbd) + + Conflicts: + admin/build-doc + Insert lines at appropriate spot + +commit 5a60a034bf3015eaf468e5e3f9d8feb08b8fdd95 +Author: Nathan Cutler +Date: Wed Jun 3 10:41:27 2015 +0200 + + man/ceph-create-keys.8: add missing file + + This is the generated manpage, taken from a local build of master + + Signed-off-by: Nathan Cutler + (cherry picked from commit bcda61fcbe07ee36cd5172d80018f287591660ec) + +commit 19305b897ac147602049752a2dfbe1fd39ba562c +Author: Kefu Chai +Date: Tue Apr 21 14:59:32 2015 +0800 + + doc: add ceph-create-keys.8 + + Fixes: #10725 + Signed-off-by: Kefu Chai + (cherry picked from commit 27cee2f1f46a9f47cda9dfeb56ff1259e982960c) + + Conflicts: + doc/man/8/ceph-create-keys.rst + Includes fixes from https://github.com/ceph/ceph/pull/4855 + +commit ffd0933dcc790d7cedc1048b664bf4e8c40464a3 +Author: Jason Dillaman +Date: Mon May 11 17:05:49 2015 -0400 + + WorkQueue: added virtual destructor + + Signed-off-by: Jason Dillaman + (cherry picked from commit b3f5a75332c058816dc39b71e9d2b36e752159f4) + +commit a28adfbdd8abc86e7766c303bc610c0c252910f7 +Author: Jason Dillaman +Date: Wed Apr 8 16:46:34 2015 -0400 + + WorkQueue: add new ContextWQ work queue + + The queue holds a collection of Context pointers that will + be completed by the thread pool. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 24a33e977f7b71962adeeb48f75d488a76e70fa9) + +commit 1a321e477effa23bffbb6cb057aa955586197345 +Author: Wido den Hollander +Date: Wed Jun 3 13:13:33 2015 +0200 + + rgw: Do not enclose the Bucket header in quotes + + Fixes: #11860 + Signed-off-by: Wido den Hollander + (cherry picked from commit 8af25faed93fe02d3dad585b8579ce8b041cc4e6) + +commit a62c3aa1df9e0f79ac75d94083d10b902dbba382 +Author: Kefu Chai +Date: Fri Apr 24 14:04:30 2015 +0800 + + ceph.in: handle unknown Exception correctly + + * in case parse_cmdargs() throws + + Signed-off-by: Kefu Chai + (cherry picked from commit 7789eefc01deb9ca7fe90f5521aece3e36c3c350) + +commit cc7f7441c1d0457e840bfdc3413f9bc6d5f5134a +Author: Kefu Chai +Date: Fri Apr 24 01:27:44 2015 +0800 + + ceph.in: improve the interactive mode + + * if ceph is not reading from a tty, expect EOF instead of "quit" + as the end of input. + * do not panic at seeing the EOF + * update the test case test_mon_injectargs_SI(). since we disables + "ceph injectargs " in a458bd83, in which the arguments + of "injectargs" are supposed to be consumed by "tell" instead. + so "ceph injectargs ..." is taken as an incomplete command, and + this command will bring ceph cli into the interactive mode, + redirecting its stdin to /dev/null helps ceph cli quit the loop, + but in a way of throwing EOFError exception. this change handles + the EOF, so the "ceph injectargs ..." does not throws anymore. + but the side effect is that the test fails since it expects a + non-zero return code. so replace it with an equivalent "tell" + command which also fails but due to the non-SI postfix. + + Signed-off-by: Kefu Chai + (cherry picked from commit da9d2b4077ab1dceeed979ab71f0d9ed59b14266) + +commit eb26388cb985456e8f6e07a52a20fa912cf4efaa +Author: Kefu Chai +Date: Fri Apr 24 00:50:37 2015 +0800 + + ceph.in: parse quote correctly in interactive mode + + Fixes: #11279 + Signed-off-by: Kefu Chai + (cherry picked from commit bc7d8c99d2e16a141a8b575281ba12c67628dac3) + +commit 558d6391ecdfd5d716558341ce5d84ce1f7ec9c3 +Author: Kefu Chai +Date: Fri May 8 12:01:16 2015 +0800 + + json_sprit: fix the FTBFS on old gcc + + Fixes: #11574 + Signed-off-by: Kefu Chai + (cherry picked from commit 6b68b27146852f057a3373cd04b08cd6917f3eea) + +commit 678b3e6082729698ce3575ba70313dd8399b1aed +Author: Tim Serong +Date: Sat May 2 01:59:53 2015 +1000 + + json_spirit: use utf8 intenally when parsing \uHHHH + + When the python CLI is given non-ASCII characters, it converts them to + \uHHHH escapes in JSON. json_spirit parses these internally into 16 bit + characters, which could only work if json_spirit were built to use + std::wstring, which it isn't; it's using std::string, so the high byte + ends up being zero'd, leaving the low byte which is effectively garbage. + + This hack^H^H^H^H change makes json_spirit convert to utf8 internally + instead, which can be stored just fine inside a std::string. + + Note that this implementation still assumes \uHHHH escapes are four hex + digits, so it'll only cope with characters in the Basic Multilingual + Plane. Still, that's rather a lot more characters than it could cope + with before ;) + + (For characters outside the BMP, Python seems to generate escapes in the + form \uHHHHHHHH, i.e. 8 hex digits, which the current implementation + doesn't expect to see) + + Fixes: #7387 + + Signed-off-by: Tim Serong + (cherry picked from commit 8add15b86e7aaef41397ab8fa9e77ee7957eb607) + +commit ef6641c0aa47c7f559aa56d7c35a5815afc2ba49 +Author: Loic Dachary +Date: Mon Mar 23 21:26:23 2015 +0100 + + ceph.spec: update OpenSUSE BuildRequires + + OpenSUSE 13.2 needs libsnappy-devel but not bzip2-devel. + + Signed-off-by: Loic Dachary + (cherry picked from commit bdac3dc3fb189688af52f60b0b34339dd8fea251) + +commit 0b6d4427ac7234fd509a9541adf45e303e40cb16 (refs/remotes/jashan/wip-hammer-11535-admin-socket, refs/remotes/gh/wip-hammer-11535-admin-socket) +Author: Jon Bernard +Date: Fri May 8 11:54:06 2015 -0400 + + common/admin_socket: close socket descriptor in destructor + + Long-running processes that do not reuse a single client connection will + see accumulating file descriptors as a result of not closing the + listening socket. In this case, eventually the system will reach + file-max and subsequent connections will fail. + + Fixes: #11535 + + Signed-off-by: Jon Bernard + (cherry picked from commit 88fabb1ee6cd3c7d211ca46919055195e32245db) diff --git a/doc/release-notes.rst b/doc/release-notes.rst index 5891c0a65bf10..555f1a6e8dd1c 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -508,11 +508,22 @@ Notable Changes * rocksdb: update to latest (Xiaoxi Chen) * rpm: loosen ceph-test dependencies (Ken Dreyer) -v0.94.3 hammer (draft) -====================== +v0.94.3 Hammer +============== -* The commands of "pg ls-by-{pool,primary,osd}" and "pg ls" now take "recovering" -instead of "recovery", to include the recovering pgs in the listed pgs. +This Hammer point release fixes a critical (though rare) data +corruption bug that could be triggered when logs are rotated via +SIGHUP. It also fixes a range of other important bugs in the OSD, +monitor, RGW, RGW, and CephFS. + +All v0.94.x Hammer users are strongly encouraged to upgrade. + +Upgrading +--------- + +* The ``pg ls-by-{pool,primary,osd}`` commands and ``pg ls`` now take + the argument ``recovering`` instead of ``recovery`` in order to + include the recovering pgs in the listed pgs. Notable Changes --------------- @@ -582,6 +593,10 @@ Notable Changes * fs: client: reference counting 'struct Fh' (`issue#12088 `_, `pr#5222 `_, Yan, Zheng) * build/ops: ceph.spec: update OpenSUSE BuildRequires (`issue#11611 `_, `pr#4667 `_, Loic Dachary) +For more detailed information, see :download:`the complete changelog `. + + + v0.94.2 Hammer ============== diff --git a/doc/releases.rst b/doc/releases.rst index 8857c733ce43e..8c6c51cadab3d 100644 --- a/doc/releases.rst +++ b/doc/releases.rst @@ -23,7 +23,7 @@ Timeline | |Development|`Dumpling`_|`Emperor`_ |`Firefly`_ |`Giant`_ |`Hammer`_ | | |Testing |LTS |Stable |LTS |Stable |LTS | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+ -| August 2015 |`9.0.3`_ | | | | | | +| August 2015 |`9.0.3`_ | | | | |`0.94.3`_ | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+ | July 2015 |`9.0.2`_ | | |`0.80.10`_ | | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+ @@ -108,10 +108,12 @@ Timeline | | |`0.67`_ | | | | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +.. _9.0.3: ../release-notes#v9-0-3 .. _9.0.2: ../release-notes#v9-0-2 .. _9.0.1: ../release-notes#v9-0-1 .. _9.0.0: ../release-notes#v9-0-0 +.. _0.94.3: ../release-notes#v0-94-3-hammer .. _0.94.2: ../release-notes#v0-94-2-hammer .. _0.94.1: ../release-notes#v0-94-1-hammer .. _0.94: ../release-notes#v0-94-hammer From 4e6548ca750601b01c4c1d035ca81505dca94fbf Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 5 Aug 2015 16:18:01 -0400 Subject: [PATCH 136/654] memstore: PageSetObject for MemStore integration adds config variable memstore_page_set=true to switch between the PageSet and bufferlist implementations for object data benchmark results: test command: ./ceph_objectstore_bench --osd_objectstore=memstore --osd_data=/home/casey/osbench --size=1g --block-size=X --threads=Y --repeats=32 test hardware: 2x Intel(R) Xeon(R) CPU E5-2620 0 @ 2.00GHz (12 cores/24 threads), 64GB memory bufferlist (sequential write bandwidth in MB/s): threads bs=4k bs=64k bs=256k bs=1m bs=4m 1 x 63 340 6177 81861 8 x 17 304 5910 87708 16 x x 320 5959 86578 x: bandwidth approached 0 as size grew PageSet (sequential write bandwidth in MB/s): threads bs=4k bs=64k bs=256k bs=1m bs=4m 1 197 2122 3085 4471 4712 8 626 8808 15518 20389 18926 16 726 9581 18997 22330 21983 bufferlist performs extremely well in constrained workloads (large block size and total file size below ~10G), because its writes are zero-copy. but it breaks down at smaller block sizes due to the linear property of the bufferlist PageSet trades the zero-copy property for a logarithmic data structure in order to achieve more reliable performance across file and block sizes, in addition to better thread scaling note: because each PG operates on a different collection and uses the Sequencer to serialize the operations therein, we're not likely to see the benefits from multilpe threads in the OSD Signed-off-by: Casey Bodley --- src/common/config_opts.h | 2 + src/os/MemStore.cc | 181 ++++++++++++++++++++++++++++++++------- src/os/MemStore.h | 63 ++++++++++++-- 3 files changed, 211 insertions(+), 35 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 15c8ed57592a8..e420f8c898fd5 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -781,6 +781,8 @@ OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64M OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024) +OPTION(memstore_page_set, OPT_BOOL, true) +OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index 0fff54506ce9b..efa2b8aefcdca 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -170,7 +170,7 @@ int MemStore::_load() int r = cbl.read_file(fn.c_str(), &err); if (r < 0) return r; - CollectionRef c(new Collection); + CollectionRef c(new Collection(cct)); bufferlist::iterator p = cbl.begin(); c->decode(p); coll_map[*q] = c; @@ -327,6 +327,7 @@ int MemStore::read( return o->read(offset, l, bl); } + int MemStore::fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) { @@ -962,12 +963,7 @@ int MemStore::_touch(coll_t cid, const ghobject_t& oid) if (!c) return -ENOENT; - ObjectRef o = c->get_object(oid); - if (!o) { - o.reset(new BufferlistObject); - c->object_map[oid] = o; - c->object_hash[oid] = o; - } + c->get_or_create_object(oid); return 0; } @@ -983,14 +979,7 @@ int MemStore::_write(coll_t cid, const ghobject_t& oid, if (!c) return -ENOENT; - ObjectRef o = c->get_object(oid); - if (!o) { - // write implicitly creates a missing object - o.reset(new BufferlistObject); - c->object_map[oid] = o; - c->object_hash[oid] = o; - } - + ObjectRef o = c->get_or_create_object(oid); const ssize_t old_size = o->get_size(); o->write(offset, bl); used_bytes += (o->get_size() - old_size); @@ -1106,12 +1095,7 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid, ObjectRef oo = c->get_object(oldoid); if (!oo) return -ENOENT; - ObjectRef no = c->get_object(newoid); - if (!no) { - no.reset(new BufferlistObject); - c->object_map[newoid] = no; - c->object_hash[newoid] = no; - } + ObjectRef no = c->get_or_create_object(newoid); used_bytes += oo->get_size() - no->get_size(); no->clone(oo.get(), 0, oo->get_size(), 0); @@ -1144,12 +1128,7 @@ int MemStore::_clone_range(coll_t cid, const ghobject_t& oldoid, ObjectRef oo = c->get_object(oldoid); if (!oo) return -ENOENT; - ObjectRef no = c->get_object(newoid); - if (!no) { - no.reset(new BufferlistObject); - c->object_map[newoid] = no; - c->object_hash[newoid] = no; - } + ObjectRef no = c->get_or_create_object(newoid); if (srcoff >= oo->get_size()) return 0; if (srcoff + len >= oo->get_size()) @@ -1251,10 +1230,10 @@ int MemStore::_create_collection(coll_t cid) { dout(10) << __func__ << " " << cid << dendl; RWLock::WLocker l(coll_lock); - ceph::unordered_map::iterator cp = coll_map.find(cid); - if (cp != coll_map.end()) + auto result = coll_map.insert(std::make_pair(cid, CollectionRef())); + if (!result.second) return -EEXIST; - coll_map[cid].reset(new Collection); + result.first->second.reset(new Collection(cct)); return 0; } @@ -1445,3 +1424,145 @@ int MemStore::BufferlistObject::truncate(uint64_t size) } return 0; } + +// PageSetObject +int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl) +{ + const auto start = offset; + const auto end = offset + len; + auto remaining = len; + + PageSet::page_vector pages; + data.get_range(offset, len, pages); + + // allocate a buffer for the data + buffer::ptr buf(len); + + auto p = pages.begin(); + while (remaining) { + // no more pages in range + if (p == pages.end() || (*p)->offset >= end) { + buf.zero(offset - start, remaining); + break; + } + auto page = *p; + + // fill any holes between pages with zeroes + if (page->offset > offset) { + const auto count = std::min(remaining, page->offset - offset); + buf.zero(offset - start, count); + remaining -= count; + offset = page->offset; + if (!remaining) + break; + } + + // read from page + const auto page_offset = offset - page->offset; + const auto count = min(remaining, data.get_page_size() - page_offset); + + buf.copy_in(offset - start, count, page->data + page_offset); + + remaining -= count; + offset += count; + + ++p; + } + + bl.append(buf); + return len; +} + +int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + + // make sure the page range is allocated + PageSet::page_vector pages; + data.alloc_range(offset, src.length(), pages); + + auto page = pages.begin(); + + // XXX: cast away the const because bufferlist doesn't have a const_iterator + auto p = const_cast(src).begin(); + while (len > 0) { + unsigned page_offset = offset - (*page)->offset; + unsigned pageoff = data.get_page_size() - page_offset; + unsigned count = min(len, pageoff); + p.copy(count, (*page)->data + page_offset); + offset += count; + len -= count; + if (count == pageoff) + ++page; + } + if (data_len < offset) + data_len = offset; + return 0; +} + +int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff, + uint64_t len, uint64_t dstoff) +{ + const int64_t delta = dstoff - srcoff; + + auto &src_data = static_cast(src)->data; + const auto src_page_size = src_data.get_page_size(); + + auto &dst_data = data; + const auto dst_page_size = dst_data.get_page_size(); + + PageSet::page_vector src_pages, dst_pages; + + while (len) { + const auto count = std::min(len, src_page_size * 16); + src_data.get_range(srcoff, count, src_pages); + + for (auto &src_page : src_pages) { + auto sbegin = std::max(srcoff, src_page->offset); + auto send = std::min(srcoff + count, src_page->offset + src_page_size); + dst_data.alloc_range(sbegin + delta, send - sbegin, dst_pages); + + // copy data from src page to dst pages + for (auto &dst_page : dst_pages) { + auto dbegin = std::max(sbegin + delta, dst_page->offset); + auto dend = std::min(send + delta, dst_page->offset + dst_page_size); + + std::copy(src_page->data + (dbegin - delta) - src_page->offset, + src_page->data + (dend - delta) - src_page->offset, + dst_page->data + dbegin - dst_page->offset); + } + dst_pages.clear(); // drop page refs + srcoff += count; + dstoff += count; + len -= count; + } + src_pages.clear(); // drop page refs + } + + // update object size + if (data_len < dstoff + len) + data_len = dstoff + len; + return 0; +} + +int MemStore::PageSetObject::truncate(uint64_t size) +{ + data.free_pages_after(size); + data_len = size; + + const auto page_size = data.get_page_size(); + const auto page_offset = size & ~(page_size-1); + if (page_offset == size) + return 0; + + // write zeroes to the rest of the last page + PageSet::page_vector pages; + data.get_range(page_offset, page_size, pages); + if (pages.empty()) + return 0; + + auto page = pages.begin(); + auto data = (*page)->data; + std::fill(data + (size - page_offset), data + page_size, 0); + return 0; +} diff --git a/src/os/MemStore.h b/src/os/MemStore.h index d1edc2acf330c..7ffd100e26a91 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -19,7 +19,6 @@ #include #include -#include "include/assert.h" #include "include/unordered_map.h" #include "include/memory.h" #include "include/Spinlock.h" @@ -27,8 +26,13 @@ #include "common/RefCountedObj.h" #include "common/RWLock.h" #include "ObjectStore.h" +#include "PageSet.h" +#include "include/assert.h" class MemStore : public ObjectStore { +private: + CephContext *const cct; + public: struct Object : public RefCountedObject { std::mutex xattr_mutex; @@ -117,7 +121,39 @@ class MemStore : public ObjectStore { } }; + struct PageSetObject : public Object { + PageSet data; + size_t data_len; + + PageSetObject(size_t page_size) : data(page_size), data_len(0) {} + + size_t get_size() const override { return data_len; } + + int read(uint64_t offset, uint64_t len, bufferlist &bl) override; + int write(uint64_t offset, const bufferlist &bl) override; + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) override; + int truncate(uint64_t offset) override; + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + ::encode(data_len, bl); + data.encode(bl); + encode_base(bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& p) override { + DECODE_START(1, p); + ::decode(data_len, p); + data.decode(p); + decode_base(p); + DECODE_FINISH(p); + } + }; + struct Collection : public RefCountedObject { + CephContext *cct; + bool use_page_set; ceph::unordered_map object_hash; ///< for lookup map object_map; ///< for iteration map xattr; @@ -127,6 +163,12 @@ class MemStore : public ObjectStore { friend void intrusive_ptr_add_ref(Collection *c) { c->get(); } friend void intrusive_ptr_release(Collection *c) { c->put(); } + ObjectRef create_object() const { + if (use_page_set) + return new PageSetObject(cct->_conf->memstore_page_size); + return new BufferlistObject(); + } + // NOTE: The lock only needs to protect the object_map/hash, not the // contents of individual objects. The osd is already sequencing // reads and writes, so we will never see them concurrently at this @@ -140,9 +182,18 @@ class MemStore : public ObjectStore { return o->second; } + ObjectRef get_or_create_object(ghobject_t oid) { + RWLock::WLocker l(lock); + auto result = object_hash.emplace(oid, ObjectRef()); + if (result.second) + object_map[oid] = result.first->second = create_object(); + return result.first->second; + } + void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); ::encode(xattr, bl); + ::encode(use_page_set, bl); uint32_t s = object_map.size(); ::encode(s, bl); for (map::const_iterator p = object_map.begin(); @@ -156,12 +207,13 @@ class MemStore : public ObjectStore { void decode(bufferlist::iterator& p) { DECODE_START(1, p); ::decode(xattr, p); + ::decode(use_page_set, p); uint32_t s; ::decode(s, p); while (s--) { ghobject_t k; ::decode(k, p); - ObjectRef o(new BufferlistObject); + auto o = create_object(); o->decode(p); object_map.insert(make_pair(k, o)); object_hash.insert(make_pair(k, o)); @@ -180,7 +232,9 @@ class MemStore : public ObjectStore { return result; } - Collection() : lock("MemStore::Collection::lock") {} + Collection(CephContext *cct) + : cct(cct), use_page_set(cct->_conf->memstore_page_set), + lock("MemStore::Collection::lock") {} }; typedef Collection::Ref CollectionRef; @@ -243,8 +297,6 @@ class MemStore : public ObjectStore { void _do_transaction(Transaction& t); - void _write_into_bl(const bufferlist& src, unsigned offset, bufferlist *dst); - int _touch(coll_t cid, const ghobject_t& oid); int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl, uint32_t fadvsie_flags = 0); @@ -284,6 +336,7 @@ class MemStore : public ObjectStore { public: MemStore(CephContext *cct, const string& path) : ObjectStore(path), + cct(cct), coll_lock("MemStore::coll_lock"), apply_lock("MemStore::apply_lock"), finisher(cct), From d43c10e0af00c638a07f7501f4926e4bc3b24dfd Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 5 Aug 2015 16:20:38 -0400 Subject: [PATCH 137/654] memstore: use thread_local page vector avoid allocating page_vector storage for every read/write request. this improves the sequential write bandwidth in osbench by 5-10% Signed-off-by: Casey Bodley --- src/os/MemStore.cc | 37 +++++++++++++++++++++---------------- src/os/MemStore.h | 4 ++++ 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index efa2b8aefcdca..a098f315994f7 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -327,7 +327,6 @@ int MemStore::read( return o->read(offset, l, bl); } - int MemStore::fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) { @@ -1426,22 +1425,26 @@ int MemStore::BufferlistObject::truncate(uint64_t size) } // PageSetObject + +// use a thread-local vector for the pages returned by PageSet, so we +// can avoid allocations in read/write() +thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages; + int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl) { const auto start = offset; const auto end = offset + len; auto remaining = len; - PageSet::page_vector pages; - data.get_range(offset, len, pages); + data.get_range(offset, len, tls_pages); // allocate a buffer for the data buffer::ptr buf(len); - auto p = pages.begin(); + auto p = tls_pages.begin(); while (remaining) { // no more pages in range - if (p == pages.end() || (*p)->offset >= end) { + if (p == tls_pages.end() || (*p)->offset >= end) { buf.zero(offset - start, remaining); break; } @@ -1469,6 +1472,8 @@ int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl) ++p; } + tls_pages.clear(); // drop page refs + bl.append(buf); return len; } @@ -1478,10 +1483,9 @@ int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src) unsigned len = src.length(); // make sure the page range is allocated - PageSet::page_vector pages; - data.alloc_range(offset, src.length(), pages); + data.alloc_range(offset, src.length(), tls_pages); - auto page = pages.begin(); + auto page = tls_pages.begin(); // XXX: cast away the const because bufferlist doesn't have a const_iterator auto p = const_cast(src).begin(); @@ -1497,6 +1501,7 @@ int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src) } if (data_len < offset) data_len = offset; + tls_pages.clear(); // drop page refs return 0; } @@ -1511,13 +1516,13 @@ int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff, auto &dst_data = data; const auto dst_page_size = dst_data.get_page_size(); - PageSet::page_vector src_pages, dst_pages; + PageSet::page_vector dst_pages; while (len) { const auto count = std::min(len, src_page_size * 16); - src_data.get_range(srcoff, count, src_pages); + src_data.get_range(srcoff, count, tls_pages); - for (auto &src_page : src_pages) { + for (auto &src_page : tls_pages) { auto sbegin = std::max(srcoff, src_page->offset); auto send = std::min(srcoff + count, src_page->offset + src_page_size); dst_data.alloc_range(sbegin + delta, send - sbegin, dst_pages); @@ -1536,7 +1541,7 @@ int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff, dstoff += count; len -= count; } - src_pages.clear(); // drop page refs + tls_pages.clear(); // drop page refs } // update object size @@ -1556,13 +1561,13 @@ int MemStore::PageSetObject::truncate(uint64_t size) return 0; // write zeroes to the rest of the last page - PageSet::page_vector pages; - data.get_range(page_offset, page_size, pages); - if (pages.empty()) + data.get_range(page_offset, page_size, tls_pages); + if (tls_pages.empty()) return 0; - auto page = pages.begin(); + auto page = tls_pages.begin(); auto data = (*page)->data; std::fill(data + (size - page_offset), data + page_size, 0); + tls_pages.clear(); // drop page ref return 0; } diff --git a/src/os/MemStore.h b/src/os/MemStore.h index 7ffd100e26a91..4725c3445bf13 100644 --- a/src/os/MemStore.h +++ b/src/os/MemStore.h @@ -125,6 +125,10 @@ class MemStore : public ObjectStore { PageSet data; size_t data_len; + // use a thread-local vector for the pages returned by PageSet, so we + // can avoid allocations in read/write() + static thread_local PageSet::page_vector tls_pages; + PageSetObject(size_t page_size) : data(page_size), data_len(0) {} size_t get_size() const override { return data_len; } From 5a4f6a866bb675195ad83c38fd23238d078c78ed Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 28 Aug 2015 00:32:38 +0800 Subject: [PATCH 138/654] osd: do not let OSD_HITSET_GMT reuse the feature bit * to ease the backport to hammer Signed-off-by: Kefu Chai --- src/include/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 81ff511c23250..78e0fbef09a63 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -65,9 +65,9 @@ #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ #define CEPH_FEATURE_MON_METADATA (1ULL<<50) #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ -#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */ #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) #define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */ #define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */ @@ -157,6 +157,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT | \ CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 | \ CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES | \ + CEPH_FEATURE_OSD_HITSET_GMT | \ 0ULL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL From f078a675a7a7cb575cbf717e4c088f8b22962ff6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 13:14:25 -0400 Subject: [PATCH 139/654] ceph-object-corpus: add 0.94.2-207-g88e7ee7 hammer objects Signed-off-by: Sage Weil --- ceph-object-corpus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph-object-corpus b/ceph-object-corpus index 20351c6bae6dd..ef8a3bd8f5d35 160000 --- a/ceph-object-corpus +++ b/ceph-object-corpus @@ -1 +1 @@ -Subproject commit 20351c6bae6dd4802936a5a9fd76e41b8ce2bad0 +Subproject commit ef8a3bd8f5d358bdb40a8d61abd25b24b033ed5b From 4a1cb825362029956cca5dbb88e4e6bba636fd5e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 13:27:57 -0400 Subject: [PATCH 140/654] test/encoding/readable: handle nondeterministic items in corpus too In paritcular, 0.94.2-207-g88e7ee7/objects/ExplicitHashHitSet Signed-off-by: Sage Weil --- src/test/encoding/readable.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/test/encoding/readable.sh b/src/test/encoding/readable.sh index a0d41dc811e7b..f387bd1180917 100755 --- a/src/test/encoding/readable.sh +++ b/src/test/encoding/readable.sh @@ -59,6 +59,20 @@ do failed=$(($failed + 1)) continue fi + + # nondeterministic classes may dump + # nondeterministically. compare the sorted json + # output. this is a weaker test, but is better than + # nothing. + if ! ./ceph-dencoder type $type is_deterministic + then + echo " sorting json output for nondeterministic object" + for f in $tmp1 $tmp2; do + sort $f | sed 's/,$//' > $f.new + mv $f.new $f + done + fi + if ! cmp $tmp1 $tmp2; then echo "**** reencode of $vdir/objects/$type/$f resulted in a different dump ****" diff $tmp1 $tmp2 From 20c76523b92ddfba25cdfd201ae6150ffc397583 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 13:29:07 -0400 Subject: [PATCH 141/654] ceph-object-corpus: remove hammer foo and bar coll_t's Signed-off-by: Sage Weil --- ceph-object-corpus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph-object-corpus b/ceph-object-corpus index ef8a3bd8f5d35..67383cc060dd9 160000 --- a/ceph-object-corpus +++ b/ceph-object-corpus @@ -1 +1 @@ -Subproject commit ef8a3bd8f5d358bdb40a8d61abd25b24b033ed5b +Subproject commit 67383cc060dd9f90d398eed5a00e31eb70845dd8 From 9f476136a8ca8e9b14277a1fe787b1d7a60fbb98 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 18 May 2015 16:53:07 +0200 Subject: [PATCH 142/654] rgw: define attribute for storing object expiration info. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 9582c8c23c206..8ad0733a9a65a 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -65,6 +65,7 @@ using ceph::crypto::MD5; #define RGW_ATTR_CONTENT_ENC RGW_ATTR_PREFIX "content_encoding" #define RGW_ATTR_CONTENT_LANG RGW_ATTR_PREFIX "content_language" #define RGW_ATTR_EXPIRES RGW_ATTR_PREFIX "expires" +#define RGW_ATTR_DELETE_AT RGW_ATTR_PREFIX "delete_at" #define RGW_ATTR_ID_TAG RGW_ATTR_PREFIX "idtag" #define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name" #define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest" From 7675aca5d5d0b0d243574d34819321a301818204 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Tue, 12 May 2015 14:13:45 +0200 Subject: [PATCH 143/654] cls: add timeindex class for radosgw's objects expiration. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/cls/Makefile-client.am | 6 + src/cls/Makefile-server.am | 5 + src/cls/timeindex/cls_timeindex.cc | 272 ++++++++++++++++++++++ src/cls/timeindex/cls_timeindex_client.cc | 156 +++++++++++++ src/cls/timeindex/cls_timeindex_client.h | 52 +++++ src/cls/timeindex/cls_timeindex_ops.h | 116 +++++++++ src/cls/timeindex/cls_timeindex_types.h | 43 ++++ src/rgw/Makefile.am | 1 + src/test/Makefile-client.am | 2 +- 9 files changed, 652 insertions(+), 1 deletion(-) create mode 100644 src/cls/timeindex/cls_timeindex.cc create mode 100644 src/cls/timeindex/cls_timeindex_client.cc create mode 100644 src/cls/timeindex/cls_timeindex_client.h create mode 100644 src/cls/timeindex/cls_timeindex_ops.h create mode 100644 src/cls/timeindex/cls_timeindex_types.h diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am index aa4a4e6054b6d..21bd7e546bfcc 100644 --- a/src/cls/Makefile-client.am +++ b/src/cls/Makefile-client.am @@ -24,6 +24,9 @@ noinst_LIBRARIES += libcls_log_client.a libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc noinst_LIBRARIES += libcls_statelog_client.a +libcls_timeindex_client_a_SOURCES = cls/timeindex/cls_timeindex_client.cc +noinst_LIBRARIES += libcls_timeindex_client.a + libcls_replica_log_client_a_SOURCES = \ cls/replica_log/cls_replica_log_types.cc \ cls/replica_log/cls_replica_log_ops.cc \ @@ -68,6 +71,9 @@ noinst_HEADERS += \ cls/statelog/cls_statelog_types.h \ cls/statelog/cls_statelog_ops.h \ cls/statelog/cls_statelog_client.h \ + cls/timeindex/cls_timeindex_types.h \ + cls/timeindex/cls_timeindex_ops.h \ + cls/timeindex/cls_timeindex_client.h \ cls/replica_log/cls_replica_log_types.h \ cls/replica_log/cls_replica_log_ops.h \ cls/replica_log/cls_replica_log_client.h \ diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am index 7af69ba18dbc0..05900350d38ef 100644 --- a/src/cls/Makefile-server.am +++ b/src/cls/Makefile-server.am @@ -39,6 +39,11 @@ libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' radoslib_LTLIBRARIES += libcls_statelog.la +libcls_timeindex_la_SOURCES = cls/timeindex/cls_timeindex.cc +libcls_timeindex_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_timeindex_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_timeindex.la + libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' diff --git a/src/cls/timeindex/cls_timeindex.cc b/src/cls/timeindex/cls_timeindex.cc new file mode 100644 index 0000000000000..4d2384c2b1ace --- /dev/null +++ b/src/cls/timeindex/cls_timeindex.cc @@ -0,0 +1,272 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include +#include + +#include "include/types.h" +#include "include/utime.h" +#include "objclass/objclass.h" + +#include "cls_timeindex_types.h" +#include "cls_timeindex_ops.h" + +#include "global/global_context.h" + +CLS_VER(1,0) +CLS_NAME(timeindex) + +cls_handle_t h_class; +cls_method_handle_t h_timeindex_add; +cls_method_handle_t h_timeindex_list; +cls_method_handle_t h_timeindex_trim; + +static const size_t MAX_LIST_ENTRIES = 1000; +static const size_t MAX_TRIM_ENTRIES = 1000; + +static const string TIMEINDEX_PREFIX = "1_"; + +static void get_index_time_prefix(const utime_t& ts, + string& index) +{ + char buf[32]; + + snprintf(buf, sizeof(buf), "%s%010ld.%06ld_", TIMEINDEX_PREFIX.c_str(), + (long)ts.sec(), (long)ts.usec()); + buf[sizeof(buf) - 1] = '\0'; + + index = buf; +} + +static void get_index(cls_method_context_t hctx, + const utime_t& key_ts, + const string& key_ext, + string& index) +{ + get_index_time_prefix(key_ts, index); + index.append(key_ext); +} + +static int parse_index(const string& index, + utime_t& key_ts, + string& key_ext) +{ + int sec, usec; + char keyext[256]; + + int ret = sscanf(index.c_str(), "1_%d.%d_%255s", &sec, &usec, keyext); + + key_ts = utime_t(sec, usec); + key_ext = string(keyext); + return ret; +} + +static int cls_timeindex_add(cls_method_context_t hctx, + bufferlist * const in, + bufferlist * const out) +{ + bufferlist::iterator in_iter = in->begin(); + + cls_timeindex_add_op op; + try { + ::decode(op, in_iter); + } catch (buffer::error& err) { + CLS_LOG(1, "ERROR: cls_timeindex_add_op(): failed to decode op"); + return -EINVAL; + } + + for (list::iterator iter = op.entries.begin(); + iter != op.entries.end(); + ++iter) { + cls_timeindex_entry& entry = *iter; + + string index; + get_index(hctx, entry.key_ts, entry.key_ext, index); + + CLS_LOG(20, "storing entry at %s", index.c_str()); + + int ret = cls_cxx_map_set_val(hctx, index, &entry.value); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +static int cls_timeindex_list(cls_method_context_t hctx, + bufferlist * const in, + bufferlist * const out) +{ + bufferlist::iterator in_iter = in->begin(); + + cls_timeindex_list_op op; + try { + ::decode(op, in_iter); + } catch (buffer::error& err) { + CLS_LOG(1, "ERROR: cls_timeindex_list_op(): failed to decode op"); + return -EINVAL; + } + + map keys; + + string from_index; + string to_index; + + if (op.marker.empty()) { + get_index_time_prefix(op.from_time, from_index); + } else { + from_index = op.marker; + } + const bool use_time_boundary = (op.to_time >= op.from_time); + + if (use_time_boundary) { + get_index_time_prefix(op.to_time, to_index); + } + + size_t max_entries = op.max_entries; + if (max_entries > MAX_LIST_ENTRIES) { + max_entries = MAX_LIST_ENTRIES; + } + + int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX, + max_entries + 1, &keys); + if (rc < 0) { + return rc; + } + + cls_timeindex_list_ret ret; + + list& entries = ret.entries; + map::iterator iter = keys.begin(); + + bool done = false; + string marker; + + for (size_t i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + const string& index = iter->first; + bufferlist& bl = iter->second; + + marker = index; + if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) { + CLS_LOG(20, "DEBUG: cls_timeindex_list: finishing on to_index=%s", + to_index.c_str()); + done = true; + break; + } + + cls_timeindex_entry e; + + if (parse_index(index, e.key_ts, e.key_ext) < 0) { + CLS_LOG(1, "ERROR: cls_timeindex_list: could not parse index=%s", + index.c_str()); + } else { + CLS_LOG(20, "DEBUG: cls_timeindex_list: index=%s, key_ext=%s, bl.len = %d", + index.c_str(), e.key_ext.c_str(), bl.length()); + e.value = bl; + entries.push_back(e); + } + } + + if (iter == keys.end()) { + done = true; + } + + ret.marker = marker; + ret.truncated = !done; + + ::encode(ret, *out); + + return 0; +} + + +static int cls_timeindex_trim(cls_method_context_t hctx, + bufferlist * const in, + bufferlist * const out) +{ + bufferlist::iterator in_iter = in->begin(); + + cls_timeindex_trim_op op; + try { + ::decode(op, in_iter); + } catch (buffer::error& err) { + CLS_LOG(1, "ERROR: cls_timeindex_trim: failed to decode entry"); + return -EINVAL; + } + + map keys; + + string from_index; + string to_index; + + if (op.from_marker.empty()) { + get_index_time_prefix(op.from_time, from_index); + } else { + from_index = op.from_marker; + } + + if (op.to_marker.empty()) { + get_index_time_prefix(op.to_time, to_index); + } else { + to_index = op.to_marker; + } + + int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX, + MAX_TRIM_ENTRIES, &keys); + if (rc < 0) { + return rc; + } + + map::iterator iter = keys.begin(); + + bool removed = false; + for (size_t i = 0; i < MAX_TRIM_ENTRIES && iter != keys.end(); ++i, ++iter) { + const string& index = iter->first; + + CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str()); + + if (index.compare(0, to_index.size(), to_index) > 0) { + CLS_LOG(20, "DEBUG: cls_timeindex_trim: finishing on to_index=%s", + to_index.c_str()); + break; + } + + CLS_LOG(20, "removing key: index=%s", index.c_str()); + + int rc = cls_cxx_map_remove_key(hctx, index); + if (rc < 0) { + CLS_LOG(1, "ERROR: cls_cxx_map_remove_key failed rc=%d", rc); + return rc; + } + + removed = true; + } + + if (!removed) { + return -ENODATA; + } + + return 0; +} + +void __cls_init() +{ + CLS_LOG(1, "Loaded timeindex class!"); + + cls_register("timeindex", &h_class); + + /* timeindex */ + cls_register_cxx_method(h_class, "add", CLS_METHOD_RD | CLS_METHOD_WR, + cls_timeindex_add, &h_timeindex_add); + cls_register_cxx_method(h_class, "list", CLS_METHOD_RD, + cls_timeindex_list, &h_timeindex_list); + cls_register_cxx_method(h_class, "trim", CLS_METHOD_RD | CLS_METHOD_WR, + cls_timeindex_trim, &h_timeindex_trim); + + return; +} + diff --git a/src/cls/timeindex/cls_timeindex_client.cc b/src/cls/timeindex/cls_timeindex_client.cc new file mode 100644 index 0000000000000..c3e7dc6187ad5 --- /dev/null +++ b/src/cls/timeindex/cls_timeindex_client.cc @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "include/types.h" +#include "cls/timeindex/cls_timeindex_ops.h" +#include "include/rados/librados.hpp" + + +using namespace librados; + + +void cls_timeindex_add(librados::ObjectWriteOperation& op, list& entries) +{ + bufferlist in; + cls_timeindex_add_op call; + + call.entries = entries; + + ::encode(call, in); + op.exec("timeindex", "add", in); +} + +void cls_timeindex_add(librados::ObjectWriteOperation& op, cls_timeindex_entry& entry) +{ + bufferlist in; + cls_timeindex_add_op call; + + call.entries.push_back(entry); + + ::encode(call, in); + op.exec("timeindex", "add", in); +} + +void cls_timeindex_add_prepare_entry(cls_timeindex_entry& entry, + const utime_t& key_timestamp, + const string& key_ext, + const bufferlist& bl) +{ + entry.key_ts = key_timestamp; + entry.key_ext = key_ext; + entry.value = bl; +} + +void cls_timeindex_add(librados::ObjectWriteOperation& op, + const utime_t& key_timestamp, + const string& key_ext, + const bufferlist& bl) +{ + cls_timeindex_entry entry; + + cls_timeindex_add_prepare_entry(entry, key_timestamp, key_ext, bl); + cls_timeindex_add(op, entry); +} + +void cls_timeindex_trim(librados::ObjectWriteOperation& op, + const utime_t& from_time, + const utime_t& to_time, + const string& from_marker, + const string& to_marker) +{ + bufferlist in; + cls_timeindex_trim_op call; + + call.from_time = from_time; + call.to_time = to_time; + call.from_marker = from_marker; + call.to_marker = to_marker; + + ::encode(call, in); + + op.exec("timeindex", "trim", in); +} + +int cls_timeindex_trim(librados::IoCtx& io_ctx, + const string& oid, + const utime_t& from_time, + const utime_t& to_time, + const string& from_marker, + const string& to_marker) +{ + bool done = false; + + do { + ObjectWriteOperation op; + + cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker); + + int r = io_ctx.operate(oid, &op); + if (r == -ENODATA) { + done = true; + } else if (r < 0) { + return r; + } + + } while (!done); + + return 0; +} + +class TimeindexListCtx : public ObjectOperationCompletion { + list *entries; + string *marker; + bool *truncated; + +public: + TimeindexListCtx(list *_entries, + string *_marker, + bool *_truncated) + : entries(_entries), marker(_marker), truncated(_truncated) {} + + void handle_completion(int r, bufferlist& outbl) { + if (r >= 0) { + cls_timeindex_list_ret ret; + try { + bufferlist::iterator iter = outbl.begin(); + ::decode(ret, iter); + if (entries) { + *entries = ret.entries; + } + if (truncated) { + *truncated = ret.truncated; + } + if (marker) { + *marker = ret.marker; + } + } catch (buffer::error& err) { + // nothing we can do about it atm + } + } + } +}; + +void cls_timeindex_list(librados::ObjectReadOperation& op, + const utime_t& from, + const utime_t& to, + const string& in_marker, + const int max_entries, + list& entries, + string *out_marker, + bool *truncated) +{ + bufferlist inbl; + cls_timeindex_list_op call; + + call.from_time = from; + call.to_time = to; + call.marker = in_marker; + call.max_entries = max_entries; + + ::encode(call, inbl); + + op.exec("timeindex", "list", inbl, + new TimeindexListCtx(&entries, out_marker, truncated)); +} diff --git a/src/cls/timeindex/cls_timeindex_client.h b/src/cls/timeindex/cls_timeindex_client.h new file mode 100644 index 0000000000000..09a420f4937ad --- /dev/null +++ b/src/cls/timeindex/cls_timeindex_client.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLS_TIMEINDEX_CLIENT_H +#define CEPH_CLS_TIMEINDEX_CLIENT_H + +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "cls_timeindex_types.h" + +/* + * timeindex objclass + */ + +void cls_timeindex_add_prepare_entry(cls_timeindex_entry& entry, + const utime_t& key_timestamp, + const string& key_ext, + bufferlist& bl); + +void cls_timeindex_add(librados::ObjectWriteOperation& op, + const list& entry); + +void cls_timeindex_add(librados::ObjectWriteOperation& op, + const cls_timeindex_entry& entry); + +void cls_timeindex_add(librados::ObjectWriteOperation& op, + const utime_t& timestamp, + const string& name, + const bufferlist& bl); + +void cls_timeindex_list(librados::ObjectReadOperation& op, + const utime_t& from, + const utime_t& to, + const string& in_marker, + const int max_entries, + list& entries, + string *out_marker, + bool *truncated); + +void cls_timeindex_trim(librados::ObjectWriteOperation& op, + const utime_t& from_time, + const utime_t& to_time, + const string& from_marker = std::string(), + const string& to_marker = std::string()); + +int cls_timeindex_trim(librados::IoCtx& io_ctx, + const string& oid, + const utime_t& from_time, + const utime_t& to_time, + const string& from_marker = std::string(), + const string& to_marker = std::string()); +#endif diff --git a/src/cls/timeindex/cls_timeindex_ops.h b/src/cls/timeindex/cls_timeindex_ops.h new file mode 100644 index 0000000000000..e5498f741aa96 --- /dev/null +++ b/src/cls/timeindex/cls_timeindex_ops.h @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLS_TIMEINDEX_OPS_H +#define CEPH_CLS_TIMEINDEX_OPS_H + +#include "include/types.h" +#include "cls_timeindex_types.h" + +struct cls_timeindex_add_op { + list entries; + + cls_timeindex_add_op() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(entries, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(entries, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(cls_timeindex_add_op) + +struct cls_timeindex_list_op { + utime_t from_time; + string marker; /* if not empty, overrides from_time */ + utime_t to_time; /* not inclusive */ + int max_entries; /* upperbound to returned num of entries + might return less than that and still be truncated */ + + cls_timeindex_list_op() : max_entries(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(from_time, bl); + ::encode(marker, bl); + ::encode(to_time, bl); + ::encode(max_entries, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(from_time, bl); + ::decode(marker, bl); + ::decode(to_time, bl); + ::decode(max_entries, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(cls_timeindex_list_op) + +struct cls_timeindex_list_ret { + list entries; + string marker; + bool truncated; + + cls_timeindex_list_ret() : truncated(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(entries, bl); + ::encode(marker, bl); + ::encode(truncated, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(entries, bl); + ::decode(marker, bl); + ::decode(truncated, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(cls_timeindex_list_ret) + + +/* + * operation will return 0 when successfully removed but not done. Will return + * -ENODATA when done, so caller needs to repeat sending request until that. + */ +struct cls_timeindex_trim_op { + utime_t from_time; + utime_t to_time; /* inclusive */ + string from_marker; + string to_marker; + + cls_timeindex_trim_op() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(from_time, bl); + ::encode(to_time, bl); + ::encode(from_marker, bl); + ::encode(to_marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(from_time, bl); + ::decode(to_time, bl); + ::decode(from_marker, bl); + ::decode(to_marker, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(cls_timeindex_trim_op) + +#endif /* CEPH_CLS_TIMEINDEX_OPS_H */ diff --git a/src/cls/timeindex/cls_timeindex_types.h b/src/cls/timeindex/cls_timeindex_types.h new file mode 100644 index 0000000000000..afb7d07d7d65f --- /dev/null +++ b/src/cls/timeindex/cls_timeindex_types.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLS_TIMEINDEX_TYPES_H +#define CEPH_CLS_TIMEINDEX_TYPES_H + +#include "include/encoding.h" +#include "include/types.h" + +#include "include/utime.h" + +class JSONObj; + +struct cls_timeindex_entry { + /* Mandatory timestamp. Will be part of the key. */ + utime_t key_ts; + /* Not mandatory. The name_ext field, if not empty, will form second + * part of the key. */ + string key_ext; + /* Become value of OMAP-based mapping. */ + bufferlist value; + + cls_timeindex_entry() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(key_ts, bl); + ::encode(key_ext, bl); + ::encode(value, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(key_ts, bl); + ::decode(key_ext, bl); + ::decode(value, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(cls_timeindex_entry) + +#endif /* CEPH_CLS_TIMEINDEX_TYPES_H */ diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index f3dd92aec99a3..8ef7f8d3040d0 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -54,6 +54,7 @@ LIBRGW_DEPS += \ libcls_rgw_client.la \ libcls_log_client.a \ libcls_statelog_client.a \ + libcls_timeindex_client.a \ libcls_user_client.a \ libcls_replica_log_client.a \ libcls_lock_client.la \ diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 01aaa0e4dc58d..00d9938f7a063 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -534,7 +534,7 @@ ceph_test_cls_rgw_opstate_LDADD = \ $(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \ $(UNITTEST_LDADD) $(CRYPTO_LIBS) \ -lcurl -lexpat \ - libcls_version_client.a libcls_log_client.a \ + libcls_version_client.a libcls_log_client.a libcls_timeindex_client.a \ libcls_statelog_client.a libcls_refcount_client.la \ libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la \ $(LIBRADOS) From 0d792c9dcc1fae8029f07a59e63dedb34eafcc6d Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 21 May 2015 17:18:53 +0200 Subject: [PATCH 144/654] rgw: add support for object expiration in rgw_rados.cc. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/common/config_opts.h | 3 + src/rgw/rgw_rados.cc | 200 ++++++++++++++++++++++++++++++++++++++- src/rgw/rgw_rados.h | 63 +++++++++++- 3 files changed, 262 insertions(+), 4 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d30e15e3428a9..f38ffd4864af9 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1085,6 +1085,9 @@ OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multip OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change +OPTION(rgw_objexp_time_step_exp, OPT_U32, 12) // exponent value (2 is the base) for rounding the timestamps +OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in + OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index c23a2db3ee7f0..19bc923b1986c 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -27,6 +27,7 @@ #include "cls/version/cls_version_client.h" #include "cls/log/cls_log_client.h" #include "cls/statelog/cls_statelog_client.h" +#include "cls/timeindex/cls_timeindex_client.h" #include "cls/lock/cls_lock_client.h" #include "cls/user/cls_user_client.h" @@ -2313,6 +2314,168 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const return cls_log_trim(io_ctx, oid, start_time, end_time, from_marker, to_marker); } +string RGWRados::objexp_hint_get_shardname(const utime_t &ts) +{ + const time_t roundedts = ts.sec() >> cct->_conf->rgw_objexp_time_step_exp; + const unsigned int shnum = roundedts % cct->_conf->rgw_objexp_hints_num_shards; + + char buf[32]; + snprintf(buf, sizeof(buf), "%010u", shnum); + + string objname("time_index_hint."); + return objname + buf; +} + +static string objexp_hint_get_keyext(const string& bucket_name, + const string& bucket_id, + const rgw_obj_key& obj_key) +{ + return bucket_name + ":" + bucket_id + ":" + obj_key.name + ":" + obj_key.instance; +} + +int RGWRados::objexp_hint_add(const utime_t& delete_at, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_key& obj_key) +{ + librados::IoCtx io_ctx; + + const char * const log_pool = zone.log_pool.name.c_str(); + int r = rados->ioctx_create(log_pool, io_ctx); + if (r == -ENOENT) { + rgw_bucket pool(log_pool); + r = create_pool(pool); + if (r < 0) { + return r; + } else { + /* retry */ + r = rados->ioctx_create(log_pool, io_ctx); + } + } + if (r < 0) { + return r; + } + + const string keyext = objexp_hint_get_keyext(bucket_name, + bucket_id, obj_key); + objexp_hint_entry he = { + .bucket_name = bucket_name, + .bucket_id = bucket_id, + .obj_key = obj_key, + .exp_time = delete_at }; + bufferlist hebl; + ::encode(he, hebl); + ObjectWriteOperation op; + cls_timeindex_add(op, delete_at, keyext, hebl); + + string shard_name = objexp_hint_get_shardname(delete_at); + r = io_ctx.operate(shard_name, &op); + return r; +} + +void RGWRados::objexp_get_shard(const utime_t& start_time, + const utime_t& end_time, + utime_t &marker, /* in/out */ + string& shard, /* out */ + bool& truncated) /* out */ +{ + if (marker.is_zero()) { + marker = start_time; + } + + const uint32_t time_step_exp = cct->_conf->rgw_objexp_time_step_exp; + const uint32_t num_shards = cct->_conf->rgw_objexp_hints_num_shards; + const time_t time_step = 1 << time_step_exp; + + const time_t sts = start_time.sec() >> time_step_exp; + const time_t ets = end_time.sec() >> time_step_exp; + const time_t mts = marker.sec() >> time_step_exp; + + const uint32_t periods = (ets - sts) / time_step; + const uint32_t iters = min(periods, num_shards); + + shard = objexp_hint_get_shardname(marker); + + if (mts % num_shards < (sts + iters) % num_shards) { + truncated = true; + marker += utime_t(time_step, 0); + } else { + truncated = false; + } + + return; +} + +int RGWRados::objexp_hint_list(const string& oid, + const utime_t& start_time, + const utime_t& end_time, + const int max_entries, + const string& marker, + list& entries, /* out */ + string *out_marker, /* out */ + bool *truncated) /* out */ +{ + librados::IoCtx io_ctx; + + const char * const log_pool = zone.log_pool.name.c_str(); + int ret = rados->ioctx_create(log_pool, io_ctx); + if (ret < 0) { + return ret; + } + + librados::ObjectReadOperation op; + cls_timeindex_list(op, start_time, end_time, marker, max_entries, entries, + out_marker, truncated); + + bufferlist obl; + ret = io_ctx.operate(oid, &op, &obl); + + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + if (ret == -ENOENT && truncated) { + *truncated = false; + } + + return 0; +} + +int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */ + objexp_hint_entry& hint_entry) /* out */ +{ + try { + bufferlist::iterator iter = ti_entry.value.begin(); + ::decode(hint_entry, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl; + } + + return 0; +} + +int RGWRados::objexp_hint_trim(const string& oid, + const utime_t& start_time, + const utime_t& end_time, + const string& from_marker, + const string& to_marker) +{ + librados::IoCtx io_ctx; + + const char * const log_pool = zone.log_pool.name.c_str(); + int ret = rados->ioctx_create(log_pool, io_ctx); + if (ret < 0) { + return ret; + } + + ret = cls_timeindex_trim(io_ctx, oid, start_time, end_time, + from_marker, to_marker); + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + return 0; +} int RGWRados::lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& duration, string& zone_id, string& owner_id) { @@ -4739,6 +4902,24 @@ int RGWRados::Object::Delete::delete_obj() uint64_t obj_size = state->size; + if (!params.expiration_time.is_zero()) { + bufferlist bl; + utime_t delete_at; + + if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) { + try { + bufferlist::iterator iter = bl.begin(); + ::decode(delete_at, iter); + } catch (buffer::error& err) { + dout(5) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + } + + if (params.expiration_time != delete_at) { + return -ERR_PRECONDITION_FAILED; + } + } + } + ObjectWriteOperation op; r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true); @@ -4799,8 +4980,12 @@ int RGWRados::Object::Delete::delete_obj() return 0; } -int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_obj& obj, - int versioning_status, uint16_t bilog_flags) +int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + rgw_obj& obj, + int versioning_status, + uint16_t bilog_flags, + const utime_t& expiration_time) { RGWRados::Object del_target(this, bucket_info, obj_ctx, obj); RGWRados::Object::Delete del_op(&del_target); @@ -4808,6 +4993,7 @@ int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_ del_op.params.bucket_owner = bucket_info.owner; del_op.params.versioning_status = versioning_status; del_op.params.bilog_flags = bilog_flags; + del_op.params.expiration_time = expiration_time; return del_op.delete_obj(); } @@ -5316,6 +5502,16 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj, continue; op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_DELETE_AT) == 0) { + utime_t ts; + ::decode(ts, bl); + + rgw_obj_key obj_key; + obj.get_index_key(&obj_key); + + objexp_hint_add(ts, bucket.name, bucket.bucket_id, obj_key); + } } if (!op.size()) diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index c69ca9c929911..0ee187409d592 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -13,6 +13,7 @@ #include "cls/version/cls_version_types.h" #include "cls/log/cls_log_types.h" #include "cls/statelog/cls_statelog_types.h" +#include "cls/timeindex/cls_timeindex_types.h" #include "rgw_log.h" #include "rgw_metadata.h" #include "rgw_rest_conn.h" @@ -993,6 +994,32 @@ struct RGWRegionMap { }; WRITE_CLASS_ENCODER(RGWRegionMap) +struct objexp_hint_entry { + string bucket_name; + string bucket_id; + rgw_obj_key obj_key; + utime_t exp_time; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(bucket_name, bl); + ::encode(bucket_id, bl); + ::encode(obj_key, bl); + ::encode(exp_time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(bucket_name, bl); + ::decode(bucket_id, bl); + ::decode(obj_key, bl); + ::decode(exp_time, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(objexp_hint_entry) + class RGWDataChangesLog; class RGWReplicaLogger; @@ -1597,6 +1624,7 @@ class RGWRados string marker_version_id; uint32_t bilog_flags; list *remove_objs; + utime_t expiration_time; DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL) {} } params; @@ -1862,8 +1890,12 @@ class RGWRados int bucket_suspended(rgw_bucket& bucket, bool *suspended); /** Delete an object.*/ - virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, rgw_obj& src_obj, - int versioning_status, uint16_t bilog_flags = 0); + virtual int delete_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_owner, + rgw_obj& src_obj, + int versioning_status, + uint16_t bilog_flags = 0, + const utime_t& expiration_time = utime_t()); /* Delete a system object */ virtual int delete_system_obj(rgw_obj& src_obj, RGWObjVersionTracker *objv_tracker = NULL); @@ -2071,6 +2103,33 @@ class RGWRados int time_log_info(const string& oid, cls_log_header *header); int time_log_trim(const string& oid, const utime_t& start_time, const utime_t& end_time, const string& from_marker, const string& to_marker); + + string objexp_hint_get_shardname(const utime_t &ts); + void objexp_get_shard(const utime_t& start_time, + const utime_t& end_time, + utime_t &marker, /* out */ + string& shard, /* out */ + bool& truncated); /* out */ + int objexp_hint_add(const utime_t& delete_at, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_key& obj_key); + int objexp_hint_list(const string& oid, + const utime_t& start_time, + const utime_t& end_time, + const int max_entries, + const string& marker, + list& entries, /* out */ + string *out_marker, /* out */ + bool *truncated); /* out */ + int objexp_hint_parse(cls_timeindex_entry &ti_entry, + objexp_hint_entry& hint_entry); /* out */ + int objexp_hint_trim(const string& oid, + const utime_t& start_time, + const utime_t& end_time, + const string& from_marker = std::string(), + const string& to_marker = std::string()); + int lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& duration, string& zone_id, string& owner_id); int unlock(rgw_bucket& pool, const string& oid, string& zone_id, string& owner_id); From db27ea92494201e127d81498f5918834e04e3d02 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 21 May 2015 17:19:38 +0200 Subject: [PATCH 145/654] rgw: add garbage collector daemon for expired objects. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/common/config_opts.h | 1 + src/rgw/Makefile.am | 4 + src/rgw/rgw_object_expirer.cc | 255 ++++++++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 src/rgw/rgw_object_expirer.cc diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f38ffd4864af9..47a4ea6411071 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1085,6 +1085,7 @@ OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multip OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change +OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting OPTION(rgw_objexp_time_step_exp, OPT_U32, 12) // exponent value (2 is the base) for rounding the timestamps OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index 8ef7f8d3040d0..235ddadfd66db 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -105,6 +105,10 @@ radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) bin_PROGRAMS += radosgw-admin +radosgw_object_expirer_SOURCES = rgw/rgw_object_expirer.cc +radosgw_object_expirer_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) +bin_PROGRAMS += radosgw-object-expirer + ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) bin_DEBUGPROGRAMS += ceph_rgw_multiparser diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc new file mode 100644 index 0000000000000..5f29afcb14deb --- /dev/null +++ b/src/rgw/rgw_object_expirer.cc @@ -0,0 +1,255 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +using namespace std; + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_replica_log.h" + +#define dout_subsys ceph_subsys_rgw + +static RGWRados *store = NULL; + +class StoreDestructor { + RGWRados *store; + +public: + StoreDestructor(RGWRados *_s) : store(_s) {} + ~StoreDestructor() { + if (store) { + RGWStoreManager::close_storage(store); + } + } +}; + +static void usage() +{ + generic_server_usage(); +} + +static inline utime_t get_last_run_time(void) +{ + return utime_t(); +} + +static int init_bucket_info(const string& bucket_name, + const string& bucket_id, + RGWBucketInfo& bucket_info) +{ + RGWObjectCtx obj_ctx(store); + const string bucket_instance_id = bucket_name + ":" + bucket_id; + + int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, + bucket_info, NULL, NULL); + + return ret; +} + +static int garbage_single_object(objexp_hint_entry& hint) +{ + RGWBucketInfo bucket_info; + + int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info); + if (ret < 0) { + dout(1) << "ERROR: could not init bucket: " << cpp_strerror(-ret) << dendl; + return ret; + } + + /* TODO: check whether the hint is actual. */ + RGWObjectCtx rctx(store); + + rgw_obj_key key = hint.obj_key; + if (key.instance.empty()) { + key.instance = "null"; + } + + rgw_obj obj(bucket_info.bucket, key); + ret = store->delete_obj(rctx, bucket_info, obj, + bucket_info.versioning_status(), 0, hint.exp_time); + + return ret; +} + +static void garbage_chunk(list& entries, /* in */ + bool& need_trim) /* out */ +{ + need_trim = false; + + for (list::iterator iter = entries.begin(); + iter != entries.end(); + ++iter) + { + objexp_hint_entry hint; + dout(15) << "===== got removal hint for: " << iter->key_ts.sec() << " - " << iter->key_ext << dendl; + + int ret = store->objexp_hint_parse(*iter, hint); + if (ret < 0) { + dout(1) << "cannot parse removal hint for " << hint.obj_key << dendl; + continue; + } + + /* PRECOND_FAILED simply means that our hint is not valid. + * We can silently ignore that and move forward. */ + ret = garbage_single_object(hint); + if (ret == -ERR_PRECONDITION_FAILED) { + dout(15) << "not actual hint for object: " << hint.obj_key << dendl; + } else if (ret < 0) { + dout(1) << "cannot remove expired object: " << hint.obj_key << dendl; + } + + need_trim = true; + } + + return; +} + +static void trim_chunk(const string& shard, + const utime_t& from, + const utime_t& to) +{ + dout(20) << "trying to trim removal hints to " << to << dendl; + + int ret = store->objexp_hint_trim(shard, from, to); + if (ret < 0) { + dout(0) << "ERROR during trim: " << ret << dendl; + } + + return; +} + +static void proceed_single_shard(const string& shard, + const utime_t& last_run, + const utime_t& round_start) +{ + string marker; + string out_marker; + bool truncated = false; + + do { + list entries; + int ret = store->objexp_hint_list(shard, last_run, round_start, + 1000, marker, entries, + &out_marker, &truncated); + if (ret < 0) { + dout(10) << "cannot get removal hints from shard: " << shard << dendl; + continue; + } + + bool need_trim; + garbage_chunk(entries, need_trim); + + if (need_trim) { + trim_chunk(shard, last_run, round_start); + } + + marker = out_marker; + } while (truncated); + + return; +} + +static void inspect_all_shards(const utime_t& last_run, + const utime_t& round_start) +{ + bool is_next_available; + utime_t shard_marker; + + do { + string shard; + store->objexp_get_shard(last_run, round_start, shard_marker, shard, + is_next_available); + + dout(20) << "proceeding shard = " << shard << dendl; + + proceed_single_shard(shard, last_run, round_start); + } while (is_next_available); + + return; +} + +int main(const int argc, const char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + env_to_vec(args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + usage(); + return 0; + } + } + + if (g_conf->daemonize) { + global_init_daemonize(g_ceph_context, 0); + } + + common_init_finish(g_ceph_context); + + store = RGWStoreManager::get_storage(g_ceph_context, false, false); + if (!store) { + std::cerr << "couldn't init storage provider" << std::endl; + return EIO; + } + + rgw_user_init(store); + rgw_bucket_init(store->meta_mgr); + + /* Guard to not forget about closing the rados store. */ + StoreDestructor store_dtor(store); + + utime_t last_run = get_last_run_time(); + while (true) { + const utime_t round_start = ceph_clock_now(g_ceph_context); + inspect_all_shards(last_run, round_start); + + last_run = round_start; + + /* End of the real work for now. Prepare for sleep. */ + const utime_t round_time = ceph_clock_now(g_ceph_context) - round_start; + const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0); + + if (round_time < interval) { + /* This should be the main path of execution. All currently expired + * objects have been removed and we need go sleep waiting for the next + * turn. If the check isn't true, it means we have to much hints + * in relation to interval time. */ + const utime_t sleep_period = interval - round_time; + dout(20) << "sleeping for " << sleep_period << dendl; + sleep_period.sleep(); + } + } + + return EXIT_SUCCESS; +} From f572430b008e1a126ad22cb62d4a393a2e7f47a5 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 25 May 2015 14:28:01 +0200 Subject: [PATCH 146/654] rgw: move objexp pool creation into separate function. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_rados.cc | 66 ++++++++++++++++++-------------------------- src/rgw/rgw_rados.h | 2 ++ 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 19bc923b1986c..e58bbbdfefd1e 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -1615,6 +1615,10 @@ int RGWRados::init_complete() if (ret < 0) return ret; + ret = open_objexp_pool_ctx(); + if (ret < 0) + return ret; + pools_initialized = true; gc = new RGWGC(); @@ -1756,6 +1760,25 @@ int RGWRados::open_gc_pool_ctx() return r; } +int RGWRados::open_objexp_pool_ctx() +{ + const char * const pool_name = zone.log_pool.name.c_str(); + librados::Rados * const rad = get_rados_handle(); + int r = rad->ioctx_create(pool_name, objexp_pool_ctx); + if (r == -ENOENT) { + r = rad->pool_create(pool_name); + if (r == -EEXIST) { + r = 0; + } else if (r < 0) { + return r; + } + + r = rad->ioctx_create(pool_name, objexp_pool_ctx); + } + + return r; +} + int RGWRados::init_watch() { const char *control_pool = zone.control_pool.name.c_str(); @@ -2338,24 +2361,6 @@ int RGWRados::objexp_hint_add(const utime_t& delete_at, const string& bucket_id, const rgw_obj_key& obj_key) { - librados::IoCtx io_ctx; - - const char * const log_pool = zone.log_pool.name.c_str(); - int r = rados->ioctx_create(log_pool, io_ctx); - if (r == -ENOENT) { - rgw_bucket pool(log_pool); - r = create_pool(pool); - if (r < 0) { - return r; - } else { - /* retry */ - r = rados->ioctx_create(log_pool, io_ctx); - } - } - if (r < 0) { - return r; - } - const string keyext = objexp_hint_get_keyext(bucket_name, bucket_id, obj_key); objexp_hint_entry he = { @@ -2369,8 +2374,7 @@ int RGWRados::objexp_hint_add(const utime_t& delete_at, cls_timeindex_add(op, delete_at, keyext, hebl); string shard_name = objexp_hint_get_shardname(delete_at); - r = io_ctx.operate(shard_name, &op); - return r; + return objexp_pool_ctx.operate(shard_name, &op); } void RGWRados::objexp_get_shard(const utime_t& start_time, @@ -2415,26 +2419,18 @@ int RGWRados::objexp_hint_list(const string& oid, string *out_marker, /* out */ bool *truncated) /* out */ { - librados::IoCtx io_ctx; - - const char * const log_pool = zone.log_pool.name.c_str(); - int ret = rados->ioctx_create(log_pool, io_ctx); - if (ret < 0) { - return ret; - } - librados::ObjectReadOperation op; cls_timeindex_list(op, start_time, end_time, marker, max_entries, entries, out_marker, truncated); bufferlist obl; - ret = io_ctx.operate(oid, &op, &obl); + int ret = objexp_pool_ctx.operate(oid, &op, &obl); if ((ret < 0 ) && (ret != -ENOENT)) { return ret; } - if (ret == -ENOENT && truncated) { + if ((ret == -ENOENT) && truncated) { *truncated = false; } @@ -2460,15 +2456,7 @@ int RGWRados::objexp_hint_trim(const string& oid, const string& from_marker, const string& to_marker) { - librados::IoCtx io_ctx; - - const char * const log_pool = zone.log_pool.name.c_str(); - int ret = rados->ioctx_create(log_pool, io_ctx); - if (ret < 0) { - return ret; - } - - ret = cls_timeindex_trim(io_ctx, oid, start_time, end_time, + int ret = cls_timeindex_trim(objexp_pool_ctx, oid, start_time, end_time, from_marker, to_marker); if ((ret < 0 ) && (ret != -ENOENT)) { return ret; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 0ee187409d592..68faada74bd28 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1204,6 +1204,7 @@ class RGWRados /** Open the pool used as root for this gateway */ int open_root_pool_ctx(); int open_gc_pool_ctx(); + int open_objexp_pool_ctx(); int open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx& io_ctx); int open_bucket_index_ctx(rgw_bucket& bucket, librados::IoCtx& index_ctx); @@ -1282,6 +1283,7 @@ class RGWRados std::map rados_map; librados::IoCtx gc_pool_ctx; // .rgw.gc + librados::IoCtx objexp_pool_ctx; bool pools_initialized; From 3b0636e9a3548a286444141ff2d3ba9705640af6 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 25 May 2015 17:15:54 +0200 Subject: [PATCH 147/654] rgw: make the rgw-object-expirer's options more human readable. Fixes: #4096 Signed-off-by: Radoslaw Zarzynski --- src/common/config_opts.h | 2 +- src/rgw/rgw_rados.cc | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 47a4ea6411071..b634f41d180c4 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1086,7 +1086,7 @@ OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multip OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting -OPTION(rgw_objexp_time_step_exp, OPT_U32, 12) // exponent value (2 is the base) for rounding the timestamps +OPTION(rgw_objexp_time_step, OPT_U32, 4096) // number of seconds for rounding the timestamps OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index e58bbbdfefd1e..a172d8bfd0172 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2339,7 +2339,7 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const string RGWRados::objexp_hint_get_shardname(const utime_t &ts) { - const time_t roundedts = ts.sec() >> cct->_conf->rgw_objexp_time_step_exp; + const time_t roundedts = ts.sec() / cct->_conf->rgw_objexp_time_step; const unsigned int shnum = roundedts % cct->_conf->rgw_objexp_hints_num_shards; char buf[32]; @@ -2387,20 +2387,19 @@ void RGWRados::objexp_get_shard(const utime_t& start_time, marker = start_time; } - const uint32_t time_step_exp = cct->_conf->rgw_objexp_time_step_exp; const uint32_t num_shards = cct->_conf->rgw_objexp_hints_num_shards; - const time_t time_step = 1 << time_step_exp; + const time_t time_step = cct->_conf->rgw_objexp_time_step; - const time_t sts = start_time.sec() >> time_step_exp; - const time_t ets = end_time.sec() >> time_step_exp; - const time_t mts = marker.sec() >> time_step_exp; + const time_t sts = start_time.sec() / time_step; + const time_t ets = end_time.sec() / time_step; + const time_t mts = marker.sec() / time_step; - const uint32_t periods = (ets - sts) / time_step; - const uint32_t iters = min(periods, num_shards); + const uint32_t periods = ets - sts; + const uint32_t iters = min(periods, num_shards - 1); shard = objexp_hint_get_shardname(marker); - if (mts % num_shards < (sts + iters) % num_shards) { + if (mts - sts < iters) { truncated = true; marker += utime_t(time_step, 0); } else { From 1fa376c248018c54200da4511be723bcee298d90 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 25 May 2015 17:55:09 +0200 Subject: [PATCH 148/654] rgw: make object removal atomic in rgw-object-expirer. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_object_expirer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc index 5f29afcb14deb..97b5a27e58d73 100644 --- a/src/rgw/rgw_object_expirer.cc +++ b/src/rgw/rgw_object_expirer.cc @@ -81,7 +81,6 @@ static int garbage_single_object(objexp_hint_entry& hint) return ret; } - /* TODO: check whether the hint is actual. */ RGWObjectCtx rctx(store); rgw_obj_key key = hint.obj_key; @@ -90,6 +89,7 @@ static int garbage_single_object(objexp_hint_entry& hint) } rgw_obj obj(bucket_info.bucket, key); + store->set_atomic(&rctx, obj); ret = store->delete_obj(rctx, bucket_info, obj, bucket_info.versioning_status(), 0, hint.exp_time); From 05c90e68c33af9ba2bf1f9690a0a6da2c01f6c0b Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Fri, 29 May 2015 14:17:28 +0200 Subject: [PATCH 149/654] rgw: split rgw-object-expirer. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/rgw/Makefile.am | 3 +- src/rgw/rgw_object_expirer.cc | 139 +---------------------- src/rgw/rgw_object_expirer_core.cc | 172 +++++++++++++++++++++++++++++ src/rgw/rgw_object_expirer_core.h | 65 +++++++++++ 4 files changed, 242 insertions(+), 137 deletions(-) create mode 100644 src/rgw/rgw_object_expirer_core.cc create mode 100644 src/rgw/rgw_object_expirer_core.h diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index 235ddadfd66db..c1baa6a3490da 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -45,7 +45,8 @@ librgw_la_SOURCES = \ rgw/rgw_replica_log.cc \ rgw/rgw_keystone.cc \ rgw/rgw_quota.cc \ - rgw/rgw_dencoder.cc + rgw/rgw_dencoder.cc \ + rgw/rgw_object_expirer_core.cc librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS} noinst_LTLIBRARIES += librgw.la diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc index 97b5a27e58d73..b492e82eadc87 100644 --- a/src/rgw/rgw_object_expirer.cc +++ b/src/rgw/rgw_object_expirer.cc @@ -31,6 +31,7 @@ using namespace std; #include "rgw_formats.h" #include "rgw_usage.h" #include "rgw_replica_log.h" +#include "rgw_object_expirer_core.h" #define dout_subsys ceph_subsys_rgw @@ -58,141 +59,6 @@ static inline utime_t get_last_run_time(void) return utime_t(); } -static int init_bucket_info(const string& bucket_name, - const string& bucket_id, - RGWBucketInfo& bucket_info) -{ - RGWObjectCtx obj_ctx(store); - const string bucket_instance_id = bucket_name + ":" + bucket_id; - - int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, - bucket_info, NULL, NULL); - - return ret; -} - -static int garbage_single_object(objexp_hint_entry& hint) -{ - RGWBucketInfo bucket_info; - - int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info); - if (ret < 0) { - dout(1) << "ERROR: could not init bucket: " << cpp_strerror(-ret) << dendl; - return ret; - } - - RGWObjectCtx rctx(store); - - rgw_obj_key key = hint.obj_key; - if (key.instance.empty()) { - key.instance = "null"; - } - - rgw_obj obj(bucket_info.bucket, key); - store->set_atomic(&rctx, obj); - ret = store->delete_obj(rctx, bucket_info, obj, - bucket_info.versioning_status(), 0, hint.exp_time); - - return ret; -} - -static void garbage_chunk(list& entries, /* in */ - bool& need_trim) /* out */ -{ - need_trim = false; - - for (list::iterator iter = entries.begin(); - iter != entries.end(); - ++iter) - { - objexp_hint_entry hint; - dout(15) << "===== got removal hint for: " << iter->key_ts.sec() << " - " << iter->key_ext << dendl; - - int ret = store->objexp_hint_parse(*iter, hint); - if (ret < 0) { - dout(1) << "cannot parse removal hint for " << hint.obj_key << dendl; - continue; - } - - /* PRECOND_FAILED simply means that our hint is not valid. - * We can silently ignore that and move forward. */ - ret = garbage_single_object(hint); - if (ret == -ERR_PRECONDITION_FAILED) { - dout(15) << "not actual hint for object: " << hint.obj_key << dendl; - } else if (ret < 0) { - dout(1) << "cannot remove expired object: " << hint.obj_key << dendl; - } - - need_trim = true; - } - - return; -} - -static void trim_chunk(const string& shard, - const utime_t& from, - const utime_t& to) -{ - dout(20) << "trying to trim removal hints to " << to << dendl; - - int ret = store->objexp_hint_trim(shard, from, to); - if (ret < 0) { - dout(0) << "ERROR during trim: " << ret << dendl; - } - - return; -} - -static void proceed_single_shard(const string& shard, - const utime_t& last_run, - const utime_t& round_start) -{ - string marker; - string out_marker; - bool truncated = false; - - do { - list entries; - int ret = store->objexp_hint_list(shard, last_run, round_start, - 1000, marker, entries, - &out_marker, &truncated); - if (ret < 0) { - dout(10) << "cannot get removal hints from shard: " << shard << dendl; - continue; - } - - bool need_trim; - garbage_chunk(entries, need_trim); - - if (need_trim) { - trim_chunk(shard, last_run, round_start); - } - - marker = out_marker; - } while (truncated); - - return; -} - -static void inspect_all_shards(const utime_t& last_run, - const utime_t& round_start) -{ - bool is_next_available; - utime_t shard_marker; - - do { - string shard; - store->objexp_get_shard(last_run, round_start, shard_marker, shard, - is_next_available); - - dout(20) << "proceeding shard = " << shard << dendl; - - proceed_single_shard(shard, last_run, round_start); - } while (is_next_available); - - return; -} - int main(const int argc, const char **argv) { vector args; @@ -230,9 +96,10 @@ int main(const int argc, const char **argv) StoreDestructor store_dtor(store); utime_t last_run = get_last_run_time(); + ObjectExpirer objexp(store); while (true) { const utime_t round_start = ceph_clock_now(g_ceph_context); - inspect_all_shards(last_run, round_start); + objexp.inspect_all_shards(last_run, round_start); last_run = round_start; diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc new file mode 100644 index 0000000000000..e2cc6b9c206c4 --- /dev/null +++ b/src/rgw/rgw_object_expirer_core.cc @@ -0,0 +1,172 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +using namespace std; + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_replica_log.h" +#include "rgw_object_expirer_core.h" + +#define dout_subsys ceph_subsys_rgw + +int ObjectExpirer::init_bucket_info(const string& bucket_name, + const string& bucket_id, + RGWBucketInfo& bucket_info) +{ + RGWObjectCtx obj_ctx(store); + const string bucket_instance_id = bucket_name + ":" + bucket_id; + + int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, + bucket_info, NULL, NULL); + + return ret; +} + +int ObjectExpirer::garbage_single_object(objexp_hint_entry& hint) +{ + RGWBucketInfo bucket_info; + + int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info); + if (ret < 0) { + dout(1) << "ERROR: could not init bucket: " << cpp_strerror(-ret) << dendl; + return ret; + } + + RGWObjectCtx rctx(store); + + rgw_obj_key key = hint.obj_key; + if (key.instance.empty()) { + key.instance = "null"; + } + + rgw_obj obj(bucket_info.bucket, key); + store->set_atomic(&rctx, obj); + ret = store->delete_obj(rctx, bucket_info, obj, + bucket_info.versioning_status(), 0, hint.exp_time); + + return ret; +} + +void ObjectExpirer::garbage_chunk(list& entries, /* in */ + bool& need_trim) /* out */ +{ + need_trim = false; + + for (list::iterator iter = entries.begin(); + iter != entries.end(); + ++iter) + { + objexp_hint_entry hint; + dout(15) << "got removal hint for: " << iter->key_ts.sec() \ + << " - " << iter->key_ext << dendl; + + int ret = store->objexp_hint_parse(*iter, hint); + if (ret < 0) { + dout(1) << "cannot parse removal hint for " << hint.obj_key << dendl; + continue; + } + + /* PRECOND_FAILED simply means that our hint is not valid. + * We can silently ignore that and move forward. */ + ret = garbage_single_object(hint); + if (ret == -ERR_PRECONDITION_FAILED) { + dout(15) << "not actual hint for object: " << hint.obj_key << dendl; + } else if (ret < 0) { + dout(1) << "cannot remove expired object: " << hint.obj_key << dendl; + } + + need_trim = true; + } + + return; +} + +void ObjectExpirer::trim_chunk(const string& shard, + const utime_t& from, + const utime_t& to) +{ + dout(20) << "trying to trim removal hints to " << to << dendl; + + int ret = store->objexp_hint_trim(shard, from, to); + if (ret < 0) { + dout(0) << "ERROR during trim: " << ret << dendl; + } + + return; +} + +void ObjectExpirer::proceed_single_shard(const string& shard, + const utime_t& last_run, + const utime_t& round_start) +{ + string marker; + string out_marker; + bool truncated = false; + + do { + list entries; + int ret = store->objexp_hint_list(shard, last_run, round_start, + 1000, marker, entries, + &out_marker, &truncated); + if (ret < 0) { + dout(10) << "cannot get removal hints from shard: " << shard << dendl; + continue; + } + + bool need_trim; + garbage_chunk(entries, need_trim); + + if (need_trim) { + trim_chunk(shard, last_run, round_start); + } + + marker = out_marker; + } while (truncated); + + return; +} + +void ObjectExpirer::inspect_all_shards(const utime_t& last_run, + const utime_t& round_start) +{ + bool is_next_available; + utime_t shard_marker; + + do { + string shard; + store->objexp_get_shard(last_run, round_start, shard_marker, shard, + is_next_available); + + dout(20) << "proceeding shard = " << shard << dendl; + + proceed_single_shard(shard, last_run, round_start); + } while (is_next_available); + + return; +} diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h new file mode 100644 index 0000000000000..a1701057a6bce --- /dev/null +++ b/src/rgw/rgw_object_expirer_core.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OBJEXP_H +#define CEPH_OBJEXP_H + +#include +#include +#include +#include + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_replica_log.h" + +class ObjectExpirer { +protected: + RGWRados * const store; + + int init_bucket_info(const string& bucket_name, + const string& bucket_id, + RGWBucketInfo& bucket_info); + +public: + ObjectExpirer(RGWRados * const _store) + : store(_store) + {} + + int garbage_single_object(objexp_hint_entry& hint); + + void garbage_chunk(list& entries, /* in */ + bool& need_trim); /* out */ + + void trim_chunk(const string& shard, + const utime_t& from, + const utime_t& to); + + void proceed_single_shard(const string& shard, + const utime_t& last_run, + const utime_t& round_start); + + void inspect_all_shards(const utime_t& last_run, + const utime_t& round_start); +}; +#endif /* CEPH_OBJEXP_H */ From cdce7a21f915bb2f22a6d21bfafc538e4bde7271 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Tue, 18 Aug 2015 13:10:02 +0200 Subject: [PATCH 150/654] rgw: integrate Swift object expiration-related things with CMake. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/CMakeLists.txt | 22 +++++++++++++++++----- src/cls/CMakeLists.txt | 7 +++++++ src/test/CMakeLists.txt | 2 ++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5b8298e130830..402d8a6dc1ba0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -898,7 +898,8 @@ if(${WITH_RADOSGW}) rgw/rgw_replica_log.cc rgw/rgw_keystone.cc rgw/rgw_quota.cc - rgw/rgw_dencoder.cc) + rgw/rgw_dencoder.cc + rgw/rgw_object_expirer_core.cc) add_library(rgw_a STATIC ${rgw_a_srcs}) @@ -931,19 +932,30 @@ if(${WITH_RADOSGW}) rgw/rgw_admin.cc rgw/rgw_orphan.cc) + set(radosgw_object_expirer_srcs + rgw/rgw_object_expirer.cc) + add_executable(radosgw ${radosgw_srcs} $) target_link_libraries(radosgw rgw_a librados cls_rgw_client cls_lock_client cls_refcount_client - cls_log_client cls_statelog_client cls_version_client - cls_replica_log_client cls_user_client + cls_log_client cls_statelog_client cls_timeindex_client + cls_version_client cls_replica_log_client cls_user_client curl expat global fcgi resolv ${BLKID_LIBRARIES} ${TCMALLOC_LIBS}) install(TARGETS radosgw DESTINATION bin) add_executable(radosgw-admin ${radosgw_admin_srcs} $) target_link_libraries(radosgw-admin rgw_a librados cls_rgw_client cls_lock_client cls_refcount_client - cls_log_client cls_statelog_client cls_version_client - cls_replica_log_client cls_user_client + cls_log_client cls_statelog_client cls_timeindex_client + cls_version_client cls_replica_log_client cls_user_client curl expat global fcgi resolv ${BLKID_LIBRARIES} ${TCMALLOC_LIBS}) install(TARGETS radosgw-admin DESTINATION bin) + + add_executable(radosgw-object-expirer ${radosgw_object_expirer_srcs} $) + target_link_libraries(radosgw-object-expirer rgw_a librados + cls_rgw_client cls_lock_client cls_refcount_client + cls_log_client cls_statelog_client cls_timeindex_client + cls_version_client cls_replica_log_client cls_user_client + curl expat global fcgi resolv ${TCMALLOC_LIBS}) + install(TARGETS radosgw-object-expirer DESTINATION bin) endif(${WITH_RADOSGW}) diff --git a/src/cls/CMakeLists.txt b/src/cls/CMakeLists.txt index c6abc1f664be7..c8befd05c2a51 100644 --- a/src/cls/CMakeLists.txt +++ b/src/cls/CMakeLists.txt @@ -60,6 +60,13 @@ install(TARGETS cls_statelog DESTINATION lib/rados-classes) add_library(cls_statelog_client statelog/cls_statelog_client.cc) +# cls_timeindex +add_library(cls_timeindex SHARED timeindex/cls_timeindex.cc) +set_target_properties(cls_timeindex PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_timeindex DESTINATION lib/rados-classes) + +add_library(cls_timeindex_client timeindex/cls_timeindex_client.cc) + # cls_replica_log add_library(cls_replica_log SHARED replica_log/cls_replica_log.cc) set_target_properties(cls_replica_log PROPERTIES VERSION "1.0.0" SOVERSION "1") diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 8763f972f1cfe..a2f216c5caa33 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -931,6 +931,7 @@ if(${WITH_RADOSGW}) cls_refcount_client cls_log_client cls_statelog_client + cls_timeindex_client cls_version_client cls_replica_log_client cls_kvs @@ -1017,6 +1018,7 @@ if(${WITH_RADOSGW}) cls_version_client cls_log_client cls_statelog_client + cls_timeindex_client cls_refcount_client cls_rgw_client cls_user_client From faac0b1f164e43ba6bc8c11b7c84734fde30a58b Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 11:05:47 -0700 Subject: [PATCH 151/654] rgw: create a worker thread for object expiration Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_object_expirer.cc | 32 +++----------- src/rgw/rgw_object_expirer_core.cc | 71 +++++++++++++++++++++++++++--- src/rgw/rgw_object_expirer_core.h | 27 +++++++++++- 3 files changed, 97 insertions(+), 33 deletions(-) diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc index b492e82eadc87..63f4e967db47a 100644 --- a/src/rgw/rgw_object_expirer.cc +++ b/src/rgw/rgw_object_expirer.cc @@ -54,11 +54,6 @@ static void usage() generic_server_usage(); } -static inline utime_t get_last_run_time(void) -{ - return utime_t(); -} - int main(const int argc, const char **argv) { vector args; @@ -95,28 +90,15 @@ int main(const int argc, const char **argv) /* Guard to not forget about closing the rados store. */ StoreDestructor store_dtor(store); - utime_t last_run = get_last_run_time(); - ObjectExpirer objexp(store); + RGWObjectExpirer objexp(store); + objexp.start_processor(); + + const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0); while (true) { - const utime_t round_start = ceph_clock_now(g_ceph_context); - objexp.inspect_all_shards(last_run, round_start); - - last_run = round_start; - - /* End of the real work for now. Prepare for sleep. */ - const utime_t round_time = ceph_clock_now(g_ceph_context) - round_start; - const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0); - - if (round_time < interval) { - /* This should be the main path of execution. All currently expired - * objects have been removed and we need go sleep waiting for the next - * turn. If the check isn't true, it means we have to much hints - * in relation to interval time. */ - const utime_t sleep_period = interval - round_time; - dout(20) << "sleeping for " << sleep_period << dendl; - sleep_period.sleep(); - } + interval.sleep(); } + /* unreachable */ + return EXIT_SUCCESS; } diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc index e2cc6b9c206c4..5ae562ae27c72 100644 --- a/src/rgw/rgw_object_expirer_core.cc +++ b/src/rgw/rgw_object_expirer_core.cc @@ -35,7 +35,7 @@ using namespace std; #define dout_subsys ceph_subsys_rgw -int ObjectExpirer::init_bucket_info(const string& bucket_name, +int RGWObjectExpirer::init_bucket_info(const string& bucket_name, const string& bucket_id, RGWBucketInfo& bucket_info) { @@ -48,7 +48,7 @@ int ObjectExpirer::init_bucket_info(const string& bucket_name, return ret; } -int ObjectExpirer::garbage_single_object(objexp_hint_entry& hint) +int RGWObjectExpirer::garbage_single_object(objexp_hint_entry& hint) { RGWBucketInfo bucket_info; @@ -73,7 +73,7 @@ int ObjectExpirer::garbage_single_object(objexp_hint_entry& hint) return ret; } -void ObjectExpirer::garbage_chunk(list& entries, /* in */ +void RGWObjectExpirer::garbage_chunk(list& entries, /* in */ bool& need_trim) /* out */ { need_trim = false; @@ -107,7 +107,7 @@ void ObjectExpirer::garbage_chunk(list& entries, /* in return; } -void ObjectExpirer::trim_chunk(const string& shard, +void RGWObjectExpirer::trim_chunk(const string& shard, const utime_t& from, const utime_t& to) { @@ -121,7 +121,7 @@ void ObjectExpirer::trim_chunk(const string& shard, return; } -void ObjectExpirer::proceed_single_shard(const string& shard, +void RGWObjectExpirer::proceed_single_shard(const string& shard, const utime_t& last_run, const utime_t& round_start) { @@ -152,7 +152,7 @@ void ObjectExpirer::proceed_single_shard(const string& shard, return; } -void ObjectExpirer::inspect_all_shards(const utime_t& last_run, +void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, const utime_t& round_start) { bool is_next_available; @@ -170,3 +170,62 @@ void ObjectExpirer::inspect_all_shards(const utime_t& last_run, return; } + +bool RGWObjectExpirer::going_down() +{ + return (down_flag.read() != 0); +} + +void RGWObjectExpirer::start_processor() +{ + worker = new OEWorker(store->ctx(), this); + worker->create(); +} + +void RGWObjectExpirer::stop_processor() +{ + down_flag.set(1); + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWObjectExpirer::OEWorker::entry() { + utime_t last_run; + do { + utime_t start = ceph_clock_now(cct); + dout(2) << "object expiration: start" << dendl; + oe->inspect_all_shards(last_run, start); + dout(2) << "object expiration: stop" << dendl; + + last_run = start; + + if (oe->going_down()) + break; + + utime_t end = ceph_clock_now(cct); + end -= start; + int secs = cct->_conf->rgw_objexp_gc_interval; + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + lock.Lock(); + cond.WaitInterval(cct, lock, utime_t(secs, 0)); + lock.Unlock(); + } while (!oe->going_down()); + + return NULL; +} + +void RGWObjectExpirer::OEWorker::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} + diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h index a1701057a6bce..47ef376b33bb8 100644 --- a/src/rgw/rgw_object_expirer_core.h +++ b/src/rgw/rgw_object_expirer_core.h @@ -18,6 +18,10 @@ #include "common/Formatter.h" #include "common/errno.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + #include "global/global_init.h" #include "include/utime.h" @@ -33,7 +37,7 @@ #include "rgw_usage.h" #include "rgw_replica_log.h" -class ObjectExpirer { +class RGWObjectExpirer { protected: RGWRados * const store; @@ -41,8 +45,23 @@ class ObjectExpirer { const string& bucket_id, RGWBucketInfo& bucket_info); + class OEWorker : public Thread { + CephContext *cct; + RGWObjectExpirer *oe; + Mutex lock; + Cond cond; + + public: + OEWorker(CephContext *_cct, RGWObjectExpirer *_oe) : cct(_cct), oe(_oe), lock("OEWorker") {} + void *entry(); + void stop(); + }; + + OEWorker *worker; + atomic_t down_flag; + public: - ObjectExpirer(RGWRados * const _store) + RGWObjectExpirer(RGWRados * const _store) : store(_store) {} @@ -61,5 +80,9 @@ class ObjectExpirer { void inspect_all_shards(const utime_t& last_run, const utime_t& round_start); + + bool going_down(); + void start_processor(); + void stop_processor(); }; #endif /* CEPH_OBJEXP_H */ From 14e02bc90a463805f4c3e2de210892067a52514b Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 27 Aug 2015 11:08:33 -0700 Subject: [PATCH 152/654] PG::handle_advance_map: on_pool_change after handling the map change Otherwise, the is_active() checks in the hitset code can erroneously return true firing off repops stamped with the new epoch which then get cleared in the map change code. The filestore callbacks then pass the interval check and call into a destroyed repop structure. Fixes: 12809 Backport: hammer,firefly Signed-off-by: Samuel Just --- src/osd/PG.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 36c664129a536..cf8faf696ec92 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5279,12 +5279,12 @@ void PG::handle_advance_map( << dendl; update_osdmap_ref(osdmap); pool.update(osdmap); - if (pool.info.last_change == osdmap_ref->get_epoch()) - on_pool_change(); AdvMap evt( osdmap, lastmap, newup, up_primary, newacting, acting_primary); recovery_state.handle_event(evt, rctx); + if (pool.info.last_change == osdmap_ref->get_epoch()) + on_pool_change(); } void PG::handle_activate_map(RecoveryCtx *rctx) From 470f970488fc5e45afba6fef1ca05ffaa0add036 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 13:57:19 -0400 Subject: [PATCH 153/654] include/ceph_features: define HAMMER_0_94_4 feature This is to constrain upgrades past hammer to version that include the appropriate compatibility fixes (e.g., hobject_t encoding). Signed-off-by: Sage Weil --- src/include/ceph_features.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 78e0fbef09a63..4857b0a8eb124 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -68,6 +68,7 @@ #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) #define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) +#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) #define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */ #define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */ @@ -157,7 +158,8 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT | \ CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 | \ CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES | \ - CEPH_FEATURE_OSD_HITSET_GMT | \ + CEPH_FEATURE_OSD_HITSET_GMT | \ + CEPH_FEATURE_HAMMER_0_94_4 | \ 0ULL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL From f668c6cce1b6c4ca05863e10b57507c91f28fa0f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 14:03:49 -0400 Subject: [PATCH 154/654] mon: use HAMMER_0_94_4 feature to require sufficiently new hammer - Do not allow post-hammer OSDs to start if the running hammer is not a new enough hammer. - Do not allow pre-hammer or old hammer OSDs to start once post-hammer OSDs are in the cluster. Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 25cf3c10e95e4..572d9c897dcfb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1871,19 +1871,19 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op) } // make sure upgrades stop at hammer - // * OSD_PROXY_FEATURES is the last pre-hammer feature + // * HAMMER_0_94_4 is the required hammer feature // * MON_METADATA is the first post-hammer feature if (osdmap.get_num_up_osds() > 0) { if ((m->osd_features & CEPH_FEATURE_MON_METADATA) && - !(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES)) { + !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) { mon->clog->info() << "disallowing boot of post-hammer OSD " << m->get_orig_source_inst() - << " because one or more up OSDs is pre-hammer\n"; + << " because one or more up OSDs is pre-hammer v0.94.4\n"; goto ignore; } - if (!(m->osd_features & CEPH_FEATURE_OSD_PROXY_FEATURES) && + if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) && (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) { - mon->clog->info() << "disallowing boot of pre-hammer OSD " + mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD " << m->get_orig_source_inst() << " because all up OSDs are post-hammer\n"; goto ignore; From 38465f0d7065c41070eab6b2326f8a1f0b8de332 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 14:08:06 -0400 Subject: [PATCH 155/654] osd: refuse to boot if any pre-hammer or old hammer ( --- src/osd/OSD.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f6a1fed1b0cfc..26a640972a62a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4441,6 +4441,10 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest) } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && !store->can_sort_nibblewise()) { dout(1) << "osdmap SORTBITWISE flag is NOT set but our backend does not support nibblewise sort" << dendl; + } else if (osdmap->get_num_up_osds() && + (osdmap->get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4) == 0) { + dout(1) << "osdmap indicates one or more pre-v0.94.4 hammer OSDs is running" + << dendl; } else if (is_waiting_for_healthy() || !_is_healthy()) { // if we are not healthy, do not mark ourselves up (yet) dout(1) << "not healthy; waiting to boot" << dendl; From 2bc5a48f4c5d3667213be3a7b5a0e0f5ef9daf4f Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 27 Aug 2015 11:24:25 -0700 Subject: [PATCH 156/654] osd: Decode use_gmt_hitset with a unique version Signed-off-by: David Zafman --- src/osd/osd_types.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 975d51391cd8e..50a80b06cb1fb 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1256,7 +1256,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(20, 5, bl); + ENCODE_START(21, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -1306,7 +1306,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const void pg_pool_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(20, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl); ::decode(type, bl); ::decode(size, bl); ::decode(crush_ruleset, bl); @@ -1425,9 +1425,12 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } if (struct_v >= 20) { ::decode(min_write_recency_for_promote, bl); - ::decode(use_gmt_hitset, bl); } else { min_write_recency_for_promote = 1; + } + if (struct_v >= 21) { + ::decode(use_gmt_hitset, bl); + } else { use_gmt_hitset = false; } DECODE_FINISH(bl); From 4f9a84310e847f10b6c3d0fc573e6dd06bc095c6 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 18 May 2015 16:54:34 +0200 Subject: [PATCH 157/654] rgw: add basic support for X-Delete-At header of Swift API. Fixes: #4099 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_op.cc | 20 ++++++++++++++++++++ src/rgw/rgw_op.h | 2 ++ src/rgw/rgw_rest_swift.cc | 27 +++++++++++++++++++++++++++ src/rgw/rgw_rest_swift.h | 1 + 4 files changed, 50 insertions(+) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index c3827714d0f1e..35a49b0e2b509 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -962,6 +962,19 @@ void RGWGetObj::execute() return; } + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + attr_iter = attrs.find(RGW_ATTR_DELETE_AT); + if (need_object_expiration() && attr_iter != attrs.end()) { + utime_t delete_at; + ::decode(delete_at, attr_iter->second); + + if (delete_at <= ceph_clock_now(g_ceph_context)) { + ret = -ENOENT; + goto done_err; + } + } + ofs = new_ofs; end = new_end; @@ -2339,6 +2352,13 @@ void RGWPutMetadataObject::execute() /* Filter currently existing attributes. */ prepare_add_del_attrs(orig_attrs, attrs, rmattrs); populate_with_generic_attrs(s, attrs); + + if (!delete_at.is_zero()) { + bufferlist delatbl; + ::encode(delete_at, delatbl); + attrs[RGW_ATTR_DELETE_AT] = delatbl; + } + ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, NULL); } diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 32e8c80fe4974..61d8c019fef5a 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -180,6 +180,7 @@ class RGWGetObj : public RGWOp { virtual const string name() { return "get_obj"; } virtual RGWOpType get_type() { return RGW_OP_GET_OBJ; } virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; } + virtual bool need_object_expiration() { return false; } }; #define RGW_LIST_BUCKETS_LIMIT_MAX 10000 @@ -578,6 +579,7 @@ class RGWPutMetadataObject : public RGWOp { int ret; RGWAccessControlPolicy policy; string placement_rule; + utime_t delete_at; public: RGWPutMetadataObject() diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 39dee7867d329..e3377d326678b 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -620,6 +620,26 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params() return -EINVAL; } + /* Handle Swift object expiration. */ + utime_t delat_proposal; + string x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + + if (!x_delete.empty()) { + string err; + long ts = strict_strtoll(x_delete.c_str(), 10, &err); + + if (!err.empty()) { + return -EINVAL; + } + + delat_proposal += utime_t(ts, 0); + if (delat_proposal < ceph_clock_now(g_ceph_context)) { + return -EINVAL; + } + + delete_at = delat_proposal; + } + placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", ""); return 0; } @@ -683,6 +703,13 @@ static void dump_object_metadata(struct req_state * const s, for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) { s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str()); } + + iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != attrs.end()) { + utime_t delete_at; + ::decode(delete_at, iter->second); + s->cio->print("X-Delete-At: %lu\r\n", delete_at.sec()); + } } int RGWCopyObj_ObjStore_SWIFT::init_dest_policy() diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h index 22283e468a0d2..a2838c7847170 100644 --- a/src/rgw/rgw_rest_swift.h +++ b/src/rgw/rgw_rest_swift.h @@ -14,6 +14,7 @@ class RGWGetObj_ObjStore_SWIFT : public RGWGetObj_ObjStore { ~RGWGetObj_ObjStore_SWIFT() {} int send_response_data(bufferlist& bl, off_t ofs, off_t len); + bool need_object_expiration() { return true; } }; class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore { From aa5f1b8c2f3a404bf4bbe7a442d62668e9b29718 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 11:44:35 -0700 Subject: [PATCH 158/654] rgw: a few fixes, guard bufferlist decodes Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_op.cc | 8 +++++++- src/rgw/rgw_rest_swift.cc | 8 ++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 35a49b0e2b509..5aafe137938af 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -967,7 +967,13 @@ void RGWGetObj::execute() attr_iter = attrs.find(RGW_ATTR_DELETE_AT); if (need_object_expiration() && attr_iter != attrs.end()) { utime_t delete_at; - ::decode(delete_at, attr_iter->second); + try { + ::decode(delete_at, attr_iter->second); + } catch (buffer::error& err) { + ret = -EIO; + ldout(s->cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT " attribute" << dendl; + goto done_err; + } if (delete_at <= ceph_clock_now(g_ceph_context)) { ret = -ENOENT; diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index e3377d326678b..1d3e6c143aabf 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -707,8 +707,12 @@ static void dump_object_metadata(struct req_state * const s, iter = attrs.find(RGW_ATTR_DELETE_AT); if (iter != attrs.end()) { utime_t delete_at; - ::decode(delete_at, iter->second); - s->cio->print("X-Delete-At: %lu\r\n", delete_at.sec()); + try { + ::decode(delete_at, iter->second); + s->cio->print("X-Delete-At: %lu\r\n", delete_at.sec()); + } catch (buffer::error& err) { + dout(0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT " attr, ignoring" << dendl; + } } } From 65949bd13c8004bf2f0edfb03ef71f8ad9f0e595 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Wed, 13 May 2015 15:55:46 +0200 Subject: [PATCH 159/654] rgw: add support for X-Delete-After HTTP header of Swift API. Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_rest_swift.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 1d3e6c143aabf..7dea41012f7db 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -622,7 +622,15 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params() /* Handle Swift object expiration. */ utime_t delat_proposal; - string x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); + + if (x_delete.empty()) { + x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + } else { + /* X-Delete-After HTTP is present. It means we need add its value + * to the current time. */ + delat_proposal = ceph_clock_now(g_ceph_context); + } if (!x_delete.empty()) { string err; From f2f23c2d6e8657916a11e2a0c0d973273de3b2ef Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Wed, 13 May 2015 16:35:51 +0200 Subject: [PATCH 160/654] rgw: implement object_is_expired function. Signed-off-by: Radoslaw Zarzynski Signed-off-by: Yehuda Sadeh Conflicts: src/rgw/rgw_op.cc --- src/rgw/rgw_op.cc | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 5aafe137938af..0c76f1581bb6b 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -911,6 +911,25 @@ void RGWGetObj::pre_exec() rgw_bucket_object_pre_exec(s); } +static bool object_is_expired(map& attrs) { + map::iterator iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != attrs.end()) { + utime_t delete_at; + try { + ::decode(delete_at, iter->second); + } catch (buffer::error& err) { + dout(0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl; + return false; + } + + if (delete_at <= ceph_clock_now(g_ceph_context)) { + return true; + } + } + + return false; +} + void RGWGetObj::execute() { utime_t start_time = s->time; @@ -964,21 +983,9 @@ void RGWGetObj::execute() /* Check whether the object has expired. Swift API documentation * stands that we should return 404 Not Found in such case. */ - attr_iter = attrs.find(RGW_ATTR_DELETE_AT); - if (need_object_expiration() && attr_iter != attrs.end()) { - utime_t delete_at; - try { - ::decode(delete_at, attr_iter->second); - } catch (buffer::error& err) { - ret = -EIO; - ldout(s->cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT " attribute" << dendl; - goto done_err; - } - - if (delete_at <= ceph_clock_now(g_ceph_context)) { - ret = -ENOENT; - goto done_err; - } + if (need_object_expiration() && object_is_expired(attrs)) { + ret = -ENOENT; + goto done_err; } ofs = new_ofs; From 478b14ee99cf560d71cc8e4baca831955f5b196a Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Wed, 13 May 2015 16:37:46 +0200 Subject: [PATCH 161/654] rgw: verify Swift object lifetime at POST. Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_op.cc | 7 +++++++ src/rgw/rgw_op.h | 1 + src/rgw/rgw_rest_swift.h | 1 + 3 files changed, 9 insertions(+) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 0c76f1581bb6b..3078e4197904d 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -2362,6 +2362,13 @@ void RGWPutMetadataObject::execute() return; } + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && object_is_expired(orig_attrs)) { + ret = -ENOENT; + return; + } + /* Filter currently existing attributes. */ prepare_add_del_attrs(orig_attrs, attrs, rmattrs); populate_with_generic_attrs(s, attrs); diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 61d8c019fef5a..8781faeff4613 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -599,6 +599,7 @@ class RGWPutMetadataObject : public RGWOp { virtual const string name() { return "put_obj_metadata"; } virtual RGWOpType get_type() { return RGW_OP_PUT_METADATA_OBJECT; } virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; } + virtual bool need_object_expiration() { return false; } }; class RGWDeleteObj : public RGWOp { diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h index a2838c7847170..55b41bbc4e4a0 100644 --- a/src/rgw/rgw_rest_swift.h +++ b/src/rgw/rgw_rest_swift.h @@ -115,6 +115,7 @@ class RGWPutMetadataObject_ObjStore_SWIFT : public RGWPutMetadataObject_ObjStore int get_params(); void send_response(); + bool need_object_expiration() { return true; } }; class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore { From eee424c22ff0210b7fa086d3c17b4fea3cb3e5bd Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 12:03:40 -0700 Subject: [PATCH 162/654] rgw: init object expirer thread Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 16 +++++++++++++--- src/rgw/rgw_rados.h | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index a172d8bfd0172..00996cad3bbfd 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -48,6 +48,7 @@ using namespace librados; #include "rgw_log.h" #include "rgw_gc.h" +#include "rgw_object_expirer_core.h" #define dout_subsys ceph_subsys_rgw @@ -1447,9 +1448,14 @@ void RGWRados::finalize() delete data_log; if (use_gc_thread) { gc->stop_processor(); - delete gc; - gc = NULL; + obj_expirer->stop_processor(); } + delete gc; + gc = NULL; + + delete obj_expirer; + obj_expirer = NULL; + delete rest_master_conn; map::iterator iter; @@ -1624,8 +1630,12 @@ int RGWRados::init_complete() gc = new RGWGC(); gc->initialize(cct, this); - if (use_gc_thread) + obj_expirer = new RGWObjectExpirer(this); + + if (use_gc_thread) { gc->start_processor(); + obj_expirer->start_processor(); + } quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads); diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 68faada74bd28..5ee67c5983a35 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -22,6 +22,7 @@ class RGWWatcher; class SafeTimer; class ACLOwner; class RGWGC; +class RGWObjectExpirer; /* flags for put_obj_meta() */ #define PUT_OBJ_CREATE 0x01 @@ -1241,6 +1242,7 @@ class RGWRados }; RGWGC *gc; + RGWObjectExpirer *obj_expirer; bool use_gc_thread; bool quota_threads; @@ -1297,7 +1299,7 @@ class RGWRados public: RGWRados() : max_req_id(0), lock("rados_timer_lock"), watchers_lock("watchers_lock"), timer(NULL), - gc(NULL), use_gc_thread(false), quota_threads(false), + gc(NULL), obj_expirer(NULL), use_gc_thread(false), quota_threads(false), num_watchers(0), watchers(NULL), watch_initialized(false), bucket_id_lock("rados_bucket_id"), From 3dbea3c3be9723a638cf3ba5a732c973b35345ed Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 12:05:23 -0700 Subject: [PATCH 163/654] rgw: rename obj expiration hint oids Give a more clear name Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 00996cad3bbfd..ace339433d068 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2355,7 +2355,7 @@ string RGWRados::objexp_hint_get_shardname(const utime_t &ts) char buf[32]; snprintf(buf, sizeof(buf), "%010u", shnum); - string objname("time_index_hint."); + string objname("obj_delete_at_hint."); return objname + buf; } From e734b0a4ac959230082a3bea5bf96b9250745dd1 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 12:34:04 -0700 Subject: [PATCH 164/654] radosgw-admin: a new command to run objects expirer $ radosgw-admin objects expire Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_admin.cc | 14 ++++++++++++++ src/rgw/rgw_rados.cc | 6 ++++++ src/rgw/rgw_rados.h | 1 + src/test/cli/radosgw-admin/help.t | 1 + 4 files changed, 22 insertions(+) diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 1140cbdbbc5f8..e8f12cd141d45 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -67,6 +67,7 @@ void _usage() cout << " bucket check check bucket index\n"; cout << " object rm remove object\n"; cout << " object unlink unlink object from bucket index\n"; + cout << " objects expire run expired objects cleanup\n"; cout << " quota set set quota params\n"; cout << " quota enable enable quota\n"; cout << " quota disable disable quota\n"; @@ -223,6 +224,7 @@ enum { OPT_OBJECT_UNLINK, OPT_OBJECT_STAT, OPT_OBJECT_REWRITE, + OPT_OBJECTS_EXPIRE, OPT_BI_GET, OPT_BI_PUT, OPT_BI_LIST, @@ -282,6 +284,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more) strcmp(cmd, "mdlog") == 0 || strcmp(cmd, "metadata") == 0 || strcmp(cmd, "object") == 0 || + strcmp(cmd, "objects") == 0 || strcmp(cmd, "olh") == 0 || strcmp(cmd, "opstate") == 0 || strcmp(cmd, "orphans") == 0 || @@ -391,6 +394,9 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more) return OPT_OBJECT_STAT; if (strcmp(cmd, "rewrite") == 0) return OPT_OBJECT_REWRITE; + } else if (strcmp(prev_cmd, "objects") == 0) { + if (strcmp(cmd, "expire") == 0) + return OPT_OBJECTS_EXPIRE; } else if (strcmp(prev_cmd, "olh") == 0) { if (strcmp(cmd, "get") == 0) return OPT_OLH_GET; @@ -2344,6 +2350,14 @@ int main(int argc, char **argv) } } + if (opt_cmd == OPT_OBJECTS_EXPIRE) { + int ret = store->process_expire_objects(); + if (ret < 0) { + cerr << "ERROR: process_expire_objects() processing returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + if (opt_cmd == OPT_BUCKET_REWRITE) { if (bucket_name.empty()) { cerr << "ERROR: bucket not specified" << std::endl; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index ace339433d068..8ebb7a4195a7f 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -8127,6 +8127,12 @@ int RGWRados::process_gc() return gc->process(); } +int RGWRados::process_expire_objects() +{ + obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now(cct)); + return 0; +} + int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid) { bufferlist in; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 5ee67c5983a35..791fa9ce1a066 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2145,6 +2145,7 @@ class RGWRados int list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated); int process_gc(); + int process_expire_objects(); int defer_gc(void *ctx, rgw_obj& obj); int bucket_check_index(rgw_bucket& bucket, diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index 33aee1d5eb332..d455b565ff4c3 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -24,6 +24,7 @@ bucket check check bucket index object rm remove object object unlink unlink object from bucket index + objects expire run expired objects cleanup quota set set quota params quota enable enable quota quota disable disable quota From c4a9a4b78470d541bc72b755fc774b40a8e37951 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 13:05:26 -0700 Subject: [PATCH 165/654] rgw: objexp related fixes minor issues following code review Signed-off-by: Yehuda Sadeh --- src/cls/timeindex/cls_timeindex.cc | 2 +- src/common/config_opts.h | 1 + src/rgw/rgw_object_expirer_core.cc | 27 +++++++++++++++------------ src/rgw/rgw_object_expirer_core.h | 4 ++-- src/rgw/rgw_rados.cc | 3 ++- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/cls/timeindex/cls_timeindex.cc b/src/cls/timeindex/cls_timeindex.cc index 4d2384c2b1ace..c5c1e093d1cd2 100644 --- a/src/cls/timeindex/cls_timeindex.cc +++ b/src/cls/timeindex/cls_timeindex.cc @@ -161,7 +161,7 @@ static int cls_timeindex_list(cls_method_context_t hctx, cls_timeindex_entry e; if (parse_index(index, e.key_ts, e.key_ext) < 0) { - CLS_LOG(1, "ERROR: cls_timeindex_list: could not parse index=%s", + CLS_LOG(0, "ERROR: cls_timeindex_list: could not parse index=%s", index.c_str()); } else { CLS_LOG(20, "DEBUG: cls_timeindex_list: index=%s, key_ext=%s, bl.len = %d", diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b634f41d180c4..edb8d4911f589 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1088,6 +1088,7 @@ OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pen OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting OPTION(rgw_objexp_time_step, OPT_U32, 4096) // number of seconds for rounding the timestamps OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in +OPTION(rgw_objexp_chunk_size, OPT_U32, 100) // maximum number of entries in a single operation when processing objexp data OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc index 5ae562ae27c72..3c12ca6c15053 100644 --- a/src/rgw/rgw_object_expirer_core.cc +++ b/src/rgw/rgw_object_expirer_core.cc @@ -54,7 +54,7 @@ int RGWObjectExpirer::garbage_single_object(objexp_hint_entry& hint) int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info); if (ret < 0) { - dout(1) << "ERROR: could not init bucket: " << cpp_strerror(-ret) << dendl; + ldout(store->ctx(), 1) << "ERROR: could not init bucket: " << cpp_strerror(-ret) << dendl; return ret; } @@ -83,12 +83,12 @@ void RGWObjectExpirer::garbage_chunk(list& entries, /* ++iter) { objexp_hint_entry hint; - dout(15) << "got removal hint for: " << iter->key_ts.sec() \ + ldout(store->ctx(), 15) << "got removal hint for: " << iter->key_ts.sec() \ << " - " << iter->key_ext << dendl; int ret = store->objexp_hint_parse(*iter, hint); if (ret < 0) { - dout(1) << "cannot parse removal hint for " << hint.obj_key << dendl; + ldout(store->ctx(), 1) << "cannot parse removal hint for " << hint.obj_key << dendl; continue; } @@ -96,9 +96,9 @@ void RGWObjectExpirer::garbage_chunk(list& entries, /* * We can silently ignore that and move forward. */ ret = garbage_single_object(hint); if (ret == -ERR_PRECONDITION_FAILED) { - dout(15) << "not actual hint for object: " << hint.obj_key << dendl; + ldout(store->ctx(), 15) << "not actual hint for object: " << hint.obj_key << dendl; } else if (ret < 0) { - dout(1) << "cannot remove expired object: " << hint.obj_key << dendl; + ldout(store->ctx(), 1) << "cannot remove expired object: " << hint.obj_key << dendl; } need_trim = true; @@ -111,11 +111,11 @@ void RGWObjectExpirer::trim_chunk(const string& shard, const utime_t& from, const utime_t& to) { - dout(20) << "trying to trim removal hints to " << to << dendl; + ldout(store->ctx(), 20) << "trying to trim removal hints to " << to << dendl; int ret = store->objexp_hint_trim(shard, from, to); if (ret < 0) { - dout(0) << "ERROR during trim: " << ret << dendl; + ldout(store->ctx(), 0) << "ERROR during trim: " << ret << dendl; } return; @@ -129,13 +129,16 @@ void RGWObjectExpirer::proceed_single_shard(const string& shard, string out_marker; bool truncated = false; + CephContext *cct = store->ctx(); + int num_entries = cct->_conf->rgw_objexp_chunk_size; + do { list entries; int ret = store->objexp_hint_list(shard, last_run, round_start, - 1000, marker, entries, + num_entries, marker, entries, &out_marker, &truncated); if (ret < 0) { - dout(10) << "cannot get removal hints from shard: " << shard << dendl; + ldout(cct, 10) << "cannot get removal hints from shard: " << shard << dendl; continue; } @@ -163,7 +166,7 @@ void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, store->objexp_get_shard(last_run, round_start, shard_marker, shard, is_next_available); - dout(20) << "proceeding shard = " << shard << dendl; + ldout(store->ctx(), 20) << "proceeding shard = " << shard << dendl; proceed_single_shard(shard, last_run, round_start); } while (is_next_available); @@ -197,9 +200,9 @@ void *RGWObjectExpirer::OEWorker::entry() { utime_t last_run; do { utime_t start = ceph_clock_now(cct); - dout(2) << "object expiration: start" << dendl; + ldout(cct, 2) << "object expiration: start" << dendl; oe->inspect_all_shards(last_run, start); - dout(2) << "object expiration: stop" << dendl; + ldout(cct, 2) << "object expiration: stop" << dendl; last_run = start; diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h index 47ef376b33bb8..12bcc8e6b9ae7 100644 --- a/src/rgw/rgw_object_expirer_core.h +++ b/src/rgw/rgw_object_expirer_core.h @@ -39,7 +39,7 @@ class RGWObjectExpirer { protected: - RGWRados * const store; + RGWRados *store; int init_bucket_info(const string& bucket_name, const string& bucket_id, @@ -61,7 +61,7 @@ class RGWObjectExpirer { atomic_t down_flag; public: - RGWObjectExpirer(RGWRados * const _store) + RGWObjectExpirer(RGWRados *_store) : store(_store) {} diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 8ebb7a4195a7f..bde1cd7249f6f 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -4908,7 +4908,8 @@ int RGWRados::Object::Delete::delete_obj() bufferlist::iterator iter = bl.begin(); ::decode(delete_at, iter); } catch (buffer::error& err) { - dout(5) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + return -EIO; } if (params.expiration_time != delete_at) { From 9db8122680e7a641c06c09c6a42da8cc870194a6 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 13:35:07 -0700 Subject: [PATCH 166/654] rgw: lock obj expirer shards when processing to prevent multiple rgws processing the same shard concurrently Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_object_expirer_core.cc | 26 +++++++++++++++++++++++++- src/rgw/rgw_rados.h | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc index 3c12ca6c15053..2dee9bb011a1f 100644 --- a/src/rgw/rgw_object_expirer_core.cc +++ b/src/rgw/rgw_object_expirer_core.cc @@ -33,8 +33,12 @@ using namespace std; #include "rgw_replica_log.h" #include "rgw_object_expirer_core.h" +#include "cls/lock/cls_lock_client.h" + #define dout_subsys ceph_subsys_rgw +static string objexp_lock_name = "gc_process"; + int RGWObjectExpirer::init_bucket_info(const string& bucket_name, const string& bucket_id, RGWBucketInfo& bucket_info) @@ -132,9 +136,23 @@ void RGWObjectExpirer::proceed_single_shard(const string& shard, CephContext *cct = store->ctx(); int num_entries = cct->_conf->rgw_objexp_chunk_size; + int max_secs = cct->_conf->rgw_objexp_gc_interval; + utime_t end = ceph_clock_now(cct); + end += max_secs; + + rados::cls::lock::Lock l(objexp_lock_name); + + utime_t time(max_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&store->objexp_pool_ctx, shard); + if (ret == -EBUSY) { /* already locked by another processor */ + dout(5) << __func__ << "(): failed to acquire lock on " << shard << dendl; + return; + } do { list entries; - int ret = store->objexp_hint_list(shard, last_run, round_start, + ret = store->objexp_hint_list(shard, last_run, round_start, num_entries, marker, entries, &out_marker, &truncated); if (ret < 0) { @@ -149,9 +167,15 @@ void RGWObjectExpirer::proceed_single_shard(const string& shard, trim_chunk(shard, last_run, round_start); } + utime_t now = ceph_clock_now(g_ceph_context); + if (now >= end) { + break; + } + marker = out_marker; } while (truncated); + l.unlock(&store->objexp_pool_ctx, shard); return; } diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 791fa9ce1a066..121bfa58c22c8 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1199,6 +1199,7 @@ class Finisher; class RGWRados { friend class RGWGC; + friend class RGWObjectExpirer; friend class RGWStateLog; friend class RGWReplicaLogger; From 131214dc6ddeef51edf0abf7cc0e41a66baeebe1 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Wed, 15 Jul 2015 21:33:07 +0000 Subject: [PATCH 167/654] ec: add support for fast read on PGBackend/ECBackend async read Extend the PGBackend::objects_read_async interface to support *fast read*, add the implemenation for ECBackend, in which we issue redundant reads, and use the first returned (to decode) to serve clients. Signed-off-by: Guang Yang --- src/osd/ECBackend.cc | 61 ++++++++++++++++++++++++++++-------- src/osd/ECBackend.h | 19 +++++++++-- src/osd/PGBackend.h | 2 +- src/osd/ReplicatedBackend.cc | 6 +++- src/osd/ReplicatedBackend.h | 3 +- 5 files changed, 73 insertions(+), 18 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 47f326efaf176..50e8a5b33c52a 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -469,7 +469,8 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority) start_read_op( priority, m.reads, - OpRequestRef()); + OpRequestRef(), + false); } void ECBackend::continue_recovery_op( @@ -487,7 +488,7 @@ void ECBackend::continue_recovery_op( set to_read; uint64_t recovery_max_chunk = get_recovery_chunk_size(); int r = get_min_avail_to_read_shards( - op.hoid, want, true, &to_read); + op.hoid, want, true, false, &to_read); if (r != 0) { // we must have lost a recovery source assert(!op.recovery_progress.first); @@ -1024,11 +1025,35 @@ shard_to_read_map.find(from); assert(rop.in_progress.count(from)); rop.in_progress.erase(from); + bool is_complete = true; if (!rop.in_progress.empty()) { - dout(10) << __func__ << " readop not complete: " << rop << dendl; - } else { - dout(10) << __func__ << " readop complete: " << rop << dendl; + if (rop.do_redundant_reads) { + for (map::const_iterator iter = + rop.complete.begin(); + iter != rop.complete.end(); + ++iter) { + set have; + for (map::const_iterator j = + iter->second.returned.front().get<2>().begin(); + j != iter->second.returned.front().get<2>().end(); + ++j) { + have.insert(j->first.shard); + } + set want_to_read, dummy_minimum; + get_want_to_read_shards(&want_to_read); + if (ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum) < 0) { + is_complete = false; + break; + } + } + } else { + is_complete = false; + } + } + if (is_complete) { complete_read_op(rop, m); + } else { + dout(10) << __func__ << " readop not complete: " << rop << dendl; } } @@ -1317,8 +1342,12 @@ int ECBackend::get_min_avail_to_read_shards( const hobject_t &hoid, const set &want, bool for_recovery, + bool do_redundant_reads, set *to_read) { + // Make sure we don't do redundant reads for recovery + assert(!for_recovery || !do_redundant_reads); + map, hobject_t::BitwiseComparator>::const_iterator miter = get_parent()->get_missing_loc_shards().find(hoid); @@ -1380,6 +1409,10 @@ int ECBackend::get_min_avail_to_read_shards( if (r < 0) return r; + if (do_redundant_reads) { + need.swap(have); + } + if (!to_read) return 0; @@ -1395,7 +1428,8 @@ int ECBackend::get_min_avail_to_read_shards( void ECBackend::start_read_op( int priority, map &to_read, - OpRequestRef _op) + OpRequestRef _op, + bool do_redundant_reads) { ceph_tid_t tid = get_parent()->get_tid(); assert(!tid_to_read_map.count(tid)); @@ -1404,6 +1438,7 @@ void ECBackend::start_read_op( op.tid = tid; op.to_read.swap(to_read); op.op = _op; + op.do_redundant_reads = do_redundant_reads; dout(10) << __func__ << ": starting " << op << dendl; map messages; @@ -1683,7 +1718,8 @@ void ECBackend::objects_read_async( const hobject_t &hoid, const list, pair > > &to_read, - Context *on_complete) + Context *on_complete, + bool fast_read) { in_progress_client_reads.push_back(ClientAsyncReadStatus(on_complete)); CallClientContexts *c = new CallClientContexts( @@ -1700,17 +1736,15 @@ void ECBackend::objects_read_async( offsets.push_back(boost::make_tuple(tmp.first, tmp.second, i->first.get<2>())); } - const vector &chunk_mapping = ec_impl->get_chunk_mapping(); set want_to_read; - for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) { - int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i; - want_to_read.insert(chunk); - } + get_want_to_read_shards(&want_to_read); + set shards; int r = get_min_avail_to_read_shards( hoid, want_to_read, false, + fast_read, &shards); assert(r == 0); @@ -1728,7 +1762,8 @@ void ECBackend::objects_read_async( start_read_op( cct->_conf->osd_client_op_priority, for_read_op, - OpRequestRef()); + OpRequestRef(), + fast_read); return; } diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index e3378595ba457..8f5201c5d26be 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -145,7 +145,8 @@ class ECBackend : public PGBackend { const hobject_t &hoid, const list, pair > > &to_read, - Context *on_complete); + Context *on_complete, + bool fast_read = false); private: friend struct ECRecoveryHandle; @@ -154,6 +155,14 @@ class ECBackend : public PGBackend { sinfo.get_stripe_width()); } + void get_want_to_read_shards(set *want_to_read) const { + const vector &chunk_mapping = ec_impl->get_chunk_mapping(); + for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) { + int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i; + want_to_read->insert(chunk); + } + } + /** * Recovery * @@ -284,6 +293,10 @@ class ECBackend : public PGBackend { int priority; ceph_tid_t tid; OpRequestRef op; // may be null if not on behalf of a client + // True if redundant reads are issued, false otherwise, + // this is useful to tradeoff some resources (redundant ops) for + // low latency read, especially on relatively idle cluster + bool do_redundant_reads; map to_read; map complete; @@ -306,7 +319,8 @@ class ECBackend : public PGBackend { void start_read_op( int priority, map &to_read, - OpRequestRef op); + OpRequestRef op, + bool do_redundant_reads); /** @@ -452,6 +466,7 @@ class ECBackend : public PGBackend { const hobject_t &hoid, ///< [in] object const set &want, ///< [in] desired shards bool for_recovery, ///< [in] true if we may use non-acting replicas + bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency set *to_read ///< [out] shards to read ); ///< @return error code, 0 on success diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index bb463ee07eb8c..52599942b439d 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -545,7 +545,7 @@ const hobject_t &hoid, const list, pair > > &to_read, - Context *on_complete) = 0; + Context *on_complete, bool fast_read = false) = 0; virtual bool scrub_supported() { return false; } void be_scan_list( diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 5ddc9fd311efc..7d344e15df0f3 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -278,8 +278,12 @@ void ReplicatedBackend::objects_read_async( const hobject_t &hoid, const list, pair > > &to_read, - Context *on_complete) + Context *on_complete, + bool fast_read) { + // There is no fast read implementation for replication backend yet + assert(!fast_read); + int r = 0; for (list, pair > >::const_iterator i = diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 155abfb881de8..a36007d42c287 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -159,7 +159,8 @@ class ReplicatedBackend : public PGBackend { const hobject_t &hoid, const list, pair > > &to_read, - Context *on_complete); + Context *on_complete, + bool fast_read = false); private: // push From 5eb2a77dd770ce39e76aaeb73a1494426584df9f Mon Sep 17 00:00:00 2001 From: Guang G Yang Date: Tue, 21 Jul 2015 17:54:45 +0000 Subject: [PATCH 168/654] mon: add a new pool setting to configure fast read for EC pool Signed-off-by: Guang Yang --- src/mon/MonCommands.h | 4 ++-- src/mon/OSDMonitor.cc | 22 ++++++++++++++++++++-- src/osd/ReplicatedPG.cc | 2 +- src/osd/osd_types.cc | 14 ++++++++++++-- src/osd/osd_types.h | 4 +++- src/test/pybind/test_ceph_argparse.py | 10 +++++----- 6 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 1bf238787b9b6..42bb35235ac37 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -674,11 +674,11 @@ COMMAND("osd pool rename " \ "rename to ", "osd", "rw", "cli,rest") COMMAND("osd pool get " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|write_fadvise_dontneed|all|min_write_recency_for_promote", \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|write_fadvise_dontneed|all|min_write_recency_for_promote|fast_read", \ "get pool parameter ", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed|min_write_recency_for_promote " \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed|min_write_recency_for_promote|fast_read " \ "name=val,type=CephString " \ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ "set pool parameter to ", "osd", "rw", "cli,rest") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 25cf3c10e95e4..04ee1a05826d6 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2881,7 +2881,8 @@ namespace { CACHE_TARGET_FULL_RATIO, CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE, ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE, - WRITE_FADVISE_DONTNEED, MIN_WRITE_RECENCY_FOR_PROMOTE}; + WRITE_FADVISE_DONTNEED, MIN_WRITE_RECENCY_FOR_PROMOTE, + FAST_READ}; std::set subtract_second_from_first(const std::set& first, @@ -3334,7 +3335,8 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) ("erasure_code_profile", ERASURE_CODE_PROFILE) ("min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE) ("write_fadvise_dontneed", WRITE_FADVISE_DONTNEED) - ("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE); + ("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE) + ("fast_read", FAST_READ); typedef std::set choices_set_t; @@ -3490,6 +3492,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) f->dump_int("min_write_recency_for_promote", p->min_write_recency_for_promote); break; + case FAST_READ: + f->dump_int("fast_read", p->fast_read); + break; } f->close_section(); f->flush(rdata); @@ -3588,6 +3593,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) ss << "min_write_recency_for_promote: " << p->min_write_recency_for_promote << "\n"; break; + case FAST_READ: + ss << "fast_read: " << p->fast_read << "\n"; + break; } rdata.append(ss.str()); ss.str(""); @@ -4942,6 +4950,16 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, return -EINVAL; } p.min_write_recency_for_promote = n; + } else if (var == "fast_read") { + if (val == "true" || (interr.empty() && n == 1)) { + if (p.is_replicated()) { + ss << "fast read is not supported in replication pool"; + return -EINVAL; + } + p.fast_read = true; + } else if (val == "false" || (interr.empty() && n == 0)) { + p.fast_read = false; + } } else { ss << "unrecognized variable '" << var << "'"; return -EINVAL; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index d5bfbcf79b2af..f4991fdfafce5 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -112,7 +112,7 @@ void ReplicatedPG::OpContext::start_async_reads(ReplicatedPG *pg) pg->pgbackend->objects_read_async( obc->obs.oi.soid, pending_async_reads, - new OnReadComplete(pg, this)); + new OnReadComplete(pg, this), pg->get_pool().fast_read); pending_async_reads.clear(); } void ReplicatedPG::OpContext::finish_read(ReplicatedPG *pg) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 50a80b06cb1fb..0f78242c76525 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -947,6 +947,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote); f->dump_unsigned("stripe_width", get_stripe_width()); f->dump_unsigned("expected_num_objects", expected_num_objects); + f->dump_bool("fast_read", fast_read); } void pg_pool_t::convert_to_pg_shards(const vector &from, set* to) const { @@ -1256,7 +1257,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(21, 5, bl); + ENCODE_START(22, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -1301,12 +1302,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(cache_target_dirty_high_ratio_micro, bl); ::encode(min_write_recency_for_promote, bl); ::encode(use_gmt_hitset, bl); + ::encode(fast_read, bl); ENCODE_FINISH(bl); } void pg_pool_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(22, 5, 5, bl); ::decode(type, bl); ::decode(size, bl); ::decode(crush_ruleset, bl); @@ -1433,6 +1435,11 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } else { use_gmt_hitset = false; } + if (struct_v >= 22) { + ::decode(fast_read, bl); + } else { + fast_read = false; + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -1490,6 +1497,7 @@ void pg_pool_t::generate_test_instances(list& o) a.cache_min_evict_age = 2321; a.erasure_code_profile = "profile in osdmap"; a.expected_num_objects = 123456; + a.fast_read = false; o.push_back(new pg_pool_t(a)); } @@ -1541,6 +1549,8 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) out << " stripe_width " << p.get_stripe_width(); if (p.expected_num_objects) out << " expected_num_objects " << p.expected_num_objects; + if (p.fast_read) + out << " fast_read " << p.fast_read; return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index cf67153c91168..9af5db99d175b 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1119,6 +1119,7 @@ struct pg_pool_t { uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates ///< user does not specify any expected value + bool fast_read; ///< whether turn on fast read on the pool or not pg_pool_t() : flags(0), type(0), size(0), min_size(0), @@ -1146,7 +1147,8 @@ struct pg_pool_t { min_read_recency_for_promote(0), min_write_recency_for_promote(0), stripe_width(0), - expected_num_objects(0) + expected_num_objects(0), + fast_read(false) { } void dump(Formatter *f) const; diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 6bd2b08352636..bf23beaef9477 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -597,7 +597,7 @@ def test_crush_dump(self): self.assert_valid_command(['osd', 'crush', 'dump']) assert_equal({}, validate_command(sigdict, ['osd', 'crush'])) assert_equal({}, validate_command(sigdict, ['osd', 'crush', - 'dump', + 'dump', 'toomany'])) def test_setcrushmap(self): @@ -982,7 +982,7 @@ def test_pool_create(self): assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create', 'poolname', '128', '128', - 'erasure', '^^^', + 'erasure', '^^^', 'ruleset'])) assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create', 'poolname', @@ -1026,7 +1026,7 @@ def test_pool_rename(self): def test_pool_get(self): for var in ('size', 'min_size', 'crash_replay_interval', - 'pg_num', 'pgp_num', 'crush_ruleset', 'auid'): + 'pg_num', 'pgp_num', 'crush_ruleset', 'auid', 'fast_read'): self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var]) assert_equal({}, validate_command(sigdict, ['osd', 'pool'])) assert_equal({}, validate_command(sigdict, ['osd', 'pool', @@ -1043,7 +1043,7 @@ def test_pool_get(self): def test_pool_set(self): for var in ('size', 'min_size', 'crash_replay_interval', 'pg_num', 'pgp_num', 'crush_ruleset', - 'hashpspool', 'auid'): + 'hashpspool', 'auid', 'fast_read'): self.assert_valid_command(['osd', 'pool', 'set', 'poolname', var, 'value']) assert_equal({}, validate_command(sigdict, ['osd', 'pool', @@ -1151,7 +1151,7 @@ def test_exists(self): def test_list(self): self.check_no_arg('config-key', 'list') # Local Variables: -# compile-command: "cd ../.. ; make -j4 && +# compile-command: "cd ../.. ; make -j4 && # PYTHONPATH=pybind nosetests --stop \ # test/pybind/test_ceph_argparse.py # test_ceph_argparse.py:TestOSD.test_rm" # End: From 54090f180c2cf33a294d1c12c70d002550b79267 Mon Sep 17 00:00:00 2001 From: Guang G Yang Date: Tue, 21 Jul 2015 20:45:48 +0000 Subject: [PATCH 169/654] mon: add a configuration for default fast read setting Signed-off-by: Guang Yang --- src/common/config_opts.h | 1 + src/mon/OSDMonitor.cc | 38 ++++++++++++++++++++++++++++++++++++-- src/mon/OSDMonitor.h | 7 +++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2bfbc581e9624..81d3291f9fc87 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -212,6 +212,7 @@ OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap OPTION(mon_osd_prime_pg_temp, OPT_BOOL, false) // prime osdmap with pg mapping changes OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT, .5) // max time to spend priming +OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL, false) // whether turn on fast read on the pool or not OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps OPTION(mon_lease, OPT_FLOAT, 5) // lease interval OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 04ee1a05826d6..f1672e8e935bb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4039,12 +4039,12 @@ int OSDMonitor::prepare_new_pool(MonOpRequestRef op) return prepare_new_pool(m->name, m->auid, m->crush_rule, ruleset_name, 0, 0, erasure_code_profile, - pg_pool_t::TYPE_REPLICATED, 0, &ss); + pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss); else return prepare_new_pool(m->name, session->auid, m->crush_rule, ruleset_name, 0, 0, erasure_code_profile, - pg_pool_t::TYPE_REPLICATED, 0, &ss); + pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss); } int OSDMonitor::crush_rename_bucket(const string& srcname, @@ -4428,6 +4428,7 @@ int OSDMonitor::get_crush_ruleset(const string &ruleset_name, * @param erasure_code_profile The profile name in OSDMap to be used for erasure code * @param pool_type TYPE_ERASURE, or TYPE_REP * @param expected_num_objects expected number of objects on the pool + * @param fast_read fast read type. * @param ss human readable error message, if any. * * @return 0 on success, negative errno on failure. @@ -4439,6 +4440,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, const string &erasure_code_profile, const unsigned pool_type, const uint64_t expected_num_objects, + FastReadType fast_read, ostream *ss) { if (name.length() == 0) @@ -4458,6 +4460,10 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, << ", which in this case is " << pg_num; return -ERANGE; } + if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) { + *ss << "'fast_read' can only apply to erasure coding pool"; + return -EINVAL; + } int r; r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile, crush_ruleset_name, &crush_ruleset, ss); @@ -4513,6 +4519,24 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, if (g_conf->osd_pool_use_gmt_hitset && (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) pi->use_gmt_hitset = true; + + if (pool_type == pg_pool_t::TYPE_ERASURE) { + switch (fast_read) { + case FAST_READ_OFF: + pi->fast_read = false; + break; + case FAST_READ_ON: + pi->fast_read = true; + break; + case FAST_READ_DEFAULT: + pi->fast_read = g_conf->mon_osd_pool_ec_fast_read; + break; + default: + *ss << "invalid fast_read setting: " << fast_read; + return -EINVAL; + } + } + pi->size = size; pi->min_size = min_size; pi->crush_ruleset = crush_ruleset; @@ -6557,12 +6581,22 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, err = -EINVAL; goto reply; } + + int8_t fast_read_param; + cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int8_t(-1)); + FastReadType fast_read = FAST_READ_DEFAULT; + if (fast_read_param == 0) + fast_read = FAST_READ_OFF; + else if (fast_read_param > 0) + fast_read = FAST_READ_ON; + err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool -1, // default crush rule ruleset_name, pg_num, pgp_num, erasure_code_profile, pool_type, (uint64_t)expected_num_objects, + fast_read, &ss); if (err < 0) { switch(err) { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index a185954e11880..78e00f90e5f9c 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -152,6 +152,12 @@ class OSDMonitor : public PaxosService { CrushWrapper &_get_stable_crush(); void _get_pending_crush(CrushWrapper& newcrush); + enum FastReadType { + FAST_READ_OFF, + FAST_READ_ON, + FAST_READ_DEFAULT + }; + // svc public: void create_initial(); @@ -312,6 +318,7 @@ class OSDMonitor : public PaxosService { const string &erasure_code_profile, const unsigned pool_type, const uint64_t expected_num_objects, + FastReadType fast_read, ostream *ss); int prepare_new_pool(MonOpRequestRef op); From ae1df24d27f5ff660b34f469654e71fece7a3120 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 28 Jul 2015 15:58:38 -0700 Subject: [PATCH 170/654] osd: Fix admin socket help output Signed-off-by: David Zafman --- src/osd/OSD.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f6a1fed1b0cfc..676a701a4969d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2077,7 +2077,7 @@ void OSD::final_init() "name=objname,type=CephObjectname " \ "name=shardid,type=CephInt,req=false,range=0|255", test_ops_hook, - "inject data error into omap"); + "inject data error to an object"); assert(r == 0); r = admin_socket->register_command( @@ -2087,7 +2087,7 @@ void OSD::final_init() "name=objname,type=CephObjectname " \ "name=shardid,type=CephInt,req=false,range=0|255", test_ops_hook, - "inject metadata error"); + "inject metadata error to an object"); assert(r == 0); r = admin_socket->register_command( "set_recovery_delay", From 1febe89d926803a58bbef648804dd6aa8c2ea131 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 31 Jul 2015 18:44:56 -0700 Subject: [PATCH 171/654] osd: Avoid confusion by changing EC decode total_chunk_size to total_data_size The length of the buffers being decoded may only be part of the chunk, so less than the HashInfo total_chunk_size. Signed-off-by: David Zafman --- src/osd/ECUtil.cc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 1f3b45857da8a..9d2c2fb261e9d 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -10,23 +10,23 @@ int ECUtil::decode( map &to_decode, bufferlist *out) { - uint64_t total_chunk_size = to_decode.begin()->second.length(); + uint64_t total_data_size = to_decode.begin()->second.length(); assert(to_decode.size()); - assert(total_chunk_size % sinfo.get_chunk_size() == 0); + assert(total_data_size % sinfo.get_chunk_size() == 0); assert(out); assert(out->length() == 0); for (map::iterator i = to_decode.begin(); i != to_decode.end(); ++i) { - assert(i->second.length() == total_chunk_size); + assert(i->second.length() == total_data_size); } - if (total_chunk_size == 0) + if (total_data_size == 0) return 0; - for (uint64_t i = 0; i < total_chunk_size; i += sinfo.get_chunk_size()) { + for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { map chunks; for (map::iterator j = to_decode.begin(); j != to_decode.end(); @@ -48,18 +48,18 @@ int ECUtil::decode( map &to_decode, map &out) { - uint64_t total_chunk_size = to_decode.begin()->second.length(); + uint64_t total_data_size = to_decode.begin()->second.length(); assert(to_decode.size()); - assert(total_chunk_size % sinfo.get_chunk_size() == 0); + assert(total_data_size % sinfo.get_chunk_size() == 0); for (map::iterator i = to_decode.begin(); i != to_decode.end(); ++i) { - assert(i->second.length() == total_chunk_size); + assert(i->second.length() == total_data_size); } - if (total_chunk_size == 0) + if (total_data_size == 0) return 0; set need; @@ -71,7 +71,7 @@ int ECUtil::decode( need.insert(i->first); } - for (uint64_t i = 0; i < total_chunk_size; i += sinfo.get_chunk_size()) { + for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { map chunks; for (map::iterator j = to_decode.begin(); j != to_decode.end(); @@ -92,7 +92,7 @@ int ECUtil::decode( for (map::iterator i = out.begin(); i != out.end(); ++i) { - assert(i->second->length() == total_chunk_size); + assert(i->second->length() == total_data_size); } return 0; } From 450149213eec90b01103b47e35b9784a3ef92ab3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 6 Aug 2015 16:15:05 -0700 Subject: [PATCH 172/654] common, osd: Remove osd_read_eio_on_bad_digest config variable In order to handle erasure coded reads where additional chunks would be able to substitute for some bad chunks, we really need to return an indication of bad digests or incorrectly sized chunks. We are simply using the error return of -EIO to convey that information for future use. Signed-off-by: David Zafman --- src/common/config_opts.h | 2 -- src/osd/ReplicatedPG.cc | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 81d3291f9fc87..811f66e47759d 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -607,8 +607,6 @@ OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap du OPTION(osd_op_num_threads_per_shard, OPT_INT, 2) OPTION(osd_op_num_shards, OPT_INT, 5) -OPTION(osd_read_eio_on_bad_digest, OPT_BOOL, true) // return EIO if object digest is bad - // Only use clone_overlap for recovery if there are fewer than // osd_recover_clone_overlap_limit entries in the overlap set OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index f4991fdfafce5..16e24c44642a2 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3844,8 +3844,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) << " != expected 0x" << oi.data_digest << std::dec << " on " << soid; // FIXME fall back to replica or something? - if (g_conf->osd_read_eio_on_bad_digest) - result = -EIO; + result = -EIO; } } } From da2987d79db679e7b44d7886462c81d34994af26 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 4 Aug 2015 13:38:13 -0700 Subject: [PATCH 173/654] osd: Fix ECBackend to handle mismatch of total chunk size Fixes: #12200 Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 17 +++++++++++------ src/osd/ECBackend.h | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 50e8a5b33c52a..2181634627911 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1297,7 +1297,7 @@ void ECBackend::submit_transaction( for (set::iterator i = need_hinfos.begin(); i != need_hinfos.end(); ++i) { - ECUtil::HashInfoRef ref = get_hash_info(*i); + ECUtil::HashInfoRef ref = get_hash_info(*i, false); if (!ref) { derr << __func__ << ": get_hash_info(" << *i << ")" << " returned a null pointer and there is no " @@ -1505,7 +1505,7 @@ void ECBackend::start_read_op( } ECUtil::HashInfoRef ECBackend::get_hash_info( - const hobject_t &hoid) + const hobject_t &hoid, bool checks) { dout(10) << __func__ << ": Getting attr on " << hoid << dendl; ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid); @@ -1517,7 +1517,8 @@ ECUtil::HashInfoRef ECBackend::get_hash_info( ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), &st); ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); - if (r >= 0 && st.st_size > 0) { + // XXX: What does it mean if there is no object on disk? + if (r >= 0) { dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl; bufferlist bl; r = store->getattr( @@ -1528,8 +1529,12 @@ ECUtil::HashInfoRef ECBackend::get_hash_info( if (r >= 0) { bufferlist::iterator bp = bl.begin(); ::decode(hinfo, bp); - assert(hinfo.get_total_chunk_size() == (uint64_t)st.st_size); - } else { + if (checks && hinfo.get_total_chunk_size() != (uint64_t)st.st_size) { + dout(0) << __func__ << ": Mismatch of total_chunk_size " + << hinfo.get_total_chunk_size() << dendl; + return ECUtil::HashInfoRef(); + } + } else if (st.st_size > 0) { // If empty object and no hinfo, create it return ECUtil::HashInfoRef(); } } @@ -1845,7 +1850,7 @@ void ECBackend::be_deep_scrub( o.read_error = true; } - ECUtil::HashInfoRef hinfo = get_hash_info(poid); + ECUtil::HashInfoRef hinfo = get_hash_info(poid, false); if (!hinfo) { dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl; o.read_error = true; diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 8f5201c5d26be..f416e30f3f8d2 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -447,7 +447,7 @@ class ECBackend : public PGBackend { const ECUtil::stripe_info_t sinfo; /// If modified, ensure that the ref is held until the update is applied SharedPtrRegistry unstable_hashinfo_registry; - ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid); + ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool checks = true); friend struct ReadCB; void check_op(Op *op); From 08f81a9b2b8aed4792e5b16b57a2fffa6fe92895 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 4 Aug 2015 13:39:28 -0700 Subject: [PATCH 174/654] osd: Check for EC decode errors, though none are possible at this time Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 2181634627911..388a41b5494e6 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -367,7 +367,8 @@ void ECBackend::handle_recovery_read_complete( from[i->first.shard].claim(i->second); } dout(10) << __func__ << ": " << from << dendl; - ECUtil::decode(sinfo, ec_impl, from, target); + int r = ECUtil::decode(sinfo, ec_impl, from, target); + assert(r == 0); if (attrs) { op.xattrs.swap(*attrs); @@ -1682,11 +1683,15 @@ struct CallClientContexts : ++j) { to_decode[j->first.shard].claim(j->second); } - ECUtil::decode( + int r = ECUtil::decode( ec->sinfo, ec->ec_impl, to_decode, &bl); + if (r < 0) { + res.r = r; + goto out; + } assert(i->second.second); assert(i->second.first); i->second.first->substr_of( From 21e9f69dd258a8c204828070cfe8b4018acdb145 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 30 Jun 2015 22:06:22 -0700 Subject: [PATCH 175/654] osd: Check CRC when able to on async read Fixes: #12000 Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 53 +++++++++++++++++++++++++++++--------- src/osd/ReplicatedPG.cc | 57 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 92 insertions(+), 18 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 388a41b5494e6..1629cb8c5d332 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -196,6 +196,7 @@ struct OnRecoveryReadComplete : : pg(pg), hoid(hoid) {} void finish(pair &in) { ECBackend::read_result_t &res = in.second; + // FIXME??? assert(res.r == 0); assert(res.errors.empty()); assert(res.returned.size() == 1); @@ -885,28 +886,31 @@ void ECBackend::handle_sub_read( ECSubRead &op, ECSubReadReply *reply) { + shard_id_t shard = get_parent()->whoami_shard().shard; for(map >, hobject_t::BitwiseComparator>::iterator i = op.to_read.begin(); i != op.to_read.end(); ++i) { - for (list >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) { + bufferhash h(-1); + uint64_t total_read = 0; + list >::iterator j; + for (j = i->second.begin(); j != i->second.end(); ++j) { bufferlist bl; int r = store->read( coll, - ghobject_t( - i->first, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(i->first, ghobject_t::NO_GEN, shard), j->get<0>(), j->get<1>(), bl, j->get<2>(), - false); + true); // Allow EIO return if (r < 0) { - assert(0); reply->buffers_read.erase(i->first); reply->errors[i->first] = r; break; } else { + dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl; + total_read += r; + h << bl; reply->buffers_read[i->first].push_back( make_pair( j->get<0>(), @@ -914,6 +918,29 @@ void ECBackend::handle_sub_read( ); } } + // If all reads happened then lets check digest + if (j == i->second.end()) { + dout(20) << __func__ << ": Checking hash of " << i->first << dendl; + ECUtil::HashInfoRef hinfo = get_hash_info(i->first); + // This shows that we still need deep scrub because large enough files + // are read in sections, so the digest check here won't be done here. + if (!hinfo || (total_read == hinfo->get_total_chunk_size() && + h.digest() != hinfo->get_chunk_hash(shard))) { + if (!hinfo) { + get_parent()->clog_error() << __func__ << ": No hinfo for " << i->first << "\n"; + dout(5) << __func__ << ": No hinfo for " << i->first << dendl; + } else { + get_parent()->clog_error() << __func__ << ": Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << "\n"; + dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl; + } + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. + reply->buffers_read.erase(i->first); + reply->errors[i->first] = -EIO; + } + } } for (set::iterator i = op.attrs_to_read.begin(); i != op.attrs_to_read.end(); @@ -928,7 +955,6 @@ void ECBackend::handle_sub_read( *i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), reply->attrs_read[*i]); if (r < 0) { - assert(0); reply->buffers_read.erase(*i); reply->errors[*i] = r; } @@ -973,7 +999,7 @@ void ECBackend::handle_sub_read_reply( op.buffers_read.begin(); i != op.buffers_read.end(); ++i) { - assert(!op.errors.count(i->first)); + assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op continue; @@ -999,7 +1025,7 @@ void ECBackend::handle_sub_read_reply( for (map, hobject_t::BitwiseComparator>::iterator i = op.attrs_read.begin(); i != op.attrs_read.end(); ++i) { - assert(!op.errors.count(i->first)); + assert(!op.errors.count(i->first)); // if read error better not have sent an attribute if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op continue; @@ -1019,7 +1045,7 @@ void ECBackend::handle_sub_read_reply( } map >::iterator siter = -shard_to_read_map.find(from); + shard_to_read_map.find(from); assert(siter != shard_to_read_map.end()); assert(siter->second.count(op.tid)); siter->second.erase(op.tid); @@ -1664,6 +1690,8 @@ struct CallClientContexts : : ec(ec), status(status), to_read(to_read) {} void finish(pair &in) { ECBackend::read_result_t &res = in.second; + if (res.r != 0) + goto out; assert(res.returned.size() == to_read.size()); assert(res.r == 0); assert(res.errors.empty()); @@ -1703,12 +1731,13 @@ struct CallClientContexts : } res.returned.pop_front(); } +out: status->complete = true; list &ip = ec->in_progress_client_reads; while (ip.size() && ip.front().complete) { if (ip.front().on_complete) { - ip.front().on_complete->complete(0); + ip.front().on_complete->complete(res.r); ip.front().on_complete = NULL; } ip.pop_front(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 16e24c44642a2..4c51f246dea06 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3650,12 +3650,35 @@ static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t ma return 0; } -struct FillInExtent : public Context { +struct FillInVerifyExtent : public Context { ceph_le64 *r; - FillInExtent(ceph_le64 *r) : r(r) {} - void finish(int _r) { - if (_r >= 0) { - *r = _r; + int32_t *rval; + bufferlist *outdatap; + boost::optional maybe_crc; + uint64_t size; + OSDService *osd; + hobject_t soid; + __le32 flags; + FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp, + boost::optional mc, uint64_t size, + OSDService *osd, hobject_t soid, __le32 flags) : + r(r), rval(rv), outdatap(blp), maybe_crc(mc), + size(size), osd(osd), soid(soid), flags(flags) {} + void finish(int len) { + *rval = len; + *r = len; + // whole object? can we verify the checksum? + if (maybe_crc && *r == size) { + uint32_t crc = outdatap->crc32c(-1); + if (maybe_crc != crc) { + osd->clog->error() << std::hex << " full-object read crc 0x" << crc + << " != expected 0x" << *maybe_crc + << std::dec << " on " << soid << "\n"; + if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) { + *rval = -EIO; + *r = 0; + } + } } } }; @@ -3811,15 +3834,27 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) // read into a buffer bufferlist bl; + bool async = false; if (trimmed_read && op.extent.length == 0) { // read size was trimmed to zero and it is expected to do nothing // a read operation of 0 bytes does *not* do nothing, this is why // the trimmed_read boolean is needed } else if (pool.info.require_rollback()) { + async = true; + boost::optional maybe_crc; + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. FillInVerifyExtent will + // will check the oi.size again. + if (oi.is_data_digest() && op.extent.offset == 0 && + op.extent.length >= oi.size) + maybe_crc = oi.data_digest; ctx->pending_async_reads.push_back( make_pair( boost::make_tuple(op.extent.offset, op.extent.length, op.flags), - make_pair(&osd_op.outdata, new FillInExtent(&op.extent.length)))); + make_pair(&osd_op.outdata, + new FillInVerifyExtent(&op.extent.length, &osd_op.rval, + &osd_op.outdata, maybe_crc, oi.size, osd, + soid, op.flags)))); dout(10) << " async_read noted for " << soid << dendl; } else { int r = pgbackend->objects_read_sync( @@ -3852,9 +3887,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) first_read = false; ctx->data_off = op.extent.offset; } + // XXX the op.extent.length is the requested length for async read + // On error this length is changed to 0 after the error comes back. ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10); ctx->delta_stats.num_rd++; + // Skip checking the result and just proceed to the next operation + if (async) + continue; + } break; @@ -6272,6 +6313,10 @@ void ReplicatedPG::complete_read_ctx(int result, OpContext *ctx) assert(ctx->async_reads_complete()); for (vector::iterator p = ctx->ops.begin(); p != ctx->ops.end(); ++p) { + if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = p->rval; + break; + } ctx->bytes_read += p->outdata.length(); } ctx->reply->claim_op_out_data(ctx->ops); From f3eea4aaa4d0d467241c1bcd5467fd74acf2bcc5 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 29 Jul 2015 17:02:50 -0700 Subject: [PATCH 176/654] test: Fix incorrect syntax in check for subread all feature Signed-off-by: David Zafman --- src/test/erasure-code/test-erasure-eio.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index 3c93338bbd7e5..aadc54477a687 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -44,7 +44,7 @@ function setup_osds() { for id in $(seq 0 3) ; do # TODO: the feature of "osd-pool-erasure-code-subread-all" is not yet supported. - if -n osd_pool_erasure_code_subread_all__is_supported; then + if [ -n "$osd_pool_erasure_code_subread_all__is_supported" ]; then run_osd $dir $id "--osd-pool-erasure-code-subread-all=$subread" || return 1 else run_osd $dir $id || return 1 From bbdae5373e0906c371ebc287f13728a41558712a Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 28 Jul 2015 15:48:01 -0700 Subject: [PATCH 177/654] test: Enable EIO test code but expect error instead of osd crash Signed-off-by: David Zafman --- src/test/erasure-code/test-erasure-eio.sh | 82 ++++++++++++++++------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index aadc54477a687..769ff1f6c3e44 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -76,7 +76,7 @@ function delete_pool() { ./ceph osd erasure-code-profile rm myprofile } -function rados_put_get() { +function rados_put() { local dir=$1 local poolname=$2 local objname=${3:-SOMETHING} @@ -88,21 +88,57 @@ function rados_put_get() { # get and put an object, compare they are equal # ./rados --pool $poolname put $objname $dir/ORIGINAL || return 1 +} + +function rados_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + local expect=${4:-0} + + # + # Expect a failure to get object + # + if [ $expect = "1" ]; + then + ! ./rados --pool $poolname get $objname $dir/COPY + return $? + fi + # + # get an object, compare with $dir/ORIGINAL + # ./rados --pool $poolname get $objname $dir/COPY || return 1 diff $dir/ORIGINAL $dir/COPY || return 1 rm $dir/COPY +} + +function rados_put_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + local expect=${4:-0} + local recovery=$5 + # - # take out the first OSD used to store the object and - # check the object can still be retrieved, which implies - # recovery + # get and put an object, compare they are equal # - local -a initial_osds=($(get_osds $poolname $objname)) - local last=$((${#initial_osds[@]} - 1)) - ./ceph osd out ${initial_osds[$last]} || return 1 - ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1 - ./rados --pool $poolname get $objname $dir/COPY || return 1 - diff $dir/ORIGINAL $dir/COPY || return 1 - ./ceph osd in ${initial_osds[$last]} || return 1 + rados_put $dir $poolname $objname || return 1 + rados_get $dir $poolname $objname $expect || return 1 + + if [ -n "$recovery" ]; + then + # + # take out the first OSD used to store the object and + # check the object can still be retrieved, which implies + # recovery + # + local -a initial_osds=($(get_osds $poolname $objname)) + local last=$((${#initial_osds[@]} - 1)) + ./ceph osd out ${initial_osds[$last]} || return 1 + ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1 + rados_get $dir $poolname $objname $expect || return 1 + ./ceph osd in ${initial_osds[$last]} || return 1 + fi rm $dir/ORIGINAL } @@ -112,33 +148,28 @@ function rados_get_data_eio() { shift local shard_id=$1 shift - local osd_state=$1 + local recovery=$1 shift # inject eio to speificied shard - # OSD with eio injection will crash at reading object # local poolname=pool-jerasure local objname=obj-eio-$$-$shard_id local -a initial_osds=($(get_osds $poolname $objname)) local osd_id=${initial_osds[$shard_id]} local last=$((${#initial_osds[@]} - 1)) - # set_config osd $osd_id filestore_debug_inject_read_err true || return 1 set_config osd $osd_id filestore_debug_inject_read_err true || return 1 CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \ injectdataerr $poolname $objname $shard_id || return 1 - set_config osd $osd_id filestore_fail_eio false || return 1 - - rados_put_get $dir $poolname $objname || return 1 - TIMEOUT=1 wait_for_osd $osd_state $osd_id || return 1 + rados_put_get $dir $poolname $objname 1 $recovery || return 1 } # # These two test cases try to validate the following behavior: # For object on EC pool, if there is one shard having read error ( -# either primary or replica), it will trigger OSD crash. +# either primary or replica), client gets the read error back. # -function TEST_rados_get_without_subreadall_eio_shard_0() { +function TEST_rados_get_subread_eio_shard_0() { local dir=$1 setup_osds false || return 1 @@ -146,11 +177,11 @@ function TEST_rados_get_without_subreadall_eio_shard_0() { create_erasure_coded_pool $poolname || return 1 # inject eio on primary OSD (0) local shard_id=0 - rados_get_data_eio $dir $shard_id down || return 1 + rados_get_data_eio $dir $shard_id || return 1 delete_pool $poolname } -function TEST_rados_get_without_subreadall_eio_shard_1() { +function TEST_rados_get_subread_eio_shard_1() { local dir=$1 setup_osds false || return 1 @@ -158,11 +189,10 @@ function TEST_rados_get_without_subreadall_eio_shard_1() { create_erasure_coded_pool $poolname || return 1 # inject eio into replica OSD (1) local shard_id=1 - rados_get_data_eio $dir $shard_id down || return 1 + rados_get_data_eio $dir $shard_id || return 1 delete_pool $poolname } - : <<'DISABLED_TESTS' # this test case is aimed to test the fix of https://github.com/ceph/ceph/pull/2952 # this test case can test both client read and recovery read on EIO @@ -182,7 +212,7 @@ function TEST_rados_get_with_subreadall_eio_shard_0() { create_erasure_coded_pool $poolname || return 1 # inject eio on primary OSD (0) local shard_id=0 - rados_get_data_eio $dir $shard_id up || return 1 + rados_get_data_eio $dir $shard_id recovery || return 1 check_pg_status $pg "inconsistent" || return 1 delete_pool $poolname @@ -198,7 +228,7 @@ function TEST_rados_get_with_subreadall_eio_shard_1() { create_erasure_coded_pool $poolname || return 1 # inject eio on replica OSD (1) local shard_id=1 - rados_get_data_eio $dir $shard_id up || return 1 + rados_get_data_eio $dir $shard_id recovery || return 1 # the reason to skip this check when current shardid != 0 is that the first # k chunks returned is not always containing current shardid, so this pg may From a7c6b6a975b2d7ac6cf9a0d10c24b6c3585077f3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 28 Jul 2015 20:27:16 -0700 Subject: [PATCH 178/654] test: Adding testing of shard with incorrect size Signed-off-by: David Zafman --- src/test/erasure-code/test-erasure-eio.sh | 58 +++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index 769ff1f6c3e44..7a1c0c0c171d7 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -164,6 +164,38 @@ function rados_get_data_eio() { rados_put_get $dir $poolname $objname 1 $recovery || return 1 } +function rados_get_data_bad_size() { + local dir=$1 + shift + local shard_id=$1 + shift + local bytes=$1 + shift + local mode=${1:-set} + + # inject eio to speificied shard + # + local poolname=pool-jerasure + local objname=obj-size-$$-$shard_id-$bytes + local -a initial_osds=($(get_osds $poolname $objname)) + local osd_id=${initial_osds[$shard_id]} + local last=$((${#initial_osds[@]} - 1)) + rados_put $dir $poolname $objname || return 1 + if [ "$mode" = "add" ]; + then + objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1 + dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT + elif [ "$bytes" = "0" ]; + then + touch $dir/CORRUPT + else + dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT + fi + objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1 + rm -f $dir/CORRUPT + rados_get $dir $poolname $objname 1 || return 1 +} + # # These two test cases try to validate the following behavior: # For object on EC pool, if there is one shard having read error ( @@ -193,6 +225,32 @@ function TEST_rados_get_subread_eio_shard_1() { delete_pool $poolname } +function TEST_rados_get_bad_size_shard_0() { + local dir=$1 + setup_osds false || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + local shard_id=0 + rados_get_data_bad_size $dir $shard_id 10 || return 1 + rados_get_data_bad_size $dir $shard_id 0 || return 1 + rados_get_data_bad_size $dir $shard_id 256 add || return 1 + delete_pool $poolname +} + +function TEST_rados_get_bad_size_shard_1() { + local dir=$1 + setup_osds false || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + local shard_id=1 + rados_get_data_bad_size $dir $shard_id 10 || return 1 + rados_get_data_bad_size $dir $shard_id 0 || return 1 + rados_get_data_bad_size $dir $shard_id 256 add || return 1 + delete_pool $poolname +} + : <<'DISABLED_TESTS' # this test case is aimed to test the fix of https://github.com/ceph/ceph/pull/2952 # this test case can test both client read and recovery read on EIO From 5bfa75c255fc311a50e6e38eb22aa8821521aac3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 10 Aug 2015 21:01:52 -0700 Subject: [PATCH 179/654] osd: Drop errors if enough copies are good redundant reads come in Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 64 ++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 1629cb8c5d332..15c7666d97428 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -992,6 +992,7 @@ void ECBackend::handle_sub_read_reply( map::iterator iter = tid_to_read_map.find(op.tid); if (iter == tid_to_read_map.end()) { //canceled + dout(10) << __func__ << ": abort " << op << dendl; return; } ReadOp &rop = iter->second; @@ -1002,6 +1003,7 @@ void ECBackend::handle_sub_read_reply( assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op +dout(0) << __func__ << " to_read skipping" << dendl; continue; } list >::const_iterator req_iter = @@ -1028,6 +1030,7 @@ void ECBackend::handle_sub_read_reply( assert(!op.errors.count(i->first)); // if read error better not have sent an attribute if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op +dout(0) << __func__ << " to_read skipping" << dendl; continue; } rop.complete[i->first].attrs = map(); @@ -1040,7 +1043,8 @@ void ECBackend::handle_sub_read_reply( make_pair( from, i->second)); - if (rop.complete[i->first].r == 0) +dout(0) << __func__ << " shard=" << from << " error=" << i->second << dendl; + if (!rop.do_redundant_reads && rop.complete[i->first].r == 0) rop.complete[i->first].r = i->second; } @@ -1052,32 +1056,46 @@ void ECBackend::handle_sub_read_reply( assert(rop.in_progress.count(from)); rop.in_progress.erase(from); - bool is_complete = true; - if (!rop.in_progress.empty()) { - if (rop.do_redundant_reads) { - for (map::const_iterator iter = - rop.complete.begin(); - iter != rop.complete.end(); - ++iter) { - set have; - for (map::const_iterator j = - iter->second.returned.front().get<2>().begin(); - j != iter->second.returned.front().get<2>().end(); - ++j) { - have.insert(j->first.shard); - } - set want_to_read, dummy_minimum; - get_want_to_read_shards(&want_to_read); - if (ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum) < 0) { - is_complete = false; - break; - } + bool is_complete = rop.in_progress.empty(); + if (rop.do_redundant_reads) { + for (map::const_iterator iter = + rop.complete.begin(); + iter != rop.complete.end(); + ++iter) { + set have; + for (map::const_iterator j = + iter->second.returned.front().get<2>().begin(); + j != iter->second.returned.front().get<2>().end(); + ++j) { + have.insert(j->first.shard); +dout(0) << __func__ << " have shard=" << j->first.shard << dendl; + } + set want_to_read, dummy_minimum; + get_want_to_read_shards(&want_to_read); + int err; + if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) { +dout(0) << __func__ << " minimum_to_decode failed" << dendl; + if (is_complete) { + if (rop.complete[iter->first].errors.empty()) { +dout(0) << __func__ << " simply not enough copies err=" << err << dendl; + } else { + // Grab the first error + err = rop.complete[iter->first].errors.begin()->second; +dout(0) << __func__ << ": Use one of the shard errors err=" << err << dendl; + } + rop.complete[iter->first].r = err; + } + break; + } else { +dout(0) << __func__ << " Enough copies have come in ignore errors" << dendl; + is_complete = true; + rop.complete[iter->first].errors.clear(); + assert(rop.complete[iter->first].r == 0); } - } else { - is_complete = false; } } if (is_complete) { +dout(0) << __func__ << " Complete: " << rop << dendl; complete_read_op(rop, m); } else { dout(10) << __func__ << " readop not complete: " << rop << dendl; From c09c1192dbe20e190d91efd3f58dd3951b856987 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 11 Aug 2015 12:51:39 -0700 Subject: [PATCH 180/654] osd: Send reads to other shards if erasure coded chunk reads fail Handle errors in a common way whether redundant reads or not Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 176 ++++++++++++++++++++++++++++++++++++++++--- src/osd/ECBackend.h | 16 +++- 2 files changed, 179 insertions(+), 13 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 15c7666d97428..47594cb932885 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -472,7 +472,7 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority) priority, m.reads, OpRequestRef(), - false); + false, true); } void ECBackend::continue_recovery_op( @@ -1044,8 +1044,6 @@ dout(0) << __func__ << " to_read skipping" << dendl; from, i->second)); dout(0) << __func__ << " shard=" << from << " error=" << i->second << dendl; - if (!rop.do_redundant_reads && rop.complete[i->first].r == 0) - rop.complete[i->first].r = i->second; } map >::iterator siter = @@ -1056,8 +1054,10 @@ dout(0) << __func__ << " shard=" << from << " error=" << i->second << dendl; assert(rop.in_progress.count(from)); rop.in_progress.erase(from); - bool is_complete = rop.in_progress.empty(); - if (rop.do_redundant_reads) { + unsigned is_complete = 0; + // For redundant reads check for completion as each shard comes in, + // or in a non-recovery read check for completion once all the shards read. + if (rop.do_redundant_reads || (!rop.for_recovery && rop.in_progress.empty())) { for (map::const_iterator iter = rop.complete.begin(); iter != rop.complete.end(); @@ -1073,9 +1073,20 @@ dout(0) << __func__ << " have shard=" << j->first.shard << dendl; set want_to_read, dummy_minimum; get_want_to_read_shards(&want_to_read); int err; + // XXX: Could just do if (have.size < ec_impl->get_data_chunk_count()) if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) { dout(0) << __func__ << " minimum_to_decode failed" << dendl; - if (is_complete) { + if (rop.in_progress.empty()) { + // If we don't have enough copies and we haven't sent reads for all shards + // we can send the rest of the reads, if any. + if (!rop.do_redundant_reads) { + int r = objects_remaining_read_async(iter->first, rop); + if (r == 0) { + // We added to in_progress and not incrementing is_complete + continue; + } + // Couldn't read any additional shards so handle as completed with errors + } if (rop.complete[iter->first].errors.empty()) { dout(0) << __func__ << " simply not enough copies err=" << err << dendl; } else { @@ -1084,17 +1095,17 @@ dout(0) << __func__ << " simply not enough copies err=" << err << dendl; dout(0) << __func__ << ": Use one of the shard errors err=" << err << dendl; } rop.complete[iter->first].r = err; + ++is_complete; } - break; } else { -dout(0) << __func__ << " Enough copies have come in ignore errors" << dendl; - is_complete = true; +dout(0) << __func__ << " Enough copies for " << iter->first << " (ignore errors)" << dendl; + ++is_complete; rop.complete[iter->first].errors.clear(); assert(rop.complete[iter->first].r == 0); } } } - if (is_complete) { + if (rop.in_progress.empty() || is_complete == rop.complete.size()) { dout(0) << __func__ << " Complete: " << rop << dendl; complete_read_op(rop, m); } else { @@ -1470,11 +1481,50 @@ int ECBackend::get_min_avail_to_read_shards( return 0; } +int ECBackend::get_remaining_shards( + const hobject_t &hoid, + const set &avail, + set *to_read) +{ + map >::const_iterator miter = + get_parent()->get_missing_loc_shards().find(hoid); + + set need; + map shards; + + for (set::const_iterator i = + get_parent()->get_acting_shards().begin(); + i != get_parent()->get_acting_shards().end(); + ++i) { + dout(10) << __func__ << ": checking acting " << *i << dendl; + const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (!missing.is_missing(hoid)) { + assert(!need.count(i->shard)); + need.insert(i->shard); + assert(!shards.count(i->shard)); + shards.insert(make_pair(i->shard, *i)); + } + } + + if (!to_read) + return 0; + + for (set::iterator i = need.begin(); + i != need.end(); + ++i) { + assert(shards.count(shard_id_t(*i))); + if (avail.find(*i) == avail.end()) + to_read->insert(shards[shard_id_t(*i)]); + } + return 0; +} + void ECBackend::start_read_op( int priority, map &to_read, OpRequestRef _op, - bool do_redundant_reads) + bool do_redundant_reads, + bool for_recovery) { ceph_tid_t tid = get_parent()->get_tid(); assert(!tid_to_read_map.count(tid)); @@ -1484,6 +1534,7 @@ void ECBackend::start_read_op( op.to_read.swap(to_read); op.op = _op; op.do_redundant_reads = do_redundant_reads; + op.for_recovery = for_recovery; dout(10) << __func__ << ": starting " << op << dendl; map messages; @@ -1549,6 +1600,71 @@ void ECBackend::start_read_op( dout(10) << __func__ << ": started " << op << dendl; } +void ECBackend::start_remaining_read_op( + ReadOp &op, + map &to_read) +{ + int priority = op.priority; + ceph_tid_t tid = op.tid; + op.to_read.swap(to_read); + + dout(10) << __func__ << ": starting additional " << op << dendl; + + map messages; + for (map::iterator i = op.to_read.begin(); + i != op.to_read.end(); + ++i) { + bool need_attrs = i->second.want_attrs; + for (set::const_iterator j = i->second.need.begin(); + j != i->second.need.end(); + ++j) { + if (need_attrs) { + messages[*j].attrs_to_read.insert(i->first); + need_attrs = false; + } + op.obj_to_source[i->first].insert(*j); + op.source_to_obj[*j].insert(i->first); + } + for (list >::const_iterator j = + i->second.to_read.begin(); + j != i->second.to_read.end(); + ++j) { + pair chunk_off_len = + sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>())); + for (set::const_iterator k = i->second.need.begin(); + k != i->second.need.end(); + ++k) { + messages[*k].to_read[i->first].push_back(boost::make_tuple(chunk_off_len.first, + chunk_off_len.second, + j->get<2>())); + } + assert(!need_attrs); + } + } + + for (map::iterator i = messages.begin(); + i != messages.end(); + ++i) { + op.in_progress.insert(i->first); + shard_to_read_map[i->first].insert(op.tid); + i->second.tid = tid; + MOSDECSubOpRead *msg = new MOSDECSubOpRead; + msg->set_priority(priority); + msg->pgid = spg_t( + get_parent()->whoami_spg_t().pgid, + i->first.shard); + msg->map_epoch = get_parent()->get_epoch(); + msg->op = i->second; + msg->op.from = get_parent()->whoami_shard(); + msg->op.tid = tid; + get_parent()->send_message_osd_cluster( + i->first.osd, + msg, + get_parent()->get_epoch()); + } + dout(10) << __func__ << ": started additional " << op << dendl; +} + ECUtil::HashInfoRef ECBackend::get_hash_info( const hobject_t &hoid, bool checks) { @@ -1820,11 +1936,47 @@ void ECBackend::objects_read_async( cct->_conf->osd_client_op_priority, for_read_op, OpRequestRef(), - fast_read); + fast_read, false); return; } +int ECBackend::objects_remaining_read_async( + const hobject_t &hoid, + ReadOp &rop) +{ + set already_read; + set ots = rop.obj_to_source[hoid]; + for (set::iterator i = ots.begin(); i != ots.end(); ++i) + already_read.insert(i->shard); + dout(10) << __func__ << " have/error shards=" << already_read << dendl; + set shards; + int r = get_remaining_shards(hoid, already_read, &shards); + if (r) + return r; + if (shards.empty()) + return -EIO; + + dout(10) << __func__ << " Read remaining shards " << shards << dendl; + + list > offsets = rop.to_read.find(hoid)->second.to_read; + GenContext &> *c = rop.to_read.find(hoid)->second.cb; + + map for_read_op; + for_read_op.insert( + make_pair( + hoid, + read_request_t( + hoid, + offsets, + shards, + false, + c))); + + start_remaining_read_op(rop, for_read_op); + return 0; +} + int ECBackend::objects_get_attrs( const hobject_t &hoid, map *out) diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index f416e30f3f8d2..a039b70c8a8dc 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -297,6 +297,9 @@ class ECBackend : public PGBackend { // this is useful to tradeoff some resources (redundant ops) for // low latency read, especially on relatively idle cluster bool do_redundant_reads; + // True if reading for recovery which could possibly reading only a subset + // of the available shards. + bool for_recovery; map to_read; map complete; @@ -320,7 +323,13 @@ class ECBackend : public PGBackend { int priority, map &to_read, OpRequestRef op, - bool do_redundant_reads); + bool do_redundant_reads, bool for_recovery); + + void start_remaining_read_op(ReadOp &rop, + map &to_read); + int objects_remaining_read_async( + const hobject_t &hoid, + ReadOp &rop); /** @@ -470,6 +479,11 @@ class ECBackend : public PGBackend { set *to_read ///< [out] shards to read ); ///< @return error code, 0 on success + int get_remaining_shards( + const hobject_t &hoid, + const set &avail, + set *to_read); + int objects_get_attrs( const hobject_t &hoid, map *out); From d3b06edfbe4cc2338ceffb56d398ae9ed1e673c5 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Aug 2015 11:37:36 -0700 Subject: [PATCH 181/654] test: Fix comment in test-erasure-eio.sh Signed-off-by: David Zafman --- src/test/erasure-code/test-erasure-eio.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index 7a1c0c0c171d7..47f97031dd357 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -173,7 +173,7 @@ function rados_get_data_bad_size() { shift local mode=${1:-set} - # inject eio to speificied shard + # Change the size of speificied shard # local poolname=pool-jerasure local objname=obj-size-$$-$shard_id-$bytes From 70e000a9a42c50eda48f5d3b9e432ffc3a70f75b Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Aug 2015 12:33:44 -0700 Subject: [PATCH 182/654] test: Fix to expect no errors on 1 bad shard and errors with 2 bad shards Signed-off-by: David Zafman --- src/test/erasure-code/test-erasure-eio.sh | 84 ++++++++++++++++++----- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index 47f97031dd357..fe465c93bda3d 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -143,6 +143,22 @@ function rados_put_get() { rm $dir/ORIGINAL } +function inject_eio() { + local objname=$1 + shift + local dir=$1 + shift + local shard_id=$1 + shift + + local poolname=pool-jerasure + local -a initial_osds=($(get_osds $poolname $objname)) + local osd_id=${initial_osds[$shard_id]} + set_config osd $osd_id filestore_debug_inject_read_err true || return 1 + CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \ + injectdataerr $poolname $objname $shard_id || return 1 +} + function rados_get_data_eio() { local dir=$1 shift @@ -155,32 +171,30 @@ function rados_get_data_eio() { # local poolname=pool-jerasure local objname=obj-eio-$$-$shard_id - local -a initial_osds=($(get_osds $poolname $objname)) - local osd_id=${initial_osds[$shard_id]} - local last=$((${#initial_osds[@]} - 1)) - set_config osd $osd_id filestore_debug_inject_read_err true || return 1 - CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \ - injectdataerr $poolname $objname $shard_id || return 1 - rados_put_get $dir $poolname $objname 1 $recovery || return 1 + inject_eio $objname $dir $shard_id || return 1 + rados_put_get $dir $poolname $objname 0 $recovery || return 1 + + shard_id=$(expr $shard_id + 1) + inject_eio $objname $dir $shard_id || return 1 + rados_get $dir $poolname $objname 1 || return 1 } -function rados_get_data_bad_size() { +# Change the size of speificied shard +# +function set_size() { + local objname=$1 + shift local dir=$1 shift local shard_id=$1 shift local bytes=$1 shift - local mode=${1:-set} + local mode=${1} - # Change the size of speificied shard - # local poolname=pool-jerasure - local objname=obj-size-$$-$shard_id-$bytes local -a initial_osds=($(get_osds $poolname $objname)) local osd_id=${initial_osds[$shard_id]} - local last=$((${#initial_osds[@]} - 1)) - rados_put $dir $poolname $objname || return 1 if [ "$mode" = "add" ]; then objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1 @@ -191,15 +205,42 @@ function rados_get_data_bad_size() { else dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT fi + objectstore_tool $dir $osd_id --op list $objname objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1 rm -f $dir/CORRUPT +} + +function rados_get_data_bad_size() { + local dir=$1 + shift + local shard_id=$1 + shift + local bytes=$1 + shift + local mode=${1:-set} + + local poolname=pool-jerasure + local objname=obj-size-$$-$shard_id-$bytes + rados_put $dir $poolname $objname || return 1 + + # Change the size of speificied shard + # + set_size $objname $dir $shard_id $bytes $mode || return 1 + + rados_get $dir $poolname $objname 0 || return 1 + + # Leave objname and modify another shard + shard_id=$(expr $shard_id + 1) + set_size $objname $dir $shard_id $bytes $mode || return 1 rados_get $dir $poolname $objname 1 || return 1 } # # These two test cases try to validate the following behavior: # For object on EC pool, if there is one shard having read error ( -# either primary or replica), client gets the read error back. +# either primary or replica), client can still read object. +# +# If 2 shards have read errors the client will get an error. # function TEST_rados_get_subread_eio_shard_0() { local dir=$1 @@ -207,7 +248,7 @@ function TEST_rados_get_subread_eio_shard_0() { local poolname=pool-jerasure create_erasure_coded_pool $poolname || return 1 - # inject eio on primary OSD (0) + # inject eio on primary OSD (0) and replica OSD (1) local shard_id=0 rados_get_data_eio $dir $shard_id || return 1 delete_pool $poolname @@ -219,18 +260,26 @@ function TEST_rados_get_subread_eio_shard_1() { local poolname=pool-jerasure create_erasure_coded_pool $poolname || return 1 - # inject eio into replica OSD (1) + # inject eio into replicas OSD (1) and OSD (2) local shard_id=1 rados_get_data_eio $dir $shard_id || return 1 delete_pool $poolname } +# +# These two test cases try to validate that following behavior: +# For object on EC pool, if there is one shard which an incorrect +# size this will cause an internal read error, client can still read object. +# +# If 2 shards have incorrect size the client will get an error. +# function TEST_rados_get_bad_size_shard_0() { local dir=$1 setup_osds false || return 1 local poolname=pool-jerasure create_erasure_coded_pool $poolname || return 1 + # Set incorrect size into primary OSD (0) and replica OSD (1) local shard_id=0 rados_get_data_bad_size $dir $shard_id 10 || return 1 rados_get_data_bad_size $dir $shard_id 0 || return 1 @@ -244,6 +293,7 @@ function TEST_rados_get_bad_size_shard_1() { local poolname=pool-jerasure create_erasure_coded_pool $poolname || return 1 + # Set incorrect size into replicas OSD (1) and OSD (2) local shard_id=1 rados_get_data_bad_size $dir $shard_id 10 || return 1 rados_get_data_bad_size $dir $shard_id 0 || return 1 From a9c1601e8abd23ae4cddf8cbafb1924395b66ca8 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 27 Aug 2015 16:21:50 -0700 Subject: [PATCH 183/654] cls: Fix successful return found by compiler warning Signed-off-by: David Zafman --- src/cls/hello/cls_hello.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc index d1adbd4a4cf7c..3bc78647366e2 100644 --- a/src/cls/hello/cls_hello.cc +++ b/src/cls/hello/cls_hello.cc @@ -268,6 +268,7 @@ class PGLSHelloFilter : public PGLSFilter { } catch (buffer::error &e) { return -EINVAL; } + return 0; } virtual ~PGLSHelloFilter() {} From e92d2f38c5940a4ee2fc38ea23f8b9043c46ebe7 Mon Sep 17 00:00:00 2001 From: guce Date: Fri, 28 Aug 2015 17:56:19 +0800 Subject: [PATCH 184/654] h3c mail organization map update .mailmap and .organizationmap for h3c organization Signed-off-by: Ce Gu --- .mailmap | 11 +++++++++++ .organizationmap | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/.mailmap b/.mailmap index 3e8b2788dda53..1e13f36c41605 100644 --- a/.mailmap +++ b/.mailmap @@ -242,3 +242,14 @@ Zhiqiang Wang Signed-off-by: Zhiqiang Wang Wang, Zhiqiang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang +Ce Gu guce +Zengran Zhang Aran85 +Weijun Duan duanweijun +Lu Shi s09816 +Na Xie x11562 +Ruifeng Yang Ruifeng Yang <149233652@qq.com> +Xiaowei Chen shawn chen +Xuan Liu l11625 +Donghai Xu x11507 +Mingyue Zhao mingyuez +Qiankun Zheng zqkqkz diff --git a/.organizationmap b/.organizationmap index 3b64ad1b28a63..381f95299d095 100644 --- a/.organizationmap +++ b/.organizationmap @@ -380,6 +380,44 @@ Yahoo! Wei Luo Yahoo! Xihui He Yahoo! Zhi (David) Zhang YouScribe Guilhem Lettron +H3C Ce Gu +H3C Zengran Zhang +H3C Weijun Duan +H3C Lu Shi +H3C Na Xie +H3C Ruifeng Yang +H3C Xiaowei Chen +H3C Xuan Liu +H3C Donghai Xu +H3C Mingyue Zhao +H3C Qiankun Zheng +H3C Sangdi Xu +H3C Bo Cai +H3C Yue Zhu +H3C Fei Wang +H3C Ni Dang +H3C Shan Li +H3C Xiaofeng Feng +H3C Peiyang Liu +H3C Zeqiang Zhuang +H3C Ming Zou +H3C Bingxin Yang +H3C Xiangwei Wu +H3C Jie Chen +H3C Jie Li +H3C Zhanyang Chen +H3C Yehua Chen +H3C Siyuan Zhou +H3C Qiang Guo +H3C Yanbin Wu +H3C Kongming Wu +H3C Bin Zheng +H3C Chunyan Ma +H3C Yongqiang He +H3C Xudong Cao +H3C Yunhui Chen +H3C Tingting Chi +H3C Wenfeng Wang # # Local Variables: # compile-command: "git log --pretty='%aN <%aE>' | \ From c901e855a5d999eb2d0b71457b7ea772e75b8a27 Mon Sep 17 00:00:00 2001 From: Sangdi Xu Date: Fri, 28 Aug 2015 18:19:23 +0800 Subject: [PATCH 185/654] doc:radosgw: correct typos of the command removing a subuser Fix typos in the example command removing a subuser, and delete the 'The Ceph Object Gateway' tag as it should not appear there. Signed-off-by: Sangdi Xu --- doc/radosgw/admin.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/radosgw/admin.rst b/doc/radosgw/admin.rst index 849f236e3c09a..e10745ff48010 100644 --- a/doc/radosgw/admin.rst +++ b/doc/radosgw/admin.rst @@ -182,7 +182,7 @@ subuser), specify ``user rm`` and the user ID. :: To remove the subuser only, specify ``subuser rm`` and the subuser ID. :: - radosgw-admin subuser rm --uid=johndoe:swift + radosgw-admin subuser rm --subuser=johndoe:swift Options include: @@ -198,10 +198,10 @@ Remove a Subuser ---------------- When you remove a sub user, you are removing access to the Swift interface. -The user will remain in the system. The Ceph Object Gateway To remove the subuser, specify +The user will remain in the system. To remove the subuser, specify ``subuser rm`` and the subuser ID. :: - radosgw-admin subuser rm --uid=johndoe:swift + radosgw-admin subuser rm --subuser=johndoe:swift From 51e6b710afbf02cbe8bb277d6bc31528b3e1cb42 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Fri, 28 Aug 2015 12:31:20 +0200 Subject: [PATCH 186/654] mailmap: sort {organization,mail}map Signed-off-by: Loic Dachary --- .mailmap | 30 ++++++++-------- .organizationmap | 94 ++++++++++++++++++++++++------------------------ 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/.mailmap b/.mailmap index 1e13f36c41605..fc7a27fab218a 100644 --- a/.mailmap +++ b/.mailmap @@ -34,6 +34,7 @@ Caleb Miles caleb miles Caleb Miles Caleb Miles Carlos Maltzahn carlosm Casey Marshall rsdio +Ce Gu guce Chendi Xue Chendi.Xue Chendi Xue Chendi Xue Cheng Cheng cchengleo @@ -52,6 +53,7 @@ David Moreau Simard Dmitry Smirnov Dmitry Yatsushkevich Dominik Hannen Dominik Hannen +Donghai Xu x11507 Eric Mourgaya Erwin, Brock A Esteban Molina-Estolano eestolan @@ -68,8 +70,8 @@ Gary Lowell Gaurav Kumar Garg Gerben Meijer Greg Farnum -Greg Farnum Greg Farnum +Greg Farnum Greg Farnum Greg Farnu Greg Farnum Greg Farnum @@ -87,8 +89,8 @@ Holger Macht Huamin Chen rootfs Huang Jun huang jun Huang Jun huangjun -Ilya Dryomov Ilja Slepnev +Ilya Dryomov Ismael Serrano Jean-Charles Lopez jeanchlopez Jiang Heng jiangheng @@ -125,6 +127,7 @@ Loic Dachary Loic Dachary Loic Dachary Loic Dachary Loïc Dachary +Lu Shi s09816 Ma Jianpeng Jianpeng Ma Ma Jianpeng Ma Jianpeng Ma, Jianpeng @@ -137,19 +140,22 @@ Matthew Wodrich Michael Riederer dynamike67 Michael Rodriguez Michael Rodriguez +Mingyue Zhao mingyuez Mykola Golub Mykola Golub Nathan Cutler +Na Xie x11562 +Neha Ummareddy nehaummareddy Neil Levine Ning Yao Noah Watkins Noah Watkins -Neha Ummareddy nehaummareddy Pascal de Bruijn Pascal de Bruijn | Unilogic Networks B.V Patience Warnick Patrick McGarry scuttlemonkey Patrick McGarry Pavan Rallabhandi Pavan Rallabhandi Pete Zaitcev +Qiankun Zheng zqkqkz Riccardo Ferretti rferrett Roald J. van Loon Robert Jansen @@ -157,6 +163,7 @@ Robin Dehu Ron Allred rallred Ross Turk Ross Turk +Ruifeng Yang Ruifeng Yang <149233652@qq.com> Sage Weil Sage Weil Sage Weil @@ -207,18 +214,21 @@ Tommi Virtanen Tommi Virtanen Travis Rhoden Tyler Brekke Concubidated +Varada Kari Volker Assmann Volker Assmann -Varada Kari Walter Huf Walter J. Huf Wang, Yaguang ywang19 Warren Usui wusui +Weijun Duan duanweijun Wei Luo luowei Wido den Hollander Xan Peng xan Xavier Roche +Xiaowei Chen shawn chen Xie Rui <875016668@qq.com> Jerry7X <875016668@qq.com> Xingyi Wu +Xuan Liu l11625 Yan, Zheng Yan, Zheng Zheng Yan Yan, Zheng Zheng, Yan @@ -236,20 +246,10 @@ Yehuda Sadeh Yehuda Sadeh Yongyue Sun Abioy Yuan Zhou +Zengran Zhang Aran85 Zhi (David) Zhang Zhi (David) Zhang Zhi Z Zhang Zhiqiang Wang Signed-off-by: Zhiqiang Wang Zhiqiang Wang Wang, Zhiqiang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang -Ce Gu guce -Zengran Zhang Aran85 -Weijun Duan duanweijun -Lu Shi s09816 -Na Xie x11562 -Ruifeng Yang Ruifeng Yang <149233652@qq.com> -Xiaowei Chen shawn chen -Xuan Liu l11625 -Donghai Xu x11507 -Mingyue Zhao mingyuez -Qiankun Zheng zqkqkz diff --git a/.organizationmap b/.organizationmap index 381f95299d095..0946f00b92458 100644 --- a/.organizationmap +++ b/.organizationmap @@ -50,8 +50,8 @@ Cloudwatt Christophe Courtaut Florent Flament Cloudwatt Loic Dachary Cloudwatt Sahid Orentino Ferdjaoui -CohortFS, LLC Matt Benjamin CohortFS, LLC Casey Bodley +CohortFS, LLC Matt Benjamin Commerce Guys Nikola Kotur Corvisa LLC Walter Huf Credit Mutuel Arkea Eric Mourgaya @@ -92,6 +92,44 @@ gocept gmbh & co. kg Christian Theune GRNet Filippos Giannakos GRNet Stratos Psomadakis GRNet Vangelis Koukis +H3C Bingxin Yang +H3C Bin Zheng +H3C Bo Cai +H3C Ce Gu +H3C Chunyan Ma +H3C Donghai Xu +H3C Fei Wang +H3C Jie Chen +H3C Jie Li +H3C Kongming Wu +H3C Lu Shi +H3C Mingyue Zhao +H3C Ming Zou +H3C Na Xie +H3C Ni Dang +H3C Peiyang Liu +H3C Qiang Guo +H3C Qiankun Zheng +H3C Ruifeng Yang +H3C Sangdi Xu +H3C Shan Li +H3C Siyuan Zhou +H3C Tingting Chi +H3C Weijun Duan +H3C Wenfeng Wang +H3C Xiangwei Wu +H3C Xiaofeng Feng +H3C Xiaowei Chen +H3C Xuan Liu +H3C Xudong Cao +H3C Yanbin Wu +H3C Yehua Chen +H3C Yongqiang He +H3C Yue Zhu +H3C Yunhui Chen +H3C Zengran Zhang +H3C Zeqiang Zhuang +H3C Zhanyang Chen Hastexo Florian Haas HGST Kevin Dalley HGST Lluis Pamies-Juarez @@ -148,8 +186,8 @@ IWeb David Moreau Simard Karlsruhe Institute of Technology Daniel J. Hofmann Keeper Technology Wyllys Ingersoll Lebanon Evangelical School Jonathan Dieter -Linaro Yazen Ghannam Linaro Steve Capper +Linaro Yazen Ghannam Los Alamos National Laboratory Esteban Molina-Estolano Mellanox Vu Pham Mirantis Andrew Woodward @@ -160,8 +198,8 @@ Mirantis Mykola Golub Mirantis Radoslaw Zarzynski MIT Computer Science and Artificial Intelligence Laboratory Stephen Jahl MSys Technologies Rajesh Nambiar -Nebula Chris Holcombe Nebula Anton Aksola +Nebula Chris Holcombe Opower Derrick Schneider Pacific Northwest National Laboratory Brown, David M JR Pacific Northwest National Laboratory Erwin, Brock A @@ -214,8 +252,8 @@ Red Hat Tamil Muthamizhan Red Hat Tom Callaway Red Hat Travis Rhoden Red Hat Tyler Brekke -Red Hat Venky Shankar Red Hat Vasu Kulkarni +Red Hat Venky Shankar Red Hat Warren Usui Red Hat Yan, Zheng Red Hat Yehuda Sadeh @@ -292,11 +330,10 @@ Unaffiliated Florian Coste Unaffiliated Florian Marsylle Unaffiliated François Lafont Unaffiliated Frank Yu -Unaffiliated Jon Bernard Unaffiliated Gaurav Kumar Garg -Unaffiliated Huang Jun Unaffiliated Haomai Wang Unaffiliated Henry Chang +Unaffiliated Huang Jun Unaffiliated Ian Kelling Unaffiliated Ilja Slepnev Unaffiliated Ismael Serrano @@ -305,6 +342,7 @@ Unaffiliated Javier Guerra Unaffiliated Jiang Heng Unaffiliated Jiantao He Unaffiliated Jian Wen +Unaffiliated Jon Bernard Unaffiliated Karel Striegel Unaffiliated Kefu Chai Unaffiliated Kernel Neophyte @@ -330,23 +368,23 @@ Unaffiliated Shawn Edwards Unaffiliated Simon Guinot Unaffiliated Stephen F Taylor Unaffiliated Steve Stock -Unaffiliated Tim Freund Unaffiliated Thomas Johnson +Unaffiliated Tim Freund Unaffiliated Vartika Rai Unaffiliated Vicente Cheng Unaffiliated Viktor Suprun Unaffiliated Volker Voigt Unaffiliated VRan Liu Unaffiliated William A. Kennington III -Unaffiliated Xingyi Wu Unaffiliated Xan Peng Unaffiliated Xie Rui <875016668@qq.com> +Unaffiliated Xingyi Wu Unaffiliated Xinze Chi Unaffiliated Xiong Yiliang Unaffiliated Yann Dupont Unaffiliated Yongyue Sun -Unaffiliated Zhicheng Wei Unaffiliated Zhe Zhang +Unaffiliated Zhicheng Wei Unilogic Networks B.V Pascal de Bruijn UnitedStack Dong Yuan UnitedStack Guangliang Zhao @@ -380,44 +418,6 @@ Yahoo! Wei Luo Yahoo! Xihui He Yahoo! Zhi (David) Zhang YouScribe Guilhem Lettron -H3C Ce Gu -H3C Zengran Zhang -H3C Weijun Duan -H3C Lu Shi -H3C Na Xie -H3C Ruifeng Yang -H3C Xiaowei Chen -H3C Xuan Liu -H3C Donghai Xu -H3C Mingyue Zhao -H3C Qiankun Zheng -H3C Sangdi Xu -H3C Bo Cai -H3C Yue Zhu -H3C Fei Wang -H3C Ni Dang -H3C Shan Li -H3C Xiaofeng Feng -H3C Peiyang Liu -H3C Zeqiang Zhuang -H3C Ming Zou -H3C Bingxin Yang -H3C Xiangwei Wu -H3C Jie Chen -H3C Jie Li -H3C Zhanyang Chen -H3C Yehua Chen -H3C Siyuan Zhou -H3C Qiang Guo -H3C Yanbin Wu -H3C Kongming Wu -H3C Bin Zheng -H3C Chunyan Ma -H3C Yongqiang He -H3C Xudong Cao -H3C Yunhui Chen -H3C Tingting Chi -H3C Wenfeng Wang # # Local Variables: # compile-command: "git log --pretty='%aN <%aE>' | \ From fbbe5b05e8db71dcb96ad699a30d6edc4b94d830 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Fri, 28 Aug 2015 12:46:14 +0200 Subject: [PATCH 187/654] mailmap: make h3c mailmap more robust Signed-off-by: Loic Dachary --- .mailmap | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.mailmap b/.mailmap index fc7a27fab218a..a59fa711fe668 100644 --- a/.mailmap +++ b/.mailmap @@ -34,7 +34,7 @@ Caleb Miles caleb miles Caleb Miles Caleb Miles Carlos Maltzahn carlosm Casey Marshall rsdio -Ce Gu guce +Ce Gu Chendi Xue Chendi.Xue Chendi Xue Chendi Xue Cheng Cheng cchengleo @@ -53,7 +53,7 @@ David Moreau Simard Dmitry Smirnov Dmitry Yatsushkevich Dominik Hannen Dominik Hannen -Donghai Xu x11507 +Donghai Xu Eric Mourgaya Erwin, Brock A Esteban Molina-Estolano eestolan @@ -127,7 +127,7 @@ Loic Dachary Loic Dachary Loic Dachary Loic Dachary Loïc Dachary -Lu Shi s09816 +Lu Shi Ma Jianpeng Jianpeng Ma Ma Jianpeng Ma Jianpeng Ma, Jianpeng @@ -140,10 +140,10 @@ Matthew Wodrich Michael Riederer dynamike67 Michael Rodriguez Michael Rodriguez -Mingyue Zhao mingyuez +Mingyue Zhao Mykola Golub Mykola Golub Nathan Cutler -Na Xie x11562 +Na Xie Neha Ummareddy nehaummareddy Neil Levine Ning Yao @@ -155,7 +155,7 @@ Patrick McGarry scuttlemonkey Patrick McGarry Pavan Rallabhandi Pavan Rallabhandi Pete Zaitcev -Qiankun Zheng zqkqkz +Qiankun Zheng Riccardo Ferretti rferrett Roald J. van Loon Robert Jansen @@ -163,7 +163,7 @@ Robin Dehu Ron Allred rallred Ross Turk Ross Turk -Ruifeng Yang Ruifeng Yang <149233652@qq.com> +Ruifeng Yang <149233652@qq.com> Sage Weil Sage Weil Sage Weil @@ -220,15 +220,15 @@ Volker Assmann Walter Huf Walter J. Huf Wang, Yaguang ywang19 Warren Usui wusui -Weijun Duan duanweijun +Weijun Duan Wei Luo luowei Wido den Hollander Xan Peng xan Xavier Roche -Xiaowei Chen shawn chen +Xiaowei Chen Xie Rui <875016668@qq.com> Jerry7X <875016668@qq.com> Xingyi Wu -Xuan Liu l11625 +Xuan Liu Yan, Zheng Yan, Zheng Zheng Yan Yan, Zheng Zheng, Yan @@ -246,7 +246,7 @@ Yehuda Sadeh Yehuda Sadeh Yongyue Sun Abioy Yuan Zhou -Zengran Zhang Aran85 +Zengran Zhang Zhi (David) Zhang Zhi (David) Zhang Zhi Z Zhang Zhiqiang Wang Signed-off-by: Zhiqiang Wang From cfcacb89a195da9571a7a23fe6371cabe60ed48b Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Mon, 27 Jul 2015 21:17:31 +0100 Subject: [PATCH 188/654] mon: LogMonitor: handle boolean options consistently 'mon_cluster_log_to_syslog' gets a string of key/value pairs, values in the pair being booleans, and keys being optional (one can simply specify the value). However, we weren't being consistent with the boolean behavior when handling option values. e.g., the user expects '1' and '0' to mean 'true' and 'false' respectively, and expects 'mon_cluster_log_to_syslog' to understand both '1' and '0', alongside with 'true' and 'false'. All values not 'true' or '1' will be considered 'false'. Fixes: #12325 Signed-off-by: Joao Eduardo Luis --- src/mon/LogMonitor.cc | 30 ++++++++++++++++++++++++++++++ src/mon/LogMonitor.h | 5 +---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc index c571c69f818fb..b5ab44712fd0b 100644 --- a/src/mon/LogMonitor.cc +++ b/src/mon/LogMonitor.cc @@ -12,6 +12,8 @@ * */ +#include + #include #include @@ -28,6 +30,7 @@ #include "osd/osd_types.h" #include "common/errno.h" #include "common/config.h" +#include "common/strtol.h" #include "include/assert.h" #include "include/str_list.h" #include "include/str_map.h" @@ -673,6 +676,33 @@ string LogMonitor::log_channel_info::expand_channel_meta( return s; } +bool LogMonitor::log_channel_info::do_log_to_syslog(const string &channel) { + string v = get_str_map_key(log_to_syslog, channel, + &CLOG_CONFIG_DEFAULT_KEY); + // We expect booleans, but they are in k/v pairs, kept + // as strings, in 'log_to_syslog'. We must ensure + // compatibility with existing boolean handling, and so + // we are here using a modified version of how + // md_config_t::set_val_raw() handles booleans. We will + // accept both 'true' and 'false', but will also check for + // '1' and '0'. The main distiction between this and the + // original code is that we will assume everything not '1', + // '0', 'true' or 'false' to be 'false'. + bool ret = false; + + if (boost::iequals(v, "false")) { + ret = false; + } else if (boost::iequals(v, "true")) { + ret = true; + } else { + std::string err; + int b = strict_strtol(v.c_str(), 10, &err); + ret = (err.empty() && b == 1); + } + + return ret; +} + void LogMonitor::handle_conf_change(const struct md_config_t *conf, const std::set &changed) { diff --git a/src/mon/LogMonitor.h b/src/mon/LogMonitor.h index 6dcee8b85cbcb..4d31b66b3d13c 100644 --- a/src/mon/LogMonitor.h +++ b/src/mon/LogMonitor.h @@ -69,10 +69,7 @@ class LogMonitor : public PaxosService, string expand_channel_meta(const string &input, const string &change_to); - bool do_log_to_syslog(const string &channel) { - return (get_str_map_key(log_to_syslog, channel, - &CLOG_CONFIG_DEFAULT_KEY) == "true"); - } + bool do_log_to_syslog(const string &channel); string get_facility(const string &channel) { return get_str_map_key(syslog_facility, channel, From afa92e5743832f8fd3956bd1eac695daa07ad5ff Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 28 Aug 2015 06:01:53 -0700 Subject: [PATCH 189/654] common/SubProcess: silence compiler warnings * the STD{IN,OUT,ERR}_FILENO are closed when being dup2'ed so we can not write to cout or cerr in child processes, as these fstream are still holding the old references. * to silence every write(2) call is tedious. better off overwriting the raw buffer in cerr and cout ostreams Fixes: #12730 Signed-off-by: Kefu Chai --- src/common/SubProcess.h | 57 ++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h index 3d739849193d4..60dd4aef67205 100644 --- a/src/common/SubProcess.h +++ b/src/common/SubProcess.h @@ -212,6 +212,25 @@ const char* SubProcess::err() const { return errstr.str().c_str(); } +class fd_buf : public std::streambuf { + int fd; +public: + fd_buf (int fd) : fd(fd) + {} +protected: + int_type overflow (int_type c) override { + if (c == EOF) return EOF; + char buf = c; + if (write (fd, &buf, 1) != 1) { + return EOF; + } + return c; + } + std::streamsize xsputn (const char* s, std::streamsize count) override { + return write(fd, s, count); + } +}; + int SubProcess::spawn() { assert(!is_spawned()); assert(stdin_pipe_out_fd == -1); @@ -255,10 +274,14 @@ int SubProcess::spawn() { if (opipe[OUT] != -1 && opipe[OUT] != STDOUT_FILENO) { ::dup2(opipe[OUT], STDOUT_FILENO); close(opipe[OUT]); + static fd_buf buf(STDOUT_FILENO); + std::cout.rdbuf(&buf); } if (epipe[OUT] != -1 && epipe[OUT] != STDERR_FILENO) { ::dup2(epipe[OUT], STDERR_FILENO); close(epipe[OUT]); + static fd_buf buf(STDERR_FILENO); + std::cerr.rdbuf(&buf); } int maxfd = sysconf(_SC_OPEN_MAX); @@ -307,9 +330,7 @@ void SubProcess::exec() { int ret = execvp(cmd.c_str(), (char * const *)&args[0]); assert(ret == -1); - std::ostringstream err; - err << cmd << ": exec failed: " << cpp_strerror(errno) << "\n"; - write(STDERR_FILENO, err.str().c_str(), err.str().size()); + std::cerr << cmd << ": exec failed: " << cpp_strerror(errno) << "\n"; _exit(EXIT_FAILURE); } @@ -363,24 +384,23 @@ void SubProcessTimed::exec() { } sigset_t mask, oldmask; - std::ostringstream err; int pid; // Restore default action for SIGTERM in case the parent process decided // to ignore it. if (signal(SIGTERM, SIG_DFL) == SIG_ERR) { - err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } // Because SIGCHLD is ignored by default, setup dummy handler for it, // so we can mask it. if (signal(SIGCHLD, dummy_sighandler) == SIG_ERR) { - err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } // Setup timeout handler. if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) { - err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } // Block interesting signals. @@ -390,21 +410,21 @@ void SubProcessTimed::exec() { sigaddset(&mask, SIGCHLD); sigaddset(&mask, SIGALRM); if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) { - err << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } pid = fork(); if (pid == -1) { - err << cmd << ": fork failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": fork failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } if (pid == 0) { // Child // Restore old sigmask. if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { - err << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } (void)setpgid(0, 0); // Become process group leader. @@ -418,48 +438,45 @@ void SubProcessTimed::exec() { for (;;) { int signo; if (sigwait(&mask, &signo) == -1) { - err << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } switch (signo) { case SIGCHLD: int status; if (waitpid(pid, &status, WNOHANG) == -1) { - err << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } - write(STDERR_FILENO, err.str().c_str(), err.str().size()); if (WIFEXITED(status)) _exit(WEXITSTATUS(status)); if (WIFSIGNALED(status)) _exit(128 + WTERMSIG(status)); - err << cmd << ": unknown status returned\n"; + std::cerr << cmd << ": unknown status returned\n"; goto fail_exit; case SIGINT: case SIGTERM: // Pass SIGINT and SIGTERM, which are usually used to terminate // a process, to the child. if (::kill(pid, signo) == -1) { - err << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } continue; case SIGALRM: - err << cmd << ": timed out (" << timeout << " sec)\n"; - write(STDERR_FILENO, err.str().c_str(), err.str().size()); + std::cerr << cmd << ": timed out (" << timeout << " sec)\n"; if (::killpg(pid, sigkill) == -1) { - err << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; + std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; goto fail_exit; } continue; default: - err << cmd << ": sigwait: invalid signal: " << signo << "\n"; + std::cerr << cmd << ": sigwait: invalid signal: " << signo << "\n"; goto fail_exit; } } fail_exit: - write(STDERR_FILENO, err.str().c_str(), err.str().size()); _exit(EXIT_FAILURE); } From a3fc6e8356a82939dc6a536e16170431553b4b6d Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 28 Aug 2015 15:33:12 +0100 Subject: [PATCH 190/654] CMake: update for boost_random This became a dependency in dbcaa544 Signed-off-by: John Spray --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ad8d76e26350..59a7328572cfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,7 +224,7 @@ else(${ENABLE_SHARED}) endif(${ENABLE_SHARED}) set(Boost_USE_MULTITHREADED ON) -find_package(Boost COMPONENTS thread system regex REQUIRED) +find_package(Boost COMPONENTS thread system regex random REQUIRED) include_directories(${Boost_INCLUDE_DIRS}) find_package(Threads REQUIRED) From a895982066b9e2c1045f6bbf38fed14e34512641 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 29 Aug 2015 00:08:10 +0800 Subject: [PATCH 191/654] common: 'enable experimental data corrupting features' now understands '*' '*' allows any feature. Signed-off-by: Kefu Chai --- src/common/ceph_context.cc | 3 ++- src/test/common/test_context.cc | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc index 877dd539284ca..ff7bf2bf0e6b3 100644 --- a/src/common/ceph_context.cc +++ b/src/common/ceph_context.cc @@ -230,7 +230,8 @@ bool CephContext::check_experimental_feature_enabled(const std::string& feat, std::ostream *message) { ceph_spin_lock(&_feature_lock); - bool enabled = _experimental_features.count(feat); + bool enabled = (_experimental_features.count(feat) || + _experimental_features.count("*")); ceph_spin_unlock(&_feature_lock); if (enabled) { diff --git a/src/test/common/test_context.cc b/src/test/common/test_context.cc index c32fed30b3226..921fc90ff5fa6 100644 --- a/src/test/common/test_context.cc +++ b/src/test/common/test_context.cc @@ -82,6 +82,13 @@ TEST(CephContext, experimental_features) ASSERT_FALSE(cct->check_experimental_feature_enabled("bar")); ASSERT_TRUE(cct->check_experimental_feature_enabled("baz")); + cct->_conf->set_val("enable_experimental_unrecoverable_data_corrupting_features", + "*"); + cct->_conf->apply_changes(&cout); + ASSERT_TRUE(cct->check_experimental_feature_enabled("foo")); + ASSERT_TRUE(cct->check_experimental_feature_enabled("bar")); + ASSERT_TRUE(cct->check_experimental_feature_enabled("baz")); + cct->_log->flush(); } From cc21514bba98fca300cc2364ba02576f1f2fc63b Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 22:08:46 +0200 Subject: [PATCH 192/654] ceph-disk: {CentOS,RHEL} >= 7 && Fedora >= 22 are systemd http://tracker.ceph.com/issues/12786 Fixes: #12786 Signed-off-by: Loic Dachary --- src/ceph-detect-init/ceph_detect_init/centos/__init__.py | 2 ++ src/ceph-detect-init/ceph_detect_init/fedora/__init__.py | 2 ++ src/ceph-detect-init/ceph_detect_init/rhel/__init__.py | 2 ++ src/ceph-detect-init/tests/test_all.py | 9 +++++++++ 4 files changed, 15 insertions(+) diff --git a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py index f7bf85beda8c4..b9738a73b398a 100644 --- a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py +++ b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py @@ -8,4 +8,6 @@ def choose_init(): Returns the name of a init system (upstart, sysvinit ...). """ + if release and int(release.split('.')[0]) >= 7: + return 'systemd' return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py index f7bf85beda8c4..566f8e37e99c9 100644 --- a/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py +++ b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py @@ -8,4 +8,6 @@ def choose_init(): Returns the name of a init system (upstart, sysvinit ...). """ + if release and int(release.split('.')[0]) >= 22: + return 'systemd' return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py index f7bf85beda8c4..b9738a73b398a 100644 --- a/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py +++ b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py @@ -8,4 +8,6 @@ def choose_init(): Returns the name of a init system (upstart, sysvinit ...). """ + if release and int(release.split('.')[0]) >= 7: + return 'systemd' return 'sysvinit' diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py index 68189bf0187b8..069a0ede1f472 100644 --- a/src/ceph-detect-init/tests/test_all.py +++ b/src/ceph-detect-init/tests/test_all.py @@ -38,6 +38,9 @@ class TestCephDetectInit(testtools.TestCase): def test_centos(self): + with mock.patch('ceph_detect_init.centos.release', + '7.0'): + self.assertEqual('systemd', centos.choose_init()) self.assertEqual('sysvinit', centos.choose_init()) def test_debian(self): @@ -49,9 +52,15 @@ def test_debian(self): self.assertEqual('upstart', debian.choose_init()) def test_fedora(self): + with mock.patch('ceph_detect_init.fedora.release', + '22'): + self.assertEqual('systemd', fedora.choose_init()) self.assertEqual('sysvinit', fedora.choose_init()) def test_rhel(self): + with mock.patch('ceph_detect_init.rhel.release', + '7.0'): + self.assertEqual('systemd', rhel.choose_init()) self.assertEqual('sysvinit', rhel.choose_init()) def test_suse(self): From 42ad86e14e352f2a3a33e774224f1789f268da83 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Sun, 16 Aug 2015 12:05:51 +0200 Subject: [PATCH 193/654] udev: add devicemapper to partuuid-workaround The dm-* devices are not excluded and will have by-partuuid symlinks etc. This will include devices managed by multipath as well as others. Since this only is used on partitions: # ignore partitions that span the entire disk TEST=="whole_disk", GOTO="persistent_storage_end_two" It may create symlinks for dm-* devices that are unrelated to Ceph and we assume this is going to be ok. Signed-off-by: Loic Dachary --- udev/60-ceph-partuuid-workaround.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udev/60-ceph-partuuid-workaround.rules b/udev/60-ceph-partuuid-workaround.rules index c41a27204406a..290596902fac6 100644 --- a/udev/60-ceph-partuuid-workaround.rules +++ b/udev/60-ceph-partuuid-workaround.rules @@ -13,7 +13,7 @@ ACTION=="remove", GOTO="persistent_storage_end_two" SUBSYSTEM!="block", GOTO="persistent_storage_end_two" # skip rules for inappropriate block devices -KERNEL=="fd*|mtd*|nbd*|gnbd*|btibm*|dm-*|md*", GOTO="persistent_storage_end_two" +KERNEL=="fd*|mtd*|nbd*|gnbd*|btibm*|md*", GOTO="persistent_storage_end_two" # ignore partitions that span the entire disk TEST=="whole_disk", GOTO="persistent_storage_end_two" From f9cbd792f8a8faa922b9fdd00ba1e07e32945706 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 22:22:12 +0200 Subject: [PATCH 194/654] tests: ceph-disk tests may use system ceph-{mon,osd} Allow ceph-disk.sh to run to test ceph as installed from packages. When run from sources, ceph-disk.sh is expected to use the binaries from the source tree. It is enough to prepend . to the PATH. There is no need to prefix each binary with ./ The virtualenv is also only necessary when running from sources and setting it up for ceph-detect-init is only done if in the source tree. Signed-off-by: Loic Dachary --- src/test/ceph-disk.sh | 63 ++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/src/test/ceph-disk.sh b/src/test/ceph-disk.sh index c34600160c4f1..c568f3df6f50c 100755 --- a/src/test/ceph-disk.sh +++ b/src/test/ceph-disk.sh @@ -21,17 +21,18 @@ PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' export PATH=.:$PATH # make sure program from sources are prefered DIR=test-ceph-disk -virtualenv virtualenv-$DIR -. virtualenv-$DIR/bin/activate -( - # older versions of pip will not install wrap_console scripts - # when using wheel packages - pip install --upgrade 'pip >= 6.1' - if test -d ceph-detect-init/wheelhouse ; then - wheelhouse="--no-index --use-wheel --find-links=ceph-detect-init/wheelhouse" - fi - pip --log virtualenv-$DIR/log.txt install $wheelhouse --editable ceph-detect-init -) +if virtualenv virtualenv-$DIR && test -d ceph-detect-init ; then + . virtualenv-$DIR/bin/activate + ( + # older versions of pip will not install wrap_console scripts + # when using wheel packages + pip install --upgrade 'pip >= 6.1' + if test -d ceph-detect-init/wheelhouse ; then + wheelhouse="--no-index --use-wheel --find-links=ceph-detect-init/wheelhouse" + fi + pip --log virtualenv-$DIR/log.txt install $wheelhouse --editable ceph-detect-init + ) +fi OSD_DATA=$DIR/osd MON_ID=a MONA=127.0.0.1:7451 @@ -46,7 +47,9 @@ CEPH_ARGS+=" --osd-failsafe-full-ratio=.99" CEPH_ARGS+=" --mon-host=$MONA" CEPH_ARGS+=" --log-file=$DIR/\$name.log" CEPH_ARGS+=" --pid-file=$DIR/\$name.pidfile" -CEPH_ARGS+=" --erasure-code-dir=.libs" +if test -d .libs ; then + CEPH_ARGS+=" --erasure-code-dir=.libs" +fi CEPH_ARGS+=" --auth-supported=none" CEPH_ARGS+=" --osd-journal-size=100" CEPH_DISK_ARGS= @@ -85,14 +88,14 @@ function teardown() { function run_mon() { local mon_dir=$DIR/$MON_ID - ./ceph-mon \ + ceph-mon \ --id $MON_ID \ --mkfs \ --mon-data=$mon_dir \ --mon-initial-members=$MON_ID \ "$@" - ./ceph-mon \ + ceph-mon \ --id $MON_ID \ --mon-data=$mon_dir \ --mon-osd-full-ratio=.99 \ @@ -193,10 +196,10 @@ function test_mark_init() { $mkdir -p $OSD_DATA - ./ceph-disk $CEPH_DISK_ARGS \ + ceph-disk $CEPH_DISK_ARGS \ prepare --osd-uuid $osd_uuid $osd_data || return 1 - $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \ + $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \ --verbose \ activate \ --mark-init=auto \ @@ -210,7 +213,7 @@ function test_mark_init() { else expected=systemd fi - $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \ + $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \ --verbose \ activate \ --mark-init=$expected \ @@ -227,7 +230,7 @@ function test_zap() { local osd_data=$DIR/dir $mkdir -p $osd_data - ./ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep -q 'not full block device' || return 1 + ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep -q 'not full block device' || return 1 $rm -fr $osd_data } @@ -242,7 +245,7 @@ function test_activate_dir_magic() { mkdir -p $osd_data/fsid CEPH_ARGS="--fsid $uuid" \ - ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data > $DIR/out 2>&1 + ceph-disk $CEPH_DISK_ARGS prepare $osd_data > $DIR/out 2>&1 grep --quiet 'Is a directory' $DIR/out || return 1 ! [ -f $osd_data/magic ] || return 1 rmdir $osd_data/fsid @@ -250,7 +253,7 @@ function test_activate_dir_magic() { echo successfully prepare the OSD CEPH_ARGS="--fsid $uuid" \ - ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out + ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out grep --quiet 'Preparing osd data dir' $DIR/out || return 1 grep --quiet $uuid $osd_data/ceph_fsid || return 1 [ -f $osd_data/magic ] || return 1 @@ -258,7 +261,7 @@ function test_activate_dir_magic() { echo will not override an existing OSD CEPH_ARGS="--fsid $($uuidgen)" \ - ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out + ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out grep --quiet 'ceph-disk:Data dir .* already exists' $DIR/out || return 1 grep --quiet $uuid $osd_data/ceph_fsid || return 1 } @@ -266,14 +269,14 @@ function test_activate_dir_magic() { function test_pool_read_write() { local osd_uuid=$1 - $timeout $TIMEOUT ./ceph osd pool set $TEST_POOL size 1 || return 1 + $timeout $TIMEOUT ceph osd pool set $TEST_POOL size 1 || return 1 local id=$(ceph osd create $osd_uuid) local weight=1 - ./ceph osd crush add osd.$id $weight root=default host=localhost || return 1 + ceph osd crush add osd.$id $weight root=default host=localhost || return 1 echo FOO > $DIR/BAR - $timeout $TIMEOUT ./rados --pool $TEST_POOL put BAR $DIR/BAR || return 1 - $timeout $TIMEOUT ./rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1 + $timeout $TIMEOUT rados --pool $TEST_POOL put BAR $DIR/BAR || return 1 + $timeout $TIMEOUT rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1 $diff $DIR/BAR $DIR/BAR.copy || return 1 } @@ -285,10 +288,10 @@ function test_activate() { $mkdir -p $OSD_DATA - ./ceph-disk $CEPH_DISK_ARGS \ + ceph-disk $CEPH_DISK_ARGS \ prepare --osd-uuid $osd_uuid $to_prepare $journal || return 1 - $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \ + $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \ activate \ --mark-init=none \ $to_activate || return 1 @@ -311,7 +314,7 @@ function test_activate_dmcrypt() { echo "osd_dmcrypt_type=plain" > $DIR/ceph.conf fi - ./ceph-disk $CEPH_DISK_ARGS \ + ceph-disk $CEPH_DISK_ARGS \ prepare --dmcrypt --dmcrypt-key-dir $DIR/keys --osd-uuid=$uuid --journal-uuid=$juuid $to_prepare $journal || return 1 if test $plain = plain ; then @@ -321,8 +324,8 @@ function test_activate_dmcrypt() { /sbin/cryptsetup --key-file $DIR/keys/$uuid.luks.key luksOpen $to_activate $uuid /sbin/cryptsetup --key-file $DIR/keys/$juuid.luks.key luksOpen ${journal}${journal_p} $juuid fi - - $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \ + + $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \ activate \ --mark-init=none \ /dev/mapper/$uuid || return 1 From 0e34742b968e72aa6ce4a0c95a885dced435b3bc Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 22:52:25 +0200 Subject: [PATCH 195/654] ceph-disk: is_mpath predicate for multipath devices The is_mpath predicate returns True if a device is managed by multipath. It is based on the devicemapper uuid content which is expected to always contain the mpath- string to identify the multipath subsystem. The block_path helper is added to convert the path to a device to the /sys directory that describes it. It uses the major and minor number instead of the device name because it is more reliable. The rationale including an actual example is added as a comment for future maintainers. Signed-off-by: Loic Dachary --- src/ceph-disk | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index d7b3233cff5c7..69f8b8c5f0d56 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -90,6 +90,7 @@ DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' DMCRYPT_JOURNAL_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-35865ceff2be' DEFAULT_FS_TYPE = 'xfs' +SYSFS = '/sys' MOUNT_OPTIONS = dict( btrfs='noatime,user_subvol_rm_allowed', @@ -363,6 +364,56 @@ def platform_information(): str(codename).strip() ) +# +# An alternative block_path implementation would be +# +# name = basename(dev) +# return /sys/devices/virtual/block/$name +# +# It is however more fragile because it relies on the fact +# that the basename of the device the user will use always +# matches the one the driver will use. On Ubuntu 14.04, for +# instance, when multipath creates a partition table on +# +# /dev/mapper/353333330000007d0 -> ../dm-0 +# +# it will create partition devices named +# +# /dev/mapper/353333330000007d0-part1 +# +# which is the same device as /dev/dm-1 but not a symbolic +# link to it: +# +# ubuntu@other:~$ ls -l /dev/mapper /dev/dm-1 +# brw-rw---- 1 root disk 252, 1 Aug 15 17:52 /dev/dm-1 +# lrwxrwxrwx 1 root root 7 Aug 15 17:52 353333330000007d0 -> ../dm-0 +# brw-rw---- 1 root disk 252, 1 Aug 15 17:52 353333330000007d0-part1 +# +# Using the basename in this case fails. +# +def block_path(dev): + path = os.path.realpath(dev) + rdev = os.stat(path).st_rdev + (M, m) = (os.major(rdev), os.minor(rdev)) + return "{sysfs}/dev/block/{M}:{m}".format(sysfs=SYSFS, M=M, m=m) + +def get_dm_uuid(dev): + uuid_path = os.path.join(block_path(dev), 'dm', 'uuid') + LOG.debug("get_dm_uuid " + dev + " uuid path is " + uuid_path) + if not os.path.exists(uuid_path): + return False + uuid = open(uuid_path, 'r').read() + LOG.debug("get_dm_uuid " + dev + " uuid is " + uuid) + return uuid + +def is_mpath(dev): + """ + True if the path is managed by multipath + """ + uuid = get_dm_uuid(dev) + return (uuid and + (re.match('part\d+-mpath-', uuid) or + re.match('mpath-', uuid))) def get_dev_name(path): """ From aac89719881c2788941b74d385d95860e520ea78 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 23:00:44 +0200 Subject: [PATCH 196/654] ceph-disk: replace partx with partprobe Older distributions that required partx (CentOS 6 and the like) are no longer supported and the partx fallback can be obsoleted. Signed-off-by: Loic Dachary --- src/ceph-disk | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 69f8b8c5f0d56..555f62c3ee3f5 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -1123,33 +1123,14 @@ def get_free_partition_index(dev): return 1 -def update_partition(action, dev, description): - # try to make sure the kernel refreshes the table. note - # that if this gets ebusy, we are probably racing with - # udev because it already updated it.. ignore failure here. - - # On RHEL and CentOS distros, calling partprobe forces a reboot of the - # server. Since we are not resizing partitons so we rely on calling - # partx - if platform_distro().startswith(('centos', 'red', 'scientific')): - LOG.info('calling partx on %s device %s', description, dev) - LOG.info('re-reading known partitions will display errors') - command( - [ - 'partx', - action, - dev, - ], - ) - - else: - LOG.debug('Calling partprobe on %s device %s', description, dev) - command( - [ - 'partprobe', - dev, - ], - ) +def update_partition(dev, description): + LOG.debug('Calling partprobe on %s device %s', description, dev) + command( + [ + 'partprobe', + dev, + ], + ) def zap(dev): @@ -1188,7 +1169,7 @@ def zap(dev): ], ) - update_partition('-d', dev, 'zapped') + update_partition(dev, 'zapped') except subprocess.CalledProcessError as e: raise Error(e) @@ -1305,7 +1286,7 @@ def prepare_journal_dev( ] ) - update_partition('-a', journal, 'prepared') + update_partition(journal, 'prepared') # wait for udev event queue to clear command( @@ -1521,7 +1502,7 @@ def prepare_dev( data, ], ) - update_partition('-a', data, 'created') + update_partition(data, 'created') command( [ # wait for udev event queue to clear @@ -1824,7 +1805,7 @@ def main_prepare(args): prepare_lock.release() # noqa if stat.S_ISBLK(dmode): - update_partition('-a', args.data, 'prepared') + update_partition(args.data, 'prepared') except Error as e: if journal_dm_keypath: From 2fca91eb2a84b5b92f9d0eac6ef3894ed23340ff Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 23:02:40 +0200 Subject: [PATCH 197/654] ceph-disk: --verbose shows a stack trace on error When running with --verbose, do not hide the stack trace from the user when an exception is raised. It is most helpful to figure out when the exception actually happened. Signed-off-by: Loic Dachary --- src/ceph-disk | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/ceph-disk b/src/ceph-disk index 555f62c3ee3f5..65fff4f724ea8 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -3175,8 +3175,15 @@ def main(): setup_statedir(args.statedir) setup_sysconfdir(args.sysconfdir) - try: + if args.verbose: args.func(args) + else: + main_catch(args.func, args) + +def main_catch(func, args): + + try: + func(args) except Error as e: raise SystemExit( From 3bc95dfc1b88c01e16c3df04e96acced777b344a Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 23:25:45 +0200 Subject: [PATCH 198/654] ceph-disk: multipath support for is_partition and list_partitions The is_partition predicate and the list_partitions function support devices managed by multipath. A set of helpers dedicated to multipath devices is implemented because the content of the corresponding /sys directory does not use the same conventions as regular devices regarding partitions. Instead of relying on subdirectories such as /sys/block/name/name1, the devicemapper uuid file is used and expected to start with part\d+. The holders/slaves directories provide pointers between the whole device and the partition devices. Although these structural differences reduce the opportunity for code factorization, it is easier for backward compatibility since the multipath specific logic is limited to if is_mpath(dev) branches. http://tracker.ceph.com/issues/11881 Refs: #11881 Signed-off-by: Loic Dachary --- src/ceph-disk | 62 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 65fff4f724ea8..38755ac5ac210 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -474,6 +474,14 @@ def get_dev_size(dev, size='megabytes'): os.close(fd) +def get_partition_mpath(dev, pnum): + part_re = "part{pnum}-mpath-".format(pnum=pnum) + partitions = list_partitions_mpath(dev, part_re) + if partitions: + return partitions[0] + else: + return None + def get_partition_dev(dev, pnum): """ get the device name for a partition @@ -484,13 +492,16 @@ def get_partition_dev(dev, pnum): sda 1 -> sda1 cciss/c0d1 1 -> cciss!c0d1p1 """ - name = get_dev_name(os.path.realpath(dev)) partname = None - for f in os.listdir(os.path.join('/sys/block', name)): - if f.startswith(name) and f.endswith(str(pnum)): - # we want the shortest name that starts with the base name and ends with the partition number - if not partname or len(f) < len(partname): - partname = f + if is_mpath(dev): + partname = get_partition_mpath(dev, pnum) + else: + name = get_dev_name(os.path.realpath(dev)) + for f in os.listdir(os.path.join('/sys/block', name)): + if f.startswith(name) and f.endswith(str(pnum)): + # we want the shortest name that starts with the base name and ends with the partition number + if not partname or len(f) < len(partname): + partname = f if partname: return get_dev_path(partname) else: @@ -503,21 +514,40 @@ def list_all_partitions(): """ dev_part_list = {} for name in os.listdir('/sys/block'): + LOG.debug("list_all_partitions: " + name) # /dev/fd0 may hang http://tracker.ceph.com/issues/6827 if re.match(r'^fd\d$', name): continue - if not os.path.exists(os.path.join('/sys/block', name, 'device')): - continue - dev_part_list[name] = list_partitions(name) + dev_part_list[name] = list_partitions(os.path.join('/dev', name)) return dev_part_list +def list_partitions(dev): + dev = os.path.realpath(dev) + if is_mpath(dev): + return list_partitions_mpath(dev) + else: + return list_partitions_device(dev) + +def list_partitions_mpath(dev, part_re="part\d+-mpath-"): + p = block_path(dev) + partitions = [] + holders = os.path.join(p, 'holders') + for holder in os.listdir(holders): + uuid_path = os.path.join(holders, holder, 'dm', 'uuid') + uuid = open(uuid_path, 'r').read() + LOG.debug("list_partitions_mpath: " + uuid_path + " uuid = " + uuid) + if re.match(part_re, uuid): + partitions.append(holder) + return partitions + -def list_partitions(basename): +def list_partitions_device(dev): """ Return a list of partitions on the given device name """ partitions = [] - for name in os.listdir(os.path.join('/sys/block', basename)): + basename = os.path.basename(dev) + for name in os.listdir(block_path(dev)): if name.startswith(basename): partitions.append(name) return partitions @@ -540,10 +570,17 @@ def get_partition_base(dev): return '/dev/' + basename raise Error('no parent device for partition', dev) +def is_partition_mpath(dev): + uuid = get_dm_uuid(dev) + return bool(re.match('part\d+-mpath-', uuid)) + def is_partition(dev): """ Check whether a given device path is a partition or a full disk. """ + if is_mpath(dev): + return is_partition_mpath(dev) + dev = os.path.realpath(dev) if not stat.S_ISBLK(os.lstat(dev).st_mode): raise Error('not a block device', dev) @@ -617,8 +654,7 @@ def verify_not_in_use(dev, check_partitions=False): raise Error('Device %s is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders)) if check_partitions and not is_partition(dev): - basename = get_dev_name(os.path.realpath(dev)) - for partname in list_partitions(basename): + for partname in list_partitions(dev): partition = get_dev_path(partname) if is_mounted(partition): raise Error('Device is mounted', partition) From 77ff7c3dc6dd6861b094e5a53d329de0802f3032 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 17 Aug 2015 23:51:24 +0200 Subject: [PATCH 199/654] ceph-disk: multipath support for split_dev_base_partnum split_dev_base_partnum returns the path of the whole disk in /dev/mapper. The base variable name to designate the device for the whole disk is a misnomer since it cannot be used as a basename to rebuild the parition device name in the case of multipath. The logic of split_dev_base_partnum for devices is reworked to use /sys/dev/block/M:m/partition instead of device name parsing. http://tracker.ceph.com/issues/11881 Refs: #11881 Signed-off-by: Loic Dachary --- src/ceph-disk | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 38755ac5ac210..26c19f2e11b03 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -574,6 +574,18 @@ def is_partition_mpath(dev): uuid = get_dm_uuid(dev) return bool(re.match('part\d+-mpath-', uuid)) +def partnum_mpath(dev): + uuid = get_dm_uuid(dev) + return re.findall('part(\d+)-mpath-', uuid)[0] + +def get_partition_base_mpath(dev): + slave_path = os.path.join(block_path(dev), 'slaves') + slaves = os.listdir(slave_path) + assert slaves + name_path = os.path.join(slave_path, slaves[0], 'dm', 'name') + name = open(name_path, 'r').read().strip() + return os.path.join('/dev/mapper', name) + def is_partition(dev): """ Check whether a given device path is a partition or a full disk. @@ -2565,11 +2577,14 @@ def get_dev_fs(dev): def split_dev_base_partnum(dev): - if 'loop' in dev or 'cciss' in dev or 'nvme' in dev: - return re.match('(.*\d+)p(\d+)', dev).group(1, 2) + if is_mpath(dev): + partnum = partnum_mpath(dev) + base = get_partition_base_mpath(dev) else: - return re.match('(\D+)(\d+)', dev).group(1, 2) - + b = block_path(dev) + partnum = open(os.path.join(b, 'partition')).read().strip() + base = get_partition_base(dev) + return (base, partnum) def get_partition_type(part): """ From a10141809c41ec3cc4d89803cf346acf4913d83e Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 18 Aug 2015 00:04:32 +0200 Subject: [PATCH 200/654] ceph-disk: rework get_partition_{type,uuid} Mimic the get_partition_type implementation after get_partition_uuid and factorize them to reduce the code footprint. The get_partition_type implementation is based on blkid: it is complex and fragile. Since sgdisk is consistently used to create partitions, use it instead. It is already used for get_partition_uuid and there does not seem to be any reason for concern. http://tracker.ceph.com/issues/11881 Refs: #11881 Signed-off-by: Loic Dachary --- src/ceph-disk | 78 ++++----------------------------------------------- 1 file changed, 5 insertions(+), 73 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 26c19f2e11b03..7226dd51962d2 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2587,84 +2587,16 @@ def split_dev_base_partnum(dev): return (base, partnum) def get_partition_type(part): - """ - Get the GPT partition type UUID. If we have an old blkid and can't - get it that way, use sgdisk and use the description instead (and hope - dmcrypt isn't being used). - """ - blkid, _ = command( - [ - 'blkid', - '-p', - '-o', 'udev', - part, - ] - ) - saw_part_entry = False - for line in blkid.splitlines(): - (key, value) = line.split('=') - if key == 'ID_PART_ENTRY_TYPE': - return value - if key == 'ID_PART_ENTRY_SCHEME': - table_type = value - if key.startswith('ID_PART_ENTRY_'): - saw_part_entry = True - - # hmm, is it in fact GPT? - table_type = None - base = get_partition_base(part) - blkid, _ = command( - [ - 'blkid', - '-p', - '-o', 'udev', - base - ] - ) - for line in blkid.splitlines(): - (key, value) = line.split('=') - if key == 'ID_PART_TABLE_TYPE': - table_type = value - if table_type != 'gpt': - return None # not even GPT - - if saw_part_entry: - return None # GPT, and blkid appears to be new, so we're done. - - # bah, fall back to sgdisk. - if 'blkid' not in warned_about: - LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt') - warned_about['blkid'] = True - (base, partnum) = split_dev_base_partnum(part) - sgdisk, _ = command( - [ - 'sgdisk', - '-p', - base, - ] - ) - - for line in sgdisk.splitlines(): - m = re.search('\s+(\d+)\s+\d+\s+\d+\s+\S+ \S+B\s+\S+\s+(.*)', line) - if m is not None: - num = m.group(1) - if num != partnum: - continue - desc = m.group(2) - # assume unencrypted ... blkid has failed us :( - if desc == 'ceph data': - return OSD_UUID - if desc == 'ceph journal': - return JOURNAL_UUID - - return None + return get_sgdisk_partition_info(part, 'Partition GUID code: (\S+)') +def get_partition_uuid(part): + return get_sgdisk_partition_info(part, 'Partition unique GUID: (\S+)') -def get_partition_uuid(dev): +def get_sgdisk_partition_info(dev, regexp): (base, partnum) = split_dev_base_partnum(dev) out, _ = command(['sgdisk', '-i', partnum, base]) for line in out.splitlines(): - m = re.match('Partition unique GUID: (\S+)', line) + m = re.match(regexp, line) if m: return m.group(1).lower() return None From 7e5a69bcd2ea398c78aef165ea0aaa17548b6780 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 18 Aug 2015 00:18:51 +0200 Subject: [PATCH 201/654] ceph-disk: is_held must ignore multipath devices Always return an empty list when is_held is called on a multipath device. The dmcrypt logic base decisions depending on the holders/slaves relationship. Such relationships can also exists for multipath devices but do not have the same semantic. http://tracker.ceph.com/issues/11881 Fixes: #11881 Signed-off-by: Loic Dachary --- src/ceph-disk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index 7226dd51962d2..fd0ca4822d933 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -633,6 +633,9 @@ def is_held(dev): Check if a device is held by another device (e.g., a dm-crypt mapping) """ assert os.path.exists(dev) + if is_mpath(dev): + return [] + dev = os.path.realpath(dev) base = get_dev_name(dev) From d4869ac9e44f30fbc6bca9c0d0aeeee10cb0fb57 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Sun, 16 Aug 2015 02:37:01 +0200 Subject: [PATCH 202/654] ceph-disk: add multipath support A multipath device is detected because there is a /sys/dev/block/M:m/dm/uuid file with the mpath- prefix (or part\w+-mpath prefix). When ceph-disk prepares data or journal devices on a multipath device, it sets the partition typecode to MPATH_JOURNAL_UUID, MPATH_OSD_UUID and MPATH_TOBE_UUID to a) help the udev rules distinguish them from other devices in devicemapper b) allow ceph-disk to fail if an attempt is made to activate a device with this type without accessing it via a multipath device The 95-ceph-osd.rules call ceph-disk activate on partitions of type MPATH_JOURNAL_UUID, MPATH_OSD_UUID. It relies on ceph-disk to do nothing if the device is not accessed via multipath. http://tracker.ceph.com/issues/11881 Fixes: #11881 Signed-off-by: Loic Dachary --- src/ceph-disk | 40 +++++++++++++++++++++++++++++++++++----- udev/95-ceph-osd.rules | 10 ++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index fd0ca4822d933..9dbcd53a87db8 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -80,12 +80,15 @@ knew the GPT partition type. CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' +MPATH_JOURNAL_UUID = '45b0969e-8ae0-4982-bf9d-5a8d867af560' DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106' DMCRYPT_LUKS_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-35865ceff106' OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d' +MPATH_OSD_UUID = '4fbd7e29-8ae0-4982-bf9d-5a8d867af560' DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d' DMCRYPT_LUKS_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-35865ceff05d' TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be' +MPATH_TOBE_UUID = '89c57f98-8ae0-4982-bf9d-5a8d867af560' DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' DMCRYPT_JOURNAL_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-35865ceff2be' @@ -1244,7 +1247,7 @@ def prepare_journal_dev( ' and --dmcrypt specified') LOG.debug('Journal %s is a partition', journal) LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') - if get_partition_type(journal) == JOURNAL_UUID: + if get_partition_type(journal) in (JOURNAL_UUID, MPATH_JOURNAL_UUID): LOG.debug('Journal %s was previously prepared with ceph-disk. Reusing it.', journal) reusing_partition = True # Read and reuse the partition uuid from this journal's previous life. @@ -1276,6 +1279,9 @@ def prepare_journal_dev( ptype = JOURNAL_UUID ptype_tobe = JOURNAL_UUID + if is_mpath(journal): + ptype = MPATH_JOURNAL_UUID + ptype_tobe = MPATH_JOURNAL_UUID if journal_dm_keypath: if luks: ptype = DMCRYPT_LUKS_JOURNAL_UUID @@ -1526,6 +1532,10 @@ def prepare_dev( ptype_tobe = TOBE_UUID ptype_osd = OSD_UUID + if is_mpath(data): + ptype_tobe = MPATH_TOBE_UUID + ptype_osd = MPATH_OSD_UUID + if osd_dm_keypath: ptype_tobe = DMCRYPT_TOBE_UUID if luks: @@ -2360,6 +2370,11 @@ def main_activate(args): try: mode = os.stat(args.path).st_mode if stat.S_ISBLK(mode): + if (is_partition(args.path) and + get_partition_type(args.path) == MPATH_OSD_UUID and + not is_mpath(args.path)): + raise Error('%s is not a multipath block device' % + args.path) (cluster, osd_id) = mount_activate( dev=args.path, activate_key_template=args.activate_key_template, @@ -2413,6 +2428,11 @@ def get_journal_osd_uuid(path): if not stat.S_ISBLK(mode): raise Error('%s is not a block device' % path) + if (get_partition_type(path) == MPATH_JOURNAL_UUID and + not is_mpath(path)): + raise Error('%s is not a multipath block device' % + path) + try: out = _check_output( args=[ @@ -2498,7 +2518,10 @@ def main_activate_all(args): continue (tag, uuid) = name.split('.') - if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID: + if tag in (OSD_UUID, + MPATH_OSD_UUID, + DMCRYPT_OSD_UUID, + DMCRYPT_LUKS_OSD_UUID): if tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID: path = os.path.join('/dev/mapper', uuid) @@ -2656,8 +2679,9 @@ def list_dev(dev, uuid_map, journal_map): ptype = get_partition_type(dev) prefix = ' ' + LOG.debug("list_dev(dev = " + dev + ", ptype = " + ptype + ")") desc = [] - if ptype == OSD_UUID: + if ptype in (OSD_UUID, MPATH_OSD_UUID): desc = list_dev_osd(dev, uuid_map) if desc: desc = ['ceph data'] + desc @@ -2683,7 +2707,7 @@ def list_dev(dev, uuid_map, journal_map): desc = ['ceph data (dmcrypt LUKS %s)' % holder] + fs_desc else: desc = ['ceph data (dmcrypt LUKS)', 'holders: ' + ','.join(holders)] - elif ptype == JOURNAL_UUID: + elif ptype in (JOURNAL_UUID, MPATH_JOURNAL_UUID): desc.append('ceph journal') part_uuid = get_partition_uuid(dev) if part_uuid and part_uuid in journal_map: @@ -2735,6 +2759,8 @@ def main_list(args): if part_uuid: uuid_map[part_uuid] = dev ptype = get_partition_type(dev) + LOG.debug("main_list: " + dev + " " + ptype + " " + + str(part_uuid)) if ptype == OSD_UUID: fs_type = get_dev_fs(dev) if fs_type is not None: @@ -2748,7 +2774,8 @@ def main_list(args): unmount(tpath) except MountError: pass - if ptype == DMCRYPT_OSD_UUID or ptype == DMCRYPT_LUKS_OSD_UUID: + if ptype in (DMCRYPT_OSD_UUID, + DMCRYPT_LUKS_OSD_UUID): holders = is_held(dev) if len(holders) == 1: holder = '/dev/' + holders[0] @@ -2765,6 +2792,9 @@ def main_list(args): except MountError: pass + LOG.debug("main_list: " + str(partmap) + ", " + + str(uuid_map) + ", " + str(journal_map)) + for base, parts in sorted(partmap.iteritems()): if parts: print '%s :' % get_dev_path(base) diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules index 75f443f4b3bc6..3565f7caf1361 100644 --- a/udev/95-ceph-osd.rules +++ b/udev/95-ceph-osd.rules @@ -10,6 +10,16 @@ ACTION=="add", SUBSYSTEM=="block", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \ RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name" +# activate multipath ceph-tagged partitions +ACTION=="add", SUBSYSTEM=="block", \ + ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-8ae0-4982-bf9d-5a8d867af560", \ + RUN+="/usr/sbin/ceph-disk activate /dev/$name" + +# activate multipath ceph-tagged partitions +ACTION=="add", SUBSYSTEM=="block", \ + ENV{ID_PART_ENTRY_TYPE}=="45b0969e-8ae0-4982-bf9d-5a8d867af560", \ + RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name" + # Map journal if using dm-crypt and plain ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ From 9a71816f485323301a35992318d736fc64dc9cc5 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:21:48 +0200 Subject: [PATCH 203/654] tests: obsolete ceph-disk root tests They were designed to run in a docker container using loop devices instead of disks. Although this was fit for ceph-disk activate tests for regular and dmcrypt devices, a docker instance does not have its own udev instance it is not possible to run tests involving udev events without interfering with the host. Signed-off-by: Loic Dachary --- configure.ac | 6 - src/test/Makefile.am | 4 - src/test/ceph-disk-root.sh | 23 --- src/test/ceph-disk.sh | 291 ------------------------------------- 4 files changed, 324 deletions(-) delete mode 100755 src/test/ceph-disk-root.sh diff --git a/configure.ac b/configure.ac index 7447d3d98b295..3acbfe67eeda1 100644 --- a/configure.ac +++ b/configure.ac @@ -418,12 +418,6 @@ AC_ARG_ENABLE(gitversion, AM_CONDITIONAL(NO_GIT_VERSION, [test "x$enable_gitversion" = "xno"]) -AC_ARG_ENABLE([root-make-check], - [AS_HELP_STRING([--enable-root-make-check], [enable make check tests that require root privileges])], - [], - [enable_root_make_check=no]) -AM_CONDITIONAL(ENABLE_ROOT_MAKE_CHECK, test "x$enable_root_make_check" != xno) - # profiler? AC_ARG_WITH([profiler], [AS_HELP_STRING([--with-profiler], [build extra profiler binaries])], diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 32e10851717c9..79f879683f533 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -84,11 +84,7 @@ check_SCRIPTS += \ test/libradosstriper/rados-striper.sh \ test/test_objectstore_memstore.sh -if ENABLE_ROOT_MAKE_CHECK -check_SCRIPTS += test/ceph-disk-root.sh -else check_SCRIPTS += test/ceph-disk.sh -endif EXTRA_DIST += \ $(srcdir)/test/python/ceph-disk/setup.py \ diff --git a/src/test/ceph-disk-root.sh b/src/test/ceph-disk-root.sh deleted file mode 100755 index 916bc9425c7e0..0000000000000 --- a/src/test/ceph-disk-root.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2014 Red Hat -# -# Author: Loic Dachary -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Library Public License for more details. -# -set -e -sudo test/ceph-disk.sh test_activate_dev test_activate_dmcrypt_dev test_activate_dmcrypt_plain_dev -test/ceph-disk.sh - -# Local Variables: -# compile-command: "cd ../.. ; make -j4 && test/ceph-disk-root.sh" -# End: diff --git a/src/test/ceph-disk.sh b/src/test/ceph-disk.sh index c568f3df6f50c..be466faeb9873 100755 --- a/src/test/ceph-disk.sh +++ b/src/test/ceph-disk.sh @@ -299,40 +299,6 @@ function test_activate() { test_pool_read_write $osd_uuid || return 1 } -function test_activate_dmcrypt() { - local to_prepare=$1 - local to_activate=$2 - local journal=$3 - local journal_p=$4 - local uuid=$5 - local juuid=$6 - local plain=$7 - - $mkdir -p $OSD_DATA - - if test $plain = plain ; then - echo "osd_dmcrypt_type=plain" > $DIR/ceph.conf - fi - - ceph-disk $CEPH_DISK_ARGS \ - prepare --dmcrypt --dmcrypt-key-dir $DIR/keys --osd-uuid=$uuid --journal-uuid=$juuid $to_prepare $journal || return 1 - - if test $plain = plain ; then - /sbin/cryptsetup --key-file $DIR/keys/$uuid --key-size 256 create $uuid $to_activate - /sbin/cryptsetup --key-file $DIR/keys/$juuid --key-size 256 create $juuid $journal - else - /sbin/cryptsetup --key-file $DIR/keys/$uuid.luks.key luksOpen $to_activate $uuid - /sbin/cryptsetup --key-file $DIR/keys/$juuid.luks.key luksOpen ${journal}${journal_p} $juuid - fi - - $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \ - activate \ - --mark-init=none \ - /dev/mapper/$uuid || return 1 - - test_pool_read_write $uuid || return 1 -} - function test_activate_dir() { run_mon @@ -342,263 +308,6 @@ function test_activate_dir() { $rm -fr $osd_data } -function loop_sanity_check_body() { - local dev=$1 - local guid=$2 - - # - # Check if /dev/loop is configured with max_part > 0 to handle - # partition tables and expose the partition devices in /dev - # - sgdisk --largest-new=1 --partition-guid=1:$guid $dev - if ! test -e ${dev}p1 ; then - if grep loop.max_part /proc/cmdline ; then - echo "the loop module max_part parameter is configured but when" - echo "creating a new partition on $dev, it the expected node" - echo "${dev}p1 does not exist" - return 1 - fi - perl -pi -e 's/$/ loop.max_part=16/ if(/kernel/ && !/max_part/)' /boot/grub/grub.conf - echo "the loop.max_part=16 was added to the kernel in /boot/grub/grub.conf" - cat /boot/grub/grub.conf - echo "you need to reboot for it to be taken into account" - return 1 - fi - - # - # Install the minimal files supporting the maintenance of /dev/disk/by-partuuid - # - udevadm trigger --sysname-match=$(basename $dev) - udevadm settle - if test ! -e /dev/disk/by-partuuid/$guid ; then - cp -a ../udev/95-ceph-osd-alt.rules /lib/udev/rules.d/95-ceph-osd.rules - cp -a ceph-disk ceph-disk-udev /usr/sbin - udevadm trigger --sysname-match=$(basename $dev) - if test ! -e /dev/disk/by-partuuid/$guid ; then - echo "/dev/disk/by-partuuid/$guid not found although the" - echo "following support files are installed: " - ls -l /lib/udev/rules.d/95-ceph-osd.rules /usr/sbin/ceph-disk{,-udev} - return 1 - fi - fi - - return 0 -} - -function loop_sanity_check() { - local id=$(lsb_release -si) - local major=$(lsb_release -rs | cut -f1 -d.) - if test $major != 6 || test $id != CentOS -a $id != RedHatEnterpriseServer ; then - echo "/dev/loop is assumed to be configured with max_part > 0" - echo "and /dev/disk/by-partuuid to be populated by udev on" - lsb_release -a - return 0 - fi - local name=$DIR/sanity.disk - dd if=/dev/zero of=$name bs=1024k count=10 > /dev/null 2>&1 - losetup --find $name - local dev=$(losetup --associated $name | cut -f1 -d:) - local guid=$($uuidgen) - - loop_sanity_check_body $dev $guid - status=$? - - losetup --detach $dev - rm $name - rm -f /dev/disk/by-partuuid/$guid - - return $status -} - -function reset_dev() { - local dev=$1 - - if test -z "$dev" ; then - return - fi - - grep "^$dev" < /proc/mounts | while read mounted rest ; do - umount $mounted - done - local dev_base=$(basename $dev) - ( - ls /sys/block/$dev_base/$dev_base*/holders 2> /dev/null - ls /sys/block/$dev_base/holders 2> /dev/null - ) | grep '^dm-' | while read dm ; do - dmsetup remove /dev/$dm - done - ceph-disk zap $dev > /dev/null 2>&1 -} - -function reset_leftover_dev() { - local path=$1 - - losetup --all | sed -e 's/://' | while read dev id associated_path ; do - # if $path has been deleted with a dev attached, then $associated_path - # will carry "($path (deleted))". - if test "$associated_path" = "($path)" ; then - reset_dev $dev - losetup --detach $dev - fi - done -} - -function create_dev() { - local path=$1 - - echo -n "create_dev $path ... " >&2 - reset_leftover_dev $path - dd if=/dev/zero of=$path bs=1024k count=400 > /dev/null 2>&1 - losetup --find $path - local dev=$(losetup --associated $path | cut -f1 -d:) - test "$dev" || return 1 - reset_dev $dev - echo $dev >&2 - echo $dev -} - -function destroy_dev() { - local path=$1 - local dev=$2 - - echo destroy_dev $path $dev >&2 - reset_dev $dev - losetup --detach $dev - rm -f $path -} - -function activate_dev_body() { - local disk=$1 - local journal=$2 - local newdisk=$3 - - setup - run_mon - # - # Create an OSD without a journal and an objectstore - # that does not use a journal. - # - ceph-disk zap $disk || return 1 - CEPH_ARGS="$CEPH_ARGS --osd-objectstore=memstore" \ - test_activate $disk ${disk}p1 || return 1 - kill_daemons - umount ${disk}p1 || return 1 - teardown - - setup - run_mon - # - # Create an OSD with data on a disk, journal on another - # - ceph-disk zap $disk || return 1 - test_activate $disk ${disk}p1 $journal || return 1 - kill_daemons - umount ${disk}p1 || return 1 - teardown - - setup - run_mon - # - # Create an OSD with data on a disk, journal on another - # This will add a new partition to $journal, the previous - # one will remain. - # - ceph-disk zap $disk || return 1 - test_activate $disk ${disk}p1 $journal || return 1 - kill_daemons - umount ${disk}p1 || return 1 - teardown - - setup - run_mon - # - # Create an OSD and reuse an existing journal partition - # - test_activate $newdisk ${newdisk}p1 ${journal}p1 || return 1 - # - # Create an OSD and get a journal partition from a disk that - # already contains a journal partition which is in use. Updates of - # the kernel partition table may behave differently when a - # partition is in use. See http://tracker.ceph.com/issues/7334 for - # more information. - # - ceph-disk zap $disk || return 1 - test_activate $disk ${disk}p1 $journal || return 1 - kill_daemons - umount ${newdisk}p1 || return 1 - umount ${disk}p1 || return 1 - teardown -} - -function test_activate_dev() { - test_setup_dev_and_run activate_dev_body -} - -function test_setup_dev_and_run() { - local action=$1 - if test $(id -u) != 0 ; then - echo "SKIP because not root" - return 0 - fi - - loop_sanity_check || return 1 - - local dir=$(pwd)/$DIR - local disk - disk=$(create_dev $dir/vdf.disk) || return 1 - local journal - journal=$(create_dev $dir/vdg.disk) || return 1 - local newdisk - newdisk=$(create_dev $dir/vdh.disk) || return 1 - - $action $disk $journal $newdisk - status=$? - - destroy_dev $dir/vdf.disk $disk - destroy_dev $dir/vdg.disk $journal - destroy_dev $dir/vdh.disk $newdisk - - return $status -} - -function activate_dmcrypt_dev_body() { - local disk=$1 - local journal=$2 - local newdisk=$3 - local uuid=$($uuidgen) - local juuid=$($uuidgen) - - setup - run_mon - test_activate_dmcrypt $disk ${disk}p1 $journal p1 $uuid $juuid not_plain || return 1 - kill_daemons - umount /dev/mapper/$uuid || return 1 - teardown -} - -function test_activate_dmcrypt_dev() { - test_setup_dev_and_run activate_dmcrypt_dev_body -} - -function activate_dmcrypt_plain_dev_body() { - local disk=$1 - local journal=$2 - local newdisk=$3 - local uuid=$($uuidgen) - local juuid=$($uuidgen) - - setup - run_mon - test_activate_dmcrypt $disk ${disk}p1 $journal p1 $uuid $juuid plain || return 1 - kill_daemons - umount /dev/mapper/$uuid || return 1 - teardown -} - -function test_activate_dmcrypt_plain_dev() { - test_setup_dev_and_run activate_dmcrypt_plain_dev_body -} - function test_find_cluster_by_uuid() { setup test_activate_dir 2>&1 | tee $DIR/test_find From 38d0e7bd5502030d65ea433e975ab1af77f86c6f Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:37:15 +0200 Subject: [PATCH 204/654] ceph-disk: use sys.argv instead of implicit Make parse_args and main use and argument instead of relying on argparse.ArgumentParser implicit use of sys.argv. It helps with tests. Signed-off-by: Loic Dachary --- src/ceph-disk | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 9dbcd53a87db8..84a75373b1a6b 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2905,7 +2905,7 @@ def setup_sysconfdir(dir): SYSCONFDIR = dir -def parse_args(): +def parse_args(argv): parser = argparse.ArgumentParser( 'ceph-disk', ) @@ -3164,12 +3164,11 @@ def parse_args(): func=main_zap, ) - args = parser.parse_args() + args = parser.parse_args(argv) return args - -def main(): - args = parse_args() +def main(argv): + args = parse_args(argv) loglevel = logging.WARNING if args.verbose: @@ -3221,5 +3220,5 @@ def main_catch(func, args): if __name__ == '__main__': - main() + main(sys.argv[1:]) warned_about = {} From 982591a9817cff99e743b0befe1bb1b7aa34e0b9 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:41:41 +0200 Subject: [PATCH 205/654] ceph-disk: cosmetic: argparse functions Split the large parse_args function into separate functions, one for each subparser. Signed-off-by: Loic Dachary --- src/ceph-disk | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 84a75373b1a6b..e2c5b4430740c 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2948,6 +2948,18 @@ def parse_args(argv): help='sub-command help', ) + make_prepare_parser(subparsers) + make_activate_parser(subparsers) + make_activate_journal_parser(subparsers) + make_activate_all_parser(subparsers) + make_list_parser(subparsers) + make_suppress_parser(subparsers) + make_zap_parser(subparsers) + + args = parser.parse_args(argv) + return args + +def make_prepare_parser(subparsers): prepare_parser = subparsers.add_parser('prepare', help='Prepare a directory or disk for a Ceph OSD') prepare_parser.add_argument( '--cluster', @@ -3025,7 +3037,9 @@ def parse_args(argv): prepare_parser.set_defaults( func=main_prepare, ) + return prepare_parser +def make_activate_parser(subparsers): activate_parser = subparsers.add_parser('activate', help='Activate a Ceph OSD') activate_parser.add_argument( '--mount', @@ -3071,7 +3085,9 @@ def parse_args(argv): activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring', func=main_activate, ) + return activate_parser +def make_activate_journal_parser(subparsers): activate_journal_parser = subparsers.add_parser('activate-journal', help='Activate an OSD via its journal device') activate_journal_parser.add_argument( 'dev', @@ -3106,7 +3122,9 @@ def parse_args(argv): activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring', func=main_activate_journal, ) + return activate_journal_parser +def make_activate_all_parser(subparsers): activate_all_parser = subparsers.add_parser('activate-all', help='Activate all tagged OSD partitions') activate_all_parser.add_argument( '--activate-key', @@ -3125,12 +3143,16 @@ def parse_args(argv): activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring', func=main_activate_all, ) + return activate_all_parser +def make_list_parser(subparsers): list_parser = subparsers.add_parser('list', help='List disks, partitions, and Ceph OSDs') list_parser.set_defaults( func=main_list, ) + return list_parser +def make_suppress_parser(subparsers): suppress_parser = subparsers.add_parser('suppress-activate', help='Suppress activate on a device (prefix)') suppress_parser.add_argument( 'path', @@ -3152,7 +3174,9 @@ def parse_args(argv): unsuppress_parser.set_defaults( func=main_unsuppress, ) + return suppress_parser +def make_zap_parser(subparsers): zap_parser = subparsers.add_parser('zap', help='Zap/erase/destroy a device\'s partition table (and contents)') zap_parser.add_argument( 'dev', @@ -3163,9 +3187,7 @@ def parse_args(argv): zap_parser.set_defaults( func=main_zap, ) - - args = parser.parse_args(argv) - return args + return zap_parser def main(argv): args = parse_args(argv) From 60c22a9353c0c4245ccda1dbd3416956ac713a3d Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:43:58 +0200 Subject: [PATCH 206/654] ceph-disk: cosmetic: setup_logging function Split the main function to extract the verbose handling part into the setup_logging function. Signed-off-by: Loic Dachary --- src/ceph-disk | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index e2c5b4430740c..81f0cf30a805a 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -3192,18 +3192,7 @@ def make_zap_parser(subparsers): def main(argv): args = parse_args(argv) - loglevel = logging.WARNING - if args.verbose: - loglevel = logging.DEBUG - - if args.log_stdout: - ch = logging.StreamHandler(stream=sys.stdout) - ch.setLevel(loglevel) - LOG.addHandler(ch) - else: - logging.basicConfig( - level=loglevel, - ) + setup_logging(args.verbose, args.log_stdout) if args.prepend_to_path != '': path = os.environ.get('PATH', os.defpath) @@ -3217,6 +3206,20 @@ def main(argv): else: main_catch(args.func, args) +def setup_logging(verbose, log_stdout): + loglevel = logging.WARNING + if verbose: + loglevel = logging.DEBUG + + if log_stdout: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(loglevel) + LOG.addHandler(ch) + else: + logging.basicConfig( + level=loglevel, + ) + def main_catch(func, args): try: From 5fd9486c76bde3d1ce8c5d28bfc68089527982c1 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:51:14 +0200 Subject: [PATCH 207/654] ceph-disk: fix dmcrypt typo Fix the typo introduced by 29431944c77adbc3464a8faeb7e052b24f821780 Signed-off-by: Loic Dachary --- src/ceph-disk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-disk b/src/ceph-disk index 81f0cf30a805a..f5169719ffc18 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2473,7 +2473,7 @@ def main_activate_journal(args): raise Error('activate-journal --dmcrypt called for invalid dev %s' % (rawdev)) part_uuid = get_partition_uuid(rawdev) dmcrypt_key_path = os.path.join(args.dmcrypt_key_dir, part_uuid) - dev = dmcrypt_map(rawdev, dmcrypt_key_path, partd_uuid) + dev = dmcrypt_map(rawdev, dmcrypt_key_path, part_uuid) else: dev = args.dev From 796a1403dc23d84f7c5477c862ca265b5d6a6802 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 13:14:13 +0200 Subject: [PATCH 208/654] ceph-disk: implement list --format json The ceph-disk list command is reworked in two parts: 1) the list_devices function that build an internal structure with all the information regarding disks and partitions. 2) a function to display the internal structure in plain text or json The ceph-disk list show the plain text version and is backward compatible. The ceph-disk list --format json output has more information about each device than the plain text version and is intended for scripts. The unit tests cover all modified lines (2610 to 2849). Signed-off-by: Loic Dachary --- src/ceph-disk | 318 +++++---- .../python/ceph-disk/tests/test_ceph_disk.py | 640 +++++++++++++++++- src/test/python/ceph-disk/tox.ini | 9 +- 3 files changed, 839 insertions(+), 128 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index f5169719ffc18..10a7b64fafd4b 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -20,6 +20,7 @@ import argparse import errno import fcntl +import json import logging import os import os.path @@ -511,12 +512,14 @@ def get_partition_dev(dev, pnum): raise Error('partition %d for %s does not appear to exist' % (pnum, dev)) -def list_all_partitions(): +def list_all_partitions(names): """ Return a list of devices and partitions """ + if not names: + names = os.listdir('/sys/block') dev_part_list = {} - for name in os.listdir('/sys/block'): + for name in names: LOG.debug("list_all_partitions: " + name) # /dev/fd0 may hang http://tracker.ceph.com/issues/6827 if re.match(r'^fd\d$', name): @@ -2627,128 +2630,178 @@ def get_sgdisk_partition_info(dev, regexp): return m.group(1).lower() return None - -def more_osd_info(path, uuid_map): - desc = [] - ceph_fsid = get_oneliner(path, 'ceph_fsid') - if ceph_fsid: - cluster = find_cluster_by_uuid(ceph_fsid) - if cluster: - desc.append('cluster ' + cluster) - else: - desc.append('unknown cluster ' + ceph_fsid) - - who = get_oneliner(path, 'whoami') - if who: - desc.append('osd.%s' % who) - - journal_uuid = get_oneliner(path, 'journal_uuid') - if journal_uuid: - journal_uuid = journal_uuid.lower() - if journal_uuid in uuid_map: - desc.append('journal %s' % uuid_map[journal_uuid]) - - return desc - -def list_dev_osd(dev, uuid_map): - path = is_mounted(dev) - fs_type = get_dev_fs(dev) - desc = [] - if path: - desc.append('active') - desc.extend(more_osd_info(path, uuid_map)) - elif fs_type: +def more_osd_info(path, uuid_map, desc): + desc['ceph_fsid'] = get_oneliner(path, 'ceph_fsid') + if desc['ceph_fsid']: + desc['cluster'] = find_cluster_by_uuid(desc['ceph_fsid']) + desc['whoami'] = get_oneliner(path, 'whoami') + desc['journal_uuid'] = get_oneliner(path, 'journal_uuid') + if desc['journal_uuid']: + desc['journal_uuid'] = desc['journal_uuid'].lower() + if desc['journal_uuid'] in uuid_map: + desc['journal_dev'] = uuid_map[desc['journal_uuid']] + +def list_dev_osd(dev, uuid_map, desc): + desc['mount'] = is_mounted(dev) + desc['fs_type'] = get_dev_fs(dev) + desc['state'] = 'unprepared' + if desc['mount']: + desc['state'] = 'active' + more_osd_info(desc['mount'], uuid_map, desc) + elif desc['fs_type']: try: - tpath = mount(dev=dev, fstype=fs_type, options='') + tpath = mount(dev=dev, fstype=desc['fs_type'], options='') if tpath: try: magic = get_oneliner(tpath, 'magic') if magic is not None: - desc.append('prepared') - desc.extend(more_osd_info(tpath, uuid_map)) + desc['magic'] = magic + desc['state'] = 'prepared' + more_osd_info(tpath, uuid_map, desc) finally: unmount(tpath) except MountError: pass + +def list_format_more_osd_info_plain(dev): + desc = [] + if dev.get('ceph_fsid'): + if dev.get('cluster'): + desc.append('cluster ' + dev['cluster']) + else: + desc.append('unknown cluster ' + dev['ceph_fsid']) + if dev.get('whoami'): + desc.append('osd.%s' % dev['whoami']) + if dev.get('journal_dev'): + desc.append('journal %s' % dev['journal_dev']) return desc +def list_format_dev_plain(dev, devices=[], prefix=''): + desc = [] + if dev['ptype'] == OSD_UUID: + desc = ['ceph data', dev['state']] + list_format_more_osd_info_plain(dev) + elif dev['ptype'] in (DMCRYPT_OSD_UUID, + DMCRYPT_LUKS_OSD_UUID): + dmcrypt = dev['dmcrypt'] + if not dmcrypt['holders']: + desc = ['ceph data (dmcrypt %s)' % dmcrypt['type'], 'not currently mapped'] + elif len(dmcrypt['holders']) == 1: + holder = '/dev/' + dmcrypt['holders'][0] + def lookup_dev(devices, path): + for device in devices: + if device['path'] == path: + return device + holder_dev = lookup_dev(devices, holder) + desc = ['ceph data (dmcrypt %s %s)' % (dmcrypt['type'], holder)] + list_format_more_osd_info_plain(holder_dev) + else: + desc = ['ceph data (dmcrypt %s)' % dmcrypt['type'], 'holders: ' + ','.join(dmcrypt['holders'])] + elif dev['ptype'] == JOURNAL_UUID: + desc.append('ceph journal') + if dev['journal_for']: + desc.append('for %s' % dev['journal_for']) + elif dev['ptype'] in (DMCRYPT_JOURNAL_UUID, + DMCRYPT_LUKS_JOURNAL_UUID): + dmcrypt = dev['dmcrypt'] + if dmcrypt['holders'] and len(dmcrypt['holders']) == 1: + desc = ['ceph journal (dmcrypt %s /dev/%s)' % (dmcrypt['type'], dmcrypt['holders'][0])] + else: + desc = ['ceph journal (dmcrypt %s)' % dmcrypt['type']] + if dev.get('journal_for'): + desc.append('for %s' % dev['journal_for']) + else: + desc.append(dev['type']) + if dev.get('fs_type'): + desc.append(dev['fs_type']) + elif dev.get('ptype'): + desc.append(dev['ptype']) + if dev.get('mount'): + desc.append('mounted on %s' % dev['mount']) + return '%s%s %s' % (prefix, dev['path'], ', '.join(desc)) + +def list_format_plain(devices): + lines = [] + for device in devices: + if device.get('partitions'): + lines.append('%s :' % device['path']) + for p in sorted(device['partitions']): + lines.append(list_format_dev_plain(dev=p, + devices=devices, + prefix=' ')) + else: + lines.append(list_format_dev_plain(dev=device, + devices=devices, + prefix='')) + return "\n".join(lines) + def list_dev(dev, uuid_map, journal_map): - ptype = 'unknown' - prefix = '' - if is_partition(dev): - ptype = get_partition_type(dev) - prefix = ' ' + info = { + 'path': dev, + 'dmcrypt': {}, + } - LOG.debug("list_dev(dev = " + dev + ", ptype = " + ptype + ")") - desc = [] + info['is_partition'] = is_partition(dev) + if info['is_partition']: + ptype = get_partition_type(dev) + info['uuid'] = get_partition_uuid(dev) + else: + ptype = 'unknown' + info['ptype'] = ptype + LOG.info("list_dev(dev = " + dev + ", ptype = " + str(ptype) + ")") if ptype in (OSD_UUID, MPATH_OSD_UUID): - desc = list_dev_osd(dev, uuid_map) - if desc: - desc = ['ceph data'] + desc - else: - desc = ['ceph data', 'unprepared'] + info['type'] = 'data' + if ptype == MPATH_OSD_UUID: + info['multipath'] = True + list_dev_osd(dev, uuid_map, info) elif ptype == DMCRYPT_OSD_UUID: holders = is_held(dev) - if not holders: - desc = ['ceph data (dmcrypt plain)', 'not currently mapped'] - elif len(holders) == 1: - holder = '/dev/' + holders[0] - fs_desc = list_dev_osd(holder, uuid_map) - desc = ['ceph data (dmcrypt plain %s)' % holder] + fs_desc - else: - desc = ['ceph data (dmcrypt plain)', 'holders: ' + ','.join(holders)] + info['type'] = 'data' + info['dmcrypt']['holders'] = holders + info['dmcrypt']['type'] = 'plain' + if len(holders) == 1: + list_dev_osd('/dev/' + holders[0], uuid_map, info) elif ptype == DMCRYPT_LUKS_OSD_UUID: holders = is_held(dev) - if not holders: - desc = ['ceph data (dmcrypt LUKS)', 'not currently mapped'] - elif len(holders) == 1: - holder = '/dev/' + holders[0] - fs_desc = list_dev_osd(holder, uuid_map) - desc = ['ceph data (dmcrypt LUKS %s)' % holder] + fs_desc - else: - desc = ['ceph data (dmcrypt LUKS)', 'holders: ' + ','.join(holders)] + info['type'] = 'data' + info['dmcrypt']['holders'] = holders + info['dmcrypt']['type'] = 'LUKS' + if len(holders) == 1: + list_dev_osd('/dev/' + holders[0], uuid_map, info) elif ptype in (JOURNAL_UUID, MPATH_JOURNAL_UUID): - desc.append('ceph journal') - part_uuid = get_partition_uuid(dev) - if part_uuid and part_uuid in journal_map: - desc.append('for %s' % journal_map[part_uuid]) + info['type'] = 'journal' + if ptype == MPATH_JOURNAL_UUID: + info['multipath'] = True + if info.get('uuid') in journal_map: + info['journal_for'] = journal_map[info['uuid']] elif ptype == DMCRYPT_JOURNAL_UUID: holders = is_held(dev) - if len(holders) == 1: - desc = ['ceph journal (dmcrypt plain /dev/%s)' % holders[0]] - else: - desc = ['ceph journal (dmcrypt plain)'] - part_uuid = get_partition_uuid(dev) - if part_uuid and part_uuid in journal_map: - desc.append('for %s' % journal_map[part_uuid]) + info['type'] = 'journal' + info['dmcrypt']['type'] = 'plain' + info['dmcrypt']['holders'] = holders + if info.get('uuid') in journal_map: + info['journal_for'] = journal_map[info['uuid']] elif ptype == DMCRYPT_LUKS_JOURNAL_UUID: holders = is_held(dev) - if len(holders) == 1: - desc = ['ceph journal (dmcrypt LUKS /dev/%s)' % holders[0]] - else: - desc = ['ceph journal (dmcrypt LUKS)'] - part_uuid = get_partition_uuid(dev) - if part_uuid and part_uuid in journal_map: - desc.append('for %s' % journal_map[part_uuid]) + info['type'] = 'journal' + info['dmcrypt']['type'] = 'LUKS' + info['dmcrypt']['holders'] = holders + if info.get('uuid') in journal_map: + info['journal_for'] = journal_map[info['uuid']] else: path = is_mounted(dev) fs_type = get_dev_fs(dev) if is_swap(dev): - desc.append('swap') + info['type'] = 'swap' else: - desc.append('other') + info['type'] = 'other' if fs_type: - desc.append(fs_type) - elif ptype: - desc.append(ptype) + info['fs_type'] = fs_type if path: - desc.append('mounted on %s' % path) + info['mount'] = path - print '%s%s %s' % (prefix, dev, ', '.join(desc)) + return info - -def main_list(args): - partmap = list_all_partitions() +def list_devices(args): + partmap = list_all_partitions(args.path) uuid_map = {} journal_map = {} @@ -2759,13 +2812,26 @@ def main_list(args): if part_uuid: uuid_map[part_uuid] = dev ptype = get_partition_type(dev) - LOG.debug("main_list: " + dev + " " + ptype + " " + - str(part_uuid)) - if ptype == OSD_UUID: - fs_type = get_dev_fs(dev) + LOG.debug("main_list: " + dev + + " ptype = " + str(ptype) + + " uuid = " + str(part_uuid)) + if ptype in (OSD_UUID, + DMCRYPT_OSD_UUID, + DMCRYPT_LUKS_OSD_UUID): + if ptype in (DMCRYPT_OSD_UUID, + DMCRYPT_LUKS_OSD_UUID): + holders = is_held(dev) + if len(holders) != 1: + continue + dev_to_mount = '/dev/' + holders[0] + else: + dev_to_mount = dev + + fs_type = get_dev_fs(dev_to_mount) if fs_type is not None: try: - tpath = mount(dev=dev, fstype=fs_type, options='') + tpath = mount(dev=dev_to_mount, + fstype=fs_type, options='') try: journal_uuid = get_oneliner(tpath, 'journal_uuid') if journal_uuid: @@ -2774,34 +2840,34 @@ def main_list(args): unmount(tpath) except MountError: pass - if ptype in (DMCRYPT_OSD_UUID, - DMCRYPT_LUKS_OSD_UUID): - holders = is_held(dev) - if len(holders) == 1: - holder = '/dev/' + holders[0] - fs_type = get_dev_fs(holder) - if fs_type is not None: - try: - tpath = mount(dev=holder, fstype=fs_type, options='') - try: - journal_uuid = get_oneliner(tpath, 'journal_uuid') - if journal_uuid: - journal_map[journal_uuid.lower()] = dev - finally: - unmount(tpath) - except MountError: - pass - - LOG.debug("main_list: " + str(partmap) + ", " + - str(uuid_map) + ", " + str(journal_map)) + LOG.debug("main_list: " + str(partmap) + ", uuid_map = " + + str(uuid_map) + ", journal_map = " + str(journal_map)) + + devices = [] for base, parts in sorted(partmap.iteritems()): if parts: - print '%s :' % get_dev_path(base) + disk = { 'path': get_dev_path(base) } + partitions = [] for p in sorted(parts): - list_dev(get_dev_path(p), uuid_map, journal_map) + partitions.append(list_dev(get_dev_path(p), uuid_map, journal_map)) + disk['partitions'] = partitions + devices.append(disk) else: - list_dev(get_dev_path(base), uuid_map, journal_map) + device = list_dev(get_dev_path(base), uuid_map, journal_map) + device['path'] = get_dev_path(base) + devices.append(device) + LOG.debug("list_devices: " + str(devices)) + return devices + +def main_list(args): + devices = list_devices(args) + if args.format == 'json': + print json.dumps(devices) + else: + output = list_format_plain(devices) + if output: + print output ########################### @@ -3147,6 +3213,18 @@ def make_activate_all_parser(subparsers): def make_list_parser(subparsers): list_parser = subparsers.add_parser('list', help='List disks, partitions, and Ceph OSDs') + list_parser.add_argument( + '--format', + help='output format', + default='plain', + choices=['json','plain'], + ) + list_parser.add_argument( + 'path', + metavar='PATH', + nargs='*', + help='path to block devices, relative to /sys/block', + ) list_parser.set_defaults( func=main_list, ) diff --git a/src/test/python/ceph-disk/tests/test_ceph_disk.py b/src/test/python/ceph-disk/tests/test_ceph_disk.py index b6484875f935f..a150dd3241040 100644 --- a/src/test/python/ceph-disk/tests/test_ceph_disk.py +++ b/src/test/python/ceph-disk/tests/test_ceph_disk.py @@ -1,10 +1,640 @@ +from mock import patch, DEFAULT, Mock +import argparse +import pytest import ceph_disk -# This file tests nothing (yet) except for being able to import ceph_disk -# correctly and thus ensuring somewhat that it will work under different Python -# versions. You must write unittests here so that code has adequate coverage. +def fail_to_mount(dev, fstype, options): + raise ceph_disk.MountError(dev + " mount fail") class TestCephDisk(object): - def test_basic(self): - assert True + def setup_class(self): + ceph_disk.setup_logging(verbose=True, log_stdout=False) + + def test_main_list_json(self, capsys): + args = ceph_disk.parse_args(['list', '--format', 'json']) + with patch.multiple( + ceph_disk, + list_devices=lambda args: {}): + ceph_disk.main_list(args) + out, err = capsys.readouterr() + assert '{}\n' == out + + def test_main_list_plain(self, capsys): + args = ceph_disk.parse_args(['list']) + with patch.multiple( + ceph_disk, + list_devices=lambda args: {}): + ceph_disk.main_list(args) + out, err = capsys.readouterr() + assert '' == out + + def test_list_format_more_osd_info_plain(self): + dev = { + 'ceph_fsid': 'UUID', + 'cluster': 'ceph', + 'whoami': '1234', + 'journal_dev': '/dev/Xda2', + } + out = ceph_disk.list_format_more_osd_info_plain(dev) + assert dev['cluster'] in " ".join(out) + assert dev['journal_dev'] in " ".join(out) + assert dev['whoami'] in " ".join(out) + + dev = { + 'ceph_fsid': 'UUID', + 'whoami': '1234', + 'journal_dev': '/dev/Xda2', + } + out = ceph_disk.list_format_more_osd_info_plain(dev) + assert 'unknown cluster' in " ".join(out) + + def test_list_format_plain(self): + payload = [{ + 'path': '/dev/Xda', + 'ptype': 'unknown', + 'type': 'other', + 'mount': '/somewhere', + }] + out = ceph_disk.list_format_plain(payload) + assert payload[0]['path'] in out + assert payload[0]['type'] in out + assert payload[0]['mount'] in out + + payload = [{ + 'path': '/dev/Xda1', + 'ptype': 'unknown', + 'type': 'swap', + }] + out = ceph_disk.list_format_plain(payload) + assert payload[0]['path'] in out + assert payload[0]['type'] in out + + payload = [{ + 'path': '/dev/Xda', + 'partitions': [ + { + 'dmcrypt': {}, + 'ptype': 'whatever', + 'is_partition': True, + 'fs_type': 'ext4', + 'path': '/dev/Xda1', + 'mounted': '/somewhere', + 'type': 'other', + } + ], + }] + out = ceph_disk.list_format_plain(payload) + assert payload[0]['path'] in out + assert payload[0]['partitions'][0]['path'] in out + + def test_list_format_dev_plain(dev): + # + # data + # + dev = { + 'path': '/dev/Xda1', + 'ptype': ceph_disk.OSD_UUID, + 'state': 'prepared', + 'whoami': '1234', + } + out = ceph_disk.list_format_dev_plain(dev) + assert 'data' in out + assert dev['whoami'] in out + assert dev['state'] in out + # + # journal + # + dev = { + 'path': '/dev/Xda2', + 'ptype': ceph_disk.JOURNAL_UUID, + 'journal_for': '/dev/Xda1', + } + out = ceph_disk.list_format_dev_plain(dev) + assert 'journal' in out + assert dev['journal_for'] in out + + # + # dmcrypt data + # + ptype2type = { + ceph_disk.DMCRYPT_OSD_UUID: 'plain', + ceph_disk.DMCRYPT_LUKS_OSD_UUID: 'LUKS', + } + for (ptype, type) in ptype2type.iteritems(): + for holders in ((), ("dm_0",), ("dm_0", "dm_1")): + devices = [{ + 'path': '/dev/dm_0', + 'whoami': '1234', + }] + dev = { + 'dmcrypt': { + 'holders': holders, + 'type': type, + }, + 'path': '/dev/Xda1', + 'ptype': ptype, + 'state': 'prepared', + } + out = ceph_disk.list_format_dev_plain(dev, devices) + assert 'data' in out + assert 'dmcrypt' in out + assert type in out + if len(holders) == 1: + assert devices[0]['whoami'] in out + for holder in holders: + assert holder in out + + # + # dmcrypt journal + # + ptype2type = { + ceph_disk.DMCRYPT_JOURNAL_UUID: 'plain', + ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID: 'LUKS', + } + for (ptype, type) in ptype2type.iteritems(): + for holders in ((), ("dm_0",)): + dev = { + 'path': '/dev/Xda2', + 'ptype': ptype, + 'journal_for': '/dev/Xda1', + 'dmcrypt': { + 'holders': holders, + 'type': type, + }, + } + out = ceph_disk.list_format_dev_plain(dev, devices) + assert 'journal' in out + assert 'dmcrypt' in out + assert type in out + assert dev['journal_for'] in out + if len(holders) == 1: + assert holders[0] in out + + def test_list_dev_osd(self): + dev = "Xda" + mount_path = '/mount/path' + fs_type = 'ext4' + cluster = 'ceph' + uuid_map = {} + def more_osd_info(path, uuid_map, desc): + desc['cluster'] = cluster + # + # mounted therefore active + # + with patch.multiple( + ceph_disk, + is_mounted=lambda dev: mount_path, + get_dev_fs=lambda dev: fs_type, + more_osd_info=more_osd_info + ): + desc = {} + ceph_disk.list_dev_osd(dev, uuid_map, desc) + assert {'cluster': 'ceph', + 'fs_type': 'ext4', + 'mount': '/mount/path', + 'state': 'active'} == desc + # + # not mounted and cannot mount: unprepared + # + mount_path = None + with patch.multiple( + ceph_disk, + is_mounted=lambda dev: mount_path, + get_dev_fs=lambda dev: fs_type, + mount=fail_to_mount, + more_osd_info=more_osd_info + ): + desc = {} + ceph_disk.list_dev_osd(dev, uuid_map, desc) + assert {'fs_type': 'ext4', + 'mount': mount_path, + 'state': 'unprepared'} == desc + # + # not mounted and magic found: prepared + # + def get_oneliner(path, what): + if what == 'magic': + return ceph_disk.CEPH_OSD_ONDISK_MAGIC + else: + raise Exception('unknown ' + what) + with patch.multiple( + ceph_disk, + is_mounted=lambda dev: mount_path, + get_dev_fs=lambda dev: fs_type, + mount=DEFAULT, + unmount=DEFAULT, + get_oneliner=get_oneliner, + more_osd_info=more_osd_info + ): + desc = {} + ceph_disk.list_dev_osd(dev, uuid_map, desc) + assert {'cluster': 'ceph', + 'fs_type': 'ext4', + 'mount': mount_path, + 'magic': ceph_disk.CEPH_OSD_ONDISK_MAGIC, + 'state': 'prepared'} == desc + + def test_list_all_partitions(self): + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + + with patch( + 'ceph_disk.os', + listdir=lambda path: [disk], + ), patch.multiple( + ceph_disk, + list_partitions=lambda dev: [partition], + ): + assert {disk: [partition]} == ceph_disk.list_all_partitions([]) + + with patch.multiple( + ceph_disk, + list_partitions=lambda dev: [partition], + ): + assert {disk: [partition]} == ceph_disk.list_all_partitions([disk]) + + def test_list_data(self): + args = ceph_disk.parse_args(['list']) + # + # a data partition that fails to mount is silently + # ignored + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + fs_type = "ext4" + + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: ceph_disk.OSD_UUID, + get_dev_fs=lambda dev: fs_type, + mount=fail_to_mount, + unmount=DEFAULT, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'dmcrypt': {}, + 'fs_type': fs_type, + 'is_partition': True, + 'mount': None, + 'path': '/dev/' + partition, + 'ptype': ceph_disk.OSD_UUID, + 'state': 'unprepared', + 'type': 'data', + 'uuid': partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + + def test_list_dmcrypt_data(self): + args = ceph_disk.parse_args(['list']) + partition_type2type = { + ceph_disk.DMCRYPT_OSD_UUID: 'plain', + ceph_disk.DMCRYPT_LUKS_OSD_UUID: 'LUKS', + } + for (partition_type, type) in partition_type2type.iteritems(): + # + # dmcrypt data partition with one holder + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + holders = ["dm-0"] + with patch.multiple( + ceph_disk, + is_held=lambda dev: holders, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: partition_type, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'dmcrypt': { + 'holders': holders, + 'type': type, + }, + 'fs_type': None, + 'is_partition': True, + 'mount': None, + 'path': '/dev/' + partition, + 'ptype': partition_type, + 'state': 'unprepared', + 'type': 'data', + 'uuid': partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + # + # dmcrypt data partition with two holders + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + holders = ["dm-0","dm-1"] + with patch.multiple( + ceph_disk, + is_held=lambda dev: holders, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: partition_type, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'dmcrypt': { + 'holders': holders, + 'type': type, + }, + 'is_partition': True, + 'path': '/dev/' + partition, + 'ptype': partition_type, + 'type': 'data', + 'uuid': partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + + def test_list_multipath(self): + args = ceph_disk.parse_args(['list']) + # + # multipath data partition + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: ceph_disk.MPATH_OSD_UUID, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'dmcrypt': {}, + 'fs_type': None, + 'is_partition': True, + 'mount': None, + 'multipath': True, + 'path': '/dev/' + partition, + 'ptype': ceph_disk.MPATH_OSD_UUID, + 'state': 'unprepared', + 'type': 'data', + 'uuid': partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + # + # multipath journal partition + # + journal_partition_uuid = "2cc40457-259e-4542-b029-785c7cc37871" + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: journal_partition_uuid, + get_partition_type=lambda dev: ceph_disk.MPATH_JOURNAL_UUID, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'dmcrypt': {}, + 'is_partition': True, + 'multipath': True, + 'path': '/dev/' + partition, + 'ptype': ceph_disk.MPATH_JOURNAL_UUID, + 'type': 'journal', + 'uuid': journal_partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + + def test_list_dmcrypt(self): + self.list(ceph_disk.DMCRYPT_OSD_UUID, ceph_disk.DMCRYPT_JOURNAL_UUID) + self.list(ceph_disk.DMCRYPT_LUKS_OSD_UUID, ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID) + + def test_list_normal(self): + self.list(ceph_disk.OSD_UUID, ceph_disk.JOURNAL_UUID) + + def list(self, data_ptype, journal_ptype): + args = ceph_disk.parse_args(['--verbose', 'list']) + # + # a single disk has a data partition and a journal + # partition and the osd is active + # + data_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + data = "Xda1" + data_holder = "dm-0" + journal = "Xda2" + journal_holder = "dm-0" + mount_path = '/mount/path' + fs_type = 'ext4' + journal_uuid = "7ad5e65a-0ca5-40e4-a896-62a74ca61c55" + ceph_fsid = "60a2ef70-d99b-4b9b-a83c-8a86e5e60091" + osd_id = '1234' + def get_oneliner(path, what): + if what == 'journal_uuid': + return journal_uuid + elif what == 'ceph_fsid': + return ceph_fsid + elif what == 'whoami': + return osd_id + else: + raise Exception('unknown ' + what) + def get_partition_uuid(dev): + if dev == '/dev/' + data: + return data_uuid + elif dev == '/dev/' + journal: + return journal_uuid + else: + raise Exception('unknown ' + dev) + def get_partition_type(dev): + if (dev == '/dev/' + data or + dev == '/dev/' + data_holder): + return data_ptype + elif (dev == '/dev/' + journal or + dev == '/dev/' + journal_holder): + return journal_ptype + else: + raise Exception('unknown ' + dev) + cluster = 'ceph' + if data_ptype == ceph_disk.OSD_UUID: + data_dmcrypt = {} + elif data_ptype == ceph_disk.DMCRYPT_OSD_UUID: + data_dmcrypt = { + 'type': 'plain', + 'holders': [data_holder], + } + elif data_ptype == ceph_disk.DMCRYPT_LUKS_OSD_UUID: + data_dmcrypt = { + 'type': 'LUKS', + 'holders': [data_holder], + } + else: + raise Exception('unknown ' + data_ptype) + + if journal_ptype == ceph_disk.JOURNAL_UUID: + journal_dmcrypt = {} + elif journal_ptype == ceph_disk.DMCRYPT_JOURNAL_UUID: + journal_dmcrypt = { + 'type': 'plain', + 'holders': [journal_holder], + } + elif journal_ptype == ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID: + journal_dmcrypt = { + 'type': 'LUKS', + 'holders': [journal_holder], + } + else: + raise Exception('unknown ' + journal_ptype) + + if data_dmcrypt: + def is_held(dev): + if dev == '/dev/' + data: + return [data_holder] + elif dev == '/dev/' + journal: + return [journal_holder] + else: + raise Exception('unknown ' + dev) + else: + def is_held(dev): + return [] + + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [data, journal] }, + get_dev_fs=lambda dev: fs_type, + is_mounted=lambda dev: mount_path, + get_partition_uuid=get_partition_uuid, + get_partition_type=get_partition_type, + find_cluster_by_uuid=lambda ceph_fsid: cluster, + is_partition=lambda dev: True, + mount=DEFAULT, + unmount=DEFAULT, + get_oneliner=get_oneliner, + is_held=is_held, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{ + 'ceph_fsid': ceph_fsid, + 'cluster': cluster, + 'dmcrypt': data_dmcrypt, + 'fs_type': fs_type, + 'is_partition': True, + 'journal_dev': '/dev/' + journal, + 'journal_uuid': journal_uuid, + 'mount': mount_path, + 'path': '/dev/' + data, + 'ptype': data_ptype, + 'state': 'active', + 'type': 'data', + 'whoami': osd_id, + 'uuid': data_uuid, + }, { + 'dmcrypt': journal_dmcrypt, + 'is_partition': True, + 'journal_for': '/dev/' + data, + 'path': '/dev/' + journal, + 'ptype': journal_ptype, + 'type': 'journal', + 'uuid': journal_uuid, + }, + ]}] + assert expect == ceph_disk.list_devices(args) + + def test_list_other(self): + args = ceph_disk.parse_args(['list']) + # + # not swap, unknown fs type, not mounted, with uuid + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4" + disk = "Xda" + partition = "Xda1" + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: partition_type, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{'dmcrypt': {}, + 'is_partition': True, + 'path': '/dev/' + partition, + 'ptype': partition_type, + 'type': 'other', + 'uuid': partition_uuid}]}] + assert expect == ceph_disk.list_devices(args) + # + # not swap, mounted, ext4 fs type, with uuid + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4" + disk = "Xda" + partition = "Xda1" + mount_path = '/mount/path' + fs_type = 'ext4' + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + get_dev_fs=lambda dev: fs_type, + is_mounted=lambda dev: mount_path, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: partition_type, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{'dmcrypt': {}, + 'is_partition': True, + 'mount': mount_path, + 'fs_type': fs_type, + 'path': '/dev/' + partition, + 'ptype': partition_type, + 'type': 'other', + 'uuid': partition_uuid, + }]}] + assert expect == ceph_disk.list_devices(args) + + # + # swap, with uuid + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4" + disk = "Xda" + partition = "Xda1" + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [partition] }, + is_swap=lambda dev: True, + get_partition_uuid=lambda dev: partition_uuid, + get_partition_type=lambda dev: partition_type, + is_partition=lambda dev: True, + ): + expect = [{'path': '/dev/' + disk, + 'partitions': [{'dmcrypt': {}, + 'is_partition': True, + 'path': '/dev/' + partition, + 'ptype': partition_type, + 'type': 'swap', + 'uuid': partition_uuid}]}] + assert expect == ceph_disk.list_devices(args) + + # + # whole disk + # + partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2" + disk = "Xda" + partition = "Xda1" + with patch.multiple( + ceph_disk, + list_all_partitions=lambda names: { disk: [] }, + is_partition=lambda dev: False, + ): + expect = [{'path': '/dev/' + disk, + 'dmcrypt': {}, + 'is_partition': False, + 'ptype': 'unknown', + 'type': 'other'}] + assert expect == ceph_disk.list_devices(args) diff --git a/src/test/python/ceph-disk/tox.ini b/src/test/python/ceph-disk/tox.ini index 8017044887b15..194c0fc0b1ae4 100644 --- a/src/test/python/ceph-disk/tox.ini +++ b/src/test/python/ceph-disk/tox.ini @@ -1,16 +1,19 @@ [tox] -envlist = py26, py27, flake8 +envlist = py27, flake8 skipsdist=True [testenv] deps= pytest + mock + pytest-cov==1.6 + coverage==3.7.1 commands= python setup.py develop - py.test -v + py.test -vv --cov=ceph_disk.py --cov-report=term-missing [testenv:flake8] deps= flake8 -commands=flake8 --select=F ceph_disk.py +commands=flake8 --select=F,E9 ceph_disk.py From 8c586e63d3955a586e9e922c1b655b3cf2250e27 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 22:08:46 +0200 Subject: [PATCH 209/654] ceph-disk: CentOS 7 is systemd http://tracker.ceph.com/issues/12786 Fixes: #12786 Signed-off-by: Loic Dachary --- src/ceph-detect-init/ceph_detect_init/centos/__init__.py | 2 ++ src/ceph-detect-init/tests/test_all.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py index f7bf85beda8c4..b9738a73b398a 100644 --- a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py +++ b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py @@ -8,4 +8,6 @@ def choose_init(): Returns the name of a init system (upstart, sysvinit ...). """ + if release and int(release.split('.')[0]) >= 7: + return 'systemd' return 'sysvinit' diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py index 68189bf0187b8..72aa9e96f210c 100644 --- a/src/ceph-detect-init/tests/test_all.py +++ b/src/ceph-detect-init/tests/test_all.py @@ -38,6 +38,9 @@ class TestCephDetectInit(testtools.TestCase): def test_centos(self): + with mock.patch('ceph_detect_init.centos.release', + '7.0'): + self.assertEqual('systemd', centos.choose_init()) self.assertEqual('sysvinit', centos.choose_init()) def test_debian(self): From b04bfd12e2621d498619c40393621ab467d5ce31 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 22:12:01 +0200 Subject: [PATCH 210/654] tests: remove dead scripts Revert 2d0d388162fba25af828ad2cb16560a6d00f2337 which introduced scripts that were never actually used. Signed-off-by: Loic Dachary --- src/test/Makefile.am | 8 -------- src/test/container-make-check-centos-centos7.sh | 3 --- src/test/container-make-check-ubuntu-14.04.sh | 3 --- 3 files changed, 14 deletions(-) delete mode 100755 src/test/container-make-check-centos-centos7.sh delete mode 100755 src/test/container-make-check-ubuntu-14.04.sh diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 79f879683f533..036f3f346e341 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -117,14 +117,6 @@ EXTRA_DIST += \ $(srcdir)/test/coverage.sh \ $(patsubst %,$(srcdir)/%,$(check_SCRIPTS)) -docker-check: - $(srcdir)/test/container-make-check-ubuntu-14.04.sh - $(srcdir)/test/container-make-check-centos-centos7.sh - -EXTRA_DIST += \ - $(srcdir)/test/container-make-check-ubuntu-14.04.sh - $(srcdir)/test/container-make-check-centos-centos7.sh - # target to build but not run the unit tests unittests:: $(check_PROGRAMS) diff --git a/src/test/container-make-check-centos-centos7.sh b/src/test/container-make-check-centos-centos7.sh deleted file mode 100755 index 5e718d03e4b47..0000000000000 --- a/src/test/container-make-check-centos-centos7.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -source test/docker-test-helper.sh -main_docker "$@" --os-type centos --os-version centos7 --dev -- ./run-make-check.sh --enable-root-make-check diff --git a/src/test/container-make-check-ubuntu-14.04.sh b/src/test/container-make-check-ubuntu-14.04.sh deleted file mode 100755 index d9eaa2f627f5b..0000000000000 --- a/src/test/container-make-check-ubuntu-14.04.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -source test/docker-test-helper.sh -main_docker "$@" --os-type ubuntu --os-version 14.04 --dev -- ./run-make-check.sh --enable-root-make-check From d447098cfc55941f831364f1e1b0e684beaaa50d Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 22:17:21 +0200 Subject: [PATCH 211/654] ceph-disk: implement workunit This new ceph-disk workunit re-implements the tests that previously were in the src/test/ceph-disk.sh src/test/ceph-disk-root.sh scripts and is meant to run in a virtual machine instead of docker. Signed-off-by: Loic Dachary --- qa/workunits/ceph-disk/ceph-disk-test.py | 382 +++++++++++++++++++++++ qa/workunits/ceph-disk/ceph-disk.sh | 10 + qa/workunits/ceph-helpers-root.sh | 89 ++++++ 3 files changed, 481 insertions(+) create mode 100644 qa/workunits/ceph-disk/ceph-disk-test.py create mode 100755 qa/workunits/ceph-disk/ceph-disk.sh create mode 100755 qa/workunits/ceph-helpers-root.sh diff --git a/qa/workunits/ceph-disk/ceph-disk-test.py b/qa/workunits/ceph-disk/ceph-disk-test.py new file mode 100644 index 0000000000000..c9914e93b7ea9 --- /dev/null +++ b/qa/workunits/ceph-disk/ceph-disk-test.py @@ -0,0 +1,382 @@ +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +import argparse +import json +import logging +import os +import pytest +import re +import subprocess +import sys +import tempfile +import uuid + +LOG = logging.getLogger('CephDisk') + +class CephDisk: + + @staticmethod + def helper(command): + command = "ceph-helpers-root.sh " + command + return CephDisk.sh(command) + + @staticmethod + def sh(command): + output = subprocess.check_output(command, shell=True) + LOG.debug("sh: " + command + ": " + output) + return output.strip() + + def unused_disks(self, pattern='[vs]d.'): + names = filter(lambda x: re.match(pattern, x), os.listdir("/sys/block")) + if not names: + return [] + disks = json.loads(self.helper("ceph-disk list --format json " + " ".join(names))) + unused = [] + for disk in disks: + if 'partitions' not in disk: + unused.append(disk['path']) + return unused + + def ensure_sd(self): + LOG.debug(self.unused_disks('sd.')) + if self.unused_disks('sd.'): + return + modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=200 ; udevadm settle" + try: + self.sh(modprobe) + except: + self.helper("install linux-image-extra-3.13.0-61-generic") + self.sh(modprobe) + + def unload_scsi_debug(self): + self.sh("rmmod scsi_debug || true") + + def get_osd_partition(self, uuid): + disks = json.loads(self.helper("ceph-disk list --format json")) + for disk in disks: + if 'partitions' in disk: + for partition in disk['partitions']: + if partition.get('uuid') == uuid: + return partition + raise Exception("uuid = " + uuid + " not found in " + str(disks)) + + def get_journal_partition(self, uuid): + data_partition = self.get_osd_partition(uuid) + journal_dev = data_partition['journal_dev'] + disks = json.loads(self.helper("ceph-disk list --format json")) + for disk in disks: + if 'partitions' in disk: + for partition in disk['partitions']: + if partition['path'] == journal_dev: + if 'journal_for' in partition: + assert partition['journal_for'] == data_partition['path'] + return partition + raise Exception("journal for uuid = " + uuid + " not found in " + str(disks)) + + def destroy_osd(self, uuid): + id = self.sh("ceph osd create " + uuid) + self.helper("control_osd stop " + id + " || true") + try: + partition = self.get_journal_partition(uuid) + if partition: + if partition.get('mount'): + self.sh("umount '" + partition['mount'] + "' || true") + if partition['dmcrypt']: + holder = partition['dmcrypt']['holders'][0] + self.sh("cryptsetup close $(cat /sys/block/" + holder + "/dm/name) || true") + except: + pass + try: + partition = self.get_osd_partition(uuid) + if partition.get('mount'): + self.sh("umount '" + partition['mount'] + "' || true") + if partition['dmcrypt']: + holder = partition['dmcrypt']['holders'][0] + self.sh("cryptsetup close $(cat /sys/block/" + holder + "/dm/name) || true") + except: + pass + self.sh(""" + ceph osd down {id} + ceph osd rm {id} + ceph auth del osd.{id} + ceph osd crush rm osd.{id} + """.format(id=id)) + + def run_osd(self, uuid, data, journal=None): + prepare = ("ceph-disk prepare --osd-uuid " + uuid + + " " + data) + if journal: + prepare += " " + journal + self.sh(prepare) + self.sh("ceph osd create " + uuid) + partition = self.get_osd_partition(uuid) + assert partition['type'] == 'data' + assert partition['state'] == 'active' + + @staticmethod + def augtool(command): + return CephDisk.sh(""" + augtool <<'EOF' + set /augeas/load/IniFile/lens Puppet.lns + set /augeas/load/IniFile/incl "/etc/ceph/ceph.conf" + load + {command} + save +EOF + """.format(command=command)) + +class TestCephDisk(object): + + def setup_class(self): + logging.basicConfig(level=logging.DEBUG) + c = CephDisk() + c.helper("install augeas-tools augeas") + c.augtool("set /files/etc/ceph/ceph.conf/global/osd_journal_size 100") + + def test_destroy_osd(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disk = c.unused_disks()[0] + osd_uuid = str(uuid.uuid1()) + c.run_osd(osd_uuid, disk) + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + disk) + + def test_augtool(self): + c = CephDisk() + out = c.augtool("ls /files/etc/ceph/ceph.conf") + assert 'global' in out + + def test_activate_dmcrypt_plain(self): + CephDisk.augtool("set /files/etc/ceph/ceph.conf/global/osd_dmcrypt_type plain") + self.activate_dmcrypt('plain') + CephDisk.augtool("rm /files/etc/ceph/ceph.conf/global/osd_dmcrypt_type") + + def test_activate_dmcrypt_luks(self): + CephDisk.augtool("rm /files/etc/ceph/ceph.conf/global/osd_dmcrypt_type") + self.activate_dmcrypt('luks') + + def activate_dmcrypt(self, type): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disk = c.unused_disks()[0] + osd_uuid = str(uuid.uuid1()) + journal_uuid = str(uuid.uuid1()) + d = tempfile.mkdtemp() + c.sh("ceph-disk zap " + disk) + c.sh("ceph-disk prepare " + + " --dmcrypt-key-dir " + d + + " --osd-uuid " + osd_uuid + + " --journal-uuid " + journal_uuid + + " --dmcrypt " + + " " + disk) + if type == 'plain': + c.sh("cryptsetup --key-file " + d + "/" + osd_uuid + + " --key-size 256 create " + osd_uuid + + " " + disk + "1") + else: + c.sh("cryptsetup --key-file " + d + "/" + osd_uuid + ".luks.key" + + " luksOpen " + + " " + disk + "1" + + " " + osd_uuid) + if type == 'plain': + c.sh("cryptsetup --key-file " + d + "/" + journal_uuid + + " --key-size 256 create " + journal_uuid + + " " + disk + "2") + else: + c.sh("cryptsetup --key-file " + d + "/" + journal_uuid + ".luks.key" + + " luksOpen " + + " " + disk + "2" + + " " + journal_uuid) + c.sh("ceph-disk activate /dev/mapper/" + osd_uuid) + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['type'] == 'data' + assert data_partition['state'] == 'active' + journal_partition = c.get_journal_partition(osd_uuid) + assert journal_partition + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + disk) + + def test_activate_no_journal(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disk = c.unused_disks()[0] + osd_uuid = str(uuid.uuid1()) + c.sh("ceph-disk zap " + disk) + c.augtool("set /files/etc/ceph/ceph.conf/global/osd_objectstore memstore") + c.sh("ceph-disk prepare --osd-uuid " + osd_uuid + + " " + disk) + device = json.loads(c.helper("ceph-disk list --format json " + disk))[0] + assert len(device['partitions']) == 1 + partition = device['partitions'][0] + assert partition['type'] == 'data' + assert partition['state'] == 'active' + assert 'journal_dev' not in partition + c.helper("pool_read_write") + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + disk) + c.augtool("rm /files/etc/ceph/ceph.conf/global/osd_objectstore") + + def test_activate_with_journal(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disk = c.unused_disks()[0] + osd_uuid = str(uuid.uuid1()) + c.sh("ceph-disk zap " + disk) + c.sh("ceph-disk prepare --osd-uuid " + osd_uuid + + " " + disk) + device = json.loads(c.helper("ceph-disk list --format json " + disk))[0] + assert len(device['partitions']) == 2 + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['type'] == 'data' + assert data_partition['state'] == 'active' + journal_partition = c.get_journal_partition(osd_uuid) + assert journal_partition + c.helper("pool_read_write") + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + disk) + + def test_activate_separated_journal(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disks = c.unused_disks() + data_disk = disks[0] + journal_disk = disks[1] + osd_uuid = self.activate_separated_journal(data_disk, journal_disk) + c.helper("pool_read_write 1") # 1 == pool size + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + data_disk + " " + journal_disk) + + def activate_separated_journal(self, data_disk, journal_disk): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + osd_uuid = str(uuid.uuid1()) + c.sh("ceph-disk prepare --osd-uuid " + osd_uuid + + " " + data_disk + " " + journal_disk) + device = json.loads(c.helper("ceph-disk list --format json " + data_disk))[0] + assert len(device['partitions']) == 1 + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['type'] == 'data' + assert data_partition['state'] == 'active' + journal_partition = c.get_journal_partition(osd_uuid) + assert journal_partition + return osd_uuid + + # + # Create an OSD and get a journal partition from a disk that + # already contains a journal partition which is in use. Updates of + # the kernel partition table may behave differently when a + # partition is in use. See http://tracker.ceph.com/issues/7334 for + # more information. + # + def test_activate_two_separated_journal(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disks = c.unused_disks() + data_disk = disks[0] + other_data_disk = disks[1] + journal_disk = disks[2] + osd_uuid = self.activate_separated_journal(data_disk, journal_disk) + other_osd_uuid = self.activate_separated_journal(other_data_disk, journal_disk) + # + # read/write can only succeed if the two osds are up because + # the pool needs two OSD + # + c.helper("pool_read_write 2") # 2 == pool size + c.destroy_osd(osd_uuid) + c.destroy_osd(other_osd_uuid) + c.sh("ceph-disk zap " + data_disk + " " + journal_disk + " " + other_data_disk) + + # + # Create an OSD and reuse an existing journal partition + # + def test_activate_reuse_journal(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'Ubuntu': + pytest.skip("see issue http://tracker.ceph.com/issues/12787") + disks = c.unused_disks() + data_disk = disks[0] + journal_disk = disks[1] + # + # Create an OSD with a separated journal and destroy it. + # + osd_uuid = self.activate_separated_journal(data_disk, journal_disk) + journal_partition = c.get_journal_partition(osd_uuid) + journal_path = journal_partition['path'] + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + data_disk) + osd_uuid = str(uuid.uuid1()) + # + # Create another OSD with the journal partition of the previous OSD + # + c.sh("ceph-disk prepare --osd-uuid " + osd_uuid + + " " + data_disk + " " + journal_path) + c.helper("pool_read_write 1") # 1 == pool size + device = json.loads(c.helper("ceph-disk list --format json " + data_disk))[0] + assert len(device['partitions']) == 1 + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['type'] == 'data' + assert data_partition['state'] == 'active' + journal_partition = c.get_journal_partition(osd_uuid) + # + # Verify the previous OSD partition has been reused + # + assert journal_partition['path'] == journal_path + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + data_disk + " " + journal_disk) + +class CephDiskTest(CephDisk): + + def main(self, argv): + parser = argparse.ArgumentParser( + 'ceph-disk-test', + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', default=None, + help='be more verbose', + ) + parser.add_argument( + '--destroy-osd', + help='stop, umount and destroy', + ) + args = parser.parse_args(argv) + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + + if args.destroy_osd: + dump = json.loads(CephDisk.sh("ceph osd dump -f json")) + osd_uuid = None + for osd in dump['osds']: + if str(osd['osd']) == args.destroy_osd: + osd_uuid = osd['uuid'] + if osd_uuid: + self.destroy_osd(osd_uuid) + else: + raise Exception("cannot find OSD " + args.destroy_osd + + " ceph osd dump -f json") + return + +if __name__ == '__main__': + sys.exit(CephDiskTest().main(sys.argv[1:])) diff --git a/qa/workunits/ceph-disk/ceph-disk.sh b/qa/workunits/ceph-disk/ceph-disk.sh new file mode 100755 index 0000000000000..d364efd4f9567 --- /dev/null +++ b/qa/workunits/ceph-disk/ceph-disk.sh @@ -0,0 +1,10 @@ +source $(dirname $0)/../ceph-helpers-root.sh true + +install python-pytest +install pytest +sudo env PATH=$(dirname $0)/..:$PATH py.test -v $(dirname $0)/ceph-disk-test.py +# own whatever was created as a side effect of the py.test run +# so that it can successfully be removed later on by a non privileged +# process +sudo chown -R $(id -u) $(dirname $0) + diff --git a/qa/workunits/ceph-helpers-root.sh b/qa/workunits/ceph-helpers-root.sh new file mode 100755 index 0000000000000..200452d9fe36b --- /dev/null +++ b/qa/workunits/ceph-helpers-root.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +####################################################################### + +function install() { + for package in "$@" ; do + install_one $package + done + return 0 +} + +function install_one() { + case $(lsb_release -si) in + Ubuntu|Debian|Devuan) + sudo apt-get install -y "$@" + ;; + CentOS|Fedora|RedHatEnterpriseServer) + sudo yum install -y "$@" + ;; + *SUSE*) + sudo zypper --non-interactive install "$@" + ;; + *) + echo "$(lsb_release -si) is unknown, $@ will have to be installed manually." + ;; + esac +} + +####################################################################### + +function control_osd() { + local action=$1 + local id=$2 + + local init=$(ceph-detect-init) + + case $init in + upstart) + sudo service ceph-osd $action id=$id + ;; + systemd) + sudo systemctl $action ceph-osd@$id + ;; + *) + echo ceph-detect-init returned an unknown init system: $init >&2 + return 1 + ;; + esac + return 0 +} + +####################################################################### + +function pool_read_write() { + local size=${1:-1} + local dir=/tmp + local timeout=360 + local test_pool=test_pool + + ceph osd pool delete $test_pool $test_pool --yes-i-really-really-mean-it || return 1 + ceph osd pool create $test_pool 4 || return 1 + ceph osd pool set $test_pool size $size || return 1 + ceph osd pool set $test_pool min_size $size || return 1 + + echo FOO > $dir/BAR + timeout $timeout rados --pool $test_pool put BAR $dir/BAR || return 1 + timeout $timeout rados --pool $test_pool get BAR $dir/BAR.copy || return 1 + diff $dir/BAR $dir/BAR.copy || return 1 + ceph osd pool delete $test_pool $test_pool --yes-i-really-really-mean-it || return 1 +} + +####################################################################### + +"$@" From 5ce7ed1bdd5c4cb3a48fb4f8b83e740703e520e0 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 22:22:43 +0200 Subject: [PATCH 212/654] ceph-disk: integration tests for multipath Add integration tests for multipath to the ceph-disk workunit, with the following caveats: A workaround is added (explicit call to ceph-disk activate) until the CentOS activation bug http://tracker.ceph.com/issues/12786 is fixed. The tests do not run on Ubuntu because of the multipath / device mapper bug https://bugs.launchpad.net/ubuntu/+source/multipath-tools/+bug/1488688 and it has not been tested on Debian. http://tracker.ceph.com/issues/11881 Refs: #11881 Signed-off-by: Loic Dachary --- qa/workunits/ceph-disk/ceph-disk-test.py | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/qa/workunits/ceph-disk/ceph-disk-test.py b/qa/workunits/ceph-disk/ceph-disk-test.py index c9914e93b7ea9..bc6d2544cc64c 100644 --- a/qa/workunits/ceph-disk/ceph-disk-test.py +++ b/qa/workunits/ceph-disk/ceph-disk-test.py @@ -144,6 +144,7 @@ def setup_class(self): logging.basicConfig(level=logging.DEBUG) c = CephDisk() c.helper("install augeas-tools augeas") + c.helper("install multipath-tools device-mapper-multipath") c.augtool("set /files/etc/ceph/ceph.conf/global/osd_journal_size 100") def test_destroy_osd(self): @@ -345,6 +346,50 @@ def test_activate_reuse_journal(self): c.destroy_osd(osd_uuid) c.sh("ceph-disk zap " + data_disk + " " + journal_disk) + def test_activate_multipath(self): + c = CephDisk() + if c.sh("lsb_release -si") != 'CentOS': + pytest.skip("see issue https://bugs.launchpad.net/ubuntu/+source/multipath-tools/+bug/1488688") + c.ensure_sd() + # + # Figure out the name of the multipath device + # + disk = c.unused_disks('sd.')[0] + c.sh("mpathconf --enable || true") + c.sh("multipath " + disk) + holders = os.listdir("/sys/block/" + os.path.basename(disk) + "/holders") + assert 1 == len(holders) + name = open("/sys/block/" + holders[0] + "/dm/name").read() + multipath = "/dev/mapper/" + name + # + # Prepare the multipath device + # + osd_uuid = str(uuid.uuid1()) + c.sh("ceph-disk zap " + multipath) + c.sh("ceph-disk prepare --osd-uuid " + osd_uuid + + " " + multipath) + device = json.loads(c.helper("ceph-disk list --format json " + multipath))[0] + assert len(device['partitions']) == 2 + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['type'] == 'data' + # + # Activate it although it should auto activate + # + if True: # remove this block when http://tracker.ceph.com/issues/12786 is fixed + c.sh("ceph-disk activate " + data_partition['path']) + device = json.loads(c.helper("ceph-disk list --format json " + multipath))[0] + assert len(device['partitions']) == 2 + data_partition = c.get_osd_partition(osd_uuid) + assert data_partition['state'] == 'active' + journal_partition = c.get_journal_partition(osd_uuid) + assert journal_partition + c.helper("pool_read_write") + c.destroy_osd(osd_uuid) + c.sh("ceph-disk zap " + multipath) + c.sh("udevadm settle") + c.sh("multipath -F") + c.unload_scsi_debug() + class CephDiskTest(CephDisk): def main(self, argv): From c7c59b2bf28f115876cc4dbec94656507a6cc8ba Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Sat, 29 Aug 2015 10:21:23 +0200 Subject: [PATCH 213/654] mailmap: update h3c organization mailbox Reviewed-by: Loic Dachary Signed-off-by: Ce Gu --- .organizationmap | 76 ++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.organizationmap b/.organizationmap index 0946f00b92458..d775b0613d003 100644 --- a/.organizationmap +++ b/.organizationmap @@ -92,44 +92,44 @@ gocept gmbh & co. kg Christian Theune GRNet Filippos Giannakos GRNet Stratos Psomadakis GRNet Vangelis Koukis -H3C Bingxin Yang -H3C Bin Zheng -H3C Bo Cai -H3C Ce Gu -H3C Chunyan Ma -H3C Donghai Xu -H3C Fei Wang -H3C Jie Chen -H3C Jie Li -H3C Kongming Wu -H3C Lu Shi -H3C Mingyue Zhao -H3C Ming Zou -H3C Na Xie -H3C Ni Dang -H3C Peiyang Liu -H3C Qiang Guo -H3C Qiankun Zheng -H3C Ruifeng Yang -H3C Sangdi Xu -H3C Shan Li -H3C Siyuan Zhou -H3C Tingting Chi -H3C Weijun Duan -H3C Wenfeng Wang -H3C Xiangwei Wu -H3C Xiaofeng Feng -H3C Xiaowei Chen -H3C Xuan Liu -H3C Xudong Cao -H3C Yanbin Wu -H3C Yehua Chen -H3C Yongqiang He -H3C Yue Zhu -H3C Yunhui Chen -H3C Zengran Zhang -H3C Zeqiang Zhuang -H3C Zhanyang Chen +H3C Bingxin Yang +H3C Bin Zheng +H3C Bo Cai +H3C Ce Gu +H3C Chunyan Ma +H3C Donghai Xu +H3C Fei Wang +H3C Jie Chen +H3C Jie Li +H3C Kongming Wu +H3C Lu Shi +H3C Mingyue Zhao +H3C Ming Zou +H3C Na Xie +H3C Ni Dang +H3C Peiyang Liu +H3C Qiang Guo +H3C Qiankun Zheng +H3C Ruifeng Yang +H3C Sangdi Xu +H3C Shan Li +H3C Siyuan Zhou +H3C Tingting Chi +H3C Weijun Duan +H3C Wenfeng Wang +H3C Xiangwei Wu +H3C Xiaofeng Feng +H3C Xiaowei Chen +H3C Xuan Liu +H3C Xudong Cao +H3C Yanbin Wu +H3C Yehua Chen +H3C Yongqiang He +H3C Yue Zhu +H3C Yunhui Chen +H3C Zengran Zhang +H3C Zeqiang Zhuang +H3C Zhanyang Chen Hastexo Florian Haas HGST Kevin Dalley HGST Lluis Pamies-Juarez From 384cf19d208738c583081fca369c0e4f784edd64 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:42:58 +0200 Subject: [PATCH 214/654] mailmap: Claire Massot affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index a59fa711fe668..5d91d3941c0e9 100644 --- a/.mailmap +++ b/.mailmap @@ -42,6 +42,7 @@ Chris Holcombe cholcombe973 Christian Brunner Christian Marie Christophe Courtaut Kri5 +Claire Massot Colin P. McCabe Colin P. McCabe Dan Chai danchai diff --git a/.organizationmap b/.organizationmap index d775b0613d003..47493a2ef1442 100644 --- a/.organizationmap +++ b/.organizationmap @@ -314,6 +314,7 @@ Unaffiliated BJ Lougee Unaffiliated Bosse Klykken Unaffiliated Cheng Cheng Unaffiliated Christos Stavrakakis +Unaffiliated Claire Massot Unaffiliated Colin Mattson Unaffiliated Dan Chai Unaffiliated Daniel Schepler From f4532317f23b3e42b531a6c51ae8c78960e6cbbd Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:43:28 +0200 Subject: [PATCH 215/654] mailmap: Clement Lebrun affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 5d91d3941c0e9..9667fc1bc2a6f 100644 --- a/.mailmap +++ b/.mailmap @@ -43,6 +43,7 @@ Christian Brunner Christian Marie Christophe Courtaut Kri5 Claire Massot +Clement Lebrun Colin P. McCabe Colin P. McCabe Dan Chai danchai diff --git a/.organizationmap b/.organizationmap index 47493a2ef1442..7d4d36bc87ca7 100644 --- a/.organizationmap +++ b/.organizationmap @@ -315,6 +315,7 @@ Unaffiliated Bosse Klykken Unaffiliated Cheng Cheng Unaffiliated Christos Stavrakakis Unaffiliated Claire Massot +Unaffiliated Clement Lebrun Unaffiliated Colin Mattson Unaffiliated Dan Chai Unaffiliated Daniel Schepler From 593b1a1e9142084ab22fdccfa780ac73f77c95bc Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:44:07 +0200 Subject: [PATCH 216/654] mailmap: Gabriel Sentucq affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 9667fc1bc2a6f..b8bccdff4c770 100644 --- a/.mailmap +++ b/.mailmap @@ -66,6 +66,7 @@ Florent Flament nairolf21 Florian Marsylle François Lafont +Gabriel Sentucq Gary Lowell Gary Lowelll Gary Lowell Gary Lowell diff --git a/.organizationmap b/.organizationmap index 7d4d36bc87ca7..72935a5f1b4af 100644 --- a/.organizationmap +++ b/.organizationmap @@ -332,6 +332,7 @@ Unaffiliated Florian Coste Unaffiliated Florian Marsylle Unaffiliated François Lafont Unaffiliated Frank Yu +Unaffiliated Gabriel Sentucq Unaffiliated Gaurav Kumar Garg Unaffiliated Haomai Wang Unaffiliated Henry Chang From 1395b51b3eac82df616aa718180b269ab4859d60 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:44:30 +0200 Subject: [PATCH 217/654] mailmap: Germain Chipaux affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index b8bccdff4c770..925750c0d6b4e 100644 --- a/.mailmap +++ b/.mailmap @@ -72,6 +72,7 @@ Gary Lowell Gary Lowell Gaurav Kumar Garg Gerben Meijer +Germain Chipaux Greg Farnum Greg Farnum Greg Farnum diff --git a/.organizationmap b/.organizationmap index 72935a5f1b4af..86d5495a44011 100644 --- a/.organizationmap +++ b/.organizationmap @@ -334,6 +334,7 @@ Unaffiliated François Lafont Unaffiliated Frank Yu Unaffiliated Gabriel Sentucq Unaffiliated Gaurav Kumar Garg +Unaffiliated Germain Chipaux Unaffiliated Haomai Wang Unaffiliated Henry Chang Unaffiliated Huang Jun From 7040be2459a714404e156a8b7ed955a58ab20d7a Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:44:46 +0200 Subject: [PATCH 218/654] mailmap: Guang Yang affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 925750c0d6b4e..bf8a9037e6abf 100644 --- a/.mailmap +++ b/.mailmap @@ -84,6 +84,7 @@ Greg Farnum Greg Farnum Gregory Farnum Greg Farnum Reviewed-by: Greg Farnum Guang Yang Guang Yang +Guang Yang Guang Yang Guilhem Lettron Haomai Wang From bb5784d5f475cadefe6dc27d6b9780d503507d6b Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:45:21 +0200 Subject: [PATCH 219/654] mailmap: Jordan Dorne affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index bf8a9037e6abf..3bb83421db2bd 100644 --- a/.mailmap +++ b/.mailmap @@ -120,6 +120,7 @@ John Wilkins John Wilkins John Wilkins John Wilkins +Jordan Dorne Josh Durgin Josh During Josh Durgin Josh Durgin diff --git a/.organizationmap b/.organizationmap index 86d5495a44011..baf5d56398755 100644 --- a/.organizationmap +++ b/.organizationmap @@ -347,6 +347,7 @@ Unaffiliated Jiang Heng Unaffiliated Jiantao He Unaffiliated Jian Wen Unaffiliated Jon Bernard +Unaffiliated Jordan Dorne Unaffiliated Karel Striegel Unaffiliated Kefu Chai Unaffiliated Kernel Neophyte From 8bafdc595fe8e9dc082ee8ecd91f322d7b180f02 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:45:41 +0200 Subject: [PATCH 220/654] =?UTF-8?q?mailmap:=20K=C3=A9vin=20Caradant=20affi?= =?UTF-8?q?liation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 3bb83421db2bd..16a93ca6d7c5e 100644 --- a/.mailmap +++ b/.mailmap @@ -126,6 +126,7 @@ Josh Durgin Josh Durgin Kacper Kowalik Kacper Kowalik (Xarthisius) Kefu Chai +Kévin Caradant Kiseleva Alyona Ved-vampir Laszlo Boszormenyi Laszlo Boszormenyi (GCS) Lluis Pamies-Juarez diff --git a/.organizationmap b/.organizationmap index baf5d56398755..95c26684573d2 100644 --- a/.organizationmap +++ b/.organizationmap @@ -352,6 +352,7 @@ Unaffiliated Karel Striegel Unaffiliated Kefu Chai Unaffiliated Kernel Neophyte Unaffiliated Ketor Meng +Unaffiliated Kévin Caradant Unaffiliated Kevin Cox Unaffiliated Kim Vandry Unaffiliated koleosfuscus From 3d4db7e1dd20b2865065afc9629218f523920ce1 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:46:04 +0200 Subject: [PATCH 221/654] mailmap: Lucas Fantinel affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 16a93ca6d7c5e..81d2208d16124 100644 --- a/.mailmap +++ b/.mailmap @@ -135,6 +135,7 @@ Loic Dachary Loic Dachary Loic Dachary Loïc Dachary Lu Shi +Lucas Fantinel Ma Jianpeng Jianpeng Ma Ma Jianpeng Ma Jianpeng Ma, Jianpeng diff --git a/.organizationmap b/.organizationmap index 95c26684573d2..94d1f09e08f2b 100644 --- a/.organizationmap +++ b/.organizationmap @@ -358,6 +358,7 @@ Unaffiliated Kim Vandry Unaffiliated koleosfuscus Unaffiliated Laurent Guerby Unaffiliated Lee Revell +Unaffiliated Lucas Fantinel Unaffiliated Matt Richards Unaffiliated Mehdi Abaakouk Unaffiliated Michael Nelson From 6e2cde7d59307018f8a8c4df5a53ea0e6daf11d8 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:46:22 +0200 Subject: [PATCH 222/654] mailmap: Maxime Robert affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 81d2208d16124..b7e7c60312353 100644 --- a/.mailmap +++ b/.mailmap @@ -145,6 +145,7 @@ Matt Benjamin Matthew Roy Matthew Roy Matthew Wodrich +Maxime Robert Michael Riederer dynamike67 Michael Rodriguez Michael Rodriguez diff --git a/.organizationmap b/.organizationmap index 94d1f09e08f2b..b953d69162bdd 100644 --- a/.organizationmap +++ b/.organizationmap @@ -360,6 +360,7 @@ Unaffiliated Laurent Guerby Unaffiliated Lee Revell Unaffiliated Lucas Fantinel Unaffiliated Matt Richards +Unaffiliated Maxime Robert Unaffiliated Mehdi Abaakouk Unaffiliated Michael Nelson Unaffiliated Michael Riederer From 6889f3522303157cc12466a4a3b0154d71ba13dd Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:46:40 +0200 Subject: [PATCH 223/654] mailmap: Nicolas Yong affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index b7e7c60312353..8aa97658a0d77 100644 --- a/.mailmap +++ b/.mailmap @@ -155,6 +155,7 @@ Nathan Cutler Na Xie Neha Ummareddy nehaummareddy Neil Levine +Nicolas Yong Ning Yao Noah Watkins Noah Watkins diff --git a/.organizationmap b/.organizationmap index b953d69162bdd..2dcadf6ef5241 100644 --- a/.organizationmap +++ b/.organizationmap @@ -366,6 +366,7 @@ Unaffiliated Michael Nelson Unaffiliated Michael Riederer Unaffiliated Michal Jarzabek Unaffiliated Neha Ummareddy +Unaffiliated Nicolas Yong Unaffiliated (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9> Unaffiliated Robin Dehu Unaffiliated Rohan Mars From 805dcc9a0568ba92092a848e8f51657487f0c204 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:47:01 +0200 Subject: [PATCH 224/654] mailmap: Pierre Chaumont affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 8aa97658a0d77..7903dc61c5b0c 100644 --- a/.mailmap +++ b/.mailmap @@ -166,6 +166,7 @@ Patrick McGarry Pavan Rallabhandi Pavan Rallabhandi Pete Zaitcev Qiankun Zheng +Pierre Chaumont Riccardo Ferretti rferrett Roald J. van Loon Robert Jansen diff --git a/.organizationmap b/.organizationmap index 2dcadf6ef5241..3744fd629d3b3 100644 --- a/.organizationmap +++ b/.organizationmap @@ -368,6 +368,7 @@ Unaffiliated Michal Jarzabek Unaffiliated Neha Ummareddy Unaffiliated Nicolas Yong Unaffiliated (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9> +Unaffiliated Pierre Chaumont Unaffiliated Robin Dehu Unaffiliated Rohan Mars Unaffiliated Roman Haritonov From d4f8a5bdec77005eef5d9364abc2f11ac060f2cd Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:47:20 +0200 Subject: [PATCH 225/654] mailmap: Robin Tang affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 7903dc61c5b0c..3c9e7a9db44b1 100644 --- a/.mailmap +++ b/.mailmap @@ -171,6 +171,7 @@ Riccardo Ferretti rferrett Robert Jansen Robin Dehu +Robin Tang Ron Allred rallred Ross Turk Ross Turk diff --git a/.organizationmap b/.organizationmap index 3744fd629d3b3..4804d93c5ae02 100644 --- a/.organizationmap +++ b/.organizationmap @@ -370,6 +370,7 @@ Unaffiliated Nicolas Yong Unaffiliated (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9> Unaffiliated Pierre Chaumont Unaffiliated Robin Dehu +Unaffiliated Robin Tang Unaffiliated Rohan Mars Unaffiliated Roman Haritonov Unaffiliated Sergey Arkhipov From 8e1d9f8b27ceaef1887e942d52fb85bde0fc103a Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:48:28 +0200 Subject: [PATCH 226/654] mailmap: Sebastien Han affiliation Signed-off-by: Loic Dachary --- .mailmap | 2 +- .organizationmap | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 3c9e7a9db44b1..8115604a8642a 100644 --- a/.mailmap +++ b/.mailmap @@ -203,7 +203,7 @@ Samuel Just Sandon Van Ness SandonV Sandon Van Ness Scott A. Brandt sbrandt -Sebastien Han +Sébastien Han Sebastien Ponce Sebastien Ponce Sergey Arkhipov 9seconds Shanggao Qiu qiushanggao diff --git a/.organizationmap b/.organizationmap index 4804d93c5ae02..87450e0e2074e 100644 --- a/.organizationmap +++ b/.organizationmap @@ -247,6 +247,7 @@ Red Hat Sahid Orentino Ferdjaoui Sam Lang Red Hat Samuel Just Red Hat Sandon Van Ness +Red Hat Sébastien Han Red Hat Shylesh Kumar Red Hat Tamil Muthamizhan Red Hat Tom Callaway From 0f849a861f9d8150e916ceec5bc404c404dc5cae Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:49:00 +0200 Subject: [PATCH 227/654] mailmap: Shawn Chen affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 8115604a8642a..595768470ef9c 100644 --- a/.mailmap +++ b/.mailmap @@ -207,6 +207,7 @@ Sébastien Han Sebastien Ponce Sebastien Ponce Sergey Arkhipov 9seconds Shanggao Qiu qiushanggao +Shawn Chen Shu, Xinxin xinxin shu Shu, Xinxin xinxinsh Stephen F Taylor diff --git a/.organizationmap b/.organizationmap index 87450e0e2074e..a7c82cff4d17e 100644 --- a/.organizationmap +++ b/.organizationmap @@ -376,6 +376,7 @@ Unaffiliated Rohan Mars Unaffiliated Roman Haritonov Unaffiliated Sergey Arkhipov Unaffiliated Shanggao Qiu +Unaffiliated Shawn Chen Unaffiliated Shawn Edwards Unaffiliated Simon Guinot Unaffiliated Stephen F Taylor From 33a75f1199193a03947595fa576afaee3a381dee Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:49:33 +0200 Subject: [PATCH 228/654] mailmap: Thomas Laumondais affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 595768470ef9c..61c91741357cf 100644 --- a/.mailmap +++ b/.mailmap @@ -221,6 +221,7 @@ Tamil Muthamizhan Thomas Bechtold Thomas Cantin ThomasCantin +Thomas Laumondais Thomas Johnson Tommi Virtanen Tommi Virtanen diff --git a/.organizationmap b/.organizationmap index a7c82cff4d17e..b7ae3f8d56528 100644 --- a/.organizationmap +++ b/.organizationmap @@ -382,6 +382,7 @@ Unaffiliated Simon Guinot Unaffiliated Stephen F Taylor Unaffiliated Steve Stock Unaffiliated Thomas Johnson +Unaffiliated Thomas Laumondais Unaffiliated Tim Freund Unaffiliated Vartika Rai Unaffiliated Vicente Cheng From c4820258395520036b725b0b52a35fea7e163525 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:50:03 +0200 Subject: [PATCH 229/654] mailmap: Valentin Arshanes affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 61c91741357cf..00768d0f13eae 100644 --- a/.mailmap +++ b/.mailmap @@ -231,6 +231,7 @@ Tyler Brekke Concubidated Varada Kari Volker Assmann Volker Assmann +Valentin Arshanes Thomas Walter Huf Walter J. Huf Wang, Yaguang ywang19 Warren Usui wusui diff --git a/.organizationmap b/.organizationmap index b7ae3f8d56528..27ae59d4f291e 100644 --- a/.organizationmap +++ b/.organizationmap @@ -384,6 +384,7 @@ Unaffiliated Steve Stock Unaffiliated Thomas Johnson Unaffiliated Thomas Laumondais Unaffiliated Tim Freund +Unaffiliated Valentin Arshanes Thomas Unaffiliated Vartika Rai Unaffiliated Vicente Cheng Unaffiliated Viktor Suprun From ffb36bdbc6138fc28cc22fe9992ce25a0b1ee04c Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:50:32 +0200 Subject: [PATCH 230/654] mailmap: Wu Xingyi affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 00768d0f13eae..90a0a18f811eb 100644 --- a/.mailmap +++ b/.mailmap @@ -238,6 +238,7 @@ Warren Usui wusui Weijun Duan Wei Luo luowei Wido den Hollander +Wu Xingyi Xan Peng xan Xavier Roche Xiaowei Chen diff --git a/.organizationmap b/.organizationmap index 27ae59d4f291e..4c5dc83d38225 100644 --- a/.organizationmap +++ b/.organizationmap @@ -186,6 +186,7 @@ IWeb David Moreau Simard Karlsruhe Institute of Technology Daniel J. Hofmann Keeper Technology Wyllys Ingersoll Lebanon Evangelical School Jonathan Dieter +LETV Wu Xingyi Linaro Steve Capper Linaro Yazen Ghannam Los Alamos National Laboratory Esteban Molina-Estolano From 827fbced7ae79f2dd7302492300e3d8be7391bb6 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:50:51 +0200 Subject: [PATCH 231/654] mailmap: Yannick Atchy Dalama affiliation Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + 2 files changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 90a0a18f811eb..20244e046397c 100644 --- a/.mailmap +++ b/.mailmap @@ -245,6 +245,7 @@ Xiaowei Chen Xie Rui <875016668@qq.com> Jerry7X <875016668@qq.com> Xingyi Wu Xuan Liu +Yannick Atchy Dalama Yan, Zheng Yan, Zheng Zheng Yan Yan, Zheng Zheng, Yan diff --git a/.organizationmap b/.organizationmap index 4c5dc83d38225..5a5de231b08dc 100644 --- a/.organizationmap +++ b/.organizationmap @@ -398,6 +398,7 @@ Unaffiliated Xingyi Wu Unaffiliated Xinze Chi Unaffiliated Xiong Yiliang Unaffiliated Yann Dupont +Unaffiliated Yannick Atchy Dalama Unaffiliated Yongyue Sun Unaffiliated Zhe Zhang Unaffiliated Zhicheng Wei From 33f8693365208684be62b307643cd56047c752de Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:51:20 +0200 Subject: [PATCH 232/654] mailmap: Zhi Zhang affiliation Reviewed-by: Zhi Zhang Signed-off-by: Loic Dachary --- .mailmap | 1 + .organizationmap | 1 + .peoplemap | 1 + 3 files changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index 20244e046397c..03f43e632d8a2 100644 --- a/.mailmap +++ b/.mailmap @@ -266,6 +266,7 @@ Yuan Zhou Zengran Zhang Zhi (David) Zhang Zhi (David) Zhang Zhi Z Zhang +Zhi Zhang Zhiqiang Wang Signed-off-by: Zhiqiang Wang Zhiqiang Wang Wang, Zhiqiang Zhiqiang Wang Zhiqiang Wang diff --git a/.organizationmap b/.organizationmap index 5a5de231b08dc..4123940b6fb50 100644 --- a/.organizationmap +++ b/.organizationmap @@ -297,6 +297,7 @@ Telecom Bretagne Ahoussi Armand Baptiste Veuillez Telecom Bretagne Hazem Amara Telecom Bretagne Thomas Cantin +Tencent Zhi Zhang The Linux Box Adam C. Emerson The Linux Box Ali Maredia The Linux Box Casey Bodley diff --git a/.peoplemap b/.peoplemap index c29508a322df1..ac7934a91cfdd 100644 --- a/.peoplemap +++ b/.peoplemap @@ -44,3 +44,4 @@ Warren Usui Warren Usui Yan, Zheng Yan, Zheng Yehuda Sadeh Yehuda Sadeh Yuri Weinstein Yuri Weinstein +Zhi Zhang Zhi (David) Zhang From 94bbd139bc681aa1a1650d5c8522e47138a32f2a Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:51:48 +0200 Subject: [PATCH 233/654] mailmap: Shotaro Kawaguchi affiliation Reviewed-by: Shotaro Kawaguchi Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 4123940b6fb50..aa2b216e832b2 100644 --- a/.organizationmap +++ b/.organizationmap @@ -83,6 +83,7 @@ EPAM Andrey Kuznetsov Exalead Fairbanks Robert Jansen Fujitsu Piotr Dałek +Fujitsu Shotaro Kawaguchi Fujitsu Takeshi Miyamae GameServers.com Brian Rak Gentoo Kacper Kowalik From dbdf48a84d016cc0e6863a09bf1b901b50caec12 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:52:04 +0200 Subject: [PATCH 234/654] mailmap: Takanori Nakao affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index aa2b216e832b2..3849074105392 100644 --- a/.organizationmap +++ b/.organizationmap @@ -84,6 +84,7 @@ Exalead Fairbanks Robert Jansen Fujitsu Piotr Dałek Fujitsu Shotaro Kawaguchi +Fujitsu Takanori Nakao Fujitsu Takeshi Miyamae GameServers.com Brian Rak Gentoo Kacper Kowalik From 552ad88c29450989ec181dc153cf3c00b60969b7 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:52:25 +0200 Subject: [PATCH 235/654] mailmap: Joe Handzik affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 3849074105392..286e92ccbef70 100644 --- a/.organizationmap +++ b/.organizationmap @@ -137,6 +137,7 @@ HGST Kevin Dalley HGST Lluis Pamies-Juarez Hostplex Hosting Andras Elso HP Blaine Gardner +HP Joe Handzik Igalia Javier M. Mellid Imagination Technologies Ltd. Alistair Strachan iNic Bjørnar Ness From aab25836d742601a6d48d650dee65b28d4d5c8a6 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:52:45 +0200 Subject: [PATCH 236/654] =?UTF-8?q?mailmap:=20Krzysztof=20Kosi=C5=84ski=20?= =?UTF-8?q?affiliation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 286e92ccbef70..cf7787cfce54a 100644 --- a/.organizationmap +++ b/.organizationmap @@ -176,6 +176,7 @@ Inktank Warren Usui Inktank Yehuda Sadeh Inktank Yuri Weinstein Intel Chendi Xue +Intel Krzysztof Kosiński Intel Ma Jianpeng Intel Shu, Xinxin Intel Wang, Yaguang From a7004d7372c08ee3515b31304c1bc7d14c19ced7 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:52:58 +0200 Subject: [PATCH 237/654] mailmap: Ira Cooper affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index cf7787cfce54a..22622dff60458 100644 --- a/.organizationmap +++ b/.organizationmap @@ -226,6 +226,7 @@ Red Hat Gregory Meno Red Hat Haïkel Guémar Red Hat Huamin Chen Red Hat Ilya Dryomov +Red Hat Ira Cooper Red Hat Jason Dillaman Red Hat Jean-Charles Lopez Red Hat João Eduardo Luís From 3368f0100f2257e7c4693590b3c7bec1044b7b4d Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:53:18 +0200 Subject: [PATCH 238/654] mailmap: Vikhyat Umrao affiliation Reviewed-by: Vikhyat Umrao Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 22622dff60458..15de4782344f6 100644 --- a/.organizationmap +++ b/.organizationmap @@ -261,6 +261,7 @@ Red Hat Travis Rhoden Red Hat Tyler Brekke Red Hat Vasu Kulkarni Red Hat Venky Shankar +Red Hat Vikhyat Umrao Red Hat Warren Usui Red Hat Yan, Zheng Red Hat Yehuda Sadeh From 276644343b7e3ab814bada1becf409823091d6ac Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:53:33 +0200 Subject: [PATCH 239/654] mailmap: Abhishek Dixit affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 15de4782344f6..39725acac71a4 100644 --- a/.organizationmap +++ b/.organizationmap @@ -313,6 +313,7 @@ Ubuntu Kylin Li Wang Ubuntu Kylin Min Chen Ubuntu Kylin MingXin Liu Ubuntu Kylin Yunchuan Wen +Unaffiliated Abhishek Dixit Unaffiliated Accela Zhao Unaffiliated Ailing Zhang Unaffiliated Alexis Normand From 45af3da004d9645d0834d9cb64e00fab031d5226 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:53:53 +0200 Subject: [PATCH 240/654] mailmap: Arthur Gorjux affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 39725acac71a4..a1491ae96dbd5 100644 --- a/.organizationmap +++ b/.organizationmap @@ -320,6 +320,7 @@ Unaffiliated Alexis Normand Unaffiliated Andy Allan Unaffiliated Anis Ayari Unaffiliated Armando Segnini +Unaffiliated Arthur Gorjux Unaffiliated BJ Lougee Unaffiliated Bosse Klykken Unaffiliated Cheng Cheng From 2f92ebac01861cee822d00c2ab80c8253e019414 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:54:06 +0200 Subject: [PATCH 241/654] =?UTF-8?q?mailmap:=20Ga=C3=ABl=20Fenet-Garde=20af?= =?UTF-8?q?filiation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index a1491ae96dbd5..6ab3d7b2c9943 100644 --- a/.organizationmap +++ b/.organizationmap @@ -344,6 +344,7 @@ Unaffiliated Florian Marsylle François Lafont Unaffiliated Frank Yu Unaffiliated Gabriel Sentucq +Unaffiliated Gaël Fenet-Garde Unaffiliated Gaurav Kumar Garg Unaffiliated Germain Chipaux Unaffiliated Haomai Wang From 8bd1ac039d73e96cd97e1b4fb2e04d79b0679cea Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:54:21 +0200 Subject: [PATCH 242/654] =?UTF-8?q?mailmap:=20Jean-R=C3=A9mi=20Deveaux=20a?= =?UTF-8?q?ffiliation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 6ab3d7b2c9943..0a46d855ce27d 100644 --- a/.organizationmap +++ b/.organizationmap @@ -355,6 +355,7 @@ Unaffiliated Ilja Slepnev Unaffiliated Ismael Serrano Unaffiliated Janne Grunau Unaffiliated Javier Guerra +Unaffiliated Jean-Rémi Deveaux Unaffiliated Jiang Heng Unaffiliated Jiantao He Unaffiliated Jian Wen From 6f562c6777692786cd1feb0e866d82bfe8544c39 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:54:32 +0200 Subject: [PATCH 243/654] mailmap: Jiaying Ren affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 0a46d855ce27d..15d8984926817 100644 --- a/.organizationmap +++ b/.organizationmap @@ -360,6 +360,7 @@ Unaffiliated Jiang Heng Unaffiliated Jiantao He Unaffiliated Jian Wen Unaffiliated Jon Bernard +Unaffiliated Jiaying Ren Unaffiliated Jordan Dorne Unaffiliated Karel Striegel Unaffiliated Kefu Chai From 49bd8a8bffb53934c287b990e84d2c1820826ea3 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 24 Aug 2015 22:54:44 +0200 Subject: [PATCH 244/654] mailmap: Jevon Qiao affiliation Signed-off-by: Loic Dachary --- .organizationmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.organizationmap b/.organizationmap index 15d8984926817..914fa7b0f37ce 100644 --- a/.organizationmap +++ b/.organizationmap @@ -417,6 +417,7 @@ Unaffiliated Zhicheng Wei Unilogic Networks B.V Pascal de Bruijn UnitedStack Dong Yuan UnitedStack Guangliang Zhao +UnitedStack Jevon Qiao UnitedStack Kun Huang UnitedStack Rongze Zhu University of California, Santa Cruz Adam Crume From 9b815ed894e3225a6d3f31b35ea7e3ae1383fd37 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 29 Aug 2015 16:49:45 +0800 Subject: [PATCH 245/654] configure.ac: check for libboost_random-mt also Signed-off-by: Kefu Chai --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 7447d3d98b295..2503c87b07757 100644 --- a/configure.ac +++ b/configure.ac @@ -919,7 +919,7 @@ AC_SUBST(BOOST_THREAD_LIBS) BOOST_RANDOM_LIBS="" saved_LIBS="${LIBS}" LIBS="" -AC_CHECK_LIB(boost_random, main, [], +AC_CHECK_LIB(boost_random-mt, main, [], [AC_CHECK_LIB(boost_random, main, [], AC_MSG_FAILURE(["Boost random library not found."]))]) BOOST_RANDOM_LIBS="${LIBS}" From 3a6c2468dd7fc99571d5a18353d30426d5daf306 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 27 Aug 2015 12:51:14 +0200 Subject: [PATCH 246/654] ceph-disk: fix dmcrypt typo Fix the typo introduced by 29431944c77adbc3464a8faeb7e052b24f821780 http://tracker.ceph.com/issues/12781 Fixes: #12781 Signed-off-by: Loic Dachary --- src/ceph-disk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-disk b/src/ceph-disk index d7b3233cff5c7..8e232ed476530 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2370,7 +2370,7 @@ def main_activate_journal(args): raise Error('activate-journal --dmcrypt called for invalid dev %s' % (rawdev)) part_uuid = get_partition_uuid(rawdev) dmcrypt_key_path = os.path.join(args.dmcrypt_key_dir, part_uuid) - dev = dmcrypt_map(rawdev, dmcrypt_key_path, partd_uuid) + dev = dmcrypt_map(rawdev, dmcrypt_key_path, part_uuid) else: dev = args.dev From 3c8ac5409f4fcba4e3bcacf31d17bf140ad35879 Mon Sep 17 00:00:00 2001 From: minchen Date: Sun, 30 Aug 2015 17:34:12 +0800 Subject: [PATCH 247/654] bug fix: librados segmentation fault, extra modify supports aio_xxx() methods Signed-off-by: Min Chen --- src/librados/IoCtxImpl.cc | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc index 50a600beb1b0e..945dbec2126d7 100644 --- a/src/librados/IoCtxImpl.cc +++ b/src/librados/IoCtxImpl.cc @@ -694,12 +694,12 @@ int librados::IoCtxImpl::aio_write(const object_t &oid, AioCompletionImpl *c, if (snap_seq != CEPH_NOSNAP) return -EROFS; - c->io = this; - queue_aio_write(c); - Context *onack = new C_aio_Ack(c); Context *onsafe = new C_aio_Safe(c); + c->io = this; + queue_aio_write(c); + c->tid = objecter->write(oid, oloc, off, len, snapc, bl, ut, 0, onack, onsafe, &c->objver); @@ -718,12 +718,12 @@ int librados::IoCtxImpl::aio_append(const object_t &oid, AioCompletionImpl *c, if (snap_seq != CEPH_NOSNAP) return -EROFS; - c->io = this; - queue_aio_write(c); - Context *onack = new C_aio_Ack(c); Context *onsafe = new C_aio_Safe(c); + c->io = this; + queue_aio_write(c); + c->tid = objecter->append(oid, oloc, len, snapc, bl, ut, 0, onack, onsafe, &c->objver); @@ -743,12 +743,12 @@ int librados::IoCtxImpl::aio_write_full(const object_t &oid, if (snap_seq != CEPH_NOSNAP) return -EROFS; - c->io = this; - queue_aio_write(c); - Context *onack = new C_aio_Ack(c); Context *onsafe = new C_aio_Safe(c); + c->io = this; + queue_aio_write(c); + c->tid = objecter->write_full(oid, oloc, snapc, bl, ut, 0, onack, onsafe, &c->objver); @@ -764,12 +764,12 @@ int librados::IoCtxImpl::aio_remove(const object_t &oid, AioCompletionImpl *c) if (snap_seq != CEPH_NOSNAP) return -EROFS; - c->io = this; - queue_aio_write(c); - Context *onack = new C_aio_Ack(c); Context *onsafe = new C_aio_Safe(c); + c->io = this; + queue_aio_write(c); + c->tid = objecter->remove(oid, oloc, snapc, ut, 0, onack, onsafe, &c->objver); @@ -781,9 +781,9 @@ int librados::IoCtxImpl::aio_remove(const object_t &oid, AioCompletionImpl *c) int librados::IoCtxImpl::aio_stat(const object_t& oid, AioCompletionImpl *c, uint64_t *psize, time_t *pmtime) { - c->io = this; C_aio_stat_Ack *onack = new C_aio_stat_Ack(c, pmtime); + c->io = this; c->tid = objecter->stat(oid, oloc, snap_seq, psize, &onack->mtime, 0, onack, &c->objver); From 7841455ca63c7bb9e01f9976693c804d2e1a6439 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Sat, 29 Aug 2015 22:45:41 +0800 Subject: [PATCH 248/654] Mon: Make ceph osd metadata support dump all osds Impl #12801 Signed-off-by: Haomai Wang --- doc/man/8/ceph.rst | 2 +- src/mon/MonCommands.h | 4 ++-- src/mon/OSDMonitor.cc | 33 ++++++++++++++++++++------- src/test/pybind/test_ceph_argparse.py | 2 +- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst index a1d31884f7928..38d782d9f6cf3 100644 --- a/doc/man/8/ceph.rst +++ b/doc/man/8/ceph.rst @@ -829,7 +829,7 @@ Subcommand ``metadata`` fetches metadata for osd . Usage:: - ceph osd metadata + ceph osd metadata {int[0-]} (default all) Subcommand ``out`` sets osd(s) [...] out. diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index e4da778cbd354..0c09638a22ee6 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -435,8 +435,8 @@ COMMAND("osd find " \ "find osd in the CRUSH map and show its location", \ "osd", "r", "cli,rest") COMMAND("osd metadata " \ - "name=id,type=CephInt,range=0", \ - "fetch metadata for osd ", \ + "name=id,type=CephInt,range=0,req=false", \ + "fetch metadata for osd {id} (default all)", \ "osd", "r", "cli,rest") COMMAND("osd map " \ "name=pool,type=CephPoolname " \ diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 756ff0dd07717..63b19f3652fd1 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3058,14 +3058,15 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) f->close_section(); f->flush(rdata); } else if (prefix == "osd metadata") { - int64_t osd; - if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) { + int64_t osd = -1; + if (cmd_vartype_stringify(cmdmap["id"]).size() && + !cmd_getval(g_ceph_context, cmdmap, "id", osd)) { ss << "unable to parse osd id value '" << cmd_vartype_stringify(cmdmap["id"]) << "'"; r = -EINVAL; goto reply; } - if (!osdmap.exists(osd)) { + if (osd >= 0 && !osdmap.exists(osd)) { ss << "osd." << osd << " does not exist"; r = -ENOENT; goto reply; @@ -3073,11 +3074,27 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) string format; cmd_getval(g_ceph_context, cmdmap, "format", format); boost::scoped_ptr f(Formatter::create(format, "json-pretty", "json-pretty")); - f->open_object_section("osd_metadata"); - r = dump_osd_metadata(osd, f.get(), &ss); - if (r < 0) - goto reply; - f->close_section(); + if (osd >= 0) { + f->open_object_section("osd_metadata"); + f->dump_unsigned("id", osd); + r = dump_osd_metadata(osd, f.get(), &ss); + if (r < 0) + goto reply; + f->close_section(); + } else { + f->open_array_section("osd_metadata"); + for (int i=0; iopen_object_section("osd"); + f->dump_unsigned("id", i); + r = dump_osd_metadata(i, f.get(), NULL); + if (r < 0) + goto reply; + f->close_section(); + } + } + f->close_section(); + } f->flush(rdata); } else if (prefix == "osd map") { string poolstr, objstr, namespacestr; diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 6bd2b08352636..fae1c93a8adc9 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -552,7 +552,7 @@ def test_map(self): 'toomany'])) def test_metadata(self): - self.check_1_natural_arg('osd', 'metadata') + self.check_0_or_1_natural_arg('osd', 'metadata') def test_scrub(self): self.check_1_string_arg('osd', 'scrub') From 12ebb730c575b146d2d53ed7c5d2c3586b98f3d7 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Sat, 29 Aug 2015 23:10:43 +0800 Subject: [PATCH 249/654] KeyValueStore: Fix getattrs nonexist object need return -ENOENT Signed-off-by: Haomai Wang --- src/os/KeyValueStore.cc | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc index b717279bc0551..a147c0b27ab42 100644 --- a/src/os/KeyValueStore.cc +++ b/src/os/KeyValueStore.cc @@ -270,8 +270,7 @@ int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header, { ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); + assert(!iter->status()); keys->insert(iter->key()); } return 0; @@ -282,8 +281,7 @@ int StripObjectMap::get_with_header(const StripObjectHeaderRef header, { ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); + assert(!iter->status()); out->insert(make_pair(iter->key(), iter->value())); } @@ -2152,18 +2150,22 @@ int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, int KeyValueStore::getattrs(coll_t cid, const ghobject_t& oid, map& aset) { - int r; map attr_aset; + int r; + StripObjectMap::StripObjectHeaderRef header; - r = backend->get(cid, oid, OBJECT_XATTR, &attr_aset); - if (r < 0 && r != -ENOENT) { + r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_with_header(header, OBJECT_XATTR, &attr_aset); + if (r < 0) { dout(10) << __func__ << " could not get attrs r = " << r << dendl; goto out; } - if (r == -ENOENT) - r = 0; - for (map::iterator i = attr_aset.begin(); i != attr_aset.end(); ++i) { string key; @@ -2249,7 +2251,7 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid, } r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs); - if (r < 0 && r != -ENOENT) { + if (r < 0) { dout(10) << __func__ << " could not get attrs r = " << r << dendl; return r; } @@ -2523,7 +2525,7 @@ int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid, } r = backend->get_with_header(header, OBJECT_OMAP, out); - if (r < 0 && r != -ENOENT) { + if (r < 0) { dout(10) << __func__ << " err r =" << r << dendl; return r; } @@ -2588,7 +2590,7 @@ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set * } r = backend->get_keys_with_header(header, OBJECT_OMAP, keys); - if (r < 0 && r != -ENOENT) { + if (r < 0) { return r; } return 0; @@ -2649,7 +2651,7 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid, set keys; r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys); - if (r < 0 && r != -ENOENT) { + if (r < 0) { dout(10) << __func__ << " could not get omap_keys r = " << r << dendl; return r; } From 62e1593cde6677eb889c5baca192caed98f2fb5d Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Sun, 30 Aug 2015 22:12:29 +0800 Subject: [PATCH 250/654] KeyValueStore: Fix broken assert statement Signed-off-by: Haomai Wang --- src/os/GenericObjectMap.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc index 8567d30af29e9..62f052f730db2 100644 --- a/src/os/GenericObjectMap.cc +++ b/src/os/GenericObjectMap.cc @@ -157,7 +157,6 @@ string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid) full_name += string(buf); if (oid.generation != ghobject_t::NO_GEN) { - assert(oid.shard_id != shard_id_t::NO_SHARD); full_name.append(GHOBJECT_KEY_SEP_S); t = buf; From e20195dfb13dc46aa93203f62ed487a18dfa0478 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 14 Aug 2015 13:57:33 +0800 Subject: [PATCH 251/654] mds/Server: s/mds->mdcache/mdcache. Because mdcache = mds->mdcache, for replcae . Signed-off-by: Jianpeng Ma --- src/mds/Server.cc | 102 +++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 5d380c188ce58..be3a9269975c4 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -791,7 +791,7 @@ void Server::handle_client_reconnect(MClientReconnect *m) dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm) << " on " << *in << dendl; in->reconnect_cap(from, p->second.capinfo, session); - mds->mdcache->add_reconnected_cap(in, from, inodeno_t(p->second.capinfo.snaprealm)); + mdcache->add_reconnected_cap(in, from, inodeno_t(p->second.capinfo.snaprealm)); recover_filelocks(in, p->second.flockbl, m->get_orig_source().num()); continue; } @@ -1691,7 +1691,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m) if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) { metareqid_t r = m->get_reqid(); - mds->mdcache->committed_master_slave(r, from); + mdcache->committed_master_slave(r, from); m->put(); return; } @@ -2253,7 +2253,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino } else if (layout) { in->inode.layout = *layout; } else { - in->inode.layout = mds->mdcache->default_file_layout; + in->inode.layout = mdcache->default_file_layout; } in->inode.truncate_size = -1ull; // not truncated, yet! @@ -2607,7 +2607,7 @@ CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr) // invent? if (!dir) - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); + dir = diri->get_or_open_dirfrag(mdcache, fg); // am i auth for the dirfrag? if (!dir->is_auth()) { @@ -3020,7 +3020,7 @@ void Server::handle_client_openc(MDRequestRef& mdr) if (dir_layout) layout = *dir_layout; else - layout = mds->mdcache->default_file_layout; + layout = mdcache->default_file_layout; // fill in any special params from client if (req->head.args.open.stripe_unit) @@ -3804,7 +3804,7 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr) else if (dir_layout) layout = *dir_layout; else - layout = mds->mdcache->default_file_layout; + layout = mdcache->default_file_layout; if (req->head.args.setlayout.layout.fl_object_size > 0) layout.fl_object_size = req->head.args.setlayout.layout.fl_object_size; @@ -4014,7 +4014,7 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur, else if (dir_layout) layout = *dir_layout; else - layout = mds->mdcache->default_file_layout; + layout = mdcache->default_file_layout; rest = name.substr(name.find("layout")); const OSDMap *osdmap = mds->objecter->get_osdmap_read(); @@ -4404,7 +4404,7 @@ void Server::handle_client_mknod(MDRequestRef& mdr) if (dir_layout && S_ISREG(mode)) layout = *dir_layout; else - layout = mds->mdcache->default_file_layout; + layout = mdcache->default_file_layout; SnapRealm *realm = dn->get_dir()->inode->find_snaprealm(); snapid_t follows = realm->get_newest_seq(); @@ -4505,7 +4505,7 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) newi->first = dn->first; // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); + CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t()); newdir->mark_complete(); newdir->fnode.version = newdir->pre_dirty(); @@ -4701,7 +4701,7 @@ void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti, mdr->apply(); MDRequestRef null_ref; - mds->mdcache->send_dentry_link(dn, null_ref); + mdcache->send_dentry_link(dn, null_ref); // bump target popularity mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR); @@ -4776,7 +4776,7 @@ void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targ dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; le->reqid = mdr->reqid; le->had_slaves = true; - mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); } if (inc) { @@ -4821,9 +4821,9 @@ void Server::_link_remote_finish(MDRequestRef& mdr, bool inc, MDRequestRef null_ref; if (inc) - mds->mdcache->send_dentry_link(dn, null_ref); + mdcache->send_dentry_link(dn, null_ref); else - mds->mdcache->send_dentry_unlink(dn, NULL, null_ref); + mdcache->send_dentry_unlink(dn, NULL, null_ref); // bump target popularity mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR); @@ -5002,7 +5002,7 @@ void Server::_committed_slave(MDRequestRef& mdr) MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED); mds->send_message_mds(req, mdr->slave_to_mds); - mds->mdcache->request_finish(mdr); + mdcache->request_finish(mdr); } struct C_MDS_LoggedLinkRollback : public ServerContext { @@ -5027,13 +5027,13 @@ void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& assert(g_conf->mds_kill_link_at != 9); - mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes assert(mdr || mds->is_resolve()); MutationRef mut(new MutationImpl(rollback.reqid)); mut->ls = mds->mdlog->get_current_segment(); - CInode *in = mds->mdcache->get_inode(rollback.ino); + CInode *in = mdcache->get_inode(rollback.ino); assert(in); dout(10) << " target is " << *in << dendl; assert(!in->is_projected()); // live slave request hold versionlock xlock. @@ -5083,9 +5083,9 @@ void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr) mut->apply(); if (mdr) - mds->mdcache->request_finish(mdr); + mdcache->request_finish(mdr); - mds->mdcache->finish_rollback(mut->reqid); + mdcache->finish_rollback(mut->reqid); mut->cleanup(); } @@ -5294,7 +5294,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn) dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; le->reqid = mdr->reqid; le->had_slaves = true; - mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); } if (straydn) { @@ -5345,7 +5345,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn) if (in->is_dir()) { assert(straydn); - mds->mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); + mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); } journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn)); @@ -5382,7 +5382,7 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, if (snap_is_new) //only new if strayin exists mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true); - mds->mdcache->send_dentry_unlink(dn, straydn, mdr); + mdcache->send_dentry_unlink(dn, straydn, mdr); // update subtree map? if (straydn && strayin->is_dir()) @@ -5526,7 +5526,7 @@ void Server::handle_slave_rmdir_prep(MDRequestRef& mdr) dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; le->commit.renamed_dirino = in->ino(); - mds->mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); + mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); mdr->more()->slave_update_journaled = true; submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn), @@ -5631,17 +5631,17 @@ void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& ::decode(rollback, p); dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl; - mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes assert(mdr || mds->is_resolve()); - CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir); + CDir *dir = mdcache->get_dirfrag(rollback.src_dir); if (!dir) - dir = mds->mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname); + dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname); assert(dir); CDentry *dn = dir->lookup(rollback.src_dname); assert(dn); dout(10) << " dn " << *dn << dendl; - dir = mds->mdcache->get_dirfrag(rollback.dest_dir); + dir = mdcache->get_dirfrag(rollback.dest_dir); assert(dir); CDentry *straydn = dir->lookup(rollback.dest_dname); assert(straydn); @@ -5656,8 +5656,8 @@ void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false); - mds->mdcache->request_finish(mdr); - mds->mdcache->finish_rollback(rollback.reqid); + mdcache->request_finish(mdr); + mdcache->finish_rollback(rollback.reqid); return; } @@ -5700,9 +5700,9 @@ void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentr } if (mdr) - mds->mdcache->request_finish(mdr); + mdcache->request_finish(mdr); - mds->mdcache->finish_rollback(reqid); + mdcache->finish_rollback(reqid); } @@ -6091,7 +6091,7 @@ void Server::handle_client_rename(MDRequestRef& mdr) (srcrealm->get_newest_seq() + 1 > srcdn->first || destrealm->get_newest_seq() + 1 > srcdn->first)) { dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl; - mds->mdcache->snaprealm_create(mdr, srci); + mdcache->snaprealm_create(mdr, srci); return; } } @@ -6182,7 +6182,7 @@ void Server::handle_client_rename(MDRequestRef& mdr) le->reqid = mdr->reqid; le->had_slaves = true; - mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); // no need to send frozen auth pin to recovring auth MDS of srci mdr->more()->is_remote_frozen_authpin = false; } @@ -6208,7 +6208,7 @@ void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, // apply _rename_apply(mdr, srcdn, destdn, straydn); - mds->mdcache->send_dentry_link(destdn, mdr); + mdcache->send_dentry_link(destdn, mdr); CDentry::linkage_t *destdnl = destdn->get_linkage(); CInode *in = destdnl->get_inode(); @@ -6322,7 +6322,7 @@ bool Server::_need_force_journal(CInode *diri, bool empty) } else { // see if any children of our frags are auth subtrees. list subtrees; - mds->mdcache->list_subtrees(subtrees); + mdcache->list_subtrees(subtrees); dout(10) << " subtrees " << subtrees << " frags " << ls << dendl; for (list::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; @@ -6714,7 +6714,7 @@ void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, C // finish cap imports finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map); if (mdr->more()->cap_imports.count(destdnl->get_inode())) { - mds->mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(), + mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(), mdr->more()->srcdn_auth_mds, true, mdr->more()->cap_imports[destdnl->get_inode()], imported_caps); @@ -7192,7 +7192,7 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r, mdr->more()->is_ambiguous_auth = false; } mds->queue_waiters(finished); - mds->mdcache->request_finish(mdr); + mdcache->request_finish(mdr); } } } @@ -7253,15 +7253,15 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef dout(10) << "do_rename_rollback on " << rollback.reqid << dendl; // need to finish this update before sending resolve to claim the subtree - mds->mdcache->add_rollback(rollback.reqid, master); + mdcache->add_rollback(rollback.reqid, master); MutationRef mut(new MutationImpl(rollback.reqid)); mut->ls = mds->mdlog->get_current_segment(); CDentry *srcdn = NULL; - CDir *srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag); + CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag); if (!srcdir) - srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname); + srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname); if (srcdir) { dout(10) << " srcdir " << *srcdir << dendl; srcdn = srcdir->lookup(rollback.orig_src.dname); @@ -7274,9 +7274,9 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef dout(10) << " srcdir not found" << dendl; CDentry *destdn = NULL; - CDir *destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag); + CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag); if (!destdir) - destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname); + destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname); if (destdir) { dout(10) << " destdir " << *destdir << dendl; destdn = destdir->lookup(rollback.orig_dest.dname); @@ -7289,16 +7289,16 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef CInode *in = NULL; if (rollback.orig_src.ino) { - in = mds->mdcache->get_inode(rollback.orig_src.ino); + in = mdcache->get_inode(rollback.orig_src.ino); if (in && in->is_dir()) assert(srcdn && destdn); } else - in = mds->mdcache->get_inode(rollback.orig_src.remote_ino); + in = mdcache->get_inode(rollback.orig_src.remote_ino); CDir *straydir = NULL; CDentry *straydn = NULL; if (rollback.stray.dirfrag.ino) { - straydir = mds->mdcache->get_dirfrag(rollback.stray.dirfrag); + straydir = mdcache->get_dirfrag(rollback.stray.dirfrag); if (straydir) { dout(10) << "straydir " << *straydir << dendl; straydn = straydir->lookup(rollback.stray.dname); @@ -7313,11 +7313,11 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef CInode *target = NULL; if (rollback.orig_dest.ino) { - target = mds->mdcache->get_inode(rollback.orig_dest.ino); + target = mdcache->get_inode(rollback.orig_dest.ino); if (target) assert(destdn && straydn); } else if (rollback.orig_dest.remote_ino) - target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino); + target = mdcache->get_inode(rollback.orig_dest.remote_ino); // can't use is_auth() in the resolve stage mds_rank_t whoami = mds->get_nodeid(); @@ -7540,10 +7540,10 @@ void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentr } mds->queue_waiters(finished); if (finish_mdr) - mds->mdcache->request_finish(mdr); + mdcache->request_finish(mdr); } - mds->mdcache->finish_rollback(mut->reqid); + mdcache->finish_rollback(mut->reqid); mut->cleanup(); } @@ -7769,7 +7769,7 @@ void Server::handle_client_mksnap(MDRequestRef& mdr) mds->snapclient->prepare_create(diri->ino(), snapname, mdr->get_mds_stamp(), &mdr->more()->stid, &mdr->more()->snapidbl, - new C_MDS_RetryRequest(mds->mdcache, mdr)); + new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -7902,7 +7902,7 @@ void Server::handle_client_rmsnap(MDRequestRef& mdr) if (!mdr->more()->stid) { mds->snapclient->prepare_destroy(diri->ino(), snapid, &mdr->more()->stid, &mdr->more()->snapidbl, - new C_MDS_RetryRequest(mds->mdcache, mdr)); + new C_MDS_RetryRequest(mdcache, mdr)); return; } version_t stid = mdr->more()->stid; @@ -8042,7 +8042,7 @@ void Server::handle_client_renamesnap(MDRequestRef& mdr) if (!mdr->more()->stid) { mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(), &mdr->more()->stid, &mdr->more()->snapidbl, - new C_MDS_RetryRequest(mds->mdcache, mdr)); + new C_MDS_RetryRequest(mdcache, mdr)); return; } From 90dea96f7052b0f6c1eecf1b0c4415dbb961ba03 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 25 Aug 2015 08:29:44 +0800 Subject: [PATCH 252/654] mds: add osdmap epoch for setxattr of MClientRequest. Now we use setxattr set file/dir layout. This may need data pool info. So in mds server, it need check osdmap. At present, if mds don't find data pool, it will get the latest osdmap. Now if pass osd epoch as a parameter for setxattr. We can only check this epoch of osdmap. But for compatible, we still need old code for old client. Signed-off-by: Jianpeng Ma --- src/include/ceph_fs.h | 1 + src/mds/Server.cc | 28 +++++++++++++++++++--------- src/messages/MClientRequest.h | 13 ++++++++++++- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 769f51966a4a6..08ef460bfe055 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -409,6 +409,7 @@ union ceph_mds_request_args { } __attribute__ ((packed)) open; struct { __le32 flags; + __le32 osdmap_epoch; /* use for set file/dir layout */ } __attribute__ ((packed)) setxattr; struct { struct ceph_file_layout layout; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index be3a9269975c4..2b7554cf09ae8 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4019,16 +4019,21 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur, rest = name.substr(name.find("layout")); const OSDMap *osdmap = mds->objecter->get_osdmap_read(); int r = parse_layout_vxattr(rest, value, osdmap, &layout); + epoch_t epoch = osdmap->get_epoch(); mds->objecter->put_osdmap_read(); if (r < 0) { if (r == -ENOENT) { - if (!mdr->waited_for_osdmap) { - // make sure we have the latest map. - // FIXME: we should get the client's osdmap epoch and just - // make sure we have *that*. + epoch_t req_epoch = req->get_osdmap_epoch(); + if (req_epoch > epoch) { + if (!mds->objecter->wait_for_map(req_epoch, + new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher))) + return; + } else if (req_epoch == 0 && !mdr->waited_for_osdmap) { + // For compatibility with client w/ old code, we still need get the latest map. + // One day if COMPACT_VERSION of MClientRequest >=3, we can remove those code. mdr->waited_for_osdmap = true; mds->objecter->wait_for_latest_osdmap( - new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher)); + new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher)); return; } r = -EINVAL; @@ -4057,13 +4062,18 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur, rest = name.substr(name.find("layout")); const OSDMap *osdmap = mds->objecter->get_osdmap_read(); int r = parse_layout_vxattr(rest, value, osdmap, &layout); + epoch_t epoch = osdmap->get_epoch(); mds->objecter->put_osdmap_read(); if (r < 0) { if (r == -ENOENT) { - if (!mdr->waited_for_osdmap) { - // make sure we have the latest map. - // FIXME: we should get the client's osdmap epoch and just - // make sure we have *that*. + epoch_t req_epoch = req->get_osdmap_epoch(); + if (req_epoch > epoch) { + if (!mds->objecter->wait_for_map(req_epoch, + new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher))) + return; + } else if (req_epoch == 0 && !mdr->waited_for_osdmap) { + // For compatibility with client w/ old code, we still need get the latest map. + // One day if COMPACT_VERSION of MClientRequest >=3, we can remove those code. mdr->waited_for_osdmap = true; mds->objecter->wait_for_latest_osdmap( new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher)); diff --git a/src/messages/MClientRequest.h b/src/messages/MClientRequest.h index 35dbb1757666f..1c37459105760 100644 --- a/src/messages/MClientRequest.h +++ b/src/messages/MClientRequest.h @@ -46,7 +46,7 @@ // metadata ops. class MClientRequest : public Message { - static const int HEAD_VERSION = 2; + static const int HEAD_VERSION = 3; static const int COMPAT_VERSION = 1; public: @@ -93,6 +93,17 @@ class MClientRequest : public Message { public: void set_mdsmap_epoch(epoch_t e) { head.mdsmap_epoch = e; } epoch_t get_mdsmap_epoch() { return head.mdsmap_epoch; } + epoch_t get_osdmap_epoch() const { + assert(head.op == CEPH_MDS_OP_SETXATTR); + if (header.version >= 3) + return head.args.setxattr.osdmap_epoch; + else + return 0; + } + void set_osdmap_epoch(epoch_t e) { + assert(head.op == CEPH_MDS_OP_SETXATTR); + head.args.setxattr.osdmap_epoch = e; + } metareqid_t get_reqid() { // FIXME: for now, assume clients always have 1 incarnation From dbfac2807c7acfaf42f0db7cc9539e71c809c523 Mon Sep 17 00:00:00 2001 From: Min Chen Date: Mon, 31 Aug 2015 15:09:58 +0800 Subject: [PATCH 253/654] bug fix: librados segmentation fault, support RadosStriperImpl::aio_read() method Signed-off-by: Min Chen --- src/libradosstriper/RadosStriperImpl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libradosstriper/RadosStriperImpl.cc b/src/libradosstriper/RadosStriperImpl.cc index 0886f8b3403c2..3544cac1b787e 100644 --- a/src/libradosstriper/RadosStriperImpl.cc +++ b/src/libradosstriper/RadosStriperImpl.cc @@ -463,10 +463,10 @@ int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, // create a completion object and transfer ownership of extents and resultbl vector *resultbl = new vector(extents->size()); - c->is_read = true; - c->io = m_ioCtxImpl; ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c, bl, extents, resultbl); + c->is_read = true; + c->io = m_ioCtxImpl; libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl; nc->set_complete_callback(cdata, striper_read_aio_req_complete); // go through the extents From 7e1d83bb4d4b658ee93466d8382feb1c41ed4cc6 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:47:24 +0200 Subject: [PATCH 254/654] msg/simple: Move MSG_ and SO_NOSIGPIPE into porting.h Only enable SO_PRIORITY on Linux Signed-off-by: Dennis Schafroth --- src/msg/simple/Pipe.cc | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc index a9b3b54e870e1..42995840d45a0 100644 --- a/src/msg/simple/Pipe.cc +++ b/src/msg/simple/Pipe.cc @@ -32,6 +32,8 @@ #include "auth/cephx/CephxProtocol.h" #include "auth/AuthSessionHandler.h" +#include "porting.h" + // Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR #define SEQ_MASK 0x7fffffff #define dout_subsys ceph_subsys_ms @@ -53,26 +55,6 @@ ostream& operator<<(ostream &out, const Pipe &pipe) { return pipe._pipe_prefix(out); } -/* - * This optimization may not be available on all platforms (e.g. OSX). - * Apparently a similar approach based on TCP_CORK can be used. - */ -#ifndef MSG_MORE -# define MSG_MORE 0 -#endif - -/* - * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE. - */ -#ifndef MSG_NOSIGNAL -# define MSG_NOSIGNAL 0 -# ifdef SO_NOSIGPIPE -# define CEPH_USE_SO_NOSIGPIPE -# else -# error "Cannot block SIGPIPE!" -# endif -#endif - /************************************** * Pipe */ @@ -854,7 +836,7 @@ void Pipe::set_socket_options() int prio = msgr->get_socket_priority(); if (prio >= 0) { - int r; + int r = -1; #ifdef IPTOS_CLASS_CS6 int iptos = IPTOS_CLASS_CS6; r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos)); @@ -866,7 +848,9 @@ void Pipe::set_socket_options() // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0. // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT // We need to call setsockopt(SO_PRIORITY) after it. +#if defined(__linux__) r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); +#endif if (r < 0) { ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio << ": " << cpp_strerror(errno) << dendl; From 15f8363db3e27987a07592c940dcc9a40f8ee2d9 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:49:04 +0200 Subject: [PATCH 255/654] msg/async: Include porting.h for MSG_* Signed-off-by: Dennis Schafroth --- src/msg/async/AsyncConnection.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 36a19f80cccce..a8a47e568dca1 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -23,6 +23,8 @@ #include "AsyncMessenger.h" #include "AsyncConnection.h" +#include "porting.h" + // Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR #define SEQ_MASK 0x7fffffff From f02342265e853f394b4bc535977e74579f6ed860 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:50:27 +0200 Subject: [PATCH 256/654] ceph_fuse: Implement set/get xattr with position parameter on OSX Signed-off-by: Dennis Schafroth --- src/client/fuse_ll.cc | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 0c9becd6d13ef..ee9f552f5ee4a 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -172,7 +172,12 @@ static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, // XATTRS static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, - const char *value, size_t size, int flags) + const char *value, size_t size, + int flags +#if defined(DARWIN) + ,uint32_t pos +#endif + ) { CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); @@ -204,7 +209,11 @@ static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) } static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, - size_t size) + size_t size +#if defined(DARWIN) + ,uint32_t position +#endif + ) { CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); From d549f41bcfb8c5403079fe21729b4972df8d0fdc Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:51:32 +0200 Subject: [PATCH 257/654] common/SubProcess: fix build on DARWIN include to avoid errors/warnings on missing functions Signed-off-by: Dennis Schafroth --- src/common/SubProcess.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h index 3d739849193d4..fdc688721b2b6 100644 --- a/src/common/SubProcess.h +++ b/src/common/SubProcess.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include From e69c115ba703854b85ee83911050c81dc92e3846 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:52:17 +0200 Subject: [PATCH 258/654] common/util: include sys/param.h and mount.h on DARWIN Signed-off-by: Dennis Schafroth --- src/common/util.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/util.cc b/src/common/util.cc index 3c3a304c22fcb..f8959e4644fec 100644 --- a/src/common/util.cc +++ b/src/common/util.cc @@ -25,6 +25,11 @@ #include #endif +#if defined(DARWIN) +#include +#include +#endif + // test if an entire buf is zero in 8-byte chunks bool buf_is_zero(const char *buf, size_t len) { From 4cfac6c3023f740c8fddb6b41d6c8a388917c980 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 19:55:10 +0200 Subject: [PATCH 259/654] common/xattr: Split out xattr on Linux and DARWIN. DARWIN requires position parameter Signed-off-by: Dennis Schafroth --- src/common/xattr.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/common/xattr.c b/src/common/xattr.c index 239ee02db42f0..b2e522b880413 100644 --- a/src/common/xattr.c +++ b/src/common/xattr.c @@ -9,6 +9,7 @@ * Foundation. See file COPYING. */ +#include "acconfig.h" #if defined(__FreeBSD__) #include #include @@ -42,8 +43,10 @@ ceph_os_setxattr(const char *path, const char *name, size); if (error > 0) error = 0; -#elif defined(__linux__) || defined(DARWIN) +#elif defined(__linux__) error = setxattr(path, name, value, size, 0); +#elif defined(DARWIN) + error = setxattr(path, name, value, size, 0 /* position */, 0); #endif return (error); @@ -56,12 +59,13 @@ ceph_os_fsetxattr(int fd, const char *name, const void *value, int error = -1; #if defined(__FreeBSD__) - error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, - size); + error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size); if (error > 0) error = 0; -#elif defined(__linux__) || defined(DARWIN) +#elif defined(__linux__) error = fsetxattr(fd, name, value, size, 0); +#elif defined(DARWIN) + error = fsetxattr(fd, name, value, size, 0, 0 /* no options should be indentical to Linux */ ); #endif return (error); @@ -93,7 +97,7 @@ void *value, size_t size) #elif defined(__linux__) error = getxattr(path, name, value, size); #elif defined(DARWIN) - error = getxattr(path, name, value, size, 0); + error = getxattr(path, name, value, size, 0 /* position */, 0); #endif return (error); @@ -125,7 +129,7 @@ ceph_os_fgetxattr(int fd, const char *name, void *value, #elif defined(__linux__) error = fgetxattr(fd, name, value, size); #elif defined(DARWIN) - error = fgetxattr(fd, name, value, size, 0); + error = fgetxattr(fd, name, value, size, 0, 0 /* no options */); #endif return (error); From 630da9f91191592b81a0a4035da0aad97741c376 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:07:32 +0200 Subject: [PATCH 260/654] librbd: Include porting header Signed-off-by: Dennis Schafroth --- src/librbd/AsyncRequest.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h index 7324a224a5d03..b6594bf8898e4 100644 --- a/src/librbd/AsyncRequest.h +++ b/src/librbd/AsyncRequest.h @@ -8,6 +8,9 @@ #include "include/rados/librados.hpp" #include "include/xlist.h" +/* DARWIN Missing ERESTART */ +#include "porting.h" + namespace librbd { class ImageCtx; From 17db469e9c17b958d318a7552b92e3e8d1225866 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:10:19 +0200 Subject: [PATCH 261/654] os/FileStore: fail with ENOTSUP if using sparse files on DARWIN For now, fail with Not supported (ENOTSUP) if configured using sparse files on DARWIN. Signed-off-by: Dennis Schafroth --- src/os/GenericFileStoreBackend.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc index 508f533ab2f6e..9d74df769e07f 100644 --- a/src/os/GenericFileStoreBackend.cc +++ b/src/os/GenericFileStoreBackend.cc @@ -281,11 +281,15 @@ int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct f fiemap->fm_length = len + start % CEPH_PAGE_SIZE; fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ +#if defined(DARWIN) + ret = -ENOTSUP; + goto done_err; +#else if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { ret = -errno; goto done_err; } - +#endif size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); @@ -301,12 +305,16 @@ int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct f fiemap->fm_extent_count = fiemap->fm_mapped_extents; fiemap->fm_mapped_extents = 0; +#if defined(DARWIN) + ret = -ENOTSUP; + goto done_err; +#else if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { ret = -errno; goto done_err; } *pfiemap = fiemap; - +#endif return 0; done_err: From b3b29c0db0018907af7eea64aa2d984e1d703505 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:12:37 +0200 Subject: [PATCH 262/654] rbd-fuse: Add position to set/get xattr on DARWIN Signed-off-by: Dennis Schafroth --- src/rbd_fuse/rbd-fuse.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/rbd_fuse/rbd-fuse.cc b/src/rbd_fuse/rbd-fuse.cc index 8da91e3fda462..4602cbfab9711 100644 --- a/src/rbd_fuse/rbd-fuse.cc +++ b/src/rbd_fuse/rbd-fuse.cc @@ -11,7 +11,12 @@ #include #include #include +#if defined(DARWIN) +#include +#include "porting.h" +#else #include +#endif #include #include #include @@ -597,7 +602,12 @@ struct rbdfuse_attr { int rbdfs_setxattr(const char *path, const char *name, const char *value, - size_t size, int flags) + size_t size, + int flags +#if defined(DARWIN) + ,uint32_t pos +#endif + ) { struct rbdfuse_attr *ap; if (strcmp(path, "/") != 0) @@ -616,7 +626,11 @@ rbdfs_setxattr(const char *path, const char *name, const char *value, int rbdfs_getxattr(const char *path, const char *name, char *value, - size_t size) + size_t size +#if defined(DARWIN) + ,uint32_t position +#endif + ) { struct rbdfuse_attr *ap; char buf[128]; From 126c327842042d11fc6591bb6b8e9780216f363b Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:13:10 +0200 Subject: [PATCH 263/654] rbd-replay: Different location of endian.h on DARWIN Signed-off-by: Dennis Schafroth --- src/rbd_replay/Deser.cc | 7 +++++-- src/rbd_replay/Ser.cc | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/rbd_replay/Deser.cc b/src/rbd_replay/Deser.cc index 986a18c166a66..af7992382f441 100644 --- a/src/rbd_replay/Deser.cc +++ b/src/rbd_replay/Deser.cc @@ -11,12 +11,15 @@ * Foundation. See file COPYING. * */ - +#include "acconfig.h" #include "Deser.hpp" #include #include +#if defined(DARWIN) +#include +#else #include - +#endif rbd_replay::Deser::Deser(std::istream &in) : m_in(in) { diff --git a/src/rbd_replay/Ser.cc b/src/rbd_replay/Ser.cc index 97a63cdcd5d81..bc3e3777b3270 100644 --- a/src/rbd_replay/Ser.cc +++ b/src/rbd_replay/Ser.cc @@ -12,11 +12,15 @@ * */ +#include "acconfig.h" #include "Ser.hpp" #include #include +#if defined(DARWIN) +#include +#else #include - +#endif rbd_replay::Ser::Ser(std::ostream &out) : m_out(out) { From 0fbacb306b993a1a61f62cd9be4ee1203d375ce2 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:15:23 +0200 Subject: [PATCH 264/654] test: Fix error with clang on DARWIN (LLVM 3.6.0svn) Signed-off-by: Dennis Schafroth --- src/test/librbd/test_librbd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc index c491533d1c295..fb015f65338c1 100644 --- a/src/test/librbd/test_librbd.cc +++ b/src/test/librbd/test_librbd.cc @@ -1960,7 +1960,7 @@ TEST_F(TestLibRBD, FlushAioPP) int order = 0; std::string name = get_temp_image_name(); uint64_t size = 2 << 20; - size_t num_aios = 256; + const size_t num_aios = 256; ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order)); ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL)); From fb1b6bc3da39f8dc47403c8c7620bd1252668405 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:35:32 +0200 Subject: [PATCH 265/654] tools/ceph_objectstore_tool: Missing O_LARGEFILE on DARWIN Signed-off-by: Dennis Schafroth --- src/tools/ceph_objectstore_tool.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index a2dbccbc5fc20..dd29779fd7b86 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -36,6 +36,7 @@ #include "json_spirit/json_spirit_reader.h" #include "ceph_objectstore_tool.h" +#include "porting.h" namespace po = boost::program_options; using namespace std; From cd93656e151dba7d31318c2824039bd3a3dd2db7 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:39:03 +0200 Subject: [PATCH 266/654] porting.h: add porting.h for porting/compatibility on DARWIN Collection of porting/compatibility defines to compile on DARWIN Signed-off-by: Dennis Schafroth --- src/porting.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/porting.h diff --git a/src/porting.h b/src/porting.h new file mode 100644 index 0000000000000..89274248c3171 --- /dev/null +++ b/src/porting.h @@ -0,0 +1,42 @@ + +#ifndef PORTING_H +#define PORTING_H +#include "acconfig.h" + + +#if defined(DARWIN) +#include + +/* O_LARGEFILE is not defined/required on OS X */ +#define O_LARGEFILE 0 + +/* Wonder why this is missing */ +#define PATH_MAX 1024 + +/* Could be relevant for other platforms */ +#ifndef ERESTART +#define ERESTART EINTR +#endif + +/* + * This optimization may not be available on all platforms (e.g. OSX). + * Apparently a similar approach based on TCP_CORK can be used. + */ +#ifndef MSG_MORE +# define MSG_MORE 0 +#endif + +/* + * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE. + */ +#ifndef MSG_NOSIGNAL +# define MSG_NOSIGNAL 0 +# ifdef SO_NOSIGPIPE +# define CEPH_USE_SO_NOSIGPIPE +# else +# error "Cannot block SIGPIPE!" +# endif +#endif + +#endif /* DARWIN */ +#endif /* PORTING_H */ From 11a936e3a223864e71ce9b1e1d0c3b0bda8be692 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Tue, 30 Jun 2015 20:39:50 +0200 Subject: [PATCH 267/654] librbd: Need to include errno.h on DARWIN. The file is not really required Signed-off-by: Dennis Schafroth --- src/librbd/type.h | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/librbd/type.h diff --git a/src/librbd/type.h b/src/librbd/type.h new file mode 100644 index 0000000000000..20be5c2e4b7ed --- /dev/null +++ b/src/librbd/type.h @@ -0,0 +1,7 @@ + +#ifndef LIBRBD_TYPE_H +#define LIBRBD_TYPE_H + +#include + +#endif From 44458db5a782661f786fb6434d274c2b913e85ac Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 5 Jul 2015 15:31:27 +0800 Subject: [PATCH 268/654] msg: fix encoding/decoding sockaddr_storage on DARWIN/FreeBSD sockaddr_storage on DARWIN/FreeBSD is different from sockaddr_storage on Linux. sockaddr_storage on DARWIN/FreeBSD includes a ss_len field. Besides, Its ss_len and ss_family fields are 'unsigned char'. Signed-off-by: Yan, Zheng --- src/msg/msg_types.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h index 62deacffffa58..bf668e034731b 100644 --- a/src/msg/msg_types.h +++ b/src/msg/msg_types.h @@ -156,14 +156,21 @@ namespace std { */ static inline void encode(const sockaddr_storage& a, bufferlist& bl) { struct sockaddr_storage ss = a; -#if !defined(__FreeBSD__) +#if defined(DARWIN) || defined(__FreeBSD__) + unsigned short *ss_family = reinterpret_cast(&ss); + *ss_family = htons(a.ss_family); +#else ss.ss_family = htons(ss.ss_family); #endif ::encode_raw(ss, bl); } static inline void decode(sockaddr_storage& a, bufferlist::iterator& bl) { ::decode_raw(a, bl); -#if !defined(__FreeBSD__) +#if defined(DARWIN) || defined(__FreeBSD__) + unsigned short *ss_family = reinterpret_cast(&a); + a.ss_family = ntohs(*ss_family); + a.ss_len = 0; +#else a.ss_family = ntohs(a.ss_family); #endif } From f064e90ae554b64741284ef1cdf8a00bb7b4a312 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 5 Jul 2015 15:49:49 +0800 Subject: [PATCH 269/654] Link ceph-fuse to fuse on DARWIN Signed-off-by: Yan, Zheng --- configure.ac | 19 +++++-------------- src/Makefile-client.am | 3 ++- src/client/Makefile.am | 3 ++- src/client/fuse_ll.cc | 4 ++-- src/rbd_fuse/rbd-fuse.cc | 2 +- 5 files changed, 12 insertions(+), 19 deletions(-) diff --git a/configure.ac b/configure.ac index 2503c87b07757..53b7e09a16667 100644 --- a/configure.ac +++ b/configure.ac @@ -508,20 +508,11 @@ AC_ARG_WITH([fuse], [], [with_fuse=yes]) LIBFUSE= -AS_IF([test "x$with_fuse" != xno], - [AC_CHECK_LIB([fuse], [fuse_main], - [AC_SUBST([LIBFUSE], ["-lfuse"]) - AC_DEFINE([HAVE_LIBFUSE], [1], - [Define if you have fuse]) - HAVE_LIBFUSE=1 - # look for fuse_getgroups and define FUSE_GETGROUPS if found - LIBS_saved="$LIBS" - LIBS="$LIBS -lfuse" - AC_CHECK_FUNCS([fuse_getgroups]) - LIBS="$LIBS_saved" - ], - [AC_MSG_FAILURE( - [no FUSE found (use --without-fuse to disable)])])]) +AS_IF([test "x$with_fuse" != xno], [ + PKG_CHECK_MODULES([LIBFUSE], [fuse], + [HAVE_LIBFUSE=1], + [AC_MSG_FAILURE([no FUSE found (use --without-fuse to disable)])]) +]) AM_CONDITIONAL(WITH_FUSE, [test "$HAVE_LIBFUSE" = "1"]) # jemalloc? diff --git a/src/Makefile-client.am b/src/Makefile-client.am index a8b697c4056e0..af1a84427822f 100644 --- a/src/Makefile-client.am +++ b/src/Makefile-client.am @@ -75,7 +75,8 @@ bin_PROGRAMS += ceph-fuse if WITH_RBD rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.cc -rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) +rbd_fuse_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS) +rbd_fuse_LDADD = $(LIBFUSE_LIBS) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) bin_PROGRAMS += rbd-fuse endif # WITH_RBD endif # WITH_FUSE diff --git a/src/client/Makefile.am b/src/client/Makefile.am index 5ef480b8c09bb..8e47f51b1dfba 100644 --- a/src/client/Makefile.am +++ b/src/client/Makefile.am @@ -27,7 +27,8 @@ noinst_HEADERS += \ if WITH_FUSE libclient_fuse_la_SOURCES = client/fuse_ll.cc -libclient_fuse_la_LIBADD = libclient.la -lfuse +libclient_fuse_la_LIBADD = libclient.la $(LIBFUSE_LIBS) +libclient_fuse_la_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS) noinst_LTLIBRARIES += libclient_fuse.la noinst_HEADERS += client/fuse_ll.h endif diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index ee9f552f5ee4a..e2d3cc1ef890d 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -14,8 +14,8 @@ #define FUSE_USE_VERSION 30 -#include -#include +#include +#include #include #include #include diff --git a/src/rbd_fuse/rbd-fuse.cc b/src/rbd_fuse/rbd-fuse.cc index 4602cbfab9711..f7ac46c767783 100644 --- a/src/rbd_fuse/rbd-fuse.cc +++ b/src/rbd_fuse/rbd-fuse.cc @@ -12,7 +12,7 @@ #include #include #if defined(DARWIN) -#include +#include #include "porting.h" #else #include From aa71c20734746e53e8c625749fa6f3203fde8fcb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 5 Jul 2015 15:54:27 +0800 Subject: [PATCH 270/654] client: don't include unsupported mount options on DARWIN/FreeBSD Signed-off-by: Yan, Zheng --- src/client/fuse_ll.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index e2d3cc1ef890d..551984b23be4b 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -14,8 +14,6 @@ #define FUSE_USE_VERSION 30 -#include -#include #include #include #include @@ -34,6 +32,8 @@ #include "common/config.h" #include "include/assert.h" +#include +#include #include "fuse_ll.h" #define FINO_INO(x) ((x) & ((1ull<<48)-1ull)) @@ -917,6 +917,7 @@ int CephFuse::Handle::init(int argc, const char *argv[]) newargv[newargc++] = "-o"; newargv[newargc++] = "default_permissions"; } +#if defined(__linux__) if (client->cct->_conf->fuse_big_writes) { newargv[newargc++] = "-o"; newargv[newargc++] = "big_writes"; @@ -925,7 +926,7 @@ int CephFuse::Handle::init(int argc, const char *argv[]) newargv[newargc++] = "-o"; newargv[newargc++] = "atomic_o_trunc"; } - +#endif if (client->cct->_conf->fuse_debug) newargv[newargc++] = "-d"; From d5c43d9c750dd717be46f40061e3aeb5682a824d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 5 Jul 2015 16:02:33 +0800 Subject: [PATCH 271/654] client: don't try trimming kernel dcache on DARWIN/FreeBSD still don't know if the methods used on linux also works on DARWIN/FreeBSD Signed-off-by: Yan, Zheng --- src/ceph_fuse.cc | 4 ++++ src/client/fuse_ll.cc | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc index 4dde41772ff76..9fd8d0e950a27 100644 --- a/src/ceph_fuse.cc +++ b/src/ceph_fuse.cc @@ -122,6 +122,7 @@ int main(int argc, const char **argv, const char *envp[]) { } virtual ~RemountTest() {} virtual void *entry() { +#if defined(__linux__) int ver = get_linux_version(); assert(ver != 0); bool can_invalidate_dentries = g_conf->client_try_dentry_invalidate && @@ -151,6 +152,9 @@ int main(int argc, const char **argv, const char *envp[]) { } } return reinterpret_cast(tr); +#else + return reinterpret_cast(0); +#endif } } tester; diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 551984b23be4b..410509d7c8f84 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -978,7 +978,9 @@ int CephFuse::Handle::start() ino_cb: client->cct->_conf->fuse_use_invalidate_cb ? ino_invalidate_cb : NULL, dentry_cb: dentry_invalidate_cb, switch_intr_cb: switch_interrupt_cb, +#if defined(__linux__) remount_cb: remount_cb, +#endif /* * this is broken: * From 760f5874ca1aed868f89b81d7d0c454dd62c33b2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Jul 2015 09:20:33 +0800 Subject: [PATCH 272/654] Don't use '--exclude-libs' linker option on DARWIN Only GNU linker supports this option Signed-off-by: Yan, Zheng --- src/Makefile-client.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Makefile-client.am b/src/Makefile-client.am index af1a84427822f..7e8c7165f441f 100644 --- a/src/Makefile-client.am +++ b/src/Makefile-client.am @@ -94,7 +94,9 @@ python_PYTHON += pybind/cephfs.py libcephfs_la_SOURCES = libcephfs.cc libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*' +if LINUX libcephfs_la_LDFLAGS += -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a' +endif # LINUX lib_LTLIBRARIES += libcephfs.la # jni library (java source is in src/java) From 4ec4177f56385f0e76e069b3665dd85b1d9a429f Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 10 Jul 2015 11:52:51 +0800 Subject: [PATCH 273/654] Makefile: add porting.h to dist tarball Signed-off-by: Kefu Chai --- src/Makefile.am | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makefile.am b/src/Makefile.am index 502f83a014fc4..5cf555707f3a0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -211,7 +211,8 @@ noinst_HEADERS += \ bash_completion/radosgw-admin \ mount/canonicalize.c \ mount/mtab.c \ - objclass/objclass.h + objclass/objclass.h \ + porting.h # coverage From a55891650cda2656c33196ced0ae1905d0c2a1ff Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 10 Jul 2015 11:53:55 +0800 Subject: [PATCH 274/654] porting.h: add TODO comment Signed-off-by: Kefu Chai --- src/porting.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/porting.h b/src/porting.h index 89274248c3171..1528ffb0f020a 100644 --- a/src/porting.h +++ b/src/porting.h @@ -3,6 +3,7 @@ #define PORTING_H #include "acconfig.h" +/* TODO: move these into include/compat.h */ #if defined(DARWIN) #include From 05fbfd1102e9fe263f3bde3b52f5eb97b196886f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 09:52:08 +0800 Subject: [PATCH 275/654] On Darwin: subfix of dynamic library is dylib Signed-off-by: Yan, Zheng --- src/erasure-code/ErasureCodePlugin.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/erasure-code/ErasureCodePlugin.cc b/src/erasure-code/ErasureCodePlugin.cc index 74114eedff33a..b120eda3a56f0 100644 --- a/src/erasure-code/ErasureCodePlugin.cc +++ b/src/erasure-code/ErasureCodePlugin.cc @@ -24,7 +24,11 @@ #include "include/str_list.h" #define PLUGIN_PREFIX "libec_" +#if defined(DARWIN) +#define PLUGIN_SUFFIX ".dylib" +#else #define PLUGIN_SUFFIX ".so" +#endif #define PLUGIN_INIT_FUNCTION "__erasure_code_init" #define PLUGIN_VERSION_FUNCTION "__erasure_code_version" From e92aaea1bda06c28652dd9fbe8d9a47a7afbad13 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 10:20:26 +0800 Subject: [PATCH 276/654] vstart.sh: append ceph library path to {LD,DYLD}_LIBRARY_PATH Signed-off-by: Yan, Zheng --- src/vstart.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vstart.sh b/src/vstart.sh index d48a1fbc608c9..dcb131f4f64fb 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -42,8 +42,8 @@ fi [ -z "$PYBIND" ] && PYBIND=./pybind export PYTHONPATH=$PYBIND -export LD_LIBRARY_PATH=$CEPH_LIB -export DYLD_LIBRARY_PATH=$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$CEPH_LIB:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH [ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON="$MON" [ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD="$OSD" From 4536cb52fefe57ece24f102000121717222260c1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 10:24:33 +0800 Subject: [PATCH 277/654] vstart.sh: use portable way to get ip address Signed-off-by: Yan, Zheng --- src/vstart.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/vstart.sh b/src/vstart.sh index dcb131f4f64fb..8f3ca4def82cd 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -326,11 +326,10 @@ if [ -n "$ip" ]; then IP="$ip" else echo hostname $HOSTNAME - RAW_IP=`hostname -I` # filter out IPv6 and localhost addresses - IP="$(echo "$RAW_IP"|tr ' ' '\012'|grep -v :|grep -v '^127\.'|head -n1)" - # if that left nothing, then try to use the raw thing, it might work - if [ -z "$IP" ]; then IP="$RAW_IP"; fi + IP="$(ifconfig | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | head -n1)" + # if nothing left, try using localhost address, it might work + if [ -z "$IP" ]; then IP="127.0.0.1"; fi echo ip $IP fi echo "ip $IP" From 69e2060f170855d1131d33786aab22287fe23d51 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 10:58:05 +0800 Subject: [PATCH 278/654] init-ceph: replace 'echo -n' with printf 'echo -n' is no portable. On OSX, output of 'echo -n' can be '-n' Signed-off-by: Yan, Zheng --- src/ceph_common.sh | 4 ++-- src/init-ceph.in | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ceph_common.sh b/src/ceph_common.sh index 07faddc2fce14..0a4ac229f5b56 100644 --- a/src/ceph_common.sh +++ b/src/ceph_common.sh @@ -220,10 +220,10 @@ get_conf() { if [ -z "$1" ]; then [ "$verbose" -eq 1 ] && echo "$CCONF -c $conf -n $type.$id \"$key\"" - eval "$var=\"`$CCONF -c $conf -n $type.$id \"$key\" || eval echo -n \"$def\"`\"" + eval "$var=\"`$CCONF -c $conf -n $type.$id \"$key\" || printf \"$def\"`\"" else [ "$verbose" -eq 1 ] && echo "$CCONF -c $conf -s $1 \"$key\"" - eval "$var=\"`$CCONF -c $conf -s $1 \"$key\" || eval echo -n \"$def\"`\"" + eval "$var=\"`$CCONF -c $conf -s $1 \"$key\" || eval printf \"$def\"`\"" fi } diff --git a/src/init-ceph.in b/src/init-ceph.in index 3e3b3a44cd5d3..82a4fd5e889c6 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -59,12 +59,12 @@ signal_daemon() { signal=$4 action=$5 [ -z "$action" ] && action="Stopping" - echo -n "$action Ceph $name on $host..." + printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ]; then pid=`cat $pidfile` if [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; then cmd=\"kill $signal \$pid\" - echo -n \$cmd... + printf \"\$cmd...\" \$cmd fi fi" @@ -89,12 +89,12 @@ stop_daemon() { signal=$4 action=$5 [ -z "$action" ] && action="Stopping" - echo -n "$action Ceph $name on $host..." + printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ] ; then pid=\`cat $pidfile\` while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do cmd=\"kill $signal \$pid\" - echo -n \$cmd... + printf \"\$cmd...\" \$cmd sleep 1 continue @@ -428,7 +428,7 @@ for name in $what; do status) if daemon_is_running $name ceph-$type $id $pid_file; then - echo -n "$name: running " + printf "$name: running " do_cmd "$BINDIR/ceph --admin-daemon $asok version 2>/dev/null" || echo unknown elif [ -e "$pid_file" ]; then # daemon is dead, but pid file still exists From 4594adbc8f08a7a5f124914d401020924bdc3e05 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 15:05:33 +0800 Subject: [PATCH 279/654] init-ceph: don't use procfs to check if daemon is running use ps(1) instead, which is portable Signed-off-by: Yan, Zheng --- src/init-ceph.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/init-ceph.in b/src/init-ceph.in index 82a4fd5e889c6..22f595ad2b512 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -61,8 +61,8 @@ signal_daemon() { [ -z "$action" ] && action="Stopping" printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ]; then - pid=`cat $pidfile` - if [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; then + pid=\`cat $pidfile\` + if ps -p \$pid -o args= | grep -q $daemon; then cmd=\"kill $signal \$pid\" printf \"\$cmd...\" \$cmd @@ -78,7 +78,7 @@ daemon_is_running() { pidfile=$4 do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running pid=\`cat $pidfile\` - [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline && grep -qwe -i.$daemon_id /proc/\$pid/cmdline && exit 0 # running + ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running exit 1 # pid is something else" "" "okfail" } @@ -92,7 +92,7 @@ stop_daemon() { printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ] ; then pid=\`cat $pidfile\` - while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do + while ps -p \$pid -o args= | grep -q $daemon; do cmd=\"kill $signal \$pid\" printf \"\$cmd...\" \$cmd From bb1fa7f3993dfcd1243b2a9a0bb432c269df215f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 Jul 2015 15:07:45 +0800 Subject: [PATCH 280/654] init-ceph: check if /lib/lsb/init-functions exists On OSX/FreeBSD, /lib/lsb/init-functions does not exist Signed-off-by: Yan, Zheng --- src/init-ceph.in | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/init-ceph.in b/src/init-ceph.in index 22f595ad2b512..4255c550a434d 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -12,7 +12,10 @@ # Description: Enable Ceph distributed file system services. ### END INIT INFO -. /lib/lsb/init-functions +# TODO: on FreeBSD/OSX, use equivalent script file +if [ -e /lib/lsb/init-functions ]; then + . /lib/lsb/init-functions +fi # detect systemd, also check whether the systemd-run binary exists SYSTEMD_RUN=$(which systemd-run 2>/dev/null) From b800303e197a740b54c98a4d3380987fc71fe255 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 Jul 2015 11:32:08 +0800 Subject: [PATCH 281/654] tools: link ceph-client-debug to LIBCLIENT Signed-off-by: Yan, Zheng --- src/tools/Makefile-client.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/Makefile-client.am b/src/tools/Makefile-client.am index e417eb5fa3336..4cbfd5d65e0f4 100644 --- a/src/tools/Makefile-client.am +++ b/src/tools/Makefile-client.am @@ -25,7 +25,7 @@ bin_PROGRAMS += rados if WITH_CEPHFS ceph_client_debug_SOURCES = tools/ceph-client-debug.cc -ceph_client_debug_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(LIBCOMMON) +ceph_client_debug_LDADD = $(LIBCEPHFS) $(LIBCLIENT) $(CEPH_GLOBAL) $(LIBCOMMON) bin_DEBUGPROGRAMS += ceph-client-debug endif # WITH_CEPHFS From bcbddab500ed601cd2574192fbd0212a6645484f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 Jul 2015 15:30:03 +0800 Subject: [PATCH 282/654] libcephfs: define loff_t as off_t on OSX Signed-off-by: Yan, Zheng --- src/include/cephfs/libcephfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h index 45585f494be7e..20830933ada5f 100644 --- a/src/include/cephfs/libcephfs.h +++ b/src/include/cephfs/libcephfs.h @@ -24,7 +24,7 @@ #include // FreeBSD compatibility -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) || defined(__APPLE__) typedef off_t loff_t; typedef off_t off64_t; #endif From 078691977d6703c5e34801002e139c6a8945de50 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 Jul 2015 16:03:48 +0800 Subject: [PATCH 283/654] test_c_headers: don't use -Wold-style-declaration option for clang clang does not support this option Signed-off-by: Yan, Zheng --- src/test/Makefile-client.am | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 01aaa0e4dc58d..dee4d0e8e7486 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -454,9 +454,11 @@ ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \ -Wformat-y2k \ -Winit-self \ -Wignored-qualifiers \ - -Wold-style-declaration \ -Wold-style-definition \ -Wtype-limits +if !CLANG +ceph_test_c_headers_CFLAGS += -Wold-style-declaration +endif # !CLANG bin_DEBUGPROGRAMS += ceph_test_c_headers endif # WITH_CEPHFS From ad056c12c05d96972621b1495f8bfaa87e286a1f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 Jul 2015 16:06:01 +0800 Subject: [PATCH 284/654] libradosstriper/striping.cc: include "include/types.h" include definitions of __le{32,64) Signed-off-by: Yan, Zheng --- src/test/libradosstriper/striping.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/libradosstriper/striping.cc b/src/test/libradosstriper/striping.cc index 404256637f7e8..e1b5801826129 100644 --- a/src/test/libradosstriper/striping.cc +++ b/src/test/libradosstriper/striping.cc @@ -1,5 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab + +#include "include/types.h" #include "include/rados/librados.h" #include "include/rados/librados.hpp" #include "include/radosstriper/libradosstriper.h" From 731f6aaa689f33471708696870e5b6d3ee7cb1b4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 Jul 2015 18:13:22 +0800 Subject: [PATCH 285/654] test_libcephfs: disable flock test on OSX there is no sem_timedwait on OSX. For inter-threads communication, we can replace sem_t with pthread_cond_t. But for inter-processes communication, it's extremely hard to figure out how to do a timed wait. So disable the test case on OSX Signed-off-by: Yan, Zheng --- src/test/Makefile-client.am | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index dee4d0e8e7486..aef841d156560 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -428,8 +428,11 @@ ceph_test_libcephfs_SOURCES = \ test/libcephfs/test.cc \ test/libcephfs/readdir_r_cb.cc \ test/libcephfs/caps.cc \ - test/libcephfs/multiclient.cc \ - test/libcephfs/flock.cc + test/libcephfs/multiclient.cc +if LINUX +ceph_test_libcephfs_SOURCES += test/libcephfs/flock.cc +endif # LINUX + ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD) ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS) bin_DEBUGPROGRAMS += ceph_test_libcephfs From 5db6915d80f2ad5d5bf1896d26060cfac5fdb502 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 27 Jul 2015 16:01:31 +0800 Subject: [PATCH 286/654] xattr: convert ENOATTR to ENODATA on DARWIN On Darwin, getxattr/removexattr return ENOATTR when extended attribute does not exist. But lots of ceph codes expect ENODATA is returned in that case. On Darwin, ENOATTR and ENODATA have different values, so we convert ENOATTR to ENODATA. Signed-off-by: Yan, Zheng --- src/common/xattr.c | 12 ++++++++++++ src/include/compat.h | 2 +- src/os/chain_xattr.cc | 12 ++++++------ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/common/xattr.c b/src/common/xattr.c index b2e522b880413..caa31d5270456 100644 --- a/src/common/xattr.c +++ b/src/common/xattr.c @@ -98,6 +98,9 @@ void *value, size_t size) error = getxattr(path, name, value, size); #elif defined(DARWIN) error = getxattr(path, name, value, size, 0 /* position */, 0); + /* ENOATTR and ENODATA have different values */ + if (error < 0 && errno == ENOATTR) + errno = ENODATA; #endif return (error); @@ -130,6 +133,9 @@ ceph_os_fgetxattr(int fd, const char *name, void *value, error = fgetxattr(fd, name, value, size); #elif defined(DARWIN) error = fgetxattr(fd, name, value, size, 0, 0 /* no options */); + /* ENOATTR and ENODATA have different values */ + if (error < 0 && errno == ENOATTR) + errno = ENODATA; #endif return (error); @@ -244,6 +250,9 @@ ceph_os_removexattr(const char *path, const char *name) error = removexattr(path, name); #elif defined(DARWIN) error = removexattr(path, name, 0); + /* ENOATTR and ENODATA have different values */ + if (error < 0 && errno == ENOATTR) + errno = ENODATA; #endif return (error); @@ -260,6 +269,9 @@ ceph_os_fremovexattr(int fd, const char *name) error = fremovexattr(fd, name); #elif defined(DARWIN) error = fremovexattr(fd, name, 0); + /* ENOATTR and ENODATA have different values */ + if (error < 0 && errno == ENOATTR) + errno = ENODATA; #endif return (error); diff --git a/src/include/compat.h b/src/include/compat.h index 25d3d7602f19c..caabe10df0e36 100644 --- a/src/include/compat.h +++ b/src/include/compat.h @@ -13,7 +13,7 @@ #define CEPH_COMPAT_H #if defined(__FreeBSD__) -#define ENODATA 61 +#define ENODATA ENOATTR #define MSG_MORE 0 #endif /* !__FreeBSD__ */ diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc index 24b463435b44f..f50a9ccf011d9 100644 --- a/src/os/chain_xattr.cc +++ b/src/os/chain_xattr.cc @@ -138,7 +138,7 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size) get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size); - if (i && r == -ENOATTR) { + if (i && r == -ENODATA) { ret = pos; break; } @@ -209,7 +209,7 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size) get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size); - if (i && r == -ENOATTR) { + if (i && r == -ENODATA) { ret = pos; break; } @@ -282,10 +282,10 @@ int chain_setxattr(const char *fn, const char *name, const void *val, size_t siz do { get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); r = sys_removexattr(fn, raw_name); - if (r < 0 && r != -ENOATTR) + if (r < 0 && r != -ENODATA) ret = r; i++; - } while (r != -ENOATTR); + } while (r != -ENODATA); } return ret; @@ -318,10 +318,10 @@ int chain_fsetxattr(int fd, const char *name, const void *val, size_t size) do { get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); r = sys_fremovexattr(fd, raw_name); - if (r < 0 && r != -ENOATTR) + if (r < 0 && r != -ENODATA) ret = r; i++; - } while (r != -ENOATTR); + } while (r != -ENODATA); } return ret; From 4132805d70d1ae91cba5a58475807880cf6d2588 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 27 Jul 2015 19:08:48 +0800 Subject: [PATCH 287/654] client: convert XATTR_{CREATE,REPLACE} to CEPH_XATTR_{CREATE,REPLACE} Signed-off-by: Yan, Zheng --- src/client/Client.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 4259c6ecdf605..9fbf72ddc5e44 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -23,6 +23,7 @@ #include #include #include +#include #if defined(__linux__) #include @@ -8962,8 +8963,13 @@ int Client::_setxattr(Inode *in, const char *name, const void *value, if (vxattr && vxattr->readonly) return -EOPNOTSUPP; + int xattr_flags = 0; if (!value) - flags |= CEPH_XATTR_REMOVE; + xattr_flags |= CEPH_XATTR_REMOVE; + if (flags & XATTR_CREATE) + xattr_flags |= CEPH_XATTR_CREATE; + if (flags & XATTR_REPLACE) + xattr_flags |= CEPH_XATTR_REPLACE; MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR); filepath path; @@ -8971,7 +8977,7 @@ int Client::_setxattr(Inode *in, const char *name, const void *value, req->set_filepath(path); req->set_string2(name); req->set_inode(in); - req->head.args.setxattr.flags = flags; + req->head.args.setxattr.flags = xattr_flags; bufferlist bl; bl.append((const char*)value, size); From c57ef8c3ed5d8c2ed0a81edb740613ad93e972ab Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 28 Jul 2015 17:55:30 +0800 Subject: [PATCH 288/654] test/librados: replace errno -125 with -ECANCELED Signed-off-by: Yan, Zheng --- src/test/librados/c_write_operations.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/librados/c_write_operations.cc b/src/test/librados/c_write_operations.cc index 37c7450888d13..1ea950975857c 100644 --- a/src/test/librados/c_write_operations.cc +++ b/src/test/librados/c_write_operations.cc @@ -117,7 +117,7 @@ TEST(LibRadosCWriteOps, Xattrs) { ASSERT_TRUE(op); rados_write_op_cmpxattr(op, "key", LIBRADOS_CMPXATTR_OP_EQ, "value", 5); rados_write_op_setxattr(op, "key", "value", 5); - ASSERT_EQ(-125, rados_write_op_operate(op, ioctx, "test", NULL, 0)); + ASSERT_EQ(-ECANCELED, rados_write_op_operate(op, ioctx, "test", NULL, 0)); rados_release_write_op(op); rados_ioctx_destroy(ioctx); From b82ed61220ef11fd547c8297e59212e5fc84bf37 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 29 Jul 2015 17:44:52 +0800 Subject: [PATCH 289/654] buffer: make buffer::exception classes undefined in dynamic objects On OSX, if the an exception class is declared and defined in header file, but it ends up being compiled as private symbols in different binaries. The exception handling code will take the two compiled exception classes as different types! In our case, the one in libcls_xxx.so and the one is ceph-osd are considered as different classes, thus the try-catch statement fails to work. The fix is force buffer::exception classes undefined in libcls_xxx.so. The ibcls_xxx.so are compiled with '-undefined dynamic_lookup' option. when it is loaded into ceph-osd, buffer::exception classes in ceph-osd will be used. Signed-off-by: Yan, Zheng --- src/common/buffer.cc | 12 ++++++++++++ src/include/buffer.h | 16 ++++------------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 27dc62a19e5fc..1ab62241f9c06 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -120,6 +120,18 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return 65536; } + const char * buffer::error::what() const throw () { + return "buffer::exception"; + } + const char * buffer::bad_alloc::what() const throw () { + return "buffer::bad_alloc"; + } + const char * buffer::end_of_buffer::what() const throw () { + return "buffer::end_of_buffer"; + } + const char * buffer::malformed_input::what() const throw () { + return buf; + } buffer::error_code::error_code(int error) : buffer::malformed_input(cpp_strerror(error).c_str()), code(error) {} diff --git a/src/include/buffer.h b/src/include/buffer.h index a54befa11c8a3..bb6ea9356c537 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -75,27 +75,19 @@ class CEPH_BUFFER_API buffer { public: struct error : public std::exception{ - const char *what() const throw () { - return "buffer::exception"; - } + const char *what() const throw (); }; struct bad_alloc : public error { - const char *what() const throw () { - return "buffer::bad_alloc"; - } + const char *what() const throw (); }; struct end_of_buffer : public error { - const char *what() const throw () { - return "buffer::end_of_buffer"; - } + const char *what() const throw (); }; struct malformed_input : public error { explicit malformed_input(const std::string& w) { snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str()); } - const char *what() const throw () { - return buf; - } + const char *what() const throw (); private: char buf[256]; }; From 9e37a37908b6c780e54f28a1e727cce78b453b22 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Jul 2015 14:31:20 +0800 Subject: [PATCH 290/654] test/librados: replace sem_init() with sem_open() sem_init() always fails on OSX (it's no longer supported). Signed-off-by: Yan, Zheng --- src/test/librados/aio.cc | 205 +++++++++++++++--------------- src/test/librados/watch_notify.cc | 39 +++--- src/test/libradosstriper/aio.cc | 107 ++++++++-------- src/test/test_stress_watch.cc | 10 +- 4 files changed, 180 insertions(+), 181 deletions(-) diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc index 6754e145e45da..ed59b008bfa2e 100644 --- a/src/test/librados/aio.cc +++ b/src/test/librados/aio.cc @@ -5,6 +5,7 @@ #include "gtest/gtest.h" #include +#include #include #include #include @@ -32,31 +33,30 @@ class AioTestData if (m_init) { rados_ioctx_destroy(m_ioctx); destroy_one_pool(m_pool_name, &m_cluster); - sem_destroy(&m_sem); + sem_close(m_sem); } } std::string init() { int ret; - if (sem_init(&m_sem, 0, 0)) { + if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) { int err = errno; - sem_destroy(&m_sem); ostringstream oss; - oss << "sem_init failed: " << cpp_strerror(err); + oss << "sem_open failed: " << cpp_strerror(err); return oss.str(); } m_pool_name = get_temp_pool_name(); std::string err = create_one_pool(m_pool_name, &m_cluster); if (!err.empty()) { - sem_destroy(&m_sem); + sem_close(m_sem); ostringstream oss; oss << "create_one_pool(" << m_pool_name << ") failed: error " << err; return oss.str(); } ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx); if (ret) { - sem_destroy(&m_sem); + sem_close(m_sem); destroy_one_pool(m_pool_name, &m_cluster); ostringstream oss; oss << "rados_ioctx_create failed: error " << ret; @@ -66,7 +66,7 @@ class AioTestData return ""; } - sem_t m_sem; + sem_t *m_sem; rados_t m_cluster; rados_ioctx_t m_ioctx; std::string m_pool_name; @@ -90,31 +90,30 @@ class AioTestDataPP if (m_init) { m_ioctx.close(); destroy_one_pool_pp(m_pool_name, m_cluster); - sem_destroy(&m_sem); + sem_close(m_sem); } } std::string init() { int ret; - if (sem_init(&m_sem, 0, 0)) { + if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) { int err = errno; - sem_destroy(&m_sem); ostringstream oss; - oss << "sem_init failed: " << cpp_strerror(err); + oss << "sem_open failed: " << cpp_strerror(err); return oss.str(); } m_pool_name = get_temp_pool_name(); std::string err = create_one_pool_pp(m_pool_name, m_cluster); if (!err.empty()) { - sem_destroy(&m_sem); + sem_close(m_sem); ostringstream oss; oss << "create_one_pool(" << m_pool_name << ") failed: error " << err; return oss.str(); } ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx); if (ret) { - sem_destroy(&m_sem); + sem_close(m_sem); destroy_one_pool_pp(m_pool_name, m_cluster); ostringstream oss; oss << "rados_ioctx_create failed: error " << ret; @@ -124,7 +123,7 @@ class AioTestDataPP return ""; } - sem_t m_sem; + sem_t *m_sem; Rados m_cluster; IoCtx m_ioctx; std::string m_pool_name; @@ -137,28 +136,28 @@ void set_completion_complete(rados_completion_t cb, void *arg) { AioTestData *test = static_cast(arg); test->m_complete = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_safe(rados_completion_t cb, void *arg) { AioTestData *test = static_cast(arg); test->m_safe = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_completePP(rados_completion_t cb, void *arg) { AioTestDataPP *test = static_cast(arg); test->m_complete = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_safePP(rados_completion_t cb, void *arg) { AioTestDataPP *test = static_cast(arg); test->m_safe = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } TEST(LibRadosAio, TooBig) { @@ -203,8 +202,8 @@ TEST(LibRadosAio, SimpleWrite) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); @@ -216,8 +215,8 @@ TEST(LibRadosAio, SimpleWrite) { my_completion2, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion2)); rados_aio_release(my_completion); @@ -240,8 +239,8 @@ TEST(LibRadosAio, SimpleWritePP) { my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); delete my_completion; @@ -257,8 +256,8 @@ TEST(LibRadosAio, SimpleWritePP) { my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); delete my_completion; @@ -312,8 +311,8 @@ TEST(LibRadosAio, RoundTrip) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[256]; @@ -345,8 +344,8 @@ TEST(LibRadosAio, RoundTrip2) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[128]; @@ -381,8 +380,8 @@ TEST(LibRadosAio, RoundTripPP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -417,8 +416,8 @@ TEST(LibRadosAio, RoundTripPP2) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -588,8 +587,8 @@ TEST(LibRadosAio, IsComplete) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[128]; @@ -631,8 +630,8 @@ TEST(LibRadosAio, IsCompletePP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -1068,8 +1067,8 @@ TEST(LibRadosAio, SimpleStat) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -1104,8 +1103,8 @@ TEST(LibRadosAio, SimpleStatPP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; @@ -1137,8 +1136,8 @@ TEST(LibRadosAio, SimpleStatNS) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); rados_ioctx_set_namespace(test_data.m_ioctx, "nspace"); @@ -1150,8 +1149,8 @@ TEST(LibRadosAio, SimpleStatNS) { my_completion, buf2, sizeof(buf2), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -1202,8 +1201,8 @@ TEST(LibRadosAio, SimpleStatPPNS) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; @@ -1235,8 +1234,8 @@ TEST(LibRadosAio, StatRemove) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -1294,8 +1293,8 @@ TEST(LibRadosAio, StatRemovePP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; @@ -1661,31 +1660,30 @@ class AioTestDataEC if (m_init) { rados_ioctx_destroy(m_ioctx); destroy_one_ec_pool(m_pool_name, &m_cluster); - sem_destroy(&m_sem); + sem_close(m_sem); } } std::string init() { int ret; - if (sem_init(&m_sem, 0, 0)) { + if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) { int err = errno; - sem_destroy(&m_sem); ostringstream oss; - oss << "sem_init failed: " << cpp_strerror(err); + oss << "sem_open failed: " << cpp_strerror(err); return oss.str(); } m_pool_name = get_temp_pool_name(); std::string err = create_one_ec_pool(m_pool_name, &m_cluster); if (!err.empty()) { - sem_destroy(&m_sem); + sem_close(m_sem); ostringstream oss; oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err; return oss.str(); } ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx); if (ret) { - sem_destroy(&m_sem); + sem_close(m_sem); destroy_one_ec_pool(m_pool_name, &m_cluster); ostringstream oss; oss << "rados_ioctx_create failed: error " << ret; @@ -1695,7 +1693,7 @@ class AioTestDataEC return ""; } - sem_t m_sem; + sem_t *m_sem; rados_t m_cluster; rados_ioctx_t m_ioctx; std::string m_pool_name; @@ -1719,31 +1717,30 @@ class AioTestDataECPP if (m_init) { m_ioctx.close(); destroy_one_ec_pool_pp(m_pool_name, m_cluster); - sem_destroy(&m_sem); + sem_close(m_sem); } } std::string init() { int ret; - if (sem_init(&m_sem, 0, 0)) { + if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) { int err = errno; - sem_destroy(&m_sem); ostringstream oss; - oss << "sem_init failed: " << cpp_strerror(err); + oss << "sem_open failed: " << cpp_strerror(err); return oss.str(); } m_pool_name = get_temp_pool_name(); std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster); if (!err.empty()) { - sem_destroy(&m_sem); + sem_close(m_sem); ostringstream oss; oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err; return oss.str(); } ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx); if (ret) { - sem_destroy(&m_sem); + sem_close(m_sem); destroy_one_ec_pool_pp(m_pool_name, m_cluster); ostringstream oss; oss << "rados_ioctx_create failed: error " << ret; @@ -1753,7 +1750,7 @@ class AioTestDataECPP return ""; } - sem_t m_sem; + sem_t *m_sem; Rados m_cluster; IoCtx m_ioctx; std::string m_pool_name; @@ -1766,28 +1763,28 @@ void set_completion_completeEC(rados_completion_t cb, void *arg) { AioTestDataEC *test = static_cast(arg); test->m_complete = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_safeEC(rados_completion_t cb, void *arg) { AioTestDataEC *test = static_cast(arg); test->m_safe = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_completeECPP(rados_completion_t cb, void *arg) { AioTestDataECPP *test = static_cast(arg); test->m_complete = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_safeECPP(rados_completion_t cb, void *arg) { AioTestDataECPP *test = static_cast(arg); test->m_safe = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } TEST(LibRadosAioEC, SimpleWrite) { @@ -1802,8 +1799,8 @@ TEST(LibRadosAioEC, SimpleWrite) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); @@ -1815,8 +1812,8 @@ TEST(LibRadosAioEC, SimpleWrite) { my_completion2, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion2)); rados_aio_release(my_completion); @@ -1839,8 +1836,8 @@ TEST(LibRadosAioEC, SimpleWritePP) { my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); delete my_completion; @@ -1856,8 +1853,8 @@ TEST(LibRadosAioEC, SimpleWritePP) { my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); delete my_completion; @@ -1911,8 +1908,8 @@ TEST(LibRadosAioEC, RoundTrip) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[256]; @@ -1944,8 +1941,8 @@ TEST(LibRadosAioEC, RoundTrip2) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[128]; @@ -1980,8 +1977,8 @@ TEST(LibRadosAioEC, RoundTripPP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -2016,8 +2013,8 @@ TEST(LibRadosAioEC, RoundTripPP2) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -2232,8 +2229,8 @@ TEST(LibRadosAioEC, IsComplete) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); char buf2[128]; @@ -2275,8 +2272,8 @@ TEST(LibRadosAioEC, IsCompletePP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); bufferlist bl2; @@ -2712,8 +2709,8 @@ TEST(LibRadosAioEC, SimpleStat) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -2748,8 +2745,8 @@ TEST(LibRadosAioEC, SimpleStatPP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; @@ -2781,8 +2778,8 @@ TEST(LibRadosAioEC, SimpleStatNS) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); rados_ioctx_set_namespace(test_data.m_ioctx, "nspace"); @@ -2794,8 +2791,8 @@ TEST(LibRadosAioEC, SimpleStatNS) { my_completion, buf2, sizeof(buf2), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -2846,8 +2843,8 @@ TEST(LibRadosAioEC, SimpleStatPPNS) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; @@ -2879,8 +2876,8 @@ TEST(LibRadosAioEC, StatRemove) { my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, rados_aio_get_return_value(my_completion)); uint64_t psize; @@ -2938,8 +2935,8 @@ TEST(LibRadosAioEC, StatRemovePP) { bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } ASSERT_EQ(0, my_completion->get_return_value()); uint64_t psize; diff --git a/src/test/librados/watch_notify.cc b/src/test/librados/watch_notify.cc index 1d585e8bb6d71..c424fd84b88d5 100644 --- a/src/test/librados/watch_notify.cc +++ b/src/test/librados/watch_notify.cc @@ -5,6 +5,7 @@ #include "test/librados/TestCase.h" #include +#include #include #include "gtest/gtest.h" #include "include/encoding.h" @@ -21,12 +22,12 @@ typedef RadosTestECPP LibRadosWatchNotifyECPP; int notify_sleep = 0; // notify -static sem_t sem; +static sem_t *sem; static void watch_notify_test_cb(uint8_t opcode, uint64_t ver, void *arg) { std::cout << __func__ << std::endl; - sem_post(&sem); + sem_post(sem); } class WatchNotifyTestCtx : public WatchCtx @@ -35,7 +36,7 @@ class WatchNotifyTestCtx : public WatchCtx void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) { std::cout << __func__ << std::endl; - sem_post(&sem); + sem_post(sem); } }; @@ -103,7 +104,7 @@ class WatchNotifyTestCtx2 : public WatchCtx2 #pragma GCC diagnostic ignored "-Wdeprecated-declarations" TEST_F(LibRadosWatchNotify, WatchNotify) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); char buf[128]; memset(buf, 0xcc, sizeof(buf)); ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0)); @@ -112,18 +113,18 @@ TEST_F(LibRadosWatchNotify, WatchNotify) { rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL)); ASSERT_EQ(0, rados_notify(ioctx, "foo", 0, NULL, 0)); TestAlarm alarm; - sem_wait(&sem); + sem_wait(sem); rados_unwatch(ioctx, "foo", handle); // when dne ... ASSERT_EQ(-ENOENT, rados_watch(ioctx, "dne", 0, &handle, watch_notify_test_cb, NULL)); - sem_destroy(&sem); + sem_close(sem); } TEST_P(LibRadosWatchNotifyPP, WatchNotify) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); char buf[128]; memset(buf, 0xcc, sizeof(buf)); bufferlist bl1; @@ -138,13 +139,13 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotify) { bufferlist bl2; ASSERT_EQ(0, ioctx.notify("foo", 0, bl2)); TestAlarm alarm; - sem_wait(&sem); + sem_wait(sem); ioctx.unwatch("foo", handle); - sem_destroy(&sem); + sem_close(sem); } TEST_F(LibRadosWatchNotifyEC, WatchNotify) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); char buf[128]; memset(buf, 0xcc, sizeof(buf)); ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0)); @@ -153,13 +154,13 @@ TEST_F(LibRadosWatchNotifyEC, WatchNotify) { rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL)); ASSERT_EQ(0, rados_notify(ioctx, "foo", 0, NULL, 0)); TestAlarm alarm; - sem_wait(&sem); + sem_wait(sem); rados_unwatch(ioctx, "foo", handle); - sem_destroy(&sem); + sem_close(sem); } TEST_F(LibRadosWatchNotifyECPP, WatchNotify) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); char buf[128]; memset(buf, 0xcc, sizeof(buf)); bufferlist bl1; @@ -174,15 +175,15 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotify) { bufferlist bl2; ASSERT_EQ(0, ioctx.notify("foo", 0, bl2)); TestAlarm alarm; - sem_wait(&sem); + sem_wait(sem); ioctx.unwatch("foo", handle); - sem_destroy(&sem); + sem_close(sem); } // -- TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); ioctx.set_notify_timeout(1); uint64_t handle; WatchNotifyTestCtx ctx; @@ -194,12 +195,12 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) { ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0)); ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx)); - sem_destroy(&sem); + sem_close(sem); ASSERT_EQ(0, ioctx.unwatch("foo", handle)); } TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeout) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0))); ioctx.set_notify_timeout(1); uint64_t handle; WatchNotifyTestCtx ctx; @@ -211,7 +212,7 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeout) { ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0)); ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx)); - sem_destroy(&sem); + sem_close(sem); ASSERT_EQ(0, ioctx.unwatch("foo", handle)); } diff --git a/src/test/libradosstriper/aio.cc b/src/test/libradosstriper/aio.cc index 009976c85aec0..847e01141df3b 100644 --- a/src/test/libradosstriper/aio.cc +++ b/src/test/libradosstriper/aio.cc @@ -5,6 +5,7 @@ #include "test/librados/test.h" #include "test/libradosstriper/TestCase.h" +#include #include #include @@ -16,14 +17,14 @@ class AioTestData { public: AioTestData() : m_complete(false), m_safe(false) { - sem_init(&m_sem, 0, 0); + m_sem = sem_open("test_libradosstriper_aio_sem", O_CREAT, 0644, 0); } ~AioTestData() { - sem_destroy(&m_sem); + sem_close(m_sem); } - sem_t m_sem; + sem_t *m_sem; bool m_complete; bool m_safe; }; @@ -32,14 +33,14 @@ void set_completion_complete(rados_completion_t cb, void *arg) { AioTestData *test = static_cast(arg); test->m_complete = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } void set_completion_safe(rados_completion_t cb, void *arg) { AioTestData *test = static_cast(arg); test->m_safe = true; - sem_post(&test->m_sem); + sem_post(test->m_sem); } TEST_F(StriperTest, SimpleWrite) { @@ -51,8 +52,8 @@ TEST_F(StriperTest, SimpleWrite) { memset(buf, 0xcc, sizeof(buf)); ASSERT_EQ(0, rados_striper_aio_write(striper, "StriperTest", my_completion, buf, sizeof(buf), 0)); TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); } @@ -66,8 +67,8 @@ TEST_F(StriperTestPP, SimpleWritePP) { bl1.append(buf, sizeof(buf)); ASSERT_EQ(0, striper.aio_write("SimpleWritePP", my_completion, bl1, sizeof(buf), 0)); TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); } @@ -81,8 +82,8 @@ TEST_F(StriperTest, WaitForSafe) { ASSERT_EQ(0, rados_striper_aio_write(striper, "WaitForSafe", my_completion, buf, sizeof(buf), 0)); TestAlarm alarm; rados_aio_wait_for_safe(my_completion); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); } @@ -97,8 +98,8 @@ TEST_F(StriperTestPP, WaitForSafePP) { ASSERT_EQ(0, striper.aio_write("WaitForSafePP", my_completion, bl1, sizeof(buf), 0)); TestAlarm alarm; my_completion->wait_for_safe(); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); } @@ -112,8 +113,8 @@ TEST_F(StriperTest, RoundTrip) { ASSERT_EQ(0, rados_striper_aio_write(striper, "RoundTrip", my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } char buf2[128]; memset(buf2, 0, sizeof(buf2)); @@ -126,8 +127,8 @@ TEST_F(StriperTest, RoundTrip) { rados_aio_wait_for_complete(my_completion2); } ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); } @@ -142,8 +143,8 @@ TEST_F(StriperTest, RoundTrip2) { ASSERT_EQ(0, rados_striper_aio_write(striper, "RoundTrip2", my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } char buf2[128]; memset(buf2, 0, sizeof(buf2)); @@ -156,8 +157,8 @@ TEST_F(StriperTest, RoundTrip2) { rados_aio_wait_for_safe(my_completion2); } ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); } @@ -173,8 +174,8 @@ TEST_F(StriperTestPP, RoundTripPP) { ASSERT_EQ(0, striper.aio_write("RoundTripPP", my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } bufferlist bl2; AioCompletion *my_completion2 = librados::Rados::aio_create_completion @@ -185,8 +186,8 @@ TEST_F(StriperTestPP, RoundTripPP) { my_completion2->wait_for_complete(); } ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); } @@ -202,8 +203,8 @@ TEST_F(StriperTestPP, RoundTripPP2) { ASSERT_EQ(0, striper.aio_write("RoundTripPP2", my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } bufferlist bl2; AioCompletion *my_completion2 = librados::Rados::aio_create_completion @@ -214,8 +215,8 @@ TEST_F(StriperTestPP, RoundTripPP2) { my_completion2->wait_for_safe(); } ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); } @@ -230,8 +231,8 @@ TEST_F(StriperTest, IsComplete) { ASSERT_EQ(0, rados_striper_aio_write(striper, "IsComplete", my_completion, buf, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } char buf2[128]; memset(buf2, 0, sizeof(buf2)); @@ -250,8 +251,8 @@ TEST_F(StriperTest, IsComplete) { } } ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); } @@ -267,8 +268,8 @@ TEST_F(StriperTestPP, IsCompletePP) { ASSERT_EQ(0, striper.aio_write("IsCompletePP", my_completion, bl1, sizeof(buf), 0)); { TestAlarm alarm; - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); } bufferlist bl2; AioCompletion *my_completion2 = librados::Rados::aio_create_completion @@ -285,8 +286,8 @@ TEST_F(StriperTestPP, IsCompletePP) { } } ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); } @@ -320,8 +321,8 @@ TEST_F(StriperTest, IsSafe) { rados_aio_wait_for_complete(my_completion2); } ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); } @@ -354,8 +355,8 @@ TEST_F(StriperTestPP, IsSafePP) { my_completion2->wait_for_complete(); } ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); } @@ -393,8 +394,8 @@ TEST_F(StriperTest, RoundTripAppend) { ASSERT_EQ((int)(sizeof(buf) + sizeof(buf2)), rados_aio_get_return_value(my_completion3)); ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf))); ASSERT_EQ(0, memcmp(buf3 + sizeof(buf), buf2, sizeof(buf2))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); rados_aio_release(my_completion3); @@ -435,8 +436,8 @@ TEST_F(StriperTestPP, RoundTripAppendPP) { ASSERT_EQ(sizeof(buf) + sizeof(buf2), (unsigned)my_completion3->get_return_value()); ASSERT_EQ(0, memcmp(bl3.c_str(), buf, sizeof(buf))); ASSERT_EQ(0, memcmp(bl3.c_str() + sizeof(buf), buf2, sizeof(buf2))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); my_completion3->release(); @@ -462,8 +463,8 @@ TEST_F(StriperTest, Flush) { rados_aio_wait_for_complete(my_completion2); } ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); } @@ -487,8 +488,8 @@ TEST_F(StriperTestPP, FlushPP) { my_completion2->wait_for_complete(); } ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); } @@ -525,8 +526,8 @@ TEST_F(StriperTest, RoundTripWriteFull) { } ASSERT_EQ(sizeof(buf2), (unsigned)rados_aio_get_return_value(my_completion3)); ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); rados_aio_release(my_completion); rados_aio_release(my_completion2); rados_aio_release(my_completion3); @@ -566,8 +567,8 @@ TEST_F(StriperTestPP, RoundTripWriteFullPP) { } ASSERT_EQ(sizeof(buf2), (unsigned)my_completion3->get_return_value()); ASSERT_EQ(0, memcmp(bl3.c_str(), buf2, sizeof(buf2))); - sem_wait(&test_data.m_sem); - sem_wait(&test_data.m_sem); + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); my_completion->release(); my_completion2->release(); my_completion3->release(); diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc index 1f9bed9be6ed7..9e66f0ecd66ee 100644 --- a/src/test/test_stress_watch.cc +++ b/src/test/test_stress_watch.cc @@ -23,7 +23,7 @@ using std::map; using std::ostringstream; using std::string; -static sem_t sem; +static sem_t *sem; static atomic_t stop_flag; class WatchNotifyTestCtx : public WatchCtx @@ -31,7 +31,7 @@ class WatchNotifyTestCtx : public WatchCtx public: void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) { - sem_post(&sem); + sem_post(sem); } }; @@ -68,7 +68,7 @@ INSTANTIATE_TEST_CASE_P(WatchStressTests, WatchStress, ::testing::Values("", "cache")); TEST_P(WatchStress, Stress1) { - ASSERT_EQ(0, sem_init(&sem, 0, 0)); + ASSERT_NE(SEM_FAILED, (sem = sem_open("test_stress_watch", O_CREAT, 0644, 0))); Rados ncluster; std::string pool_name = get_temp_pool_name(); ASSERT_EQ("", create_one_pool_pp(pool_name, ncluster)); @@ -105,7 +105,7 @@ TEST_P(WatchStress, Stress1) { sleep(1); // Give a change to see an incorrect notify } else { TestAlarm alarm; - sem_wait(&sem); + sem_wait(sem); } if (do_blacklist) { @@ -119,7 +119,7 @@ TEST_P(WatchStress, Stress1) { thr->join(); nioctx.close(); ASSERT_EQ(0, destroy_one_pool_pp(pool_name, ncluster)); - sem_destroy(&sem); + sem_close(sem); } #pragma GCC diagnostic pop From e71269caf64e10d09103101f98bc5dfbd0985e1c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Jul 2015 16:01:46 +0800 Subject: [PATCH 291/654] tests: disable unittest_blkdev on OSX the test uses sysfs, which does not exist on OSX Signed-off-by: Yan, Zheng --- src/test/Makefile.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 32e10851717c9..9d647624995ce 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -152,7 +152,9 @@ check_TESTPROGRAMS += unittest_addrs unittest_blkdev_SOURCES = test/common/test_blkdev.cc unittest_blkdev_CXXFLAGS = $(UNITTEST_CXXFLAGS) unittest_blkdev_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) +if LINUX check_TESTPROGRAMS += unittest_blkdev +endif unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS) From 126ee7dcade528f461a5fd795854b550a3de74ec Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Jul 2015 16:03:56 +0800 Subject: [PATCH 292/654] tests: replace std::tr1::shared_ptr with ceph::shared_ptr Signed-off-by: Yan, Zheng --- src/test/common/test_shared_cache.cc | 40 +++++++++--------- src/test/common/test_sharedptr_registry.cc | 48 +++++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/test/common/test_shared_cache.cc b/src/test/common/test_shared_cache.cc index f54a2a36a6b50..09f6fb1dc17a0 100644 --- a/src/test/common/test_shared_cache.cc +++ b/src/test/common/test_shared_cache.cc @@ -32,7 +32,7 @@ class SharedLRUTest : public SharedLRU { public: Mutex &get_lock() { return lock; } Cond &get_cond() { return cond; } - map, int* > > &get_weak_refs() { + map, int* > > &get_weak_refs() { return weak_refs; } }; @@ -45,7 +45,7 @@ class SharedLRU_all : public ::testing::Test { SharedLRUTest &cache; unsigned int key; int value; - shared_ptr ptr; + ceph::shared_ptr ptr; enum in_method_t { LOOKUP, LOWER_BOUND } in_method; Thread_wait(SharedLRUTest& _cache, unsigned int _key, @@ -61,7 +61,7 @@ class SharedLRU_all : public ::testing::Test { ptr = cache.lower_bound(key); break; case LOOKUP: - ptr = shared_ptr(new int); + ptr = ceph::shared_ptr(new int); *ptr = value; ptr = cache.lookup(key); break; @@ -105,13 +105,13 @@ TEST_F(SharedLRU_all, add) { int value1 = 2; bool existed = false; { - shared_ptr ptr = cache.add(key, new int(value1), &existed); + ceph::shared_ptr ptr = cache.add(key, new int(value1), &existed); ASSERT_EQ(value1, *ptr); ASSERT_FALSE(existed); } { int value2 = 3; - shared_ptr ptr = cache.add(key, new int(value2), &existed); + ceph::shared_ptr ptr = cache.add(key, new int(value2), &existed); ASSERT_EQ(value1, *ptr); ASSERT_TRUE(existed); } @@ -124,7 +124,7 @@ TEST_F(SharedLRU_all, empty) { ASSERT_TRUE(cache.empty()); { int value1 = 2; - shared_ptr ptr = cache.add(key, new int(value1), &existed); + ceph::shared_ptr ptr = cache.add(key, new int(value1), &existed); ASSERT_EQ(value1, *ptr); ASSERT_FALSE(existed); } @@ -169,7 +169,7 @@ TEST_F(SharedLRU_all, wait_lookup) { int value = 2; { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); cache.get_weak_refs()[key] = make_pair(ptr, &*ptr); } EXPECT_FALSE(cache.get_weak_refs()[key].first.lock()); @@ -195,7 +195,7 @@ TEST_F(SharedLRU_all, wait_lookup_or_create) { int value = 2; { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); cache.get_weak_refs()[key] = make_pair(ptr, &*ptr); } EXPECT_FALSE(cache.get_weak_refs()[key].first.lock()); @@ -240,7 +240,7 @@ TEST_F(SharedLRU_all, wait_lower_bound) { ASSERT_TRUE(cache.add(other_key, new int(other_value)).get()); { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); cache.get_weak_refs()[key] = make_pair(ptr, &*ptr); } EXPECT_FALSE(cache.get_weak_refs()[key].first.lock()); @@ -272,15 +272,15 @@ TEST_F(SharedLRU_all, get_next) { SharedLRUTest cache; const unsigned int key2 = 333; - shared_ptr ptr2 = cache.lookup_or_create(key2); + ceph::shared_ptr ptr2 = cache.lookup_or_create(key2); const int value2 = *ptr2 = 400; // entries with expired pointers are silently ignored const unsigned int key_gone = 222; - cache.get_weak_refs()[key_gone] = make_pair(shared_ptr(), (int*)0); + cache.get_weak_refs()[key_gone] = make_pair(ceph::shared_ptr(), (int*)0); const unsigned int key1 = 111; - shared_ptr ptr1 = cache.lookup_or_create(key1); + ceph::shared_ptr ptr1 = cache.lookup_or_create(key1); const int value1 = *ptr1 = 800; pair i; @@ -299,11 +299,11 @@ TEST_F(SharedLRU_all, get_next) { { SharedLRUTest cache; const unsigned int key1 = 111; - shared_ptr *ptr1 = new shared_ptr(cache.lookup_or_create(key1)); + ceph::shared_ptr *ptr1 = new shared_ptr(cache.lookup_or_create(key1)); const unsigned int key2 = 222; - shared_ptr ptr2 = cache.lookup_or_create(key2); + ceph::shared_ptr ptr2 = cache.lookup_or_create(key2); - pair > i; + pair > i; EXPECT_TRUE(cache.get_next(i.first, &i)); EXPECT_EQ(key1, i.first); delete ptr1; @@ -354,7 +354,7 @@ TEST(SharedCache_all, add) { SharedLRU cache; unsigned int key = 1; int value = 2; - shared_ptr ptr = cache.add(key, new int(value)); + ceph::shared_ptr ptr = cache.add(key, new int(value)); ASSERT_EQ(ptr, cache.lookup(key)); ASSERT_EQ(value, *cache.lookup(key)); } @@ -364,11 +364,11 @@ TEST(SharedCache_all, lru) { SharedLRU cache(NULL, SIZE); bool existed = false; - shared_ptr ptr = cache.add(0, new int(0), &existed); + ceph::shared_ptr ptr = cache.add(0, new int(0), &existed); ASSERT_FALSE(existed); { int *tmpint = new int(0); - shared_ptr ptr2 = cache.add(0, tmpint, &existed); + ceph::shared_ptr ptr2 = cache.add(0, tmpint, &existed); ASSERT_TRUE(existed); delete tmpint; } @@ -387,9 +387,9 @@ TEST(SharedCache_all, lru) { cache.purge(0); ASSERT_FALSE(cache.lookup(0)); - shared_ptr ptr2 = cache.add(0, new int(0), &existed); + ceph::shared_ptr ptr2 = cache.add(0, new int(0), &existed); ASSERT_FALSE(ptr == ptr2); - ptr = shared_ptr(); + ptr = ceph::shared_ptr(); ASSERT_TRUE(cache.lookup(0).get()); } diff --git a/src/test/common/test_sharedptr_registry.cc b/src/test/common/test_sharedptr_registry.cc index 7b06b7e163f6d..42bc8e607acac 100644 --- a/src/test/common/test_sharedptr_registry.cc +++ b/src/test/common/test_sharedptr_registry.cc @@ -30,7 +30,7 @@ class SharedPtrRegistryTest : public SharedPtrRegistry { public: Mutex &get_lock() { return lock; } - map, int*> > &get_contents() { + map, int*> > &get_contents() { return contents; } }; @@ -43,7 +43,7 @@ class SharedPtrRegistry_all : public ::testing::Test { SharedPtrRegistryTest ®istry; unsigned int key; int value; - shared_ptr ptr; + ceph::shared_ptr ptr; enum in_method_t { LOOKUP, LOOKUP_OR_CREATE } in_method; Thread_wait(SharedPtrRegistryTest& _registry, unsigned int _key, int _value, in_method_t _in_method) : @@ -63,7 +63,7 @@ class SharedPtrRegistry_all : public ::testing::Test { ptr = registry.lookup_or_create(key); break; case LOOKUP: - ptr = shared_ptr(new int); + ptr = ceph::shared_ptr(new int); *ptr = value; ptr = registry.lookup(key); break; @@ -103,7 +103,7 @@ TEST_F(SharedPtrRegistry_all, lookup_or_create) { SharedPtrRegistryTest registry; unsigned int key = 1; int value = 2; - shared_ptr ptr = registry.lookup_or_create(key); + ceph::shared_ptr ptr = registry.lookup_or_create(key); *ptr = value; ASSERT_EQ(value, *registry.lookup_or_create(key)); } @@ -124,7 +124,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) { { unsigned int key = 1; { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); registry.get_contents()[key] = make_pair(ptr, ptr.get()); } EXPECT_FALSE(registry.get_contents()[key].first.lock()); @@ -144,7 +144,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) { unsigned int key = 2; int value = 3; { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); registry.get_contents()[key] = make_pair(ptr, ptr.get()); } EXPECT_FALSE(registry.get_contents()[key].first.lock()); @@ -157,7 +157,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) { { int other_value = value + 1; unsigned int other_key = key + 1; - shared_ptr ptr = registry.lookup_or_create(other_key, other_value); + ceph::shared_ptr ptr = registry.lookup_or_create(other_key, other_value); EXPECT_TRUE(ptr.get()); EXPECT_EQ(other_value, *ptr); } @@ -173,7 +173,7 @@ TEST_F(SharedPtrRegistry_all, lookup) { SharedPtrRegistryTest registry; unsigned int key = 1; { - shared_ptr ptr = registry.lookup_or_create(key); + ceph::shared_ptr ptr = registry.lookup_or_create(key); int value = 2; *ptr = value; ASSERT_EQ(value, *registry.lookup(key)); @@ -187,7 +187,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup) { unsigned int key = 1; int value = 2; { - shared_ptr ptr(new int); + ceph::shared_ptr ptr(new int); registry.get_contents()[key] = make_pair(ptr, ptr.get()); } EXPECT_FALSE(registry.get_contents()[key].first.lock()); @@ -216,15 +216,15 @@ TEST_F(SharedPtrRegistry_all, get_next) { SharedPtrRegistryTest registry; const unsigned int key2 = 333; - shared_ptr ptr2 = registry.lookup_or_create(key2); + ceph::shared_ptr ptr2 = registry.lookup_or_create(key2); const int value2 = *ptr2 = 400; // entries with expired pointers are silentely ignored const unsigned int key_gone = 222; - registry.get_contents()[key_gone] = make_pair(shared_ptr(), (int*)0); + registry.get_contents()[key_gone] = make_pair(ceph::shared_ptr(), (int*)0); const unsigned int key1 = 111; - shared_ptr ptr1 = registry.lookup_or_create(key1); + ceph::shared_ptr ptr1 = registry.lookup_or_create(key1); const int value1 = *ptr1 = 800; pair i; @@ -245,11 +245,11 @@ TEST_F(SharedPtrRegistry_all, get_next) { // SharedPtrRegistryTest registry; const unsigned int key1 = 111; - shared_ptr *ptr1 = new shared_ptr(registry.lookup_or_create(key1)); + ceph::shared_ptr *ptr1 = new ceph::shared_ptr(registry.lookup_or_create(key1)); const unsigned int key2 = 222; - shared_ptr ptr2 = registry.lookup_or_create(key2); + ceph::shared_ptr ptr2 = registry.lookup_or_create(key2); - pair > i; + pair > i; EXPECT_TRUE(registry.get_next(i.first, &i)); EXPECT_EQ(key1, i.first); delete ptr1; @@ -262,15 +262,15 @@ TEST_F(SharedPtrRegistry_all, remove) { { SharedPtrRegistryTest registry; const unsigned int key1 = 1; - shared_ptr ptr1 = registry.lookup_or_create(key1); + ceph::shared_ptr ptr1 = registry.lookup_or_create(key1); *ptr1 = 400; registry.remove(key1); - shared_ptr ptr2 = registry.lookup_or_create(key1); + ceph::shared_ptr ptr2 = registry.lookup_or_create(key1); *ptr2 = 500; - ptr1 = shared_ptr(); - shared_ptr res = registry.lookup(key1); + ptr1 = ceph::shared_ptr(); + ceph::shared_ptr res = registry.lookup(key1); assert(res); assert(res == ptr2); assert(*res == 500); @@ -278,13 +278,13 @@ TEST_F(SharedPtrRegistry_all, remove) { { SharedPtrRegistryTest registry; const unsigned int key1 = 1; - shared_ptr ptr1 = registry.lookup_or_create(key1, 400); + ceph::shared_ptr ptr1 = registry.lookup_or_create(key1, 400); registry.remove(key1); - shared_ptr ptr2 = registry.lookup_or_create(key1, 500); + ceph::shared_ptr ptr2 = registry.lookup_or_create(key1, 500); - ptr1 = shared_ptr(); - shared_ptr res = registry.lookup(key1); + ptr1 = ceph::shared_ptr(); + ceph::shared_ptr res = registry.lookup(key1); assert(res); assert(res == ptr2); assert(*res == 500); @@ -316,7 +316,7 @@ TEST_F(SharedPtrRegistry_destructor, destructor) { EXPECT_EQ(UNDEFINED, died); int key = 101; { - shared_ptr a = registry.lookup_or_create(key); + ceph::shared_ptr a = registry.lookup_or_create(key); EXPECT_EQ(NO, died); EXPECT_TRUE(a.get()); } From 6a03fa5bbe5b2c9ae6d45cd190b26e2178bef79d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Jul 2015 18:15:18 +0800 Subject: [PATCH 293/654] TestLFNIndex.cc: don't use "cp --preserve=xattr" --preserve option is not supported by OSX's cp. use 'cp -a' instead Signed-off-by: Yan, Zheng --- src/test/os/TestLFNIndex.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc index 37733af57c900..5e44355c1e7af 100644 --- a/src/test/os/TestLFNIndex.cc +++ b/src/test/os/TestLFNIndex.cc @@ -308,7 +308,7 @@ TEST_F(TestLFNIndex, remove_object) { std::string mangled_name_1 = mangled_name; mangled_name_1.replace(mangled_name_1.find("0_long"), 6, "1_long"); const std::string pathname_1("PATH_1/" + mangled_name_1); - const std::string cmd("cp --preserve=xattr " + pathname + " " + pathname_1); + const std::string cmd("cp -a " + pathname + " " + pathname_1); EXPECT_EQ(0, ::system(cmd.c_str())); const string ATTR = "user.MARK"; EXPECT_EQ((unsigned)1, (unsigned)chain_setxattr(pathname_1.c_str(), ATTR.c_str(), "Y", 1)); From f6fa4a28d1ef3ff8f1ed84b64fc10d083a4b336c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 Jul 2015 10:01:01 +0800 Subject: [PATCH 294/654] compat: move definitions in porting.h into include/compat.h Signed-off-by: Yan, Zheng --- src/Makefile.am | 3 +-- src/include/Makefile.am | 1 + src/include/compat.h | 15 ++++++++++++++ src/{porting.h => include/sock_compat.h} | 25 ++++-------------------- src/librbd/AsyncRequest.h | 2 +- src/msg/async/AsyncConnection.cc | 2 +- src/msg/simple/Pipe.cc | 2 +- src/rbd_fuse/rbd-fuse.cc | 6 +----- src/tools/ceph_objectstore_tool.cc | 1 - 9 files changed, 25 insertions(+), 32 deletions(-) rename src/{porting.h => include/sock_compat.h} (50%) diff --git a/src/Makefile.am b/src/Makefile.am index 5cf555707f3a0..502f83a014fc4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -211,8 +211,7 @@ noinst_HEADERS += \ bash_completion/radosgw-admin \ mount/canonicalize.c \ mount/mtab.c \ - objclass/objclass.h \ - porting.h + objclass/objclass.h # coverage diff --git a/src/include/Makefile.am b/src/include/Makefile.am index 6369312d8e6f4..a364b295d6c25 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -61,6 +61,7 @@ noinst_HEADERS += \ include/cmp.h \ include/color.h \ include/compat.h \ + include/sock_compat.h \ include/crc32c.h \ include/encoding.h \ include/err.h \ diff --git a/src/include/compat.h b/src/include/compat.h index caabe10df0e36..9cb4f61b5327b 100644 --- a/src/include/compat.h +++ b/src/include/compat.h @@ -17,6 +17,21 @@ #define MSG_MORE 0 #endif /* !__FreeBSD__ */ +#if defined(__APPLE__) +/* PATH_MAX */ +#include + +/* O_LARGEFILE is not defined/required on OS X */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +/* Could be relevant for other platforms */ +#ifndef ERESTART +#define ERESTART EINTR +#endif +#endif /* __APPLE__ */ + #ifndef TEMP_FAILURE_RETRY #define TEMP_FAILURE_RETRY(expression) ({ \ typeof(expression) __result; \ diff --git a/src/porting.h b/src/include/sock_compat.h similarity index 50% rename from src/porting.h rename to src/include/sock_compat.h index 1528ffb0f020a..5faacc343edc6 100644 --- a/src/porting.h +++ b/src/include/sock_compat.h @@ -1,23 +1,7 @@ +#ifndef CEPH_SOCK_COMPAT_H +#define CEPH_SOCK_COMPAT_H -#ifndef PORTING_H -#define PORTING_H -#include "acconfig.h" - -/* TODO: move these into include/compat.h */ - -#if defined(DARWIN) -#include - -/* O_LARGEFILE is not defined/required on OS X */ -#define O_LARGEFILE 0 - -/* Wonder why this is missing */ -#define PATH_MAX 1024 - -/* Could be relevant for other platforms */ -#ifndef ERESTART -#define ERESTART EINTR -#endif +#include "include/compat.h" /* * This optimization may not be available on all platforms (e.g. OSX). @@ -39,5 +23,4 @@ # endif #endif -#endif /* DARWIN */ -#endif /* PORTING_H */ +#endif diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h index b6594bf8898e4..c0a60131106bc 100644 --- a/src/librbd/AsyncRequest.h +++ b/src/librbd/AsyncRequest.h @@ -9,7 +9,7 @@ #include "include/xlist.h" /* DARWIN Missing ERESTART */ -#include "porting.h" +#include "include/compat.h" namespace librbd { diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index a8a47e568dca1..97bb17ab20deb 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -23,7 +23,7 @@ #include "AsyncMessenger.h" #include "AsyncConnection.h" -#include "porting.h" +#include "include/sock_compat.h" // Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR #define SEQ_MASK 0x7fffffff diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc index 42995840d45a0..2292af9a54dd4 100644 --- a/src/msg/simple/Pipe.cc +++ b/src/msg/simple/Pipe.cc @@ -32,7 +32,7 @@ #include "auth/cephx/CephxProtocol.h" #include "auth/AuthSessionHandler.h" -#include "porting.h" +#include "include/sock_compat.h" // Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR #define SEQ_MASK 0x7fffffff diff --git a/src/rbd_fuse/rbd-fuse.cc b/src/rbd_fuse/rbd-fuse.cc index f7ac46c767783..5cc89002c4f95 100644 --- a/src/rbd_fuse/rbd-fuse.cc +++ b/src/rbd_fuse/rbd-fuse.cc @@ -11,12 +11,7 @@ #include #include #include -#if defined(DARWIN) #include -#include "porting.h" -#else -#include -#endif #include #include #include @@ -24,6 +19,7 @@ #include #include +#include "include/compat.h" #include "include/rbd/librbd.h" static int gotrados = 0; diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index dd29779fd7b86..a2dbccbc5fc20 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -36,7 +36,6 @@ #include "json_spirit/json_spirit_reader.h" #include "ceph_objectstore_tool.h" -#include "porting.h" namespace po = boost::program_options; using namespace std; From b06838a09db6bfc6ced4cbb124084a064afdc564 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 Jul 2015 17:18:58 +0800 Subject: [PATCH 295/654] test/admin_socket: check error message according to OS Signed-off-by: Yan, Zheng --- src/test/admin_socket.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc index b26acb8908d53..cee215d2e96f6 100644 --- a/src/test/admin_socket.cc +++ b/src/test/admin_socket.cc @@ -217,7 +217,12 @@ TEST(AdminSocketClient, Ping) { { bool ok; std::string result = client.ping(&ok); - EXPECT_NE(std::string::npos, result.find("Connection refused")); +#if defined(__APPLE__) || defined(__FreeBSD__) + const char* errmsg = "Socket operation on non-socket"; +#else + const char* errmsg = "Connection refused"; +#endif + EXPECT_NE(std::string::npos, result.find(errmsg)); ASSERT_FALSE(ok); } // a daemon is connected to the socket From c092b4fcc4ca1b8d66f93ca326f3abeba85619b0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 Jul 2015 17:20:14 +0800 Subject: [PATCH 296/654] os/chain_xattr: set CHAIN_XATTR_MAX_NAME_LEN according to max length of xattr name Signed-off-by: Yan, Zheng --- src/os/chain_xattr.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/os/chain_xattr.h b/src/os/chain_xattr.h index 2d77568012ce0..65460b2aa6e4b 100644 --- a/src/os/chain_xattr.h +++ b/src/os/chain_xattr.h @@ -8,7 +8,16 @@ #include +#if defined(__linux__) +#include +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2) +#elif defined(__APPLE__) || defined(__FreeBSD__) +#include +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2) +#else #define CHAIN_XATTR_MAX_NAME_LEN 128 +#endif + #define CHAIN_XATTR_MAX_BLOCK_LEN 2048 /* From 2cd7d4fca173e2ac6b4f10bd72a2b4c41e9612b5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 14 Aug 2015 21:56:16 +0800 Subject: [PATCH 297/654] tests: only use posix_fadvise on linux Signed-off-by: Yan, Zheng --- src/tools/rados/PoolDump.cc | 3 ++- src/tools/rados/RadosImport.cc | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tools/rados/PoolDump.cc b/src/tools/rados/PoolDump.cc index 69963ee264d78..5d0b3eda6d84e 100644 --- a/src/tools/rados/PoolDump.cc +++ b/src/tools/rados/PoolDump.cc @@ -161,8 +161,9 @@ int PoolDump::dump(IoCtx *io_ctx) } r = write_simple(TYPE_POOL_END, file_fd); +#if defined(__linux__) if (file_fd != STDOUT_FILENO) posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED); - +#endif return r; } diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc index 32fee3811647b..1f74af2c86cdc 100644 --- a/src/tools/rados/RadosImport.cc +++ b/src/tools/rados/RadosImport.cc @@ -110,8 +110,10 @@ int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite) } #endif +#if defined(__linux__) if (file_fd != STDIN_FILENO) posix_fadvise(file_fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif bool done = false; bool found_metadata = false; @@ -152,8 +154,10 @@ int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite) cerr << "Missing metadata section!" << std::endl; } +#if defined(__linux__) if (file_fd != STDIN_FILENO) posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED); +#endif return 0; } From 8d527d41673e3ec33a003cf0a7155197d492dbe9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 14 Aug 2015 22:06:11 +0800 Subject: [PATCH 298/654] common/admin_socket: fix compile error on OSX Invalid operands to binary expression ('__bind' and 'int') Without :: clang confuses C bind function and std::bind(). Signed-off-by: Yan, Zheng --- src/common/OutputDataSocket.cc | 4 ++-- src/common/admin_socket.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc index 2c4526ddaf9a9..e43f5cf95cb89 100644 --- a/src/common/OutputDataSocket.cc +++ b/src/common/OutputDataSocket.cc @@ -179,14 +179,14 @@ std::string OutputDataSocket::bind_and_listen(const std::string &sock_path, int address.sun_family = AF_UNIX; snprintf(address.sun_path, sizeof(address.sun_path), "%s", sock_path.c_str()); - if (bind(sock_fd, (struct sockaddr*)&address, + if (::bind(sock_fd, (struct sockaddr*)&address, sizeof(struct sockaddr_un)) != 0) { int err = errno; if (err == EADDRINUSE) { // The old UNIX domain socket must still be there. // Let's unlink it and try again. VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str())); - if (bind(sock_fd, (struct sockaddr*)&address, + if (::bind(sock_fd, (struct sockaddr*)&address, sizeof(struct sockaddr_un)) == 0) { err = 0; } diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc index 1df5105451373..07a2246e9e552 100644 --- a/src/common/admin_socket.cc +++ b/src/common/admin_socket.cc @@ -209,7 +209,7 @@ std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd) address.sun_family = AF_UNIX; snprintf(address.sun_path, sizeof(address.sun_path), "%s", sock_path.c_str()); - if (bind(sock_fd, (struct sockaddr*)&address, + if (::bind(sock_fd, (struct sockaddr*)&address, sizeof(struct sockaddr_un)) != 0) { int err = errno; if (err == EADDRINUSE) { @@ -222,7 +222,7 @@ std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd) } else { ldout(m_cct, 20) << "unlink stale file " << sock_path << dendl; VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str())); - if (bind(sock_fd, (struct sockaddr*)&address, + if (::bind(sock_fd, (struct sockaddr*)&address, sizeof(struct sockaddr_un)) == 0) { err = 0; } else { From 0b94867a937ad08d50c9acd3eed68f2c75d13f49 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 14 Aug 2015 22:27:16 +0800 Subject: [PATCH 299/654] common/blkdev: fix complie error on OSX/FreeBSD Signed-off-by: Yan, Zheng --- src/common/blkdev.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc index f009dd13757ca..f013a7b7c88cd 100644 --- a/src/common/blkdev.cc +++ b/src/common/blkdev.cc @@ -19,10 +19,10 @@ #include #include #include "include/int_types.h" +#include "include/uuid.h" #ifdef __linux__ #include -#include "include/uuid.h" #include #define UUID_LEN 36 From 68db9f6fdaefe07314743c0caafc9935d38cc640 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 14 Aug 2015 22:52:09 +0800 Subject: [PATCH 300/654] client: fix compile error on OSX On OSX, type of the 3rd parameters of getgrouplist is 'int *' Signed-off-by: Yan, Zheng --- src/client/Client.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/client/Client.cc b/src/client/Client.cc index 9fbf72ddc5e44..defcf99790f6f 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -95,6 +95,7 @@ using namespace std; #if HAVE_GETGROUPLIST #include #include +#include #endif #undef dout_prefix @@ -4582,7 +4583,11 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) return -EACCES; } while (1) { +#if defined(__APPLE__) + if (getgrouplist(pw->pw_name, gid, (int *)sgids, &sgid_count) == -1) { +#else if (getgrouplist(pw->pw_name, gid, sgids, &sgid_count) == -1) { +#endif // we need to resize the group list and try again sgids = (gid_t*)realloc(sgids, sgid_count * sizeof(gid_t)); if (sgids == NULL) { From d36e514935550a770570309f66de7e8d9b6f3241 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 17 Aug 2015 10:44:44 +0800 Subject: [PATCH 301/654] ceph: use 'sed -ie' to edit file in-place On OSX, 'sed -i script' does not work, because it considers 'script' as suffix of backup file. 'sed -ie script' works on both OSX and Linux. Signed-off-by: Yan, Zheng --- src/Makefile-client.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Makefile-client.am b/src/Makefile-client.am index 7e8c7165f441f..dcd18359962e8 100644 --- a/src/Makefile-client.am +++ b/src/Makefile-client.am @@ -20,9 +20,9 @@ bin_PROGRAMS += ceph-syn ceph: ceph.in ./ceph_ver.h Makefile rm -f $@ $@.tmp cp $@.in $@.tmp - sed -i "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp - grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -i "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp - grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -i "s/@CEPH_GIT_VER@/{}/g" $@.tmp + sed -ie "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp + grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp + grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_VER@/{}/g" $@.tmp cat $(srcdir)/$@.in >>$@.tmp chmod a+x $@.tmp chmod a-w $@.tmp From 491d8939bb1867fbcc87a955dc5a9241bd8a4c9b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 17 Aug 2015 10:51:58 +0800 Subject: [PATCH 302/654] test_c_headers: don't use -Werror option for clang When using clang to compile ceph, there are lots of warnings: clang: warning: argument unused during compilation: '-Wp,-D_FORTIFY_SOURCE=2' Signed-off-by: Yan, Zheng --- src/test/Makefile-client.am | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index aef841d156560..57eaa613a6965 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -442,7 +442,6 @@ unittest_encoding_SOURCES = test/encoding.cc ceph_test_c_headers_SOURCES = test/test_c_headers.c ceph_test_c_headers_LDADD = $(LIBRADOS) $(LIBCEPHFS) ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \ - -Werror \ -Wstrict-prototypes \ -Wredundant-decls \ -Wall \ @@ -460,7 +459,7 @@ ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \ -Wold-style-definition \ -Wtype-limits if !CLANG -ceph_test_c_headers_CFLAGS += -Wold-style-declaration +ceph_test_c_headers_CFLAGS += -Werror -Wold-style-declaration endif # !CLANG bin_DEBUGPROGRAMS += ceph_test_c_headers From 8ef07420b192eb882bd96a0e2134a9e8e690a294 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 29 Aug 2015 20:53:40 +0800 Subject: [PATCH 303/654] os/fs: include on osx Signed-off-by: Kefu Chai --- src/os/fs/FS.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc index c7412d657040a..78392f6a30767 100644 --- a/src/os/fs/FS.cc +++ b/src/os/fs/FS.cc @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -28,6 +27,12 @@ #include "XFS.h" #include "acconfig.h" +#ifdef DARWIN +#include +#else +#include +#endif +#include "include/compat.h" // --------------- From 109e5b127b1072989625e7f7c775cb900bee7d5a Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 29 Aug 2015 21:33:31 +0800 Subject: [PATCH 304/654] make: do not compile XFS.cc if --without-libxfs Signed-off-by: Kefu Chai --- src/os/Makefile.am | 5 +++-- src/os/fs/FS.cc | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/os/Makefile.am b/src/os/Makefile.am index ba80fd356db83..f1c68431eb827 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -8,7 +8,6 @@ if ENABLE_SERVER libos_la_SOURCES = \ os/chain_xattr.cc \ os/fs/FS.cc \ - os/fs/XFS.cc \ os/DBObjectMap.cc \ os/GenericObjectMap.cc \ os/FileJournal.cc \ @@ -31,7 +30,9 @@ libos_la_SOURCES += os/BtrfsFileStoreBackend.cc endif if WITH_LIBXFS -libos_la_SOURCES += os/XfsFileStoreBackend.cc +libos_la_SOURCES += \ + os/fs/XFS.cc \ + os/XfsFileStoreBackend.cc endif if WITH_LIBZFS diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc index 78392f6a30767..5df8adf7d79e3 100644 --- a/src/os/fs/FS.cc +++ b/src/os/fs/FS.cc @@ -24,9 +24,12 @@ #include "FS.h" +#include "acconfig.h" + +#ifdef HAVE_LIBXFS #include "XFS.h" +#endif -#include "acconfig.h" #ifdef DARWIN #include #else From fe8b1c977fb8df5af81ee0368ebd199b6e8ea65f Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Sun, 16 Aug 2015 09:12:56 +0800 Subject: [PATCH 305/654] in filestore, OP_SETATTR is implemented in FileStore::_setattrs this funtion will get all attrs of object before setting new attrs merge several OP_SETATTR ops into one OP_SETATTRS in one ceph trasaction, which will reduce counts of getting all attrs Signed-off-by: xinxin shu --- src/osd/ReplicatedPG.cc | 39 +++++++++++++++++++++++++++++++-------- src/osd/ReplicatedPG.h | 5 +++++ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c29a2afa706d5..c721c2f75c061 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3240,13 +3240,15 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid) ctx->snapset_obc->obs.oi.version; ctx->snapset_obc->obs.oi.version = ctx->at_version; + map attrs; bl.clear(); ::encode(snapset, bl); - setattr_maybe_cache(ctx->snapset_obc, ctx, t, SS_ATTR, bl); + attrs[SS_ATTR].claim(bl); bl.clear(); ::encode(ctx->snapset_obc->obs.oi, bl); - setattr_maybe_cache(ctx->snapset_obc, ctx, t, OI_ATTR, bl); + attrs[OI_ATTR].claim(bl); + setattrs_maybe_cache(ctx->snapset_obc, ctx, t, attrs); if (pool.info.require_rollback()) { set changing; @@ -6140,11 +6142,13 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc ctx->snapset_obc->obs.oi.mtime = ctx->mtime; ctx->snapset_obc->obs.oi.local_mtime = now; + map attrs; bufferlist bv(sizeof(ctx->new_obs.oi)); ::encode(ctx->snapset_obc->obs.oi, bv); ctx->op_t->touch(snapoid); - setattr_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, OI_ATTR, bv); - setattr_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, SS_ATTR, bss); + attrs[OI_ATTR].claim(bv); + attrs[SS_ATTR].claim(bss); + setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, attrs); if (pool.info.require_rollback()) { map > to_set; to_set[SS_ATTR]; @@ -6184,17 +6188,19 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl; } + map attrs; bufferlist bv(sizeof(ctx->new_obs.oi)); ::encode(ctx->new_obs.oi, bv); - setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, OI_ATTR, bv); + attrs[OI_ATTR].claim(bv); if (soid.snap == CEPH_NOSNAP) { dout(10) << " final snapset " << ctx->new_snapset << " in " << soid << dendl; - setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, SS_ATTR, bss); + attrs[SS_ATTR].claim(bss); } else { dout(10) << " no snapset (this is a clone)" << dendl; } + setattrs_maybe_cache(ctx->obc, ctx, ctx->op_t, attrs); if (pool.info.require_rollback()) { set changing; @@ -10899,8 +10905,10 @@ void ReplicatedPG::hit_set_persist() ::encode(ctx->new_obs.oi, boi); ctx->op_t->append(oid, 0, bl.length(), bl, 0); - setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, OI_ATTR, boi); - setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, SS_ATTR, bss); + map attrs; + attrs[OI_ATTR].claim(boi); + attrs[SS_ATTR].claim(bss); + setattrs_maybe_cache(ctx->obc, ctx, ctx->op_t, attrs); ctx->log.push_back( pg_log_entry_t( pg_log_entry_t::MODIFY, @@ -12258,6 +12266,21 @@ void ReplicatedPG::setattr_maybe_cache( t->setattr(obc->obs.oi.soid, key, val); } +void ReplicatedPG::setattrs_maybe_cache( + ObjectContextRef obc, + OpContext *op, + PGBackend::PGTransaction *t, + map &attrs) +{ + if (pool.info.require_rollback()) { + for (map::iterator it = attrs.begin(); + it != attrs.end(); it++ ) { + op->pending_attrs[obc][it->first] = it->second; + } + } + t->setattrs(obc->obs.oi.soid, attrs); +} + void ReplicatedPG::rmattr_maybe_cache( ObjectContextRef obc, OpContext *op, diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 4517e68730ebb..06c9b36268f45 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1581,6 +1581,11 @@ class ReplicatedPG : public PG, public PGBackend::Listener { PGBackend::PGTransaction *t, const string &key, bufferlist &val); + void setattrs_maybe_cache( + ObjectContextRef obc, + OpContext *op, + PGBackend::PGTransaction *t, + map &attrs); void rmattr_maybe_cache( ObjectContextRef obc, OpContext *op, From 13668e68484c6bcf578e563f9db10bcf95722433 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Mon, 31 Aug 2015 22:00:53 +0800 Subject: [PATCH 306/654] client: set osdmap epoch for setxattr. Signed-off-by: Jianpeng Ma --- src/client/Client.cc | 66 ++++++++++++++++++++++++++++++++++++++++++++ src/client/Client.h | 2 ++ src/mds/Server.cc | 24 ---------------- src/mds/mdstypes.h | 20 +++++++++++++- 4 files changed, 87 insertions(+), 25 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 4259c6ecdf605..8839f95ef2fa2 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -24,6 +24,9 @@ #include #include +#include +#include + #if defined(__linux__) #include #endif @@ -1913,6 +1916,11 @@ void Client::send_request(MetaRequest *request, MetaSession *session, r->releases.swap(request->cap_releases); } r->set_mdsmap_epoch(mdsmap->get_epoch()); + if (r->head.op == CEPH_MDS_OP_SETXATTR) { + const OSDMap *osdmap = objecter->get_osdmap_read(); + r->set_osdmap_epoch(osdmap->get_epoch()); + objecter->put_osdmap_read(); + } if (request->mds == -1) { request->sent_stamp = ceph_clock_now(cct); @@ -8985,9 +8993,67 @@ int Client::_setxattr(Inode *in, const char *name, const void *value, return res; } +int Client::check_data_pool_exist(string name, string value, const OSDMap *osdmap) +{ + string tmp; + if (name == "layout") { + string::iterator begin = value.begin(); + string::iterator end = value.end(); + keys_and_values p; // create instance of parser + std::map m; // map to receive results + if (!qi::parse(begin, end, p, m)) { // returns true if successful + return -EINVAL; + } + if (begin != end) + return -EINVAL; + for (map::iterator q = m.begin(); q != m.end(); ++q) { + if (q->first == "pool") { + tmp = q->second; + break; + } + } + } else if (name == "layout.pool") { + tmp = value; + } + + if (tmp.length()) { + int64_t pool; + try { + pool = boost::lexical_cast(tmp); + if (!osdmap->have_pg_pool(pool)) + return -ENOENT; + } catch (boost::bad_lexical_cast const&) { + pool = osdmap->lookup_pg_pool_name(tmp); + if (pool < 0) { + return -ENOENT; + } + } + } + + return 0; +} + int Client::ll_setxattr(Inode *in, const char *name, const void *value, size_t size, int flags, int uid, int gid) { + // For setting pool of layout, MetaRequest need osdmap epoch. + // There is a race which create a new data pool but client and mds both don't have. + // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap. + if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 || + strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) { + string rest(strstr(name, "layout")); + string v((const char*)value); + const OSDMap *osdmap = objecter->get_osdmap_read(); + int r = check_data_pool_exist(rest, v, osdmap); + objecter->put_osdmap_read(); + + if (r == -ENOENT) { + C_SaferCond ctx; + objecter->wait_for_latest_osdmap(&ctx); + ctx.wait(); + } + } + Mutex::Locker lock(client_lock); vinodeno_t vino = _get_vino(in); diff --git a/src/client/Client.h b/src/client/Client.h index 78a2da7ee04a8..81137452edc26 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -705,6 +705,8 @@ class Client : public Dispatcher, public md_config_obs_t { int check_permissions(Inode *in, int flags, int uid, int gid); + int check_data_pool_exist(string name, string value, const OSDMap *osdmap); + vinodeno_t _get_vino(Inode *in); inodeno_t _get_inodeno(Inode *in); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 2b7554cf09ae8..7dfaaa24e07e4 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -16,7 +16,6 @@ #include "include/assert.h" // lexical_cast includes system assert.h #include -#include #include #include "MDSRank.h" @@ -3850,31 +3849,8 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr) journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur)); } - - - // XATTRS -// parse a map of keys/values. -namespace qi = boost::spirit::qi; - -template -struct keys_and_values - : qi::grammar()> -{ - keys_and_values() - : keys_and_values::base_type(query) - { - query = pair >> *(qi::lit(' ') >> pair); - pair = key >> '=' >> value; - key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9"); - value = +qi::char_("a-zA-Z_0-9"); - } - qi::rule()> query; - qi::rule()> pair; - qi::rule key, value; -}; - int Server::parse_layout_vxattr(string name, string value, const OSDMap *osdmap, ceph_file_layout *layout, bool validate) { diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 9ed51e2499389..d6f92a37eaaf4 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -23,11 +23,11 @@ #include "inode_backtrace.h" +#include #include #include "include/assert.h" #include - #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011" @@ -1604,6 +1604,24 @@ class ceph_file_layout_wrapper : public ceph_file_layout void dump(Formatter *f) const; }; +// parse a map of keys/values. +namespace qi = boost::spirit::qi; +template +struct keys_and_values + : qi::grammar()> +{ + keys_and_values() + : keys_and_values::base_type(query) + { + query = pair >> *(qi::lit(' ') >> pair); + pair = key >> '=' >> value; + key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9"); + value = +qi::char_("a-zA-Z_0-9"); + } + qi::rule()> query; + qi::rule()> pair; + qi::rule key, value; +}; #endif From d05e531dc08eaa4b43e88eecbcd585bbb034a7a0 Mon Sep 17 00:00:00 2001 From: Kadu Ribeiro Date: Mon, 24 Aug 2015 22:25:57 -0300 Subject: [PATCH 307/654] doc: update ruby doc with the aws-sdk gem usage Since I'm using ceph with the `aws-sdk` gem (https://github.com/aws/aws-sdk-ruby) instead `aws-s3` (https://github.com/marcel/aws-s3) because the aws-s3 have a trouble with the new active support (https://github.com/marcel/aws-s3/issues/98) (and the downgrade active-support wasn't a option), I proposed change the doc to receive the usage instructions with the aws-sdk gem. I used ceph with aws-sdk gem with this commands. Thanks so much Signed-off-by: Carlos E Ribeiro --- doc/radosgw/s3/ruby.rst | 191 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 2 deletions(-) diff --git a/doc/radosgw/s3/ruby.rst b/doc/radosgw/s3/ruby.rst index 39711aac41355..0a62e3f425e02 100644 --- a/doc/radosgw/s3/ruby.rst +++ b/doc/radosgw/s3/ruby.rst @@ -1,7 +1,194 @@ .. _ruby: -Ruby S3 Examples -================ +Ruby S3 Examples (aws-sdk gem ~>2) +================================== + +Settings +--------------------- + +You can setup the connection on global way: + +.. code-block:: ruby + + Aws.config.update( + endpoint: 'https://objects.dreamhost.com.', + access_key_id: 'my-access-key', + secret_access_key: 'my-secret-key', + force_path_style: true, + region: 'us-east-1' + ) + + +and instantiate a client object: + +.. code-block:: ruby + + s3_client = Aws::S3::Client.new + +Listing Owned Buckets +--------------------- + +This gets a list of buckets that you own. +This also prints out the bucket name and creation date of each bucket. + +.. code-block:: ruby + + s3_client.list_buckets.buckets.each do |bucket| + puts "#{bucket.name}\t#{bucket.creation_date}" + end + +The output will look something like this:: + + mahbuckat1 2011-04-21T18:05:39.000Z + mahbuckat2 2011-04-21T18:05:48.000Z + mahbuckat3 2011-04-21T18:07:18.000Z + + +Creating a Bucket +----------------- + +This creates a new bucket called ``my-new-bucket`` + +.. code-block:: ruby + + s3_client.create_bucket(bucket: 'my-new-bucket') + +If you want a private bucket: + +`acl` option accepts: # private, public-read, public-read-write, authenticated-read + +.. code-block:: ruby + + s3_client.create_bucket(bucket: 'my-new-bucket', acl: 'private') + + +Listing a Bucket's Content +-------------------------- + +This gets a list of hashes with the contents of each object +This also prints out each object's name, the file size, and last +modified date. + +.. code-block:: ruby + + s3_client.get_objects(bucket: 'my-new-bucket').contents.each do |object| + puts "#{object.key}\t#{object.size}\t#{object.last-modified}" + end + +The output will look something like this if the bucket has some files:: + + myphoto1.jpg 251262 2011-08-08T21:35:48.000Z + myphoto2.jpg 262518 2011-08-08T21:38:01.000Z + + +Deleting a Bucket +----------------- +.. note:: + The Bucket must be empty! Otherwise it won't work! + +.. code-block:: ruby + + s3_client.delete_bucket(bucket: 'my-new-bucket') + + +Forced Delete for Non-empty Buckets +----------------------------------- +First, you need to clear the bucket: + +.. code-block:: ruby + + Aws::S3::Bucket.new('my-new-bucket', client: s3_client).clear! + +after, you can destroy the bucket + +.. code-block:: ruby + + s3_client.delete_bucket(bucket: 'my-new-bucket') + + +Creating an Object +------------------ + +This creates a file ``hello.txt`` with the string ``"Hello World!"`` + +.. code-block:: ruby + + s3_client.put_object( + key: 'hello.txt', + body: 'Hello World!', + bucket: 'my-new-bucket', + content_type: 'text/plain' + ) + + +Change an Object's ACL +---------------------- + +This makes the object ``hello.txt`` to be publicly readable, and ``secret_plans.txt`` +to be private. + +.. code-block:: ruby + + s3_client.put_object_acl(bucket: 'my-new-bucket', key: 'hello.txt', acl: 'public-read') + + s3_client.put_object_acl(bucket: 'my-new-bucket', key: 'private.txt', acl: 'private') + + +Download an Object (to a file) +------------------------------ + +This downloads the object ``poetry.pdf`` and saves it in +``/home/larry/documents/`` + +.. code-block:: ruby + s3_client.get_object(bucket: 'my-new-bucket', key: 'poetry.pdf', response_target: '/home/larry/documents/poetry.pdf') + + +Delete an Object +---------------- + +This deletes the object ``goodbye.txt`` + +.. code-block:: ruby + + s3_client.delete_object(key: 'goodbye.txt', bucket: 'my-new-bucket') + + +Generate Object Download URLs (signed and unsigned) +--------------------------------------------------- + +This generates an unsigned download URL for ``hello.txt``. This works +because we made ``hello.txt`` public by setting the ACL above. +This then generates a signed download URL for ``secret_plans.txt`` that +will work for 1 hour. Signed download URLs will work for the time +period even if the object is private (when the time period is up, the +URL will stop working). + +.. code-block:: ruby + + puts Aws::S3::Object.new( + key: 'hello.txt', + bucket_name: 'my-new-bucket', + client: s3_client + ).public_url + + puts Aws::S3::Object.new( + key: 'secret_plans.txt', + bucket_name: 'hermes_ceph_gem', + client: s3_client + ).presigned_url(:get, expires_in: 60 * 60) + +The output of this will look something like:: + + http://objects.dreamhost.com/my-bucket-name/hello.txt + http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX + +.. _`Aws::S3`: http://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Client.html + + + +Ruby S3 Examples (aws-s3 gem) +============================= Creating a Connection --------------------- From 403144f506c7ae696cc6878b6f2d17bc7d5d837d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 28 Aug 2015 16:54:49 -0400 Subject: [PATCH 308/654] ceph.spec: package cls_numops Signed-off-by: Sage Weil --- ceph.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/ceph.spec.in b/ceph.spec.in index 059eea42395e3..6dd925290e2de 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -771,6 +771,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_libdir}/rados-classes/libcls_cephfs.so* %{_libdir}/rados-classes/libcls_rbd.so* %{_libdir}/rados-classes/libcls_hello.so* +%{_libdir}/rados-classes/libcls_numops.so* %{_libdir}/rados-classes/libcls_rgw.so* %{_libdir}/rados-classes/libcls_lock.so* %{_libdir}/rados-classes/libcls_kvs.so* From 6e0f0bbab941a899670a94309a3e22c78761b96d Mon Sep 17 00:00:00 2001 From: Mykola Golub Date: Mon, 17 Aug 2015 12:45:40 +0300 Subject: [PATCH 309/654] ceph-disk: use /sys/dev/block/maj:min/partition to see if partition Fixes: #12706 Signed-off-by: Mykola Golub --- src/ceph-disk | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 10a7b64fafd4b..b3a82c34f591b 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -600,7 +600,8 @@ def is_partition(dev): return is_partition_mpath(dev) dev = os.path.realpath(dev) - if not stat.S_ISBLK(os.lstat(dev).st_mode): + st = os.lstat(dev) + if not stat.S_ISBLK(st.st_mode): raise Error('not a block device', dev) name = get_dev_name(dev) @@ -608,9 +609,10 @@ def is_partition(dev): return False # make sure it is a partition of something else - for basename in os.listdir('/sys/block'): - if os.path.exists(os.path.join('/sys/block', basename, name)): - return True + major = os.major(st.st_rdev) + minor = os.minor(st.st_rdev) + if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)): + return True raise Error('not a disk or partition', dev) From 89f0112e001a2561f9a5cd705898d43c8909501f Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Wed, 19 Aug 2015 14:54:21 +0800 Subject: [PATCH 310/654] Objecter: Take RLocker when call is_active. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index d4d269dfc4f6b..e169389a99f54 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1978,6 +1978,7 @@ class Objecter : public md_config_obs_t, public Dispatcher { public: ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL); bool is_active() { + RWLock::RLocker l(rwlock); return !((!inflight_ops.read()) && linger_ops.empty() && poolstat_ops.empty() && statfs_ops.empty()); } From e4ce619fe17a7a9dfc18e6af0b84928aa2d88c00 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 20 Aug 2015 15:38:58 +0800 Subject: [PATCH 311/654] osdc/Objecter: For func op_cancel_writes it can directly call op_cancel. Becasue we get write-lock of rwlock, so it is safe to call op_cancel rather than _op_canchel(homeless_session for this case don't met). Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 53b3cdadfa3a7..22c122372a460 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2319,6 +2319,7 @@ epoch_t Objecter::op_cancel_writes(int r, int64_t pool) rwlock.get_write(); std::vector to_cancel; + bool found = false; for (map::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) { OSDSession *s = siter->second; @@ -2330,19 +2331,22 @@ epoch_t Objecter::op_cancel_writes(int r, int64_t pool) } } s->lock.unlock(); - } - for (std::vector::iterator titer = to_cancel.begin(); titer != to_cancel.end(); ++titer) { - int cancel_result = _op_cancel(*titer, r); - // We hold rwlock across search and cancellation, so cancels should always succeed - assert(cancel_result == 0); + for (std::vector::iterator titer = to_cancel.begin(); titer != to_cancel.end(); ++titer) { + int cancel_result = op_cancel(s, *titer, r); + // We hold rwlock across search and cancellation, so cancels should always succeed + assert(cancel_result == 0); + } + if (!found && to_cancel.size()) + found = true; + to_cancel.clear(); } const epoch_t epoch = osdmap->get_epoch(); rwlock.unlock(); - if (to_cancel.size()) { + if (found) { return epoch; } else { return -1; From 064e8585a04edb3d87b38db6bed03e965cfcb359 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 20 Aug 2015 17:00:23 +0800 Subject: [PATCH 312/654] osdc/Objeter: When cancel op, decrease num_unacked/num_uncommitted. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 22c122372a460..b6be13d3038f3 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2241,7 +2241,10 @@ int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r) if (op->onack) { op->onack->complete(r); op->onack = NULL; + num_unacked.dec(); } + if (op->oncommit || op->oncommit_sync) + num_uncommitted.dec(); if (op->oncommit) { op->oncommit->complete(r); op->oncommit = NULL; From e47fa6716e9feebd813bf5a9f352467bd79e9163 Mon Sep 17 00:00:00 2001 From: Xiaowei Chen Date: Mon, 31 Aug 2015 20:46:59 -0400 Subject: [PATCH 313/654] vstart.sh: add --mon_num --osd_num --mds_num --rgw_port option add these options to replace shell var MON, OSD, MDS to be more convenient, and add --rgw_port option. Signed-off-by: Xiaowei Chen --- src/vstart.sh | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/vstart.sh b/src/vstart.sh index 8f3ca4def82cd..d3dc42d9b31e8 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -106,7 +106,10 @@ usage=$usage"\t--hitset : enable hitset tracking\n" usage=$usage"\t-e : create an erasure pool\n"; usage=$usage"\t-o config\t\t add extra config parameters to all sections\n" usage=$usage"\t-J no journal\t\tdisable filestore journal\n" - +usage=$usage"\t--mon_num specify ceph monitor count\n" +usage=$usage"\t--osd_num specify ceph osd count\n" +usage=$usage"\t--mds_num specify ceph mds count\n" +usage=$usage"\t--rgw_port specify ceph rgw http listen port\n" usage_exit() { printf "$usage" @@ -164,6 +167,23 @@ case $1 in --smallmds ) smallmds=1 ;; + --mon_num ) + echo "mon_num:$2" + CEPH_NUM_MON="$2" + shift + ;; + --osd_num ) + CEPH_NUM_OSD=$2 + shift + ;; + --mds_num ) + CEPH_NUM_MDS=$2 + shift + ;; + --rgw_port ) + CEPH_RGW_PORT=$2 + shift + ;; mon ) start_mon=1 start_all=0 From 28324fdb080c99f3ae6c4176b5eb092592970671 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 31 Aug 2015 19:28:18 -0700 Subject: [PATCH 314/654] osd: Fix the diagnostic logging mostly to dout(20) Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 47594cb932885..c2c4e1929bc13 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1003,7 +1003,7 @@ void ECBackend::handle_sub_read_reply( assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op -dout(0) << __func__ << " to_read skipping" << dendl; + dout(20) << __func__ << " to_read skipping" << dendl; continue; } list >::const_iterator req_iter = @@ -1030,7 +1030,7 @@ dout(0) << __func__ << " to_read skipping" << dendl; assert(!op.errors.count(i->first)); // if read error better not have sent an attribute if (!rop.to_read.count(i->first)) { // We canceled this read! @see filter_read_op -dout(0) << __func__ << " to_read skipping" << dendl; + dout(20) << __func__ << " to_read skipping" << dendl; continue; } rop.complete[i->first].attrs = map(); @@ -1043,7 +1043,7 @@ dout(0) << __func__ << " to_read skipping" << dendl; make_pair( from, i->second)); -dout(0) << __func__ << " shard=" << from << " error=" << i->second << dendl; + dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl; } map >::iterator siter = @@ -1068,14 +1068,14 @@ dout(0) << __func__ << " shard=" << from << " error=" << i->second << dendl; j != iter->second.returned.front().get<2>().end(); ++j) { have.insert(j->first.shard); -dout(0) << __func__ << " have shard=" << j->first.shard << dendl; + dout(20) << __func__ << " have shard=" << j->first.shard << dendl; } set want_to_read, dummy_minimum; get_want_to_read_shards(&want_to_read); int err; // XXX: Could just do if (have.size < ec_impl->get_data_chunk_count()) if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) { -dout(0) << __func__ << " minimum_to_decode failed" << dendl; + dout(20) << __func__ << " minimum_to_decode failed" << dendl; if (rop.in_progress.empty()) { // If we don't have enough copies and we haven't sent reads for all shards // we can send the rest of the reads, if any. @@ -1088,17 +1088,18 @@ dout(0) << __func__ << " minimum_to_decode failed" << dendl; // Couldn't read any additional shards so handle as completed with errors } if (rop.complete[iter->first].errors.empty()) { -dout(0) << __func__ << " simply not enough copies err=" << err << dendl; + dout(20) << __func__ << " simply not enough copies err=" << err << dendl; } else { // Grab the first error err = rop.complete[iter->first].errors.begin()->second; -dout(0) << __func__ << ": Use one of the shard errors err=" << err << dendl; + dout(20) << __func__ << ": Use one of the shard errors err=" << err << dendl; } rop.complete[iter->first].r = err; ++is_complete; } } else { -dout(0) << __func__ << " Enough copies for " << iter->first << " (ignore errors)" << dendl; + if (!rop.complete[iter->first].errors.empty()) + dout(10) << __func__ << " Enough copies for " << iter->first << " (ignore errors)" << dendl; ++is_complete; rop.complete[iter->first].errors.clear(); assert(rop.complete[iter->first].r == 0); @@ -1106,7 +1107,7 @@ dout(0) << __func__ << " Enough copies for " << iter->first << " (ignore errors) } } if (rop.in_progress.empty() || is_complete == rop.complete.size()) { -dout(0) << __func__ << " Complete: " << rop << dendl; + dout(20) << __func__ << " Complete: " << rop << dendl; complete_read_op(rop, m); } else { dout(10) << __func__ << " readop not complete: " << rop << dendl; From 700d42ef1c82f5602249b96690ae881c1d259d54 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 27 Aug 2015 22:57:49 +0800 Subject: [PATCH 315/654] osd: translate sparse_read to read for ecpool Fixes: #12012 Signed-off-by: Kefu Chai --- src/osd/ReplicatedPG.cc | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c29a2afa706d5..e68fdac15b61b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3660,6 +3660,22 @@ struct FillInExtent : public Context { } }; +struct ToSparseReadResult : public Context { + bufferlist& data_bl; + ceph_le64& len; + ToSparseReadResult(bufferlist& bl, ceph_le64& len): + data_bl(bl), len(len) {} + void finish(int r) { + if (r < 0) return; + len = r; + bufferlist outdata; + map extents = {{0, r}}; + ::encode(extents, outdata); + ::encode_destructively(data_bl, outdata); + data_bl.swap(outdata); + } +}; + template static string list_keys(const map& m) { string s; @@ -3886,17 +3902,21 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) /* map extents */ case CEPH_OSD_OP_SPARSE_READ: tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); - if (pool.info.require_rollback()) { - result = -EOPNOTSUPP; + if (op.extent.truncate_seq) { + dout(0) << "sparse_read does not support truncation sequence " << dendl; + result = -EINVAL; break; } ++ctx->num_read; - { - if (op.extent.truncate_seq) { - dout(0) << "sparse_read does not support truncation sequence " << dendl; - result = -EINVAL; - break; - } + if (pool.info.require_rollback()) { + // translate sparse read to a normal one if not supported + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(op.extent.offset, op.extent.length, op.flags), + make_pair(&osd_op.outdata, new ToSparseReadResult(osd_op.outdata, + op.extent.length)))); + dout(10) << " async_read (was sparse_read) noted for " << soid << dendl; + } else { // read into a buffer bufferlist bl; int total_read = 0; @@ -3963,11 +3983,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) osd_op.outdata.claim_append(bl); ::encode_destructively(data_bl, osd_op.outdata); - ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10); - ctx->delta_stats.num_rd++; - dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl; } + ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10); + ctx->delta_stats.num_rd++; break; case CEPH_OSD_OP_CALL: From a5bfde69a9d14de67da1e3354173ec70ba089b37 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 28 Aug 2015 14:27:53 +0800 Subject: [PATCH 316/654] osd: should use ec_pool() when checking for an ecpool we were using pool.info.require_rollback() in do_osd_ops() when handling OP_SPARSE_READ to tell if a pool is an ecpool. should use pool.info.ec_pool() instead. Signed-off-by: Kefu Chai --- src/osd/ReplicatedPG.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index e68fdac15b61b..88a8292671876 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3908,7 +3908,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) break; } ++ctx->num_read; - if (pool.info.require_rollback()) { + if (pool.info.ec_pool()) { // translate sparse read to a normal one if not supported ctx->pending_async_reads.push_back( make_pair( From 5ae2e7a185b5f95753a09a89d7110fc38848a083 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 26 Aug 2015 15:41:13 +0800 Subject: [PATCH 317/654] ceph_test_rados: also send sparse_read in ReadOp Signed-off-by: Kefu Chai --- src/test/osd/Object.cc | 38 ++++++++++++++++++++++++++ src/test/osd/Object.h | 2 ++ src/test/osd/RadosModel.h | 57 ++++++++++++++++++++++++++------------- 3 files changed, 78 insertions(+), 19 deletions(-) diff --git a/src/test/osd/Object.cc b/src/test/osd/Object.cc index 37d09c3cf5669..f2bc9db401f5e 100644 --- a/src/test/osd/Object.cc +++ b/src/test/osd/Object.cc @@ -183,3 +183,41 @@ bool ObjectDesc::check(bufferlist &to_check) { } return true; } + +bool ObjectDesc::check_sparse(const std::map& extents, + bufferlist &to_check) { + auto i = begin(); + auto p = to_check.begin(); + uint64_t pos = 0; + for (auto extent : extents) { + const uint64_t start = extent.first; + const uint64_t end = start + extent.second; + for (; pos < end; ++i, ++pos) { + if (i.end()) { + std::cout << "reached end of iterator first" << std::endl; + return false; + } + if (pos < start) { + // check the hole + if (*i != '\0') { + std::cout << "incorrect buffer at pos " << pos << std::endl; + return false; + } + } else { + // then the extent + if (*i != *p) { + std::cout << "incorrect buffer at pos " << pos << std::endl; + return false; + } + ++p; + } + } + } + uint64_t size = layers.empty() ? 0 : + most_recent_gen()->get_length(most_recent()); + if (pos != size) { + std::cout << "only read " << pos << " out of size " << size << std::endl; + return false; + } + return true; +} diff --git a/src/test/osd/Object.h b/src/test/osd/Object.h index bffb397cfb667..feeefebe8dd99 100644 --- a/src/test/osd/Object.h +++ b/src/test/osd/Object.h @@ -358,6 +358,8 @@ class ObjectDesc { // takes ownership of gen void update(ContentsGenerator *gen, const ContDesc &next); bool check(bufferlist &to_check); + bool check_sparse(const std::map& extends, + bufferlist &to_check); const ContDesc &most_recent(); ContentsGenerator *most_recent_gen() { return layers.begin()->first.get(); diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h index 9097c7a0b7c32..dddaa9dec4aeb 100644 --- a/src/test/osd/RadosModel.h +++ b/src/test/osd/RadosModel.h @@ -992,6 +992,8 @@ class ReadOp : public TestOp { vector results; vector retvals; + vector> extent_results; + vector is_sparse_read; uint64_t waiting_on; map attrs; @@ -1016,10 +1018,32 @@ class ReadOp : public TestOp { balance_reads(balance_reads), results(3), retvals(3), + extent_results(3), + is_sparse_read(3, false), waiting_on(0), attrretval(0) {} - + + void _do_read(librados::ObjectReadOperation& read_op, int index) { + uint64_t len = 0; + if (old_value.has_contents()) + len = old_value.most_recent_gen()->get_length(old_value.most_recent()); + if (rand() % 2) { + is_sparse_read[index] = false; + read_op.read(0, + len, + &results[index], + &retvals[index]); + } else { + is_sparse_read[index] = true; + read_op.sparse_read(0, + len, + &extent_results[index], + &results[index], + &retvals[index]); + } + } + void _begin() { context->state_lock.Lock(); @@ -1065,13 +1089,7 @@ class ReadOp : public TestOp { if (snap >= 0) { context->io_ctx.snap_set_read(context->snaps[snap]); } - - op.read(0, - !old_value.has_contents() ? 0 : - old_value.most_recent_gen()->get_length(old_value.most_recent()), - &results[0], - &retvals[0]); - + _do_read(op, 0); for (map::iterator i = old_value.attrs.begin(); i != old_value.attrs.end(); ++i) { @@ -1103,12 +1121,7 @@ class ReadOp : public TestOp { // OSD's read behavior in some scenarios for (uint32_t i = 1; i < 3; ++i) { librados::ObjectReadOperation pipeline_op; - - pipeline_op.read(0, - !old_value.has_contents() ? 0 : - old_value.most_recent_gen()->get_length(old_value.most_recent()), - &results[i], - &retvals[i]); + _do_read(pipeline_op, i); assert(!context->io_ctx.aio_operate(context->prefix+oid, completions[i], &pipeline_op, 0)); waiting_on++; } @@ -1182,11 +1195,17 @@ class ReadOp : public TestOp { << ", expected " << old_value.most_recent() << std::endl; context->errors++; } - for (vector::iterator it = results.begin(); - it != results.end(); ++it) { - if (!old_value.check(*it)) { - cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl; - context->errors++; + for (unsigned i = 0; i < results.size(); i++) { + if (is_sparse_read[i]) { + if (!old_value.check_sparse(extent_results[i], results[i])) { + cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl; + context->errors++; + } + } else { + if (!old_value.check(results[i])) { + cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl; + context->errors++; + } } } if (context->errors) assert(0); From 4d4920610ebfcb516630ed15678979c9e9292f5a Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 27 Aug 2015 22:57:16 +0800 Subject: [PATCH 318/654] ceph_test_rados_api_io: add tests for sparse_read Signed-off-by: Kefu Chai --- src/test/librados/io.cc | 38 ++++++++++++++++++++++++++++++++++++++ src/test/librados/test.cc | 25 +++++++++++++++++++++++++ src/test/librados/test.h | 4 ++++ 3 files changed, 67 insertions(+) diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc index 09764223a9270..2634119a4d29e 100644 --- a/src/test/librados/io.cc +++ b/src/test/librados/io.cc @@ -223,6 +223,25 @@ TEST_F(LibRadosIoPP, ReadOpPP) { } } +TEST_F(LibRadosIoPP, SparseReadOpPP) { + char buf[128]; + memset(buf, 0xcc, sizeof(buf)); + bufferlist bl; + bl.append(buf, sizeof(buf)); + ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0)); + + { + std::map extents; + bufferlist read_bl; + int rval = -1; + ObjectReadOperation op; + op.sparse_read(0, sizeof(buf), &extents, &read_bl, &rval); + ASSERT_EQ(0, ioctx.operate("foo", &op, nullptr)); + ASSERT_EQ(0, rval); + assert_eq_sparse(bl, extents, read_bl); + } +} + TEST_F(LibRadosIo, RoundTrip) { char buf[128]; char buf2[128]; @@ -721,6 +740,25 @@ TEST_F(LibRadosIoECPP, ReadOpPP) { } } +TEST_F(LibRadosIoECPP, SparseReadOpPP) { + char buf[128]; + memset(buf, 0xcc, sizeof(buf)); + bufferlist bl; + bl.append(buf, sizeof(buf)); + ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0)); + + { + std::map extents; + bufferlist read_bl; + int rval = -1; + ObjectReadOperation op; + op.sparse_read(0, sizeof(buf), &extents, &read_bl, &rval); + ASSERT_EQ(0, ioctx.operate("foo", &op, nullptr)); + ASSERT_EQ(0, rval); + assert_eq_sparse(bl, extents, read_bl); + } +} + TEST_F(LibRadosIoEC, RoundTrip) { char buf[128]; char buf2[128]; diff --git a/src/test/librados/test.cc b/src/test/librados/test.cc index acf12276ac1c9..aac053a2e67b1 100644 --- a/src/test/librados/test.cc +++ b/src/test/librados/test.cc @@ -11,6 +11,7 @@ #include #include #include +#include "gtest/gtest.h" using namespace librados; @@ -256,3 +257,27 @@ int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) cluster.shutdown(); return ret; } + +void assert_eq_sparse(bufferlist& expected, + const std::map& extents, + bufferlist& actual) { + auto i = expected.begin(); + auto p = actual.begin(); + uint64_t pos = 0; + for (auto extent : extents) { + const uint64_t start = extent.first; + const uint64_t end = start + extent.second; + for (; pos < end; ++i, ++pos) { + ASSERT_FALSE(i.end()); + if (pos < start) { + // check the hole + ASSERT_EQ('\0', *i); + } else { + // then the extent + ASSERT_EQ(*i, *p); + ++p; + } + } + } + ASSERT_EQ(expected.length(), pos); +} diff --git a/src/test/librados/test.h b/src/test/librados/test.h index 6cf522def31d8..cd1f981765345 100644 --- a/src/test/librados/test.h +++ b/src/test/librados/test.h @@ -18,6 +18,7 @@ #include "include/rados/librados.h" #include "include/rados/librados.hpp" +#include #include #include @@ -35,6 +36,9 @@ int destroy_one_pool(const std::string &pool_name, rados_t *cluster); int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster); int destroy_one_pool_pp(const std::string &pool_name, librados::Rados &cluster); int destroy_one_ec_pool_pp(const std::string &pool_name, librados::Rados &cluster); +void assert_eq_sparse(bufferlist& expected, + const std::map& extents, + bufferlist& actual); class TestAlarm { From 076bad955d374cbb37b77e2b0429f3c85f32abc0 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 28 Aug 2015 11:36:49 +0800 Subject: [PATCH 319/654] ceph_test_rados_api_aio: add a test for aio_sparse_read Signed-off-by: Kefu Chai --- src/test/librados/aio.cc | 71 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc index ed59b008bfa2e..1ebdf170f886a 100644 --- a/src/test/librados/aio.cc +++ b/src/test/librados/aio.cc @@ -479,6 +479,41 @@ TEST(LibRadosAio, RoundTripPP3) destroy_one_pool_pp(pool_name, cluster); } +TEST(LibRadosAio, RoundTripSparseReadPP) { + AioTestDataPP test_data; + ASSERT_EQ("", test_data.init()); + AioCompletion *my_completion = test_data.m_cluster.aio_create_completion( + (void*)&test_data, set_completion_completePP, set_completion_safePP); + AioCompletion *my_completion_null = NULL; + ASSERT_NE(my_completion, my_completion_null); + char buf[128]; + memset(buf, 0xcc, sizeof(buf)); + bufferlist bl1; + bl1.append(buf, sizeof(buf)); + ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion, + bl1, sizeof(buf), 0)); + { + TestAlarm alarm; + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); + } + ASSERT_EQ(0, my_completion->get_return_value()); + std::map extents; + bufferlist bl2; + AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion( + (void*)&test_data, set_completion_completePP, set_completion_safePP); + ASSERT_NE(my_completion2, my_completion_null); + ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo", + my_completion2, &extents, &bl2, sizeof(buf), 0)); + { + TestAlarm alarm; + ASSERT_EQ(0, my_completion2->wait_for_complete()); + } + ASSERT_EQ(0, my_completion2->get_return_value()); + assert_eq_sparse(bl1, extents, bl2); + delete my_completion; + delete my_completion2; +} TEST(LibRadosAio, RoundTripAppend) { AioTestData test_data; @@ -2077,6 +2112,42 @@ TEST(LibRadosAioEC, RoundTripPP3) destroy_one_pool_pp(pool_name, cluster); } +TEST(LibRadosAioEC, RoundTripSparseReadPP) { + AioTestDataECPP test_data; + ASSERT_EQ("", test_data.init()); + AioCompletion *my_completion = test_data.m_cluster.aio_create_completion( + (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP); + AioCompletion *my_completion_null = NULL; + ASSERT_NE(my_completion, my_completion_null); + char buf[128]; + memset(buf, 0xcc, sizeof(buf)); + bufferlist bl1; + bl1.append(buf, sizeof(buf)); + ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion, + bl1, sizeof(buf), 0)); + { + TestAlarm alarm; + sem_wait(test_data.m_sem); + sem_wait(test_data.m_sem); + } + ASSERT_EQ(0, my_completion->get_return_value()); + + map extents; + bufferlist bl2; + AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion( + (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP); + ASSERT_NE(my_completion2, my_completion_null); + ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo", + my_completion2, &extents, &bl2, sizeof(buf), 0)); + { + TestAlarm alarm; + ASSERT_EQ(0, my_completion2->wait_for_complete()); + } + ASSERT_EQ(0, my_completion2->get_return_value()); + assert_eq_sparse(bl1, extents, bl2); + delete my_completion; + delete my_completion2; +} TEST(LibRadosAioEC, RoundTripAppend) { AioTestDataEC test_data; From 36b62710ddef0ffaee25837a92ca1ac9b353ff05 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 1 Sep 2015 15:12:02 +0800 Subject: [PATCH 320/654] osdc/Objecter: In _cancel_linger_op, it should make num_unacked/num_committed decrease. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index b6be13d3038f3..ebcade71ba459 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2771,9 +2771,15 @@ void Objecter::_cancel_linger_op(Op *op) ldout(cct, 15) << "cancel_op " << op->tid << dendl; assert(!op->should_resend); - delete op->onack; - delete op->oncommit; - delete op->oncommit_sync; + if (op->onack) { + delete op->onack; + num_unacked.dec(); + } + if (op->oncommit || op->oncommit_sync) { + delete op->oncommit; + delete op->oncommit_sync; + num_uncommitted.dec(); + } _finish_op(op); } From 7cc963b1c6ab37bf33638dc6eca7848d93f7908f Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 1 Sep 2015 15:39:29 +0800 Subject: [PATCH 321/654] osdc/Objecter: Don't forget call _op_cancel_map_check when cancel linger op. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index ebcade71ba459..4852550a207e2 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1155,6 +1155,7 @@ void Objecter::handle_osd_map(MOSDMap *m) _send_op(op); } } else { + _op_cancel_map_check(op); _cancel_linger_op(op); } s->lock.unlock(); @@ -1809,6 +1810,7 @@ void Objecter::_kick_requests(OSDSession *session, map& lr if (!op->target.paused) resend[op->tid] = op; } else { + _op_cancel_map_check(op); _cancel_linger_op(op); } } From 80f10e3e59dadda2dca4eb62c68af972b701b316 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 1 Sep 2015 19:32:32 +0800 Subject: [PATCH 322/654] osdc/Objecter: remove the unuseful code. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 76 +++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 4852550a207e2..a46fe49224fc1 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1881,54 +1881,50 @@ void Objecter::tick() set toping; - int r = 0; // look for laggy requests utime_t cutoff = ceph_clock_now(cct); cutoff -= cct->_conf->objecter_timeout; // timeout - unsigned laggy_ops; - - do { - laggy_ops = 0; - for (map::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) { - OSDSession *s = siter->second; - RWLock::RLocker l(s->lock); - for (map::iterator p = s->ops.begin(); - p != s->ops.end(); - ++p) { - Op *op = p->second; - assert(op->session); - if (op->stamp < cutoff) { - ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd << " is laggy" << dendl; - toping.insert(op->session); - ++laggy_ops; - } - } - for (map::iterator p = s->linger_ops.begin(); - p != s->linger_ops.end(); - ++p) { - LingerOp *op = p->second; - RWLock::WLocker wl(op->watch_lock); - assert(op->session); - ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first << " (osd." << op->session->osd << ")" << dendl; - toping.insert(op->session); - if (op->is_watch && op->registered && !op->last_error) - _send_linger_ping(op); - } - for (map::iterator p = s->command_ops.begin(); - p != s->command_ops.end(); - ++p) { - CommandOp *op = p->second; - assert(op->session); - ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl; - toping.insert(op->session); + unsigned laggy_ops = 0; + + for (map::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + RWLock::RLocker l(s->lock); + for (map::iterator p = s->ops.begin(); + p != s->ops.end(); + ++p) { + Op *op = p->second; + assert(op->session); + if (op->stamp < cutoff) { + ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd << " is laggy" << dendl; + toping.insert(op->session); + ++laggy_ops; } } - if (num_homeless_ops.read() || !toping.empty()) { - _maybe_request_map(); + for (map::iterator p = s->linger_ops.begin(); + p != s->linger_ops.end(); + ++p) { + LingerOp *op = p->second; + RWLock::WLocker wl(op->watch_lock); + assert(op->session); + ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first << " (osd." << op->session->osd << ")" << dendl; + toping.insert(op->session); + if (op->is_watch && op->registered && !op->last_error) + _send_linger_ping(op); } - } while (r == -EAGAIN); + for (map::iterator p = s->command_ops.begin(); + p != s->command_ops.end(); + ++p) { + CommandOp *op = p->second; + assert(op->session); + ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl; + toping.insert(op->session); + } + } + if (num_homeless_ops.read() || !toping.empty()) { + _maybe_request_map(); + } logger->set(l_osdc_op_laggy, laggy_ops); logger->set(l_osdc_osd_laggy, toping.size()); From f420fe4683c81ecd6928df3ae11259bccc6cb9d0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 28 Aug 2015 14:23:30 +0100 Subject: [PATCH 323/654] mds: fix shutdown while in standby Fixes: #12776 Signed-off-by: John Spray --- src/mds/MDSDaemon.cc | 18 ++++++++++++++++++ src/mds/MDSRank.cc | 10 ---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index 0ab6473c98f04..7d23722792c5a 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -993,10 +993,28 @@ void MDSDaemon::suicide() clean_up_admin_socket(); + // Inform MDS we are going away, then shut down beacon beacon.set_want_state(mdsmap, MDSMap::STATE_DNE); + if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) { + // Notify the MDSMonitor that we're dying, so that it doesn't have to + // wait for us to go laggy. Only do this if we're actually in the + // MDSMap, because otherwise the MDSMonitor will drop our message. + beacon.send_and_wait(1); + } + beacon.shutdown(); + + timer.shutdown(); if (mds_rank) { mds_rank->shutdown(); + } else { + + if (objecter->initialized.read()) { + objecter->shutdown(); + } + + monc->shutdown(); + messenger->shutdown(); } } diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 7f3ea2bd35a00..715bf832ec030 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -206,13 +206,6 @@ void MDSRankDispatcher::shutdown() dout(1) << __func__ << ": shutting down rank " << whoami << dendl; - if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) { - // Notify the MDSMonitor that we're dying, so that it doesn't have to - // wait for us to go laggy. Only do this if we're actually in the - // MDSMap, because otherwise the MDSMonitor will drop our message. - beacon.send_and_wait(1); - } - timer.shutdown(); // MDLog has to shut down before the finisher, because some of its @@ -221,9 +214,6 @@ void MDSRankDispatcher::shutdown() finisher->stop(); // no flushing - // stop timers - beacon.shutdown(); - // shut down cache mdcache->shutdown(); From 08296dc0febc4ee3b1c5f860d8744bd04e9c2ff8 Mon Sep 17 00:00:00 2001 From: tianshan Date: Mon, 31 Aug 2015 18:37:42 +0800 Subject: [PATCH 324/654] rados: make 'rados bench' support json format output Fixes: #12864 rados bench add '[--format json]' and '[-o | --output outfile]' support. output option only take effect in json format. now we can use the bench result draw performance graph easily. Signed-off-by: Tianshan Qu --- src/common/obj_bencher.cc | 124 ++++++++++++++++++++++++++++++++---- src/common/obj_bencher.h | 9 +++ src/test/test_rados_tool.sh | 5 ++ src/tools/rados/rados.cc | 24 +++++++ 4 files changed, 148 insertions(+), 14 deletions(-) diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc index baaff266c9aeb..3772eb865781f 100644 --- a/src/common/obj_bencher.cc +++ b/src/common/obj_bencher.cc @@ -73,6 +73,8 @@ ostream& ObjBencher::out(ostream& os) void *ObjBencher::status_printer(void *_bencher) { ObjBencher *bencher = static_cast(_bencher); bench_data& data = bencher->data; + Formatter *formatter = bencher->formatter; + ostream *outstream = bencher->outstream; Cond cond; int i = 0; int previous_writes = 0; @@ -82,10 +84,12 @@ void *ObjBencher::status_printer(void *_bencher) { utime_t ONE_SECOND; ONE_SECOND.set_from_double(1.0); bencher->lock.Lock(); + if (formatter) + formatter->open_array_section("datas"); while(!data.done) { utime_t cur_time = ceph_clock_now(bencher->cct); - if (i % 20 == 0) { + if (i % 20 == 0 && !formatter) { if (i > 0) cur_time.localtime(cout) << " min lat: " << data.min_latency << " max lat: " << data.max_latency @@ -132,13 +136,17 @@ void *ObjBencher::status_printer(void *_bencher) { data.history.iops.push_back(iops); } + + if (formatter) + formatter->open_object_section("data"); double avg_bandwidth = (double) (data.object_size) * (data.finished) / (double)(cur_time - data.start_time) / (1024*1024); if (previous_writes != data.finished) { previous_writes = data.finished; cycleSinceChange = 0; - bencher->out(cout, cur_time) << setfill(' ') + if (!formatter) { + bencher->out(cout, cur_time) << setfill(' ') << setw(5) << i << setw(8) << data.in_flight << setw(10) << data.started @@ -147,9 +155,20 @@ void *ObjBencher::status_printer(void *_bencher) { << setw(10) << bandwidth << setw(10) << (double)data.cur_latency << setw(10) << data.avg_latency << std::endl; + } else { + formatter->dump_format("sec", "%d", i); + formatter->dump_format("cur_ops", "%d", data.in_flight); + formatter->dump_format("started", "%d", data.started); + formatter->dump_format("finished", "%d", data.finished); + formatter->dump_format("avg_bw", "%f", avg_bandwidth); + formatter->dump_format("cur_bw", "%f", bandwidth); + formatter->dump_format("last_lat", "%f", (double)data.cur_latency); + formatter->dump_format("avg_lat", "%f", data.avg_latency); + } } else { - bencher->out(cout, cur_time) << setfill(' ') + if (!formatter) { + bencher->out(cout, cur_time) << setfill(' ') << setw(5) << i << setw(8) << data.in_flight << setw(10) << data.started @@ -158,11 +177,27 @@ void *ObjBencher::status_printer(void *_bencher) { << setw(10) << '0' << setw(10) << '-' << setw(10) << data.avg_latency << std::endl; + } else { + formatter->dump_format("sec", "%d", i); + formatter->dump_format("cur_ops", "%d", data.in_flight); + formatter->dump_format("started", "%d", data.started); + formatter->dump_format("finished", "%d", data.finished); + formatter->dump_format("avg_bw", "%f", avg_bandwidth); + formatter->dump_format("cur_bw", "%f", 0); + formatter->dump_format("last_lat", "%f", 0); + formatter->dump_format("avg_lat", "%f", data.avg_latency); + } + } + if (formatter) { + formatter->close_section(); // data + formatter->flush(*outstream); } ++i; ++cycleSinceChange; cond.WaitInterval(bencher->cct, bencher->lock, ONE_SECOND); } + if (formatter) + formatter->close_section(); //datas bencher->lock.Unlock(); return NULL; } @@ -207,6 +242,9 @@ int ObjBencher::aio_bench( //fill in contentsChars deterministically so we can check returns sanitize_object_contents(&data, data.object_size); + if (formatter) + formatter->open_object_section("bench"); + if (OP_WRITE == operation) { r = write_bench(secondsToRun, concurrentios, run_name_meta); if (r != 0) goto out; @@ -237,6 +275,11 @@ int ObjBencher::aio_bench( } out: + if (formatter) { + formatter->close_section(); // bench + formatter->flush(*outstream); + *outstream << std::endl; + } delete[] contentsChars; return r; } @@ -303,15 +346,24 @@ int ObjBencher::write_bench(int secondsToRun, int concurrentios, const string& run_name_meta) { if (concurrentios <= 0) return -EINVAL; - - out(cout) << "Maintaining " << concurrentios << " concurrent writes of " - << data.object_size << " bytes for up to " - << secondsToRun << " seconds" - << std::endl; + + if (!formatter) { + out(cout) << "Maintaining " << concurrentios << " concurrent writes of " + << data.object_size << " bytes for up to " + << secondsToRun << " seconds" + << std::endl; + } else { + formatter->dump_format("concurrent_ios", "%d", concurrentios); + formatter->dump_format("object_size", "%d", data.object_size); + formatter->dump_format("seconds_to_run", "%d", secondsToRun); + } bufferlist* newContents = 0; std::string prefix = generate_object_prefix(); - out(cout) << "Object prefix: " << prefix << std::endl; + if (!formatter) + out(cout) << "Object prefix: " << prefix << std::endl; + else + formatter->dump_string("object_prefix", prefix); std::vector name(concurrentios); std::string newName; @@ -461,7 +513,8 @@ int ObjBencher::write_bench(int secondsToRun, bandwidth = ((double)data.finished)*((double)data.object_size)/(double)timePassed; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - out(cout) << "Total time run: " << timePassed << std::endl + if (!formatter) { + out(cout) << "Total time run: " << timePassed << std::endl << "Total writes made: " << data.finished << std::endl << "Write size: " << data.object_size << std::endl << "Bandwidth (MB/sec): " << setprecision(3) << bandwidth << std::endl @@ -476,7 +529,23 @@ int ObjBencher::write_bench(int secondsToRun, << "Stddev Latency: " << vec_stddev(data.history.latency) << std::endl << "Max latency: " << data.max_latency << std::endl << "Min latency: " << data.min_latency << std::endl; - + } else { + formatter->dump_format("total_time_run", "%f", (double)timePassed); + formatter->dump_format("total_writes_made", "%d", data.finished); + formatter->dump_format("write_size", "%d", data.object_size); + formatter->dump_format("bandwidth", "%f", bandwidth); + formatter->dump_format("stddev_bandwidth", "%f", vec_stddev(data.history.bandwidth)); + formatter->dump_format("max_bandwidth", "%f", data.idata.max_bandwidth); + formatter->dump_format("min_bandwidth", "%f", data.idata.min_bandwidth); + formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed)); + formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops)); + formatter->dump_format("max_iops", "%d", data.idata.max_iops); + formatter->dump_format("min_iops", "%d", data.idata.min_iops); + formatter->dump_format("average_latency", "%f", data.avg_latency); + formatter->dump_format("stddev_latency", "%f", vec_stddev(data.history.latency)); + formatter->dump_format("max_latency:", "%f", data.max_latency); + formatter->dump_format("min_latency", "%f", data.min_latency); + } //write object size/number data for read benchmarks ::encode(data.object_size, b_write); ::encode(data.finished, b_write); @@ -680,7 +749,8 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - out(cout) << "Total time run: " << runtime << std::endl + if (!formatter) { + out(cout) << "Total time run: " << runtime << std::endl << "Total reads made: " << data.finished << std::endl << "Read size: " << data.object_size << std::endl << "Bandwidth (MB/sec): " << setprecision(3) << bandwidth << std::endl @@ -691,6 +761,19 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre << "Average Latency: " << data.avg_latency << std::endl << "Max latency: " << data.max_latency << std::endl << "Min latency: " << data.min_latency << std::endl; + } else { + formatter->dump_format("total_time_run", "%f", (double)runtime); + formatter->dump_format("total_reads_made", "%d", data.finished); + formatter->dump_format("read_size", "%d", data.object_size); + formatter->dump_format("bandwidth", "%f", bandwidth); + formatter->dump_format("average_iops", "%d", (int)(data.finished/runtime)); + formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops)); + formatter->dump_format("max_iops", "%d", data.idata.max_iops); + formatter->dump_format("min_iops", "%d", data.idata.min_iops); + formatter->dump_format("average_latency", "%f", data.avg_latency); + formatter->dump_format("max_latency", "%f", data.max_latency); + formatter->dump_format("min_latency", "%f", data.min_latency); + } completions_done(); @@ -888,7 +971,8 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - out(cout) << "Total time run: " << runtime << std::endl + if (!formatter) { + out(cout) << "Total time run: " << runtime << std::endl << "Total reads made: " << data.finished << std::endl << "Read size: " << data.object_size << std::endl << "Bandwidth (MB/sec): " << setprecision(3) << bandwidth << std::endl @@ -899,7 +983,19 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr << "Average Latency: " << data.avg_latency << std::endl << "Max latency: " << data.max_latency << std::endl << "Min latency: " << data.min_latency << std::endl; - + } else { + formatter->dump_format("total_time_run", "%f", (double)runtime); + formatter->dump_format("total_reads_made", "%d", data.finished); + formatter->dump_format("read_size", "%d", data.object_size); + formatter->dump_format("bandwidth", "%f", bandwidth); + formatter->dump_format("average_iops", "%d", (int)(data.finished/runtime)); + formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops)); + formatter->dump_format("max_iops", "%d", data.idata.max_iops); + formatter->dump_format("min_iops", "%d", data.idata.min_iops); + formatter->dump_format("average_latency", "%f", data.avg_latency); + formatter->dump_format("max_latency", "%f", data.max_latency); + formatter->dump_format("min_latency", "%f", data.min_latency); + } completions_done(); return 0; diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h index 9d40ccb0e1809..34e22c2b599a5 100644 --- a/src/common/obj_bencher.h +++ b/src/common/obj_bencher.h @@ -18,6 +18,7 @@ #include "common/config.h" #include "common/Cond.h" #include "common/ceph_context.h" +#include "common/Formatter.h" #include struct bench_interval_data { @@ -59,6 +60,8 @@ typedef std::pair Object; class ObjBencher { bool show_time; + Formatter *formatter = NULL; + ostream *outstream = NULL; public: CephContext *cct; protected: @@ -110,6 +113,12 @@ class ObjBencher { void set_show_time(bool dt) { show_time = dt; } + void set_formatter(Formatter *f) { + formatter = f; + } + void set_outstream(ostream& os) { + outstream = &os; + } int clean_up_slow(const std::string& prefix, int concurrentios); }; diff --git a/src/test/test_rados_tool.sh b/src/test/test_rados_tool.sh index 08756ed98e7ce..bf51072f30bf3 100755 --- a/src/test/test_rados_tool.sh +++ b/src/test/test_rados_tool.sh @@ -219,6 +219,11 @@ run_expect_fail "$RADOS_TOOL" mkpool delete_me_mkpool_test3 0 0k run_expect_succ "$RADOS_TOOL" --pool "$POOL" bench 1 write run_expect_fail "$RADOS_TOOL" --pool "$POOL" bench 1k write +run_expect_succ "$RADOS_TOOL" --pool "$POOL" bench 1 write --format json --output "$TDIR/bench.json" +run_expect_fail "$RADOS_TOOL" --pool "$POOL" bench 1 write --output "$TDIR/bench.json" +run_expect_succ "$RADOS_TOOL" --pool "$POOL" bench 5 write --format json --no-cleanup +run_expect_succ "$RADOS_TOOL" --pool "$POOL" bench 1 rand --format json +run_expect_succ "$RADOS_TOOL" --pool "$POOL" bench 1 seq --format json echo "SUCCESS!" diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc index 82b88bbd7a781..c845d30027f74 100644 --- a/src/tools/rados/rados.cc +++ b/src/tools/rados/rados.cc @@ -1208,6 +1208,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, Formatter *formatter = NULL; bool pretty_format = false; + const char *output = NULL; Rados rados; IoCtx io_ctx; @@ -1360,6 +1361,10 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, if (i != opts.end()) { no_verify = true; } + i = opts.find("output"); + if (i != opts.end()) { + output = i->second.c_str(); + } // open rados @@ -2406,12 +2411,29 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, ret = -EINVAL; goto out; } + if (!formatter && output) { + cerr << "-o|--output option can be used only with '--format' option" + << std::endl; + ret = -EINVAL; + goto out; + } RadosBencher bencher(g_ceph_context, rados, io_ctx); bencher.set_show_time(show_time); + ostream *outstream = NULL; + if (formatter) { + bencher.set_formatter(formatter); + if (output) + outstream = new ofstream(output); + else + outstream = &cout; + bencher.set_outstream(*outstream); + } ret = bencher.aio_bench(operation, seconds, concurrent_ios, op_size, cleanup, run_name, no_verify); if (ret != 0) cerr << "error during benchmark: " << ret << std::endl; + if (formatter && output) + delete outstream; } else if (strcmp(nargs[0], "cleanup") == 0) { if (!pool_name) @@ -2926,6 +2948,8 @@ int main(int argc, const char **argv) opts["all"] = "true"; } else if (ceph_argparse_flag(args, i, "--default", (char*)NULL)) { opts["default"] = "true"; + } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) { + opts["output"] = val; } else { if (val[0] == '-') usage_exit(); From ee204044d9e69ea44b533c05cec154974039264c Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Tue, 1 Sep 2015 21:35:19 +0800 Subject: [PATCH 325/654] osdc/Objecter: optimize Objecter::tick. Set bool value is better than insert same value into set<>. Signed-off-by: Jianpeng Ma --- src/osdc/Objecter.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index a46fe49224fc1..eeb11f6b90c55 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1891,6 +1891,7 @@ void Objecter::tick() for (map::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) { OSDSession *s = siter->second; RWLock::RLocker l(s->lock); + bool found = false; for (map::iterator p = s->ops.begin(); p != s->ops.end(); ++p) { @@ -1898,7 +1899,7 @@ void Objecter::tick() assert(op->session); if (op->stamp < cutoff) { ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd << " is laggy" << dendl; - toping.insert(op->session); + found = true; ++laggy_ops; } } @@ -1909,7 +1910,7 @@ void Objecter::tick() RWLock::WLocker wl(op->watch_lock); assert(op->session); ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first << " (osd." << op->session->osd << ")" << dendl; - toping.insert(op->session); + found = true; if (op->is_watch && op->registered && !op->last_error) _send_linger_ping(op); } @@ -1919,8 +1920,10 @@ void Objecter::tick() CommandOp *op = p->second; assert(op->session); ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl; - toping.insert(op->session); + found = true; } + if (found) + toping.insert(s); } if (num_homeless_ops.read() || !toping.empty()) { _maybe_request_map(); From f1b80e99b0f832f72b741dcac54dd6c54fabba89 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 31 Aug 2015 18:04:51 -0400 Subject: [PATCH 326/654] systemd: consolidate into a single ceph-disk@.service This simple service will 'ceph-disk trigger DEV --sync'. Signed-off-by: Sage Weil --- ceph.spec.in | 8 ++------ systemd/Makefile.am | 4 +--- systemd/ceph-disk-activate-journal@.service | 8 -------- systemd/ceph-disk-dmcrypt-activate@.service | 8 -------- .../{ceph-disk-activate@.service => ceph-disk@.service} | 2 +- 5 files changed, 4 insertions(+), 26 deletions(-) delete mode 100644 systemd/ceph-disk-activate-journal@.service delete mode 100644 systemd/ceph-disk-dmcrypt-activate@.service rename systemd/{ceph-disk-activate@.service => ceph-disk@.service} (54%) diff --git a/ceph.spec.in b/ceph.spec.in index 6dd925290e2de..b7a32aa44e48d 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -591,9 +591,7 @@ install -D etc/ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/cep install -m 0644 -D systemd/ceph-mds@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds@.service install -m 0644 -D systemd/ceph-radosgw@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw@.service install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target - install -m 0644 -D systemd/ceph-disk-activate-journal@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-activate-journal@.service - install -m 0644 -D systemd/ceph-disk-activate@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-activate@.service - install -m 0644 -D systemd/ceph-disk-dmcrypt-activate@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-dmcrypt-activate@.service + install -m 0644 -D systemd/ceph-disk@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk@.service install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph %else install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph @@ -747,9 +745,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_unitdir}/ceph-create-keys@.service %{_unitdir}/ceph-osd@.service %{_unitdir}/ceph-radosgw@.service -%{_unitdir}/ceph-disk-activate-journal@.service -%{_unitdir}/ceph-disk-activate@.service -%{_unitdir}/ceph-disk-dmcrypt-activate@.service +%{_unitdir}/ceph-disk@.service %{_unitdir}/ceph.target %else %{_initrddir}/ceph diff --git a/systemd/Makefile.am b/systemd/Makefile.am index e6d73e7facca6..3db6c85f5d8af 100644 --- a/systemd/Makefile.am +++ b/systemd/Makefile.am @@ -5,9 +5,7 @@ unitfiles = \ ceph-create-keys@.service \ ceph-osd@.service \ ceph-radosgw@.service \ - ceph-disk-activate-journal@.service \ - ceph-disk-activate@.service \ - ceph-disk-dmcrypt-activate@.service + ceph-disk@.service unitdir = $(systemd_unit_dir) diff --git a/systemd/ceph-disk-activate-journal@.service b/systemd/ceph-disk-activate-journal@.service deleted file mode 100644 index ac7cb46de82d6..0000000000000 --- a/systemd/ceph-disk-activate-journal@.service +++ /dev/null @@ -1,8 +0,0 @@ -[Unit] -Description=Ceph disk journal activation: %f - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/usr/sbin/ceph-disk --verbose --log-stdout activate-journal --mark-init systemd %f -TimeoutSec=0 diff --git a/systemd/ceph-disk-dmcrypt-activate@.service b/systemd/ceph-disk-dmcrypt-activate@.service deleted file mode 100644 index b17df101612ea..0000000000000 --- a/systemd/ceph-disk-dmcrypt-activate@.service +++ /dev/null @@ -1,8 +0,0 @@ -[Unit] -Description=Ceph disk dmcrypt activation: %f - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/usr/sbin/ceph-disk --verbose --log-stdout activate --dmcrypt --mark-init systemd %f -TimeoutSec=0 diff --git a/systemd/ceph-disk-activate@.service b/systemd/ceph-disk@.service similarity index 54% rename from systemd/ceph-disk-activate@.service rename to systemd/ceph-disk@.service index b6e75af819d6f..88e4aef44160e 100644 --- a/systemd/ceph-disk-activate@.service +++ b/systemd/ceph-disk@.service @@ -4,5 +4,5 @@ Description=Ceph disk activation: %f [Service] Type=oneshot RemainAfterExit=yes -ExecStart=/usr/sbin/ceph-disk --verbose --log-stdout activate --mark-init systemd %f +ExecStart=/usr/sbin/ceph-disk --verbose --log-stdout trigger --sync %f TimeoutSec=0 From 3662a225b8807f091809388bfb9c5a7eb15efc42 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 31 Aug 2015 14:50:40 -0400 Subject: [PATCH 327/654] udev: use ceph-disk trigger ... with single set of udev rules Signed-off-by: Sage Weil --- Makefile.am | 1 - ceph.spec.in | 4 --- udev/95-ceph-osd.rules | 48 +++++++++++++++++----------------- udev/95-ceph-osd.rules.systemd | 32 ----------------------- 4 files changed, 24 insertions(+), 61 deletions(-) delete mode 100644 udev/95-ceph-osd.rules.systemd diff --git a/Makefile.am b/Makefile.am index fcf40707d45c3..1fa6e8faa068b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,6 @@ EXTRA_DIST += \ udev/50-rbd.rules \ udev/60-ceph-partuuid-workaround.rules \ udev/95-ceph-osd.rules \ - udev/95-ceph-osd.rules.systemd \ udev/95-ceph-osd-alt.rules \ share/known_hosts_drop.ceph.com \ share/id_dsa_drop.ceph.com \ diff --git a/ceph.spec.in b/ceph.spec.in index b7a32aa44e48d..a8374dc4f35eb 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -618,12 +618,8 @@ install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udev %if (0%{?rhel} && 0%{?rhel} < 7) install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules %else -%if 0%{?_with_systemd} -install -m 0644 -D udev/95-ceph-osd.rules.systemd $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules -%else install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules %endif -%endif %if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version} mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules index 3565f7caf1361..5ad73956f1206 100644 --- a/udev/95-ceph-osd.rules +++ b/udev/95-ceph-osd.rules @@ -1,53 +1,53 @@ -# activate ceph-tagged partitions +# OSD_UUID ACTION=="add", SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \ - RUN+="/usr/sbin/ceph-disk activate /dev/$name" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# activate ceph-tagged partitions +# JOURNAL_UUID ACTION=="add", SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \ - RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# activate multipath ceph-tagged partitions +# MPATH_JOURNAL_UUID ACTION=="add", SUBSYSTEM=="block", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-8ae0-4982-bf9d-5a8d867af560", \ - RUN+="/usr/sbin/ceph-disk activate /dev/$name" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# activate multipath ceph-tagged partitions +# MATH_OSD_UUID ACTION=="add", SUBSYSTEM=="block", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-8ae0-4982-bf9d-5a8d867af560", \ - RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# Map journal if using dm-crypt and plain +# DMCRYPT_JOURNAL_UUID ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \ - RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# Map journal if using dm-crypt and luks +# DMCRYPT_LUKS_JOURNAL_UUID ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-35865ceff106", \ - RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID}.luks.key luksOpen /dev/$name $env{ID_PART_ENTRY_UUID}" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# Map data device and -# activate ceph-tagged partitions -# for dm-crypted data devices and plain +# DMCRYPT_OID_UUID ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \ - RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name", \ - RUN+="/bin/bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \ - RUN+="/usr/sbin/ceph-disk activate /dev/mapper/$env{ID_PART_ENTRY_UUID}" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" -# Map data device and -# activate ceph-tagged partitions -# for dm-crypted data devices and luks +# DMCRYPT_LUKS_OSD_UUID ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-35865ceff05d", \ - RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID}.luks.key luksOpen /dev/$name $env{ID_PART_ENTRY_UUID}", \ - RUN+="/bin/bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \ - RUN+="/usr/sbin/ceph-disk activate /dev/mapper/$env{ID_PART_ENTRY_UUID}" + OWNER:="ceph", GROUP:="ceph", MODE:="660", \ + RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name" diff --git a/udev/95-ceph-osd.rules.systemd b/udev/95-ceph-osd.rules.systemd deleted file mode 100644 index 235c25509c7d3..0000000000000 --- a/udev/95-ceph-osd.rules.systemd +++ /dev/null @@ -1,32 +0,0 @@ -# activate ceph-tagged partitions -ACTION=="add", SUBSYSTEM=="block", \ - ENV{DEVTYPE}=="partition", \ - ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \ - OWNER:="ceph", GROUP:="ceph", MODE:="660", \ - TAG+="systemd", \ - ENV{SYSTEMD_WANTS}+="ceph-disk-activate@/dev/$name.service" - -# activate ceph-tagged partitions -ACTION=="add", SUBSYSTEM=="block", \ - ENV{DEVTYPE}=="partition", \ - ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \ - OWNER:="ceph", GROUP:="ceph", MODE:="660", \ - TAG+="systemd", \ - ENV{SYSTEMD_WANTS}+="ceph-disk-activate-journal@/dev/$name.service" - -# Map journal if using dm-crypt -ACTION=="add" SUBSYSTEM=="block", \ - ENV{DEVTYPE}=="partition", \ - ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \ - OWNER:="ceph", GROUP:="ceph", MODE:="660", \ - RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name" - -# Map data device and -# activate ceph-tagged partitions -# for dm-crypted data devices -ACTION=="add" SUBSYSTEM=="block", \ - ENV{DEVTYPE}=="partition", \ - ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \ - OWNER:="ceph", GROUP:="ceph", MODE:="660", \ - TAG+="systemd", \ - ENV{SYSTEMD_WANTS}+="ceph-disk-dmcrypt-activate@/dev/$name.service" From c14c3172bb5f68396c594a526ff93ab889c82f52 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 31 Aug 2015 15:20:10 -0400 Subject: [PATCH 328/654] ceph-disk: add trigger subcommand Either trigger a systemd event, or do it synchronously if there is no systemd. Signed-off-by: Sage Weil --- src/ceph-disk | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index 10a7b64fafd4b..c6c4c9bcef692 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -232,6 +232,15 @@ class ExecutableNotFound(CephDiskException): ####### utils +def is_systemd(): + """ + Detect whether systemd is running + """ + with file('/proc/1/comm', 'rb') as i: + for line in i: + if 'systemd' in line: + return True + return False def maybe_mkdir(*a, **kw): """ @@ -2604,6 +2613,22 @@ def get_dev_fs(dev): else: return None +def get_dev_udev_properties(dev): + out, _ = command( + [ + '/sbin/blkid', + '-o', + 'udev', + '-p', + dev, + ] + ) + p = {} + for line in out.split('\n'): + if line: + (key, value) = line.split('=') + p[key] = value + return p def split_dev_base_partnum(dev): if is_mpath(dev): @@ -2943,6 +2968,170 @@ def main_zap(args): ########################### +def main_trigger(args): + if is_systemd() and not args.sync: + service='ceph-disk@{dev}.service'.format(dev=args.dev) + LOG.info('systemd detected, triggering %s' % service) + command( + [ + 'systemctl', + '--no-block', + 'start', + service, + ] + ) + return + + p = get_dev_udev_properties(args.dev) + + if 'ID_PART_ENTRY_TYPE' not in p: + raise Error('no ID_PART_ENTRY_TYPE for %s' % args.dev) + parttype = p['ID_PART_ENTRY_TYPE'] + + if 'ID_PART_ENTRY_UUID' not in p: + raise Error('no ID_PART_ENTRY_UUID for %s' % args.dev) + partid = p['ID_PART_ENTRY_UUID'] + + if parttype in [OSD_UUID, MPATH_OSD_UUID]: + command( + [ + 'ceph-disk', + 'activate', + args.dev, + ] + ) + elif parttype in [JOURNAL_UUID, MPATH_JOURNAL_UUID]: + command( + [ + 'ceph-disk', + 'activate-journal', + args.dev, + ] + ) + + # journals are easy: map, chown, activate-journal + elif parttype == DMCRYPT_JOURNAL_UUID: + command( + [ + '/sbin/cryptsetup', + '--key-file', + '/etc/ceph/dmcrypt-keys/{partid}'.format(partid=partid), + '--key-size', + '256', + 'create', + partid, + args.dev, + ] + ) + newdev='/dev/mapper/' + partid + count=0 + while not os.path.exists(newdev) and count <= 10: + time.sleep(1) + count += 1 + command( + [ + '/bin/chown', + 'ceph:ceph', + newdev, + ] + ) + command( + [ + '/usr/sbin/ceph-disk', + 'activate-journal', + newdev, + ] + ) + elif parttype == DMCRYPT_LUKS_JOURNAL_UUID: + command( + [ + '/sbin/cryptsetup', + '--key-file', + '/etc/ceph/dmcrypt-keys/{partid}.luks.key'.format( + partid=partid), + 'luksOpen', + args.dev, + partid, + ] + ) + newdev='/dev/mapper/' + partid + count=0 + while not os.path.exists(newdev) and count <= 10: + time.sleep(1) + count += 1 + command( + [ + '/bin/chown', + 'ceph:ceph', + newdev, + ] + ) + command( + [ + '/usr/sbin/ceph-disk', + 'activate-journal', + newdev, + ] + ) + + # osd data: map, activate + elif parttype == DMCRYPT_OSD_UUID: + command( + [ + '/sbin/cryptsetup', + '--key-file', + '/etc/ceph/dmcrypt-keys/{partid}'.format(partid=partid), + '--key-size', + '256', + 'create', + partid, + args.dev, + ] + ) + newdev='/dev/mapper/' + partid + count=0 + while not os.path.exists(newdev) and count <= 10: + time.sleep(1) + count += 1 + command( + [ + '/usr/sbin/ceph-disk', + 'activate', + newdev, + ] + ) + + elif parttype == DMCRYPT_LUKS_OSD_UUID: + command( + [ + '/sbin/cryptsetup', + '--key-file', + '/etc/ceph/dmcrypt-keys/{partid}.luks.key'.format( + partid=partid), + 'luksOpen', + args.dev, + partid, + ] + ) + newdev='/dev/mapper/' + partid + count=0 + while not os.path.exists(newdev) and count <= 10: + time.sleep(1) + count += 1 + command( + [ + '/usr/sbin/ceph-disk', + 'activate', + newdev, + ] + ) + + else: + raise Error('unrecognized partition type %s' % parttype) + + + +########################### def setup_statedir(dir): # XXX The following use of globals makes linting @@ -3021,10 +3210,27 @@ def parse_args(argv): make_list_parser(subparsers) make_suppress_parser(subparsers) make_zap_parser(subparsers) + make_trigger_parser(subparsers) args = parser.parse_args(argv) return args +def make_trigger_parser(subparsers): + trigger_parser = subparsers.add_parser('trigger', help='Trigger an event (caled by udev)') + trigger_parser.add_argument( + 'dev', + help=('device'), + ) + trigger_parser.add_argument( + '--sync', + action='store_true', default=None, + help=('do operation synchronously; do not trigger systemd'), + ) + trigger_parser.set_defaults( + func=main_trigger, + ) + return trigger_parser + def make_prepare_parser(subparsers): prepare_parser = subparsers.add_parser('prepare', help='Prepare a directory or disk for a Ceph OSD') prepare_parser.add_argument( From fcae1458bfbb724772b604dc01b53758ec38d671 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Sun, 9 Aug 2015 17:52:32 +0200 Subject: [PATCH 329/654] ceph-disk: fix dmcrypt_map() usage for LUKS activate 29431944c77adbc3464a8faeb7e052b24f821780 added a call to dmcrypt_map() during disk activation. The change is not suitable for use alongside the recently added dmcrypt LUKS support, because: - The callers don't correctly provide cryptsetup_parameters or luks arguments. - dmcrypt_map() calls LuksFormat, which should never be performed during disk activation. - The key file paths don't carry the luks suffix when required. This commit addresses these issues. Corresponding tests and a udev file update will follow. Signed-off-by: David Disseldorp Conflicts: src/ceph-disk --- src/ceph-disk | 53 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index c6c4c9bcef692..a57c3a6fb253b 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -996,7 +996,8 @@ def dmcrypt_map( keypath, _uuid, cryptsetup_parameters, - luks + luks, + format_dev=False, ): """ Maps a device to a dmcrypt device. @@ -1033,7 +1034,8 @@ def dmcrypt_map( try: if luks: - command_check_call(luksFormat_args) + if format_dev: + command_check_call(luksFormat_args) command_check_call(luksOpen_args) else: # Plain mode has no format function, nor any validation that the key is correct. @@ -1590,7 +1592,14 @@ def prepare_dev( dev = None if osd_dm_keypath: - dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid, cryptsetup_parameters, luks) + dev = dmcrypt_map( + rawdev=rawdev, + keypath=osd_dm_keypath, + _uuid=osd_uuid, + cryptsetup_parameters=cryptsetup_parameters, + luks=luks, + format_dev=True, + ) else: dev = rawdev @@ -2123,11 +2132,24 @@ def mount_activate( # proceeding. rawdev = dev ptype = get_partition_type(rawdev) - if ptype not in [DMCRYPT_OSD_UUID]: + if ptype in [DMCRYPT_OSD_UUID]: + luks = False + cryptsetup_parameters = ['--key-size', '256'] + elif ptype in [DMCRYPT_LUKS_OSD_UUID]: + luks = True + cryptsetup_parameters = [] + else: raise Error('activate --dmcrypt called for invalid dev %s' % (dev)) part_uuid = get_partition_uuid(rawdev) - dmcrypt_key_path = os.path.join(dmcrypt_key_dir, part_uuid) - dev = dmcrypt_map(rawdev, dmcrypt_key_path, part_uuid) + dmcrypt_key_path = get_dmcrypt_key_path(part_uuid, dmcrypt_key_dir, luks) + dev = dmcrypt_map( + rawdev=rawdev, + keypath=dmcrypt_key_path, + _uuid=part_uuid, + cryptsetup_parameters=cryptsetup_parameters, + luks=luks, + format_dev=False, + ) try: fstype = detect_fstype(dev=dev) @@ -2481,11 +2503,24 @@ def main_activate_journal(args): # it before proceeding. rawdev = args.dev ptype = get_partition_type(rawdev) - if ptype not in [DMCRYPT_JOURNAL_UUID]: + if ptype in [DMCRYPT_JOURNAL_UUID]: + luks = False + cryptsetup_parameters = ['--key-size', '256'] + elif ptype in [DMCRYPT_LUKS_JOURNAL_UUID]: + luks = True + cryptsetup_parameters = [] + else: raise Error('activate-journal --dmcrypt called for invalid dev %s' % (rawdev)) part_uuid = get_partition_uuid(rawdev) - dmcrypt_key_path = os.path.join(args.dmcrypt_key_dir, part_uuid) - dev = dmcrypt_map(rawdev, dmcrypt_key_path, part_uuid) + dmcrypt_key_path = get_dmcrypt_key_path(part_uuid, args.dmcrypt_key_dir, luks) + dev = dmcrypt_map( + rawdev=rawdev, + keypath=dmcrypt_key_path, + _uuid=part_uuid, + cryptsetup_parameters=cryptsetup_parameters, + luks=luks, + format_dev=False, + ) else: dev = args.dev From 35c9962e7a36402fc3799290e910aa0103a6878c Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 1 Sep 2015 12:59:14 +0000 Subject: [PATCH 330/654] ceph-disk: only check partition_type if partition The multipath sanity checks of get_journal_osd_uuid must not try to verify the partition type when the device is not a partition. Signed-off-by: Loic Dachary --- src/ceph-disk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ceph-disk b/src/ceph-disk index a57c3a6fb253b..d8a97a2e31221 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2462,7 +2462,8 @@ def get_journal_osd_uuid(path): if not stat.S_ISBLK(mode): raise Error('%s is not a block device' % path) - if (get_partition_type(path) == MPATH_JOURNAL_UUID and + if (is_partition(path) and + get_partition_type(path) == MPATH_JOURNAL_UUID and not is_mpath(path)): raise Error('%s is not a multipath block device' % path) From 00e653440c350435b3d084a568a84d4ceb7acdb0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 11:40:41 -0400 Subject: [PATCH 331/654] ceph-disk: be a bit more verbose Signed-off-by: Sage Weil --- src/ceph-disk | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index d8a97a2e31221..1aa64c98cd5a0 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -3028,6 +3028,13 @@ def main_trigger(args): raise Error('no ID_PART_ENTRY_UUID for %s' % args.dev) partid = p['ID_PART_ENTRY_UUID'] + LOG.info('trigger {dev} parttype {parttype} uuid {partid}'.format( + dev=args.dev, + parttype=parttype, + partid=partid, + ) + ) + if parttype in [OSD_UUID, MPATH_OSD_UUID]: command( [ From b226fad96866d199923f5ed4535580b21965df24 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 11:45:56 -0400 Subject: [PATCH 332/654] ceph-disk: systemctl restart the ceph-disk@ service Otherwise the second time around activating something it will do nothing. Signed-off-by: Sage Weil --- src/ceph-disk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-disk b/src/ceph-disk index 1aa64c98cd5a0..61d2b651ed4ad 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -3012,7 +3012,7 @@ def main_trigger(args): [ 'systemctl', '--no-block', - 'start', + 'restart', service, ] ) From 32446ffb00abb58e8f15f25eae17495ea0b38857 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 1 Sep 2015 16:44:59 +0000 Subject: [PATCH 333/654] tests: ceph-disk: dmcrypt simplification * Get rid of the cryptsetup calls that are redundant with what ceph prepare already does * Do not use the --dmcrypt-key-dir option. This is less coverage but it interferes with the udev logic and is expected to be refactored soon. Signed-off-by: Loic Dachary --- qa/workunits/ceph-disk/ceph-disk-test.py | 29 +++++++----------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/qa/workunits/ceph-disk/ceph-disk-test.py b/qa/workunits/ceph-disk/ceph-disk-test.py index bc6d2544cc64c..5f42d143bbf3d 100644 --- a/qa/workunits/ceph-disk/ceph-disk-test.py +++ b/qa/workunits/ceph-disk/ceph-disk-test.py @@ -13,6 +13,12 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library Public License for more details. # +# When debugging these tests, here are a few useful commands: +# +# export PATH=..:$PATH +# python ceph-disk-test.py --verbose --destroy-osd 0 +# py.test -s -v -k test_activate_dmcrypt_luks ceph-disk-test.py +# import argparse import json import logging @@ -178,33 +184,14 @@ def activate_dmcrypt(self, type): disk = c.unused_disks()[0] osd_uuid = str(uuid.uuid1()) journal_uuid = str(uuid.uuid1()) - d = tempfile.mkdtemp() c.sh("ceph-disk zap " + disk) c.sh("ceph-disk prepare " + - " --dmcrypt-key-dir " + d + " --osd-uuid " + osd_uuid + " --journal-uuid " + journal_uuid + " --dmcrypt " + " " + disk) - if type == 'plain': - c.sh("cryptsetup --key-file " + d + "/" + osd_uuid + - " --key-size 256 create " + osd_uuid + - " " + disk + "1") - else: - c.sh("cryptsetup --key-file " + d + "/" + osd_uuid + ".luks.key" + - " luksOpen " + - " " + disk + "1" + - " " + osd_uuid) - if type == 'plain': - c.sh("cryptsetup --key-file " + d + "/" + journal_uuid + - " --key-size 256 create " + journal_uuid + - " " + disk + "2") - else: - c.sh("cryptsetup --key-file " + d + "/" + journal_uuid + ".luks.key" + - " luksOpen " + - " " + disk + "2" + - " " + journal_uuid) - c.sh("ceph-disk activate /dev/mapper/" + osd_uuid) + data_partition = c.get_osd_partition(osd_uuid) + c.sh("ceph-disk activate --dmcrypt " + data_partition['path']) data_partition = c.get_osd_partition(osd_uuid) assert data_partition['type'] == 'data' assert data_partition['state'] == 'active' From 32331ede41ef5b1dc4eb85304d2e86d7c027c75c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:18:21 -0400 Subject: [PATCH 334/654] os/Makefile.am: add os/fs/XFS.cc Signed-off-by: Sage Weil --- src/os/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/src/os/Makefile.am b/src/os/Makefile.am index 4194461e9f34d..5c6fb02ed8e11 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -8,6 +8,7 @@ if ENABLE_SERVER libos_la_SOURCES = \ os/chain_xattr.cc \ os/fs/FS.cc \ + os/fs/XFS.cc \ os/DBObjectMap.cc \ os/GenericObjectMap.cc \ os/FileJournal.cc \ From be93b09fd27e938aa8128ddca635a5a242c8c045 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:37:46 -0400 Subject: [PATCH 335/654] Revert "os/Makefile.am: add os/fs/XFS.cc" This reverts commit 32331ede41ef5b1dc4eb85304d2e86d7c027c75c. Doh, this is in a conditional below. --- src/os/Makefile.am | 1 - 1 file changed, 1 deletion(-) diff --git a/src/os/Makefile.am b/src/os/Makefile.am index 5c6fb02ed8e11..4194461e9f34d 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -8,7 +8,6 @@ if ENABLE_SERVER libos_la_SOURCES = \ os/chain_xattr.cc \ os/fs/FS.cc \ - os/fs/XFS.cc \ os/DBObjectMap.cc \ os/GenericObjectMap.cc \ os/FileJournal.cc \ From 10c0bfeaddeee8cf760e38ee54352d8ff66e637d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 Mar 2015 16:07:10 -0700 Subject: [PATCH 336/654] vstart.sh: debug newstore --- src/vstart.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vstart.sh b/src/vstart.sh index 8f3ca4def82cd..467b197c4318d 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -78,6 +78,7 @@ overwrite_conf=1 cephx=1 #turn cephx on by default cache="" memstore=0 +newstore=0 journal=1 MON_ADDR="" @@ -271,6 +272,7 @@ else debug monc = 20 debug journal = 20 debug filestore = 20 + debug newstore = 30 debug rgw = 20 debug objclass = 20' CMDSDEBUG=' From d0a4bbaf697a037b7c78a161199c0608b576bb4d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 10:09:20 -0400 Subject: [PATCH 337/654] newstore: initial version This includes a bunch of new ceph_test_objectstore tests, and a ton of fixes to existing tests so that objects actually live inside the collections they are written to. Signed-off-by: Sage Weil --- src/common/config_opts.h | 11 + src/os/Makefile.am | 4 + src/os/ObjectStore.cc | 5 + src/os/ObjectStore.h | 2 + src/os/newstore/NewStore.cc | 3685 +++++++++++++++++++++++++++++ src/os/newstore/NewStore.h | 720 ++++++ src/os/newstore/newstore_types.cc | 208 ++ src/os/newstore/newstore_types.h | 147 ++ src/vstart.sh | 7 + 9 files changed, 4789 insertions(+) create mode 100644 src/os/newstore/NewStore.cc create mode 100644 src/os/newstore/NewStore.h create mode 100644 src/os/newstore/newstore_types.cc create mode 100644 src/os/newstore/newstore_types.h diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 191716146e435..6316c31bd04cf 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -134,6 +134,7 @@ SUBSYS(throttle, 1, 1) SUBSYS(refs, 0, 0) SUBSYS(xio, 1, 5) SUBSYS(compressor, 1, 5) +SUBSYS(newstore, 1, 5) OPTION(key, OPT_STR, "") OPTION(keyfile, OPT_STR, "") @@ -789,6 +790,16 @@ OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024) OPTION(memstore_page_set, OPT_BOOL, true) OPTION(memstore_page_size, OPT_U64, 64 << 10) +OPTION(newstore_max_dir_size, OPT_U32, 1000000) +OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection +OPTION(newstore_backend, OPT_STR, "rocksdb") +OPTION(newstore_fail_eio, OPT_BOOL, true) +OPTION(newstore_sync_queue_transaction, OPT_BOOL, false) // perform write synchronously from queue_transaction +OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync +OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value +OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value +OPTION(newstore_fid_prealloc, OPT_INT, 1024) + OPTION(filestore_omap_backend, OPT_STR, "leveldb") OPTION(filestore_debug_disable_sharded_check, OPT_BOOL, false) diff --git a/src/os/Makefile.am b/src/os/Makefile.am index 4194461e9f34d..de11a6fd9e969 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -1,4 +1,5 @@ libos_types_la_SOURCES = \ + os/newstore/newstore_types.cc \ os/Transaction.cc libos_types_la_CXXFLAGS = ${AM_CXXFLAGS} noinst_LTLIBRARIES += libos_types.la @@ -8,6 +9,7 @@ if ENABLE_SERVER libos_la_SOURCES = \ os/chain_xattr.cc \ os/fs/FS.cc \ + os/newstore/NewStore.cc \ os/DBObjectMap.cc \ os/GenericObjectMap.cc \ os/FileJournal.cc \ @@ -50,6 +52,8 @@ noinst_LTLIBRARIES += libos.la noinst_HEADERS += \ os/btrfs_ioctl.h \ os/chain_xattr.h \ + os/newstore/newstore_types.h \ + os/newstore/NewStore.h \ os/BtrfsFileStoreBackend.h \ os/CollectionIndex.h \ os/DBObjectMap.h \ diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index 717df74ee3c76..69a6929afc6c4 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -19,6 +19,7 @@ #include "FileStore.h" #include "MemStore.h" #include "KeyValueStore.h" +#include "newstore/NewStore.h" #include "common/safe_io.h" ObjectStore *ObjectStore::create(CephContext *cct, @@ -37,6 +38,10 @@ ObjectStore *ObjectStore::create(CephContext *cct, cct->check_experimental_feature_enabled("keyvaluestore")) { return new KeyValueStore(data); } + if (type == "newstore" && + cct->check_experimental_feature_enabled("newstore")) { + return new NewStore(cct, data); + } return NULL; } diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 4f0d5764be2e6..57c99d540d81f 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -777,9 +777,11 @@ class ObjectStore { bufferlist::iterator data_bl_p; + public: vector colls; vector objects; + private: iterator(Transaction *t) : t(t), data_bl_p(t->data_bl.begin()), diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc new file mode 100644 index 0000000000000..67c251bd4f227 --- /dev/null +++ b/src/os/newstore/NewStore.cc @@ -0,0 +1,3685 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "NewStore.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/safe_io.h" + +#define dout_subsys ceph_subsys_newstore + +/* + + TODO: + + * hobject sorting + - backfill + - scrub + - pgnls + - tiering agent position + - ObjectStore::collection_list_range + - ObjectStore::collection_list_partial + - DBObjectMap::clone lock ordering + - HashIndex::get_path_contents_by_hash + - HashIndex::list_by_hash + * open-by-handle + * use work queue for wal fsyncs and kv record removals + * avoid mtime updates when doing open-by-handle + * abstract out fs specifics + * fid xattr backpointer + * kill collection_list_range + + */ + +const string PREFIX_SUPER = "S"; // field -> value +const string PREFIX_COLL = "C"; // collection name -> (nothing) +const string PREFIX_OBJ = "O"; // object name -> onode +const string PREFIX_OMAP = "M"; // u64 + keyname -> value +const string PREFIX_WAL = "L"; // write ahead log + + +/* + * key + * + * The key string needs to lexicographically sort the same way that + * ghobject_t does. We do this by escaping anything <= to '%' with % + * plus a 2 digit hex string, and anything >= '~' with ~ plus the two + * hex digits. + * + * We use ! as a separator for strings; this works because it is < % + * and will get escaped if it is present in the string. + * + * For the fixed length numeric fields, we just use hex and '.' as a + * convenient visual separator. Two oddities here: + * + * 1. for the -1 shard value we use --; it's the only negative value + * and it sorts < 0 that way. + * + * 2. for the pool value, we add 2^63 so that it sorts correctly + * + * We could do something much more compact here, but it would be less + * readable by humans. :/ + */ + +const string KEY_SEP_S = "!"; + +static void append_escaped(const string &in, string *out) +{ + char hexbyte[8]; + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i <= '%') { + snprintf(hexbyte, sizeof(hexbyte), "%%%02x", (unsigned)*i); + out->append(hexbyte); + } else if (*i >= 126) { + snprintf(hexbyte, sizeof(hexbyte), "~%02x", (unsigned)*i); + out->append(hexbyte); + } else { + out->push_back(*i); + } + } +} + +static int decode_escaped(const char *p, string *out) +{ + const char *orig_p = p; + while (*p && *p != '!') { + if (*p == '%' || *p == '~') { + unsigned hex; + int r = sscanf(++p, "%2x", &hex); + if (r < 1) + return -EINVAL; + out->push_back((char)hex); + p += 2; + } else { + out->push_back(*p++); + } + } + return p - orig_p; +} + +// here is a sample (large) key +// --.7fffffffffffffff.B9FA767A.!0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!fffffffffffffffe.ffffffffffffffff + +static void get_coll_key_range(const coll_t& cid, int bits, + string *temp_start, string *temp_end, + string *start, string *end) +{ + temp_start->clear(); + temp_end->clear(); + start->clear(); + end->clear(); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + char buf[PATH_MAX]; + + // make field ordering match with ghobject_t compare operations + if (pgid.shard == shard_id_t::NO_SHARD) { + // otherwise ff will sort *after* 0, not before. + *start = "--"; + } else { + snprintf(buf, sizeof(buf), "%02x", (int)pgid.shard); + start->append(buf); + } + *end = *start; + *temp_start = *start; + *temp_end = *start; + + snprintf(buf, sizeof(buf), ".%016llx.%08x.", + (unsigned long long)(pgid.pool() + 0x8000000000000000ull), + (unsigned)hobject_t::_reverse_bits(pgid.ps())); + start->append(buf); + snprintf(buf, sizeof(buf), ".%016llx.%08x.", + (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull), + (unsigned)hobject_t::_reverse_bits(pgid.ps())); + temp_start->append(buf); + + uint64_t end_hash = hobject_t::_reverse_bits(pgid.ps()); + end_hash += (1ull << (32-bits)); + if (end_hash > 0xffffffff) { + snprintf(buf, sizeof(buf), ".%016llx.gggggggg.", + (unsigned long long)(pgid.pool() + 0x8000000000000000ull)); + end->append(buf); + snprintf(buf, sizeof(buf), ".%016llx.gggggggg.", + (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull)); + temp_end->append(buf); + } else { + snprintf(buf, sizeof(buf), ".%016llx.%08x.", + (unsigned long long)(pgid.pool() + 0x8000000000000000ull), + (unsigned)end_hash); + end->append(buf); + snprintf(buf, sizeof(buf), ".%016llx.%08x.", + (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull), + (unsigned)end_hash); + temp_end->append(buf); + } + } else if (cid.is_meta()) { + *start = "--.7fffffffffffffff.00000000."; + *end = "--.7fffffffffffffff.gggggggg."; + // no separate temp section + *temp_start = *end; + *temp_end = *end; + } else { + assert(0); + } +} + +static int get_key_object(const string& key, ghobject_t *oid); + +static void get_object_key(const ghobject_t& oid, string *key) +{ + char buf[PATH_MAX]; + char *t = buf; + char *end = t + sizeof(buf); + + key->clear(); + + // make field ordering match with ghobject_t compare operations + if (oid.shard_id == shard_id_t::NO_SHARD) { + // otherwise ff will sort *after* 0, not before. + *key = "--"; + } else { + snprintf(buf, sizeof(buf), "%02x", (int)oid.shard_id); + key->append(buf); + } + + t += snprintf(t, end - t, ".%016llx.%.*x.", + (unsigned long long)(oid.hobj.pool + 0x8000000000000000ull), + (int)(sizeof(oid.hobj.get_hash())*2), + (uint32_t)oid.hobj.get_bitwise_key_u32()); + key->append(buf); + + append_escaped(oid.hobj.nspace, key); + key->append(KEY_SEP_S); + + append_escaped(oid.hobj.get_effective_key(), key); + key->append(KEY_SEP_S); + + append_escaped(oid.hobj.oid.name, key); + key->append(KEY_SEP_S); + + t = buf; + t += snprintf(t, end - t, "%016llx.%016llx", + (long long unsigned)oid.hobj.snap, + (long long unsigned)oid.generation); + key->append(buf); + + // sanity check + if (true) { + ghobject_t t; + int r = get_key_object(*key, &t); + if (r || t != oid) { + derr << " r " << r << dendl; + derr << "key " << *key << dendl; + derr << "oid " << oid << dendl; + derr << " t " << t << dendl; + assert(t == oid); + } + } +} + +static int get_key_object(const string& key, ghobject_t *oid) +{ + int r; + const char *p = key.c_str(); + + if (key[0] == '-') { + oid->shard_id = shard_id_t::NO_SHARD; + } else { + unsigned shard; + r = sscanf(p, "%x", &shard); + if (r < 1) + return -1; + oid->shard_id = shard_id_t(shard); + } + if (p[2] != '.' || p[19] != '.' || p[28] != '.') + return -2; + + unsigned hash; + uint64_t pool; + r = sscanf(p + 3, "%llx.%x", (unsigned long long*)&pool, &hash); + if (r < 2) + return -3; + oid->hobj.pool = pool - 0x8000000000000000; + oid->hobj.set_bitwise_key_u32(hash); + p += 3 + 2 + 16 + 8; + + r = decode_escaped(p, &oid->hobj.nspace); + if (r < 0) + return -4; + p += r + 1; + string okey; + r = decode_escaped(p, &okey); + if (r < 0) + return -5; + p += r + 1; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -6; + p += r + 1; + + oid->hobj.set_key(okey); + + r = sscanf(p, "%llx.%llx", (unsigned long long*)&oid->hobj.snap, + (unsigned long long*)&oid->generation); + if (r < 2) + return -7; + return 0; +} + +// '-' < '.' < '~' +void get_omap_header(uint64_t id, string *out) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%016llx-", (unsigned long long)id); + *out = buf; +} + +// hmm, I don't think there's any need to escape the user key since we +// have a clean prefix. +void get_omap_key(uint64_t id, const string& key, string *out) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%016llx.", (unsigned long long)id); + *out = buf; + out->append(key); +} + +void decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(17); +} + +void get_omap_tail(uint64_t id, string *out) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%016llx~", (unsigned long long)id); + *out = buf; +} + +void get_wal_key(uint64_t seq, string *out) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)seq); + *out = buf; +} + +// Onode + +NewStore::Onode::Onode(const ghobject_t& o, const string& k) + : nref(0), + oid(o), + key(k), + dirty(false), + exists(true), + flush_lock("NewStore::Onode::flush_lock") { +} + +// OnodeHashLRU + +#undef dout_prefix +#define dout_prefix *_dout << "newstore.lru(" << this << ") " + +void NewStore::OnodeHashLRU::_touch(OnodeRef o) +{ + lru_list_t::iterator p = lru.iterator_to(*o); + lru.erase(p); + lru.push_front(*o); +} + +void NewStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o) +{ + Mutex::Locker l(lock); + dout(30) << __func__ << " " << oid << " " << o << dendl; + assert(onode_map.count(oid) == 0); + onode_map[oid] = o; + lru.push_back(*o); +} + +NewStore::OnodeRef NewStore::OnodeHashLRU::lookup(const ghobject_t& oid) +{ + Mutex::Locker l(lock); + dout(30) << __func__ << dendl; + ceph::unordered_map::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + dout(30) << __func__ << " " << oid << " miss" << dendl; + return OnodeRef(); + } + dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; + _touch(p->second); + return p->second; +} + +void NewStore::OnodeHashLRU::clear() +{ + Mutex::Locker l(lock); + dout(10) << __func__ << dendl; + lru.clear(); + onode_map.clear(); +} + +void NewStore::OnodeHashLRU::remove(const ghobject_t& oid) +{ + Mutex::Locker l(lock); + ceph::unordered_map::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + dout(30) << __func__ << " " << oid << " miss" << dendl; + return; + } + dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; + lru_list_t::iterator pi = lru.iterator_to(*p->second); + lru.erase(pi); + onode_map.erase(p); +} + +void NewStore::OnodeHashLRU::rename(const ghobject_t& old_oid, + const ghobject_t& new_oid) +{ + Mutex::Locker l(lock); + dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl; + ceph::unordered_map::iterator po, pn; + po = onode_map.find(old_oid); + pn = onode_map.find(new_oid); + + assert(po != onode_map.end()); + if (pn != onode_map.end()) { + lru_list_t::iterator p = lru.iterator_to(*pn->second); + lru.erase(p); + onode_map.erase(pn); + } + onode_map.insert(make_pair(new_oid, po->second)); + _touch(po->second); + onode_map.erase(po); +} + +bool NewStore::OnodeHashLRU::get_next( + const ghobject_t& after, + pair *next) +{ + Mutex::Locker l(lock); + dout(20) << __func__ << " after " << after << dendl; + + if (after == ghobject_t()) { + if (lru.empty()) { + return false; + } + ceph::unordered_map::iterator p = onode_map.begin(); + assert(p != onode_map.end()); + next->first = p->first; + next->second = p->second; + return true; + } + + ceph::unordered_map::iterator p = onode_map.find(after); + assert(p != onode_map.end()); // for now + lru_list_t::iterator pi = lru.iterator_to(*p->second); + ++pi; + if (pi == lru.end()) { + return false; + } + next->first = pi->oid; + next->second = onode_map[pi->oid]; + return true; +} + +int NewStore::OnodeHashLRU::trim(int max) +{ + Mutex::Locker l(lock); + dout(20) << __func__ << " max " << max + << " size " << onode_map.size() << dendl; + int trimmed = 0; + int num = onode_map.size() - max; + lru_list_t::iterator p = lru.end(); + if (num) + p--; + while (num > 0) { + Onode *o = &*p; + int refs = o->nref.read(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs; stopping with " << num << " left to trim" << dendl; + break; + } + dout(30) << __func__ << " trim " << o->oid << dendl; + if (p != lru.begin()) { + lru.erase(p--); + } else { + lru.erase(p); + assert(num == 1); + } + o->get(); // paranoia + onode_map.erase(o->oid); + o->put(); + --num; + ++trimmed; + } + return trimmed; +} + +// ======================================================= + +// Collection + +#undef dout_prefix +#define dout_prefix *_dout << "newstore(" << store->path << ").collection(" << cid << ") " + +NewStore::Collection::Collection(NewStore *ns, coll_t c) + : store(ns), + cid(c), + lock("NewStore::Collection::lock"), + onode_map() //store->cct, store->cct->_conf->newstore_onode_map_size) +#warning fixme size the lru/cache +{ +} + +NewStore::OnodeRef NewStore::Collection::get_onode( + const ghobject_t& oid, + bool create) +{ + assert(create ? lock.is_wlocked() : lock.is_locked()); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!oid.match(cnode.bits, pgid.ps())) { + derr << __func__ << " oid " << oid << " not part of " << pgid + << " bits " << cnode.bits << dendl; + assert(0); + } + } + + OnodeRef o = onode_map.lookup(oid); + if (o) + return o; + + string key; + get_object_key(oid, &key); + + dout(20) << __func__ << " oid " << oid << " key '" << key << "'" << dendl; + + bufferlist v; + int r = store->db->get(PREFIX_OBJ, key, &v); + dout(20) << " r " << r << " v.len " << v.length() << dendl; + Onode *on; + assert(r >= 0); + if (v.length() == 0) { + if (!create) + return OnodeRef(); + + // new + on = new Onode(oid, key); + on->dirty = true; + } else { + // loaded + on = new Onode(oid, key); + bufferlist::iterator p = v.begin(); + ::decode(on->onode, p); + } + o.reset(on); + onode_map.add(oid, o); + return o; +} + + + +// ======================================================= + +#undef dout_prefix +#define dout_prefix *_dout << "newstore(" << path << ") " + + +NewStore::NewStore(CephContext *cct, const string& path) + : ObjectStore(path), + cct(cct), + db(NULL), + path_fd(-1), + fsid_fd(-1), + frag_fd(-1), + fset_fd(-1), + mounted(false), + coll_lock("NewStore::coll_lock"), + fid_lock("NewStore::fid_lock"), + wal_lock("NewStore::wal_lock"), + wal_seq(0), + finisher(cct), + fsync_tp(cct, + "NewStore::fsync_tp", + cct->_conf->newstore_fsync_threads, + "newstore_fsync_threads"), + fsync_wq(this, + cct->_conf->newstore_fsync_thread_timeout, + cct->_conf->newstore_fsync_thread_suicide_timeout, + &fsync_tp), + kv_sync_thread(this), + kv_lock("NewStore::kv_lock"), + kv_stop(false), + logger(NULL), + default_osr("default"), + reap_lock("NewStore::reap_lock") +{ + _init_logger(); +} + +NewStore::~NewStore() +{ + _shutdown_logger(); + assert(!mounted); + assert(db == NULL); + assert(fsid_fd < 0); + assert(frag_fd < 0); +} + +void NewStore::_init_logger() +{ + // XXX +} + +void NewStore::_shutdown_logger() +{ + // XXX +} + +int NewStore::peek_journal_fsid(uuid_d *fsid) +{ + return 0; +} + +int NewStore::_open_path() +{ + assert(path_fd < 0); + path_fd = ::open(path.c_str(), O_DIRECTORY); + if (path_fd < 0) { + int r = -errno; + derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) + << dendl; + return r; + } + return 0; +} + +void NewStore::_close_path() +{ + VOID_TEMP_FAILURE_RETRY(::close(path_fd)); + path_fd = -1; +} + +int NewStore::_open_frag() +{ + assert(frag_fd < 0); + frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY); + if (frag_fd < 0) { + int r = -errno; + derr << __func__ << " cannot open " << path << "/fragments: " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int NewStore::_create_frag() +{ + assert(frag_fd < 0); + frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY); + if (frag_fd < 0 && errno == ENOENT) { + int r = ::mkdirat(path_fd, "fragments", 0755); + if (r < 0) { + r = -errno; + derr << __func__ << " cannot create " << path << "/fragments: " + << cpp_strerror(r) << dendl; + return r; + } + frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY); + } + if (frag_fd < 0) { + int r = -errno; + derr << __func__ << " cannot open created " << path << "/fragments: " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void NewStore::_close_frag() +{ + if (fset_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fset_fd)); + fset_fd = -1; + } + VOID_TEMP_FAILURE_RETRY(::close(frag_fd)); + frag_fd = -1; +} + +int NewStore::_open_fsid(bool create) +{ + assert(fsid_fd < 0); + int flags = O_RDWR; + if (create) + flags |= O_CREAT; + fsid_fd = ::openat(path_fd, "fsid", flags, 0644); + if (fsid_fd < 0) { + int err = -errno; + derr << __func__ << " " << cpp_strerror(err) << dendl; + return err; + } + return 0; +} + +int NewStore::_read_fsid(uuid_d *uuid) +{ + char fsid_str[40]; + int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret > 36) + fsid_str[36] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int NewStore::_write_fsid() +{ + int r = ::ftruncate(fsid_fd, 0); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; + return r; + } + string str = stringify(fsid) + "\n"; + r = safe_write(fsid_fd, str.c_str(), str.length()); + if (r < 0) { + derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; + return r; + } + r = ::fsync(fsid_fd); + if (r < 0) { + derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void NewStore::_close_fsid() +{ + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +} + +int NewStore::_lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + derr << __func__ << " failed to lock " << path << "/fsid" + << " (is another ceph-osd still running?)" + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool NewStore::test_mount_in_use() +{ + // most error conditions mean the mount is not in use (e.g., because + // it doesn't exist). only if we fail to lock do we conclude it is + // in use. + bool ret = false; + int r = _open_path(); + if (r < 0) + return false; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _lock_fsid(); + if (r < 0) + ret = true; // if we can't lock, it is in used + _close_fsid(); + out_path: + _close_path(); + return ret; +} + +int NewStore::_open_db() +{ + assert(!db); + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/db", path.c_str()); + db = KeyValueDB::create(g_ceph_context, + g_conf->newstore_backend, + fn); + if (!db) { + derr << __func__ << " error creating db" << dendl; + delete db; + db = NULL; + return -EIO; + } + db->init(); + stringstream err; + if (db->create_and_open(err)) { + derr << __func__ << " erroring opening db: " << err << dendl; + delete db; + db = NULL; + return -EIO; + } + return 0; +} + +void NewStore::_close_db() +{ + assert(db); + delete db; + db = NULL; +} + +int NewStore::_open_collections() +{ + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (cid.parse(it->key())) { + CollectionRef c(new Collection(this, cid)); + bufferlist bl; + db->get(PREFIX_COLL, it->key(), &bl); + bufferlist::iterator p = bl.begin(); + ::decode(c->cnode, p); + dout(20) << __func__ << " opened " << cid << dendl; + coll_map[cid] = c; + } else { + dout(20) << __func__ << " unrecognized collection " << it->key() << dendl; + } + } + return 0; +} + +int NewStore::mkfs() +{ + dout(1) << __func__ << " path " << path << dendl; + int r; + uuid_d old_fsid; + + r = _open_path(); + if (r < 0) + return r; + + r = _open_fsid(true); + if (r < 0) + goto out_path_fd; + + r = _lock_fsid(); + if (r < 0) + goto out_close_fsid; + + r = _read_fsid(&old_fsid); + if (r < 0 && old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __func__ << " generated fsid " << fsid << dendl; + } else { + dout(1) << __func__ << " using provided fsid " << fsid << dendl; + } + r = _write_fsid(); + if (r < 0) + goto out_close_fsid; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __func__ << " on-disk fsid " << old_fsid + << " != provided " << fsid << dendl; + r = -EINVAL; + goto out_close_fsid; + } + fsid = old_fsid; + dout(1) << __func__ << " fsid is already set to " << fsid << dendl; + } + + r = _create_frag(); + if (r < 0) + goto out_close_fsid; + + r = _open_db(); + if (r < 0) + goto out_close_frag; + + // FIXME: superblock + + dout(10) << __func__ << " success" << dendl; + r = 0; + _close_db(); + + out_close_frag: + _close_frag(); + out_close_fsid: + _close_fsid(); + out_path_fd: + _close_path(); + return r; +} + +int NewStore::mount() +{ + dout(1) << __func__ << " path " << path << dendl; + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_frag(); + if (r < 0) + goto out_fsid; + + // FIXME: superblock, features + + r = _open_db(); + if (r < 0) + goto out_frag; + + r = _recover_next_fid(); + if (r < 0) + goto out_db; + + r = _recover_next_omap_id(); + if (r < 0) + goto out_db; + + r = _open_collections(); + if (r < 0) + goto out_db; + + r = _replay_wal(); + if (r < 0) + goto out_db; + + finisher.start(); + fsync_tp.start(); + kv_sync_thread.create(); + + mounted = true; + return 0; + + out_db: + _close_db(); + out_frag: + _close_frag(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; +} + +int NewStore::umount() +{ + assert(mounted); + dout(1) << __func__ << dendl; + + sync_and_flush(); + _reap_collections(); + + dout(20) << __func__ << " stopping fsync_wq" << dendl; + fsync_tp.stop(); + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); + dout(20) << __func__ << " draining finisher" << dendl; + finisher.wait_for_empty(); + dout(20) << __func__ << " stopping finisher" << dendl; + finisher.stop(); + dout(20) << __func__ << " closing" << dendl; + + mounted = false; + if (fset_fd >= 0) + VOID_TEMP_FAILURE_RETRY(::close(fset_fd)); + _close_db(); + _close_frag(); + _close_fsid(); + _close_path(); + return 0; +} + +void NewStore::sync(Context *onsync) +{ +#warning write sync +} + +void NewStore::sync() +{ +#warning write sync +} + +void NewStore::flush() +{ +#warning write flush +} + +void NewStore::sync_and_flush() +{ + dout(10) << __func__ << dendl; + + dout(20) << " flushing fsync wq" << dendl; + fsync_wq.flush(); + + kv_lock.Lock(); + while (!kv_committing.empty() || + !kv_queue.empty()) { + dout(20) << " waiting for kv to commit" << dendl; + kv_sync_cond.Wait(kv_lock); + } + kv_lock.Unlock(); + + dout(10) << __func__ << " done" << dendl; +} + +int NewStore::statfs(struct statfs *buf) +{ + if (::statfs(path.c_str(), buf) < 0) { + int r = -errno; + assert(!g_conf->newstore_fail_eio || r != -EIO); + return r; + } + return 0; +} + +// --------------- +// cache + +NewStore::CollectionRef NewStore::_get_collection(coll_t cid) +{ + RWLock::RLocker l(coll_lock); + ceph::unordered_map::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; +} + +void NewStore::_queue_reap_collection(CollectionRef& c) +{ + dout(10) << __func__ << " " << c->cid << dendl; + Mutex::Locker l(reap_lock); + removed_collections.push_back(c); +} + +void NewStore::_reap_collections() +{ + Mutex::Locker l(reap_lock); + for (list::iterator p = removed_collections.begin(); + p != removed_collections.end(); + ++p) { + CollectionRef c = *p; + dout(10) << __func__ << " " << c->cid << dendl; + { + pair next; + while (c->onode_map.get_next(next.first, &next)) { + assert(!next.second->exists); + if (!next.second->flush_txns.empty()) { + dout(10) << __func__ << " " << c->cid << " " << next.second->oid + << " flush_txns " << next.second->flush_txns << dendl; + return; + } + } + } + c->onode_map.clear(); + dout(10) << __func__ << " " << c->cid << " done" << dendl; + } + + dout(10) << __func__ << " all reaped" << dendl; + reap_cond.Signal(); +} + +// --------------- +// read operations + +bool NewStore::exists(coll_t cid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return false; + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return false; + return true; +} + +int NewStore::stat( + coll_t cid, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return -ENOENT; + st->st_size = o->onode.size; + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + return 0; +} + +int NewStore::read( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags, + bool allow_eio) +{ + dout(15) << __func__ << " " << cid << " " << oid + << " " << offset << "~" << length + << dendl; + bl.clear(); + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (offset == length && offset == 0) + length = o->onode.size; + + r = _do_read(o, offset, length, bl, op_flags); + + out: + dout(10) << __func__ << " " << cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int NewStore::_do_read( + OnodeRef o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) +{ + map::iterator p; + int r; + int fd = -1; + fid_t cur_fid; + + dout(20) << __func__ << " " << offset << "~" << length << " size " + << o->onode.size << dendl; + + if (offset > o->onode.size) { + r = 0; + goto out; + } + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + o->flush(); + + r = 0; + + p = o->onode.data_map.begin(); // fixme + if (p->first > offset && p != o->onode.data_map.begin()) { + --p; + } + for ( ; length > 0 && p != o->onode.data_map.end(); ++p) { + assert(p->first == 0); + assert(p->second.offset == 0); + assert(p->second.length == o->onode.size); + dout(30) << __func__ << " x " << p->first << "~" << p->second.length + << " in " << p->second.fid << dendl; + if (p->first + p->second.length <= offset) { + dout(30) << __func__ << " skipping " << p->first << "~" << p->second.length + << dendl; + continue; + } + if (p->first > offset) { + unsigned l = p->first - offset; + dout(30) << __func__ << " zero " << offset << "~" << l << dendl; + bufferptr bp(l); + bp.zero(); + bl.append(bp); + length = length - l; + } + if (p->second.fid != cur_fid) { + cur_fid = p->second.fid; + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + fd = _open_fid(cur_fid); + if (fd < 0) { + r = fd; + goto out; + } + } + unsigned x_off; + if (p->first < offset) { + x_off = offset - p->first; + } else { + x_off = 0; + } + unsigned x_len = MIN(length, p->second.length - x_off); + dout(30) << __func__ << " data " << offset << "~" << x_len + << " fid " << cur_fid << " offset " << x_off + p->second.offset + << dendl; + r = ::lseek64(fd, p->second.offset + x_off, SEEK_SET); + if (r < 0) { + r = -errno; + goto out; + } + bufferlist t; + r = t.read_fd(fd, x_len); + if (r < 0) { + goto out; + } + bl.claim_append(t); + if ((unsigned)r < x_len) { + dout(10) << __func__ << " short read " << r << " < " << x_len + << " from " << cur_fid << " offset " << p->second.offset + x_off + << dendl; + bufferptr z(x_len - r); + z.zero(); + bl.append(z); + } + offset += x_len; + length -= x_len; + } + if (length > 0 && p == o->onode.data_map.end()) { + dout(30) << __func__ << " trailing zero " << offset << "~" << length << dendl; + bufferptr bp(length); + bp.zero(); + bl.push_back(bp); + } + r = bl.length(); + out: + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + return r; +} + + +int NewStore::fiemap( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl) +{ + assert(0); +} + +int NewStore::getattr( + coll_t cid, + const ghobject_t& oid, + const char *name, + bufferptr& value) +{ + dout(15) << __func__ << " " << cid << " " << oid << " " << name << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r; + string k(name); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (!o->onode.attrs.count(k)) { + r = -ENODATA; + goto out; + } + value = o->onode.attrs[k]; + r = 0; + out: + dout(10) << __func__ << " " << cid << " " << oid << " " << name + << " = " << r << dendl; + return r; +} + +int NewStore::getattrs( + coll_t cid, + const ghobject_t& oid, + map& aset) +{ + dout(15) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + aset = o->onode.attrs; + r = 0; + out: + dout(10) << __func__ << " " << cid << " " << oid + << " = " << r << dendl; + return r; +} + +int NewStore::list_collections(vector& ls) +{ + RWLock::RLocker l(coll_lock); + for (ceph::unordered_map::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) + ls.push_back(p->first); + return 0; +} + +bool NewStore::collection_exists(coll_t c) +{ + RWLock::RLocker l(coll_lock); + return coll_map.count(c); +} + +bool NewStore::collection_empty(coll_t cid) +{ + dout(15) << __func__ << " " << cid << dendl; + vector ls; + ghobject_t next; + int r = collection_list_partial(cid, ghobject_t(), 5, 5, CEPH_NOSNAP, + &ls, &next); + if (r < 0) + return false; // fixme? + bool empty = ls.empty(); + dout(10) << __func__ << " " << cid << " = " << (int)empty << dendl; + return empty; +} + +int NewStore::collection_list(coll_t cid, vector& o) +{ + dout(15) << __func__ << " " << cid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OBJ); + string temp_start_key, temp_end_key; + string start_key, end_key; + bool temp = true; + const char *end; + get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, + &start_key, &end_key); + dout(20) << __func__ << " range " << temp_start_key << " to " << temp_end_key + << " and " << start_key << " to " << end_key << dendl; + end = temp_start_key.c_str(); + it->upper_bound(temp_start_key); + while (true) { + if (!it->valid() || strcmp(it->key().c_str(), end) > 0) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " key " << it->key() << " > " << end << dendl; + if (temp) { + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(start_key); + end = end_key.c_str(); + continue; + } + break; + } + dout(20) << __func__ << " key " << it->key() << dendl; + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + assert(r == 0); + o.push_back(oid); + it->next(); + } + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; +} + +int NewStore::collection_list_partial( + coll_t cid, ghobject_t start, + int min, int max, snapid_t snap, + vector *ls, ghobject_t *pnext) +{ + dout(15) << __func__ << " " << cid + << " start " << start << " min/max " << min << "/" << max + << " snap " << snap << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + KeyValueDB::Iterator it; + string temp_start_key, temp_end_key; + string start_key, end_key; + bool set_next = false; + const char *end; + bool temp; + + if (start == ghobject_t::get_max()) + goto out; + get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, + &start_key, &end_key); + dout(20) << __func__ << " range " << temp_start_key << " to " + << temp_end_key << " and " << start_key << " to " << end_key + << " start " << start << dendl; + it = db->get_iterator(PREFIX_OBJ); + if (start == ghobject_t()) { + it->upper_bound(temp_start_key); + temp = true; + } else { + string k; + get_object_key(start, &k); + if (start.hobj.is_temp()) { + temp = true; + assert(k >= temp_start_key && k < temp_end_key); + } else { + temp = false; + assert(k >= start_key && k < end_key); + } + it->upper_bound(k); + } + end = temp ? temp_end_key.c_str() : end_key.c_str(); + while (true) { + if (!it->valid() || strcmp(it->key().c_str(), end) > 0) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " key " << it->key() << " > " << end << dendl; + if (temp) { + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(start_key); + end = end_key.c_str(); + continue; + } + break; + } + dout(20) << __func__ << " key " << it->key() << dendl; + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + assert(r == 0); + ls->push_back(oid); + if (ls->size() >= (unsigned)max) { + *pnext = oid; + set_next = true; + break; + } + it->next(); + } + if (!set_next) { + *pnext = ghobject_t::get_max(); + } + out: + dout(10) << __func__ << " " << cid + << " start " << start << " min/max " << min << "/" << max + << " snap " << snap << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << *pnext << dendl; + return r; +} + +int NewStore::collection_list_range( + coll_t cid, ghobject_t start, ghobject_t end, + snapid_t seq, vector *ls) +{ + dout(15) << __func__ << " " << cid + << " start " << start << " end " << end + << " snap " << seq << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + KeyValueDB::Iterator it; + string temp_start_key, temp_end_key; + string start_key, end_key; + string end_str; + const char *pend; + bool temp; + + if (start == ghobject_t::get_max() || end == ghobject_t()) + goto out; + get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, + &start_key, &end_key); + dout(20) << __func__ << " range " << temp_start_key << " to " + << temp_end_key << " and " << start_key << " to " << end_key + << " start " << start << " end " << end << dendl; + it = db->get_iterator(PREFIX_OBJ); + if (start == ghobject_t()) { + it->upper_bound(temp_start_key); + temp = true; + } else { + string k; + get_object_key(start, &k); + if (start.hobj.is_temp()) { + temp = true; + assert(k >= temp_start_key && k < temp_end_key); + } else { + temp = false; + assert(k >= start_key && k < end_key); + } + it->upper_bound(k); + } + get_object_key(end, &end_str); + if (end.hobj.is_temp()) { + if (temp) + pend = end_str.c_str(); + else + goto out; + } else { + pend = temp ? temp_end_key.c_str() : end_str.c_str(); + } + while (true) { + if (!it->valid() || strcmp(it->key().c_str(), pend) > 0) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " key " << it->key() << " > " << pend << dendl; + if (temp) { + if (end.hobj.is_temp()) { + break; + } + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(start_key); + pend = end_str.c_str(); + continue; + } + break; + } + dout(20) << __func__ << " key " << it->key() << dendl; + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + assert(r == 0); + ls->push_back(oid); + it->next(); + } + out: + dout(10) << __func__ << " " << cid + << " start " << start << " end " << end + << " snap " << seq << " = " << r << dendl; + return r; +} + +// omap reads + +NewStore::OmapIteratorImpl::OmapIteratorImpl(CollectionRef c, OnodeRef o) + : c(c), o(o) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->upper_bound(head); + } +} + +int NewStore::OmapIteratorImpl::seek_to_first() +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + it->upper_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +int NewStore::OmapIteratorImpl::upper_bound(const string& after) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, after, &key); + it->upper_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +int NewStore::OmapIteratorImpl::lower_bound(const string& to) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, to, &key); + it->lower_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +bool NewStore::OmapIteratorImpl::valid() +{ + RWLock::RLocker l(c->lock); + return it->valid(); +} + +int NewStore::OmapIteratorImpl::next() +{ + RWLock::RLocker l(c->lock); + it->next(); + if (!it->valid() || it->key() == tail) { + it = KeyValueDB::Iterator(); + } + return 0; +} + +string NewStore::OmapIteratorImpl::key() +{ + RWLock::RLocker l(c->lock); + assert(it->valid()); + return it->key(); +} + +bufferlist NewStore::OmapIteratorImpl::value() +{ + RWLock::RLocker l(c->lock); + assert(it->valid()); + return it->value(); +} + +int NewStore::omap_get( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map *out /// < [out] Key to value map + ) +{ + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + *header = it->value(); + } else if (it->key() == tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << it->key() << " -> " << user_key << dendl; + assert(it->key() < tail); + (*out)[user_key] = it->value(); + } + it->next(); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int NewStore::omap_get_header( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) +{ + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + { + string head; + get_omap_header(o->onode.omap_head, &head); + if (db->get(PREFIX_OMAP, head, header) >= 0) { + dout(30) << __func__ << " got header" << dendl; + } else { + dout(30) << __func__ << " no header" << dendl; + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int NewStore::omap_get_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set *keys ///< [out] Keys defined on oid + ) +{ + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " skipping head" << dendl; + it->next(); + continue; + } + if (it->key() == tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << it->key() << " -> " << user_key << dendl; + assert(it->key() < tail); + keys->insert(user_key); + it->next(); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int NewStore::omap_get_values( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to get + map *out ///< [out] Returned keys and values + ) +{ + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " got " << key << " -> " << *p << dendl; + out->insert(make_pair(*p, val)); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int NewStore::omap_check_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to check + set *out ///< [out] Subset of keys defined on oid + ) +{ + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " have " << key << " -> " << *p << dendl; + out->insert(*p); + } else { + dout(30) << __func__ << " miss " << key << " -> " << *p << dendl; + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; +} + +ObjectMap::ObjectMapIterator NewStore::get_omap_iterator( + coll_t cid, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) +{ + assert(0); +} + + +// ----------------- +// write helpers + +int NewStore::_recover_next_omap_id() +{ + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + it->lower_bound("GGGGGGGG"); + if (!it->valid()) { + omap_id.set(1); + dout(10) << __func__ << " no omap keys, starting at 1" << dendl; + return 0; + } + dout(20) << __func__ << " last key is " << it->key() << dendl; + uint64_t id; + int r = sscanf(it->key().c_str(), "%llx", (unsigned long long*)&id); + if (r < 0) { + derr << "unable to parse " << it->key() << dendl; + return -EIO; + } + omap_id.set(id); + return 0; +} + +void NewStore::_get_omap_id(TransContext *txc, OnodeRef o) +{ + if (o->onode.omap_head) + return; + + o->onode.omap_head = omap_id.inc(); + dout(10) << __func__ << " assigned " << o->oid + << " id " << o->onode.omap_head << dendl; + string tail; + get_omap_tail(o->onode.omap_head, &tail); + bufferlist empty; + txc->t->set(PREFIX_OMAP, tail, empty); +} + +int NewStore::_recover_next_fid() +{ + bufferlist bl; + db->get(PREFIX_SUPER, "fid_max", &bl); + try { + ::decode(fid_max, bl); + } catch (buffer::error& e) { + } + dout(1) << __func__ << " old fid_max " << fid_max << dendl; + fid_last = fid_max; + + if (fid_last.fset > 0) { + char s[32]; + snprintf(s, sizeof(s), "%u", fid_last.fset); + assert(fset_fd < 0); + fset_fd = ::openat(frag_fd, s, O_DIRECTORY, 0644); + if (fset_fd < 0) { + int r = -errno; + derr << __func__ << " cannot open created " << path << "/fragments/" + << s << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + return 0; +} + +int NewStore::_open_fid(fid_t fid) +{ + char fn[32]; + snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno); + int fd = ::openat(frag_fd, fn, O_RDWR); + if (fd < 0) { + int r = -errno; + derr << __func__ << " on " << fid << ": " << cpp_strerror(r) << dendl; + return r; + } + dout(30) << __func__ << " " << fid << " = " << fd << dendl; + return fd; +} + +int NewStore::_create_fid(TransContext *txc, fid_t *fid) +{ + { + Mutex::Locker l(fid_lock); + if (fid_last.fset > 0 && + fid_last.fno > 0 && + fid_last.fset == fid_max.fset && + fid_last.fno < g_conf->newstore_max_dir_size) { + ++fid_last.fno; + if (fid_last.fno >= fid_max.fno) { + // raise fid_max, same fset + fid_max.fno += g_conf->newstore_fid_prealloc; + assert(fid_max.fno >= fid_last.fno); + bufferlist bl; + ::encode(fid_max, bl); + txc->t->set(PREFIX_SUPER, "fid_max", bl); + dout(10) << __func__ << " fid_max now " << fid_max << dendl; + } + } else { + // new fset + ++fid_last.fset; + fid_last.fno = 1; + dout(10) << __func__ << " creating " << fid_last.fset << dendl; + char s[32]; + snprintf(s, sizeof(s), "%u", fid_last.fset); + int r = ::mkdirat(frag_fd, s, 0755); + if (r < 0) { + r = -errno; + derr << __func__ << " cannot create " << path << "/fragments/" + << s << ": " << cpp_strerror(r) << dendl; + return r; + } + if (fset_fd >= 0) + VOID_TEMP_FAILURE_RETRY(::close(fset_fd)); + fset_fd = ::openat(frag_fd, s, O_DIRECTORY, 0644); + if (fset_fd < 0) { + r = -errno; + derr << __func__ << " cannot open created " << path << "/fragments/" + << s << ": " << cpp_strerror(r) << dendl; + } + + fid_max = fid_last; + fid_max.fno = g_conf->newstore_fid_prealloc; + bufferlist bl; + ::encode(fid_max, bl); + txc->t->set(PREFIX_SUPER, "fid_max", bl); + dout(10) << __func__ << " fid_max now " << fid_max << dendl; + } + *fid = fid_last; + } + + dout(10) << __func__ << " " << fid_last << dendl; + char s[32]; + snprintf(s, sizeof(s), "%u", fid->fno); + int fd = ::openat(fset_fd, s, O_RDWR|O_CREAT, 0644); + if (fd < 0) { + int r = -errno; + derr << __func__ << " cannot create " << path << "/fragments/" + << *fid << ": " << cpp_strerror(r) << dendl; + return r; + } + +#if 0 + // store a handle, too + void *hp; + size_t hlen; + int r = fd_to_handle(fd, &hp, &hlen); + if (r >= 0) { + fid->handle = string((char *)hp, hlen); + } +#endif + + dout(30) << __func__ << " " << *fid << " = " << fd << dendl; + return fd; +} + +int NewStore::_remove_fid(fid_t fid) +{ + char fn[32]; + snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno); + int r = ::unlinkat(frag_fd, fn, 0); + if (r < 0) + return -errno; + return 0; +} + +NewStore::TransContext *NewStore::_txc_create(OpSequencer *osr) +{ + TransContext *txc = new TransContext(osr); + txc->t = db->get_transaction(); + osr->queue_new(txc); + dout(20) << __func__ << " osr " << osr << " = " << txc << dendl; + return txc; +} + +void NewStore::_txc_process_fsync(fsync_item *i) +{ + dout(20) << __func__ << " txc " << i->txc << dendl; + int r = ::fdatasync(i->fd); + if (r < 0) { + r = -errno; + derr << __func__ << " error from fdatasync on " << i->fd + << " txc " << i->txc + << ": " << cpp_strerror(r) << dendl; + assert(0 == "error from fdatasync"); + } + VOID_TEMP_FAILURE_RETRY(::close(i->fd)); + if (i->txc->finish_fsync()) { + _txc_finish_fsync(i->txc); + } + dout(20) << __func__ << " txc " << i->txc << " done" << dendl; +} + +void NewStore::_txc_finish_fsync(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << dendl; + + /* + * we need to preserve the order of kv transactions, + * even though fsyncs will complete in any order. + */ + + OpSequencer *osr = txc->osr; + Mutex::Locker l(osr->qlock); + txc->state = TransContext::STATE_FSYNC_DONE; + + OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc); + while (p != osr->q.begin()) { + --p; + if (p->state < TransContext::STATE_FSYNC_DONE) { + dout(20) << __func__ << " " << txc << " blocked by " << &*p << " " + << p->get_state_name() << dendl; + return; + } + if (p->state > TransContext::STATE_FSYNC_DONE) { + ++p; + break; + } + } + do { + _txc_submit_kv(&*p++); + } while (p != osr->q.end() && + p->state == TransContext::STATE_FSYNC_DONE); +} + +int NewStore::_txc_finalize(OpSequencer *osr, TransContext *txc) +{ + dout(20) << __func__ << " osr " << osr << " txc " << txc + << " onodes " << txc->onodes << dendl; + + // finalize onodes + for (set::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + bufferlist bl; + ::encode((*p)->onode, bl); + txc->t->set(PREFIX_OBJ, (*p)->key, bl); + + Mutex::Locker l((*p)->flush_lock); + (*p)->flush_txns.insert(txc); + } + + // journal wal items + if (txc->wal_txn) { + txc->wal_txn->seq = wal_seq.inc(); + bufferlist bl; + ::encode(*txc->wal_txn, bl); + string key; + get_wal_key(txc->wal_txn->seq, &key); + txc->t->set(PREFIX_WAL, key, bl); + } + + return 0; +} + +void NewStore::_txc_queue_fsync(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + txc->state = TransContext::STATE_FSYNC_QUEUED; + fsync_wq.lock(); + for (list::iterator p = txc->fds.begin(); + p != txc->fds.end(); + ++p) { + fsync_wq._enqueue(&*p); + fsync_wq._wake(); + } + fsync_wq.unlock(); +} + +void NewStore::_txc_submit_kv(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + txc->state = TransContext::STATE_KV_QUEUED; + + Mutex::Locker l(kv_lock); + db->submit_transaction(txc->t); + kv_queue.push_back(txc); + kv_cond.SignalOne(); +} + +struct C_ApplyWAL : public Context { + NewStore *store; + NewStore::TransContext *txc; + C_ApplyWAL(NewStore *s, NewStore::TransContext *t) : store(s), txc(t) {} + void finish(int r) { + store->_apply_wal_transaction(txc); + } +}; + +void NewStore::_txc_finish_kv(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + txc->osr->qlock.Lock(); + txc->state = TransContext::STATE_KV_DONE; + + // loop in case we race with OpSequencer::flush_commit() + do { + txc->osr->qlock.Unlock(); + if (txc->onreadable_sync) { + txc->onreadable_sync->complete(0); + txc->onreadable_sync = NULL; + } + if (txc->onreadable) { + finisher.queue(txc->onreadable); + txc->onreadable = NULL; + } + if (txc->oncommit) { + txc->oncommit->complete(0); + txc->oncommit = NULL; + } + while (!txc->oncommits.empty()) { + txc->oncommits.front()->complete(0); + txc->oncommits.pop_front(); + } + txc->osr->qlock.Lock(); + } while (txc->oncommit || !txc->oncommits.empty()); + + if (txc->wal_txn) { + dout(20) << __func__ << " starting wal apply" << dendl; + txc->state = TransContext::STATE_WAL_QUEUED; + txc->osr->qlock.Unlock(); + finisher.queue(new C_ApplyWAL(this, txc)); + } else { + txc->state = TransContext::STATE_FINISHING; + txc->osr->qlock.Unlock(); + _txc_finish_apply(txc); + } +} + +void NewStore::_txc_finish_apply(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; + assert(txc->state == TransContext::STATE_FINISHING); + + for (set::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + Mutex::Locker l((*p)->flush_lock); + dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns + << dendl; + assert((*p)->flush_txns.count(txc)); + (*p)->flush_txns.erase(txc); + if ((*p)->flush_txns.empty()) + (*p)->flush_cond.Signal(); + } + + // clear out refs + txc->onodes.clear(); + + while (!txc->removed_collections.empty()) { + _queue_reap_collection(txc->removed_collections.front()); + txc->removed_collections.pop_front(); + } + + OpSequencer *osr = txc->osr; + osr->qlock.Lock(); + txc->state = TransContext::STATE_DONE; + osr->qlock.Unlock(); + + _osr_reap_done(osr); +} + +void NewStore::_osr_reap_done(OpSequencer *osr) +{ + Mutex::Locker l(osr->qlock); + dout(20) << __func__ << " osr " << osr << dendl; + while (!osr->q.empty()) { + TransContext *txc = &osr->q.front(); + dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() + << dendl; + if (txc->state != TransContext::STATE_DONE) { + break; + } + + if (txc->first_collection) { + txc->first_collection->onode_map.trim(g_conf->newstore_onode_map_size); + } + + osr->q.pop_front(); + delete txc; + osr->qcond.Signal(); + } +} + +void NewStore::_kv_sync_thread() +{ + dout(10) << __func__ << " start" << dendl; + kv_lock.Lock(); + while (true) { + assert(kv_committing.empty()); + if (kv_queue.empty()) { + if (kv_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + kv_sync_cond.Signal(); + kv_cond.Wait(kv_lock); + dout(20) << __func__ << " wake" << dendl; + } else { + dout(20) << __func__ << " committing " << kv_queue.size() << dendl; + kv_committing.swap(kv_queue); + utime_t start = ceph_clock_now(NULL); + kv_lock.Unlock(); + db->submit_transaction_sync(db->get_transaction()); + utime_t finish = ceph_clock_now(NULL); + utime_t dur = finish - start; + dout(20) << __func__ << " committed " << kv_committing.size() + << " in " << dur << dendl; + while (!kv_committing.empty()) { + _txc_finish_kv(kv_committing.front()); + kv_committing.pop_front(); + } + + // this is as good a place as any ... + _reap_collections(); + + kv_lock.Lock(); + } + } + kv_lock.Unlock(); + dout(10) << __func__ << " finish" << dendl; +} + +wal_op_t *NewStore::_get_wal_op(TransContext *txc) +{ + if (!txc->wal_txn) { + txc->wal_txn = new wal_transaction_t; + } + txc->wal_txn->ops.push_back(wal_op_t()); + return &txc->wal_txn->ops.back(); +} + +int NewStore::_apply_wal_transaction(TransContext *txc) +{ + wal_transaction_t& wt = *txc->wal_txn; + dout(20) << __func__ << " txc " << txc << " seq " << wt.seq << dendl; + txc->state = TransContext::STATE_WAL_APPLYING; + + int r = _do_wal_transaction(wt); + if (r < 0) + return r; + + string key; + get_wal_key(wt.seq, &key); + KeyValueDB::Transaction cleanup = db->get_transaction(); + cleanup->rmkey(PREFIX_WAL, key); + db->submit_transaction_sync(cleanup); + + txc->osr->qlock.Lock(); + txc->state = TransContext::STATE_FINISHING; + txc->osr->qlock.Unlock(); + + _txc_finish_apply(txc); + return 0; +} + +int NewStore::_do_wal_transaction(wal_transaction_t& wt) +{ + vector sync_fds; + sync_fds.reserve(wt.ops.size()); + + for (list::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) { + switch (p->op) { + case wal_op_t::OP_WRITE: + { + dout(20) << __func__ << " write " << p->fid << " " + << p->offset << "~" << p->length << dendl; + int fd = _open_fid(p->fid); + if (fd < 0) + return fd; + int r = ::lseek64(fd, p->offset, SEEK_SET); + if (r < 0) { + r = -errno; + derr << __func__ << " lseek64 on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } + p->data.write_fd(fd); + sync_fds.push_back(fd); + } + break; + case wal_op_t::OP_ZERO: + { + dout(20) << __func__ << " zero " << p->fid << " " + << p->offset << "~" << p->length << dendl; + int fd = _open_fid(p->fid); + if (fd < 0) + return fd; + int r = ::lseek64(fd, p->offset, SEEK_SET); + if (r < 0) { + r = -errno; + derr << __func__ << " lseek64 on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } +#warning use hole punch ioctl to zero when available + bufferlist bl; + bufferptr bp(p->length); + bp.zero(); + bl.append(bp); + bl.write_fd(fd); + sync_fds.push_back(fd); + } + break; + case wal_op_t::OP_TRUNCATE: + { + dout(20) << __func__ << " truncate " << p->fid << " " + << p->offset << dendl; + int fd = _open_fid(p->fid); + if (fd < 0) + return fd; + int r = ::ftruncate(fd, p->offset); + if (r < 0) { + r = -errno; + derr << __func__ << " truncate on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } + //sync_fds.push_back(fd); // do we care? + } + break; + + case wal_op_t::OP_REMOVE: + dout(20) << __func__ << " remove " << p->fid << dendl; + _remove_fid(p->fid); + break; + + default: + assert(0 == "unrecognized wal op"); + } + } + + for (vector::iterator p = sync_fds.begin(); + p != sync_fds.end(); + ++p) { + int r = ::fsync(*p); + assert(r == 0); + VOID_TEMP_FAILURE_RETRY(::close(*p)); + } + + return 0; +} + +int NewStore::_replay_wal() +{ + dout(10) << __func__ << " start" << dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_WAL); + it->lower_bound(string()); + KeyValueDB::Transaction cleanup = db->get_transaction(); + int count = 0; + while (it->valid()) { + bufferlist bl = it->value(); + bufferlist::iterator p = bl.begin(); + wal_transaction_t wt; + try { + ::decode(wt, p); + } catch (buffer::error& e) { + derr << __func__ << " failed to decode wal txn " << it->key() << dendl; + return -EIO; + } + dout(20) << __func__ << " replay " << it->key() << dendl; + int r = _do_wal_transaction(wt); + if (r < 0) + return r; + cleanup->rmkey(PREFIX_WAL, it->key()); + ++count; + it->next(); + } + if (count) { + dout(10) << __func__ << " cleanup" << dendl; + db->submit_transaction_sync(cleanup); + } + dout(10) << __func__ << " completed " << count << " events" << dendl; + return 0; +} + +// --------------------------- +// transactions + +int NewStore::queue_transactions( + Sequencer *posr, + list& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + int r; + + // set up the sequencer + OpSequencer *osr; + if (!posr) + posr = &default_osr; + if (posr->p) { + osr = static_cast(posr->p.get()); + dout(5) << __func__ << " existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; + } else { + osr = new OpSequencer; + osr->parent = posr; + posr->p = osr; + dout(5) << __func__ << " new " << *osr << "/" << osr->parent << dendl; + } + + TransContext *txc = _txc_create(osr); + + // XXX do it sync for now; this is not crash safe + for (list::iterator p = tls.begin(); p != tls.end(); ++p) { + (*p)->set_osr(osr); + _do_transaction(*p, txc, handle); + } + + txc->onreadable = onreadable; + txc->onreadable_sync = onreadable_sync; + txc->oncommit = ondisk; + + r = _txc_finalize(osr, txc); + assert(r == 0); + + if (g_conf->newstore_sync_queue_transaction) { + // do it syncrhonously. for example, if we have a *very* fast backend. + + // sync + txc->state = TransContext::STATE_FSYNC_FSYNCING; + for (list::iterator p = txc->fds.begin(); + p != txc->fds.end(); ++p) { + dout(30) << __func__ << " fsync " << p->fd << dendl; + int r = ::fsync(p->fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsync: " << cpp_strerror(r) << dendl; + return r; + } + VOID_TEMP_FAILURE_RETRY(::close(p->fd)); + } + + txc->state = TransContext::STATE_KV_COMMITTING; + db->submit_transaction_sync(txc->t); + + _txc_finish_kv(txc); + } else { + // async path + if (!txc->fds.empty()) { + _txc_queue_fsync(txc); + } else { + _txc_finish_fsync(txc); + } + } + + return 0; +} + +int NewStore::_do_transaction(Transaction *t, + TransContext *txc, + ThreadPool::TPHandle *handle) +{ + Transaction::iterator i = t->begin(); + int pos = 0; + + vector cvec(i.colls.size()); + unsigned j = 0; + for (vector::iterator p = i.colls.begin(); p != i.colls.end(); + ++p, ++j) { + cvec[j] = _get_collection(*p); + + // note first collection we reference + if (!j && !txc->first_collection) + txc->first_collection = cvec[j]; + } + + while (i.have_op()) { + Transaction::Op *op = i.decode_op(); + int r = 0; + CollectionRef &c = cvec[op->cid]; + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + const ghobject_t &oid = i.get_oid(op->oid); + r = _touch(txc, c, oid); + } + break; + + case Transaction::OP_WRITE: + { + const ghobject_t &oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(txc, c, oid, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + const ghobject_t &oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(txc, c, oid, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + const ghobject_t& oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(txc, c, oid, off); + } + break; + + case Transaction::OP_REMOVE: + { + const ghobject_t& oid = i.get_oid(op->oid); + r = _remove(txc, c, oid); + } + break; + + case Transaction::OP_SETATTR: + { + const ghobject_t &oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(txc, c, oid, to_set); + } + break; + + case Transaction::OP_SETATTRS: + { + const ghobject_t& oid = i.get_oid(op->oid); + map aset; + i.decode_attrset(aset); + r = _setattrs(txc, c, oid, aset); + } + break; + + case Transaction::OP_RMATTR: + { + const ghobject_t &oid = i.get_oid(op->oid); + string name = i.decode_string(); + r = _rmattr(txc, c, oid, name); + } + break; + + case Transaction::OP_RMATTRS: + { + const ghobject_t &oid = i.get_oid(op->oid); + r = _rmattrs(txc, c, oid); + } + break; + + case Transaction::OP_CLONE: + { + const ghobject_t& oid = i.get_oid(op->oid); + const ghobject_t& noid = i.get_oid(op->dest_oid); + r = _clone(txc, c, oid, noid); + } + break; + + case Transaction::OP_CLONERANGE: + assert(0 == "deprecated"); + break; + + case Transaction::OP_CLONERANGE2: + { + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(txc, c, oid, noid, srcoff, len, dstoff); + } + break; + + case Transaction::OP_MKCOLL: + { + assert(!c); + coll_t cid = i.get_cid(op->cid); + r = _create_collection(txc, cid, op->split_bits, &c); + } + break; + + case Transaction::OP_COLL_HINT: + { + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + bufferlist::iterator hiter = hint.begin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + ::decode(pg_num, hiter); + ::decode(num_objs, hiter); + dout(10) << __func__ << " collection hint objects is a no-op, " + << " pg_num " << pg_num << " num_objects " << num_objs + << dendl; + } else { + // Ignore the hint + dout(10) << __func__ << " unknown collection hint " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _remove_collection(txc, cid, &c); + } + break; + + case Transaction::OP_COLL_ADD: + assert(0 == "not implmeented"); + break; + + case Transaction::OP_COLL_REMOVE: + assert(0 == "not implmeented"); + break; + + case Transaction::OP_COLL_MOVE: + assert(0 == "deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + assert(op->cid == op->dest_cid); + ghobject_t oldoid = i.get_oid(op->oid); + ghobject_t newoid = i.get_oid(op->dest_oid); + r = _rename(txc, c, oldoid, newoid); + } + break; + + case Transaction::OP_COLL_SETATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RMATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RENAME: + assert(0 == "not implmeneted"); + break; + + case Transaction::OP_OMAP_CLEAR: + { + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(txc, c, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + map aset; + i.decode_attrset(aset); + r = _omap_setkeys(txc, c, oid, aset); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + set keys; + i.decode_keyset(keys); + r = _omap_rmkeys(txc, c, oid, keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + ghobject_t oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkey_range(txc, c, oid, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(txc, c, oid, bl); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + assert(0 == "deprecated"); + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + ghobject_t oid = i.get_oid(op->oid); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + r = _setallochint(txc, c, oid, + expected_object_size, + expected_write_size); + } + break; + + default: + derr << "bad op " << op->op << dendl; + assert(0); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC handling not implemented"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + assert(0 == "unexpected error"); + } + } + + ++pos; + } + + return 0; +} + + + +// ----------------- +// write operations + +int NewStore::_touch(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + assert(o); + o->exists = true; + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_do_write(TransContext *txc, + OnodeRef o, + uint64_t offset, uint64_t length, + const bufferlist& bl, + uint32_t fadvise_flags) +{ + int fd = -1; + int r = 0; + + dout(20) << __func__ << " have " << o->onode.size + << " bytes in " << o->onode.data_map.size() + << " fragments" << dendl; + + o->exists = true; + + if (length == 0) { + dout(20) << __func__ << " zero-length write" << dendl; + goto out; + } + if (o->onode.size == offset || + o->onode.size == 0 || + o->onode.data_map.empty()) { + if (o->onode.data_map.empty()) { + // create + fragment_t &f = o->onode.data_map[0]; + f.offset = 0; + f.length = MAX(offset + length, o->onode.size); + fd = _create_fid(txc, &f.fid); + if (fd < 0) { + r = fd; + goto out; + } + ::lseek64(fd, offset, SEEK_SET); + dout(20) << __func__ << " create " << f.fid << " writing " + << offset << "~" << length << dendl; + } else { + // append (possibly with gap) + assert(o->onode.data_map.size() == 1); + fragment_t &f = o->onode.data_map.rbegin()->second; + fd = _open_fid(f.fid); + if (fd < 0) { + r = fd; + goto out; + } + ::ftruncate(fd, f.length); // in case there is trailing crap + f.length = (offset + length) - f.offset; + ::lseek64(fd, offset - f.offset, SEEK_SET); + dout(20) << __func__ << " append " << f.fid << " writing " + << (offset - f.offset) << "~" << length << dendl; + } + if (offset + length > o->onode.size) { + o->onode.size = offset + length; + } + r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; + goto out; + } + txc->sync_fd(fd); + } else { + // WAL + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + r = _clean_fid_tail(txc, f); + if (r < 0) + goto out; + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_WRITE; + op->offset = offset - f.offset; + op->length = length; + op->fid = f.fid; + op->data = bl; + if (offset + length > o->onode.size) { + o->onode.size = offset + length; + } + if (offset + length - f.offset > f.length) { + f.length = offset + length - f.offset; + } + dout(20) << __func__ << " wal " << f.fid << " write " + << (offset - f.offset) << "~" << length << dendl; + } + r = 0; + + out: + return r; +} + +int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f) +{ + int fd = _open_fid(f.fid); + if (fd < 0) { + return fd; + } + struct stat st; + int r = ::fstat(fd, &st); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to fstat " << f.fid << ": " + << cpp_strerror(r) << dendl; + return r; + } + if (st.st_size > f.length) { + dout(20) << __func__ << " frag " << f.fid << " is long, truncating" + << dendl; + r = ::ftruncate(fd, f.length); + if (r < 0) { + derr << __func__ << " failed to ftruncate " << f.fid << ": " + << cpp_strerror(r) << dendl; + return r; + } + txc->sync_fd(fd); + } else { + // all good! + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + return 0; +} + + +int NewStore::_write(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t length, + const bufferlist& bl, + uint32_t fadvise_flags) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << dendl; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + int r = _do_write(txc, o, offset, length, bl, fadvise_flags); + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int NewStore::_zero(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + + if (o->onode.data_map.empty()) { + // we're already a big hole + if (offset + length > o->onode.size) { + o->onode.size = offset + length; + txc->write_onode(o); + } + } else { + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + + r = _clean_fid_tail(txc, f); + if (r < 0) + goto out; + + if (offset >= o->onode.size) { + // after tail + int fd = _open_fid(f.fid); + if (fd < 0) { + r = fd; + goto out; + } + f.length = (offset + length) - f.offset; + ::ftruncate(fd, f.length); + dout(20) << __func__ << " tail " << f.fid << " truncating up to " + << f.length << dendl; + o->onode.size = offset + length; + txc->write_onode(o); + } else { + // WAL + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_ZERO; + op->offset = offset - f.offset; + op->length = length; + op->fid = f.fid; + if (offset + length > o->onode.size) { + f.length = offset + length - f.offset; + o->onode.size = offset + length; + txc->write_onode(o); + } + } + } + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int NewStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset) +{ + if (o->onode.data_map.empty()) { + o->onode.size = offset; + } else if (offset == 0) { + while (!o->onode.data_map.empty()) { + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_REMOVE; + op->fid = o->onode.data_map.rbegin()->second.fid; + o->onode.data_map.erase(o->onode.data_map.rbegin()->first); + } + } else if (offset < o->onode.size) { + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + f.length = offset; + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_TRUNCATE; + op->offset = offset; + op->fid = f.fid; + assert(f.offset == 0); + } else if (offset > o->onode.size) { + // resize file up. make sure we don't have trailing bytes + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + int r = _clean_fid_tail(txc, f); + if (r < 0) + return r; + if (false) { // hmm don't bother!! + // truncate up. don't bother to fsync since it's all zeros. + int fd = _open_fid(f.fid); + if (fd < 0) { + return fd; + } + r = ::ftruncate(fd, offset); + if (r < 0) { + r = -errno; + derr << "error from ftruncate on " << f.fid << " to " << offset << ": " + << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + f.length = offset; + } + o->onode.size = offset; + txc->write_onode(o); + return 0; +} + +int NewStore::_truncate(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + if (!o->exists) { + r = -ENOENT; + goto out; + } + r = _do_truncate(txc, o, offset); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset + << " = " << r << dendl; + return r; +} + +int NewStore::_do_remove(TransContext *txc, + OnodeRef o) +{ + string key; + o->exists = false; + if (!o->onode.data_map.empty()) { + for (map::iterator p = o->onode.data_map.begin(); + p != o->onode.data_map.end(); + ++p) { + dout(20) << __func__ << " will wal remove " << p->second.fid << dendl; + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_REMOVE; + op->fid = p->second.fid; + } + } + o->onode.data_map.clear(); + o->onode.size = 0; + if (o->onode.omap_head) { + _do_omap_clear(txc, o->onode.omap_head, true); + } + + get_object_key(o->oid, &key); + txc->t->rmkey(PREFIX_OBJ, key); + return 0; +} + +int NewStore::_remove(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + r = _do_remove(txc, o); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_setattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name, + bufferptr& val) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << name << " (" << val.length() << " bytes)" + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs[name] = val; + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << name << " (" << val.length() << " bytes)" + << " = " << r << dendl; + return r; +} + +int NewStore::_setattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const map& aset) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << aset.size() << " keys" + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + for (map::const_iterator p = aset.begin(); + p != aset.end(); ++p) + o->onode.attrs[p->first] = p->second; + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << aset.size() << " keys" + << " = " << r << dendl; + return r; +} + + +int NewStore::_rmattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << name << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs.erase(name); + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << name << " = " << r << dendl; + return r; +} + +int NewStore::_rmattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs.clear(); + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +void NewStore::_do_omap_clear(TransContext *txc, uint64_t id, + bool remove_tail) +{ + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string prefix, tail; + get_omap_header(id, &prefix); + get_omap_tail(id, &tail); + it->lower_bound(prefix); + while (it->valid()) { + if (it->key() == tail) { + if (remove_tail) { + txc->t->rmkey(PREFIX_OMAP, it->key()); + } + dout(30) << __func__ << " stop at " << tail << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << it->key() << dendl; + it->next(); + } +} + +int NewStore::_omap_clear(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (o->onode.omap_head != 0) { + _do_omap_clear(txc, o->onode.omap_head, false); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_omap_setkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const map& m) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + _get_omap_id(txc, o); + for (map::const_iterator p = m.begin(); p != m.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, p->first, &key); + dout(30) << __func__ << " " << key << " <- " << p->first << dendl; + txc->t->set(PREFIX_OMAP, key, p->second); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_omap_setheader(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + string key; + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + _get_omap_id(txc, o); + get_omap_header(o->onode.omap_head, &key); + txc->t->set(PREFIX_OMAP, key, bl); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_omap_rmkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const set& m) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + r = 0; + goto out; + } + _get_omap_id(txc, o); + for (set::const_iterator p = m.begin(); p != m.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + dout(30) << __func__ << " rm " << key << " <- " << *p << dendl; + txc->t->rmkey(PREFIX_OMAP, key); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_omap_rmkey_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& first, const string& last) +{ + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + KeyValueDB::Iterator it; + string key_first, key_last; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + r = 0; + goto out; + } + it = db->get_iterator(PREFIX_OMAP); + get_omap_key(o->onode.omap_head, first, &key_first); + get_omap_key(o->onode.omap_head, last, &key_last); + it->lower_bound(key_first); + while (it->valid()) { + if (it->key() >= key_last) { + dout(30) << __func__ << " stop at " << key_last << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << it->key() << dendl; + it->next(); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; +} + +int NewStore::_setallochint(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) +{ + dout(15) << __func__ << " " << c->cid << " " << oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << dendl; + int r = 0; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + o->onode.expected_object_size = expected_object_size; + o->onode.expected_write_size = expected_write_size; + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " = " << r << dendl; + return r; +} + +int NewStore::_clone(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid) +{ + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + bufferlist bl; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + newo->exists = true; + + r = _do_read(oldo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + + // truncate any old data + while (!newo->onode.data_map.empty()) { + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_REMOVE; + op->fid = newo->onode.data_map.rbegin()->second.fid; + newo->onode.data_map.erase(newo->onode.data_map.rbegin()->first); + } + + r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0); + + newo->onode.attrs = oldo->onode.attrs; + // fixme: omap + + txc->write_onode(newo); + + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; +} + +int NewStore::_clone_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid, + uint64_t srcoff, uint64_t length, uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " from " << srcoff << "~" << length + << " to offset " << dstoff << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + bufferlist bl; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + newo->exists = true; + + r = _do_read(oldo, srcoff, length, bl, 0); + if (r < 0) + goto out; + + r = _do_write(txc, newo, dstoff, bl.length(), bl, 0); + + txc->write_onode(newo); + + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " from " << srcoff << "~" << length + << " to offset " << dstoff + << " = " << r << dendl; + return r; +} + +int NewStore::_rename(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid) +{ + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << dendl; + int r; + + RWLock::WLocker l(c->lock); + bufferlist bl; + string old_key, new_key; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + + if (newo->exists) { + r = _do_remove(txc, newo); + if (r < 0) + return r; + } + + get_object_key(old_oid, &old_key); + get_object_key(new_oid, &new_key); + + c->onode_map.rename(old_oid, new_oid); + oldo->oid = new_oid; + oldo->key = new_key; + + txc->t->rmkey(PREFIX_OBJ, old_key); + txc->write_onode(oldo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; +} + +// collections + +int NewStore::_create_collection( + TransContext *txc, + coll_t cid, + unsigned bits, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << " bits " << bits << dendl; + int r; + bufferlist bl; + + { + RWLock::WLocker l(coll_lock); + if (*c) { + r = -EEXIST; + goto out; + } + c->reset(new Collection(this, cid)); + (*c)->cnode.bits = bits; + coll_map[cid] = *c; + } + ::encode((*c)->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(cid), bl); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; + return r; +} + +int NewStore::_remove_collection(TransContext *txc, coll_t cid, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << dendl; + int r; + bufferlist empty; + + { + RWLock::WLocker l(coll_lock); + if (!*c) { + r = -ENOENT; + goto out; + } + pair next; + while ((*c)->onode_map.get_next(next.first, &next)) { + if (next.second->exists) { + r = -ENOTEMPTY; + goto out; + } + } + coll_map.erase(cid); + txc->removed_collections.push_back(*c); + c->reset(); + } + txc->t->rmkey(PREFIX_COLL, stringify(cid)); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; +} + +int NewStore::_split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem) +{ + dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + int r; + RWLock::WLocker l(c->lock); + RWLock::WLocker l2(d->lock); + c->onode_map.clear(); + d->onode_map.clear(); + c->cnode.bits = bits; + assert(d->cnode.bits == bits); + r = 0; + + dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +// =========================================== diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h new file mode 100644 index 0000000000000..88694b51cba69 --- /dev/null +++ b/src/os/newstore/NewStore.h @@ -0,0 +1,720 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_NEWSTORE_H +#define CEPH_OSD_NEWSTORE_H + +#include + +#include "include/assert.h" +#include "include/unordered_map.h" +#include "include/memory.h" +#include "common/Finisher.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "os/ObjectStore.h" +#include "os/KeyValueDB.h" + +#include "newstore_types.h" + +#include "boost/intrusive/list.hpp" + +class NewStore : public ObjectStore { + // ----------------------------------------------------- + // types +public: + + struct FragmentHandle { + int fd; + FragmentHandle() : fd(-1) {} + FragmentHandle(int f) : fd(f) {} + ~FragmentHandle() { + if (fd >= 0) + ::close(fd); + } + int fsync() { + return ::fsync(fd); + } + int fdatasync() { + return ::fdatasync(fd); + } + }; + typedef ceph::shared_ptr FragmentHandleRef; + + class TransContext; + + /// an in-memory object + struct Onode { + atomic_t nref; ///< reference count + + ghobject_t oid; + string key; ///< key under PREFIX_OBJ where we are stored + boost::intrusive::list_member_hook<> lru_item; + + onode_t onode; ///< metadata stored as value in kv store + bool dirty; // ??? + bool exists; + + Mutex flush_lock; ///< protect unappliex_txns, num_fsyncs + Cond flush_cond; ///< wait here for unapplied txns, fsyncs + set flush_txns; ///< fsyncing or committing or wal txns + + Onode(const ghobject_t& o, const string& k); + + void flush() { + Mutex::Locker l(flush_lock); + while (!flush_txns.empty()) + flush_cond.Wait(flush_lock); + } + void get() { + nref.inc(); + } + void put() { + if (nref.dec() == 0) + delete this; + } + }; + typedef boost::intrusive_ptr OnodeRef; + + struct OnodeHashLRU { + typedef boost::intrusive::list< + Onode, + boost::intrusive::member_hook< + Onode, + boost::intrusive::list_member_hook<>, + &Onode::lru_item> > lru_list_t; + + Mutex lock; + ceph::unordered_map onode_map; ///< forward lookups + lru_list_t lru; ///< lru + + OnodeHashLRU() : lock("NewStore::OnodeHashLRU::lock") {} + + void add(const ghobject_t& oid, OnodeRef o); + void _touch(OnodeRef o); + OnodeRef lookup(const ghobject_t& o); + void remove(const ghobject_t& o); + void rename(const ghobject_t& old_oid, const ghobject_t& new_oid); + void clear(); + bool get_next(const ghobject_t& after, pair *next); + int trim(int max=-1); + }; + + struct Collection { + NewStore *store; + coll_t cid; + cnode_t cnode; + RWLock lock; + + // cache onodes on a per-collection basis to avoid lock + // contention. + OnodeHashLRU onode_map; + + OnodeRef get_onode(const ghobject_t& oid, bool create); + + Collection(NewStore *ns, coll_t c); + }; + typedef ceph::shared_ptr CollectionRef; + + class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + CollectionRef c; + OnodeRef o; + KeyValueDB::Iterator it; + string head, tail; + public: + OmapIteratorImpl(CollectionRef c, OnodeRef o); + int seek_to_first(); + int upper_bound(const string &after); + int lower_bound(const string &to); + bool valid(); + int next(); + string key(); + bufferlist value(); + int status() { + return 0; + } + }; + + class OpSequencer; + + struct fsync_item { + boost::intrusive::list_member_hook<> queue_item; + int fd; + TransContext *txc; + fsync_item(int f, TransContext *t) : fd(f), txc(t) {} + }; + + struct TransContext { + typedef enum { + STATE_PREPARE, + STATE_FSYNC_QUEUED, + STATE_FSYNC_FSYNCING, + STATE_FSYNC_DONE, + STATE_KV_QUEUED, + STATE_KV_COMMITTING, + STATE_KV_DONE, + STATE_WAL_QUEUED, + STATE_WAL_APPLYING, + STATE_WAL_DONE, + STATE_FINISHING, + STATE_DONE, + } state_t; + + state_t state; + + const char *get_state_name() { + switch (state) { + case STATE_PREPARE: return "prepare"; + case STATE_FSYNC_QUEUED: return "fsync_queued"; + case STATE_FSYNC_FSYNCING: return "fsync_fsyncing"; + case STATE_FSYNC_DONE: return "fsync_done"; + case STATE_KV_QUEUED: return "kv_queued"; + case STATE_KV_COMMITTING: return "kv_committing"; + case STATE_KV_DONE: return "kv_done"; + case STATE_WAL_QUEUED: return "wal_queued"; + case STATE_WAL_APPLYING: return "wal_applying"; + case STATE_WAL_DONE: return "wal_done"; + case STATE_FINISHING: return "finishing"; + case STATE_DONE: return "done"; + } + return "???"; + } + + OpSequencer *osr; + boost::intrusive::list_member_hook<> sequencer_item; + + list fds; ///< these fds need to be synced + set onodes; ///< these onodes need to be updated/written + KeyValueDB::Transaction t; ///< then we will commit this + Context *oncommit; ///< signal on commit + Context *onreadable; ///< signal on readable + Context *onreadable_sync; ///< signal on readable + list oncommits; ///< more commit completions + list removed_collections; ///< colls we removed + + wal_transaction_t *wal_txn; ///< wal transaction (if any) + unsigned num_fsyncs_completed; + + Mutex lock; + Cond cond; + + CollectionRef first_collection; ///< first referenced collection + + TransContext(OpSequencer *o) + : state(STATE_PREPARE), + osr(o), + oncommit(NULL), + onreadable(NULL), + onreadable_sync(NULL), + wal_txn(NULL), + num_fsyncs_completed(0), + lock("NewStore::TransContext::lock") { + //cout << "txc new " << this << std::endl; + } + ~TransContext() { + delete wal_txn; + //cout << "txc del " << this << std::endl; + } + + void sync_fd(int f) { + fds.push_back(fsync_item(f, this)); + } + void write_onode(OnodeRef &o) { + onodes.insert(o); + } + + bool finish_fsync() { + Mutex::Locker l(lock); + ++num_fsyncs_completed; + if (num_fsyncs_completed == fds.size()) { + cond.Signal(); + return true; + } + return false; + } + void wait_fsync() { + Mutex::Locker l(lock); + while (num_fsyncs_completed < fds.size()) + cond.Wait(lock); + } + }; + + + class OpSequencer : public Sequencer_impl { + public: + Mutex qlock; + Cond qcond; + typedef boost::intrusive::list< + TransContext, + boost::intrusive::member_hook< + TransContext, + boost::intrusive::list_member_hook<>, + &TransContext::sequencer_item> > q_list_t; + q_list_t q; ///< transactions + + Sequencer *parent; + + OpSequencer() + : qlock("NewStore::OpSequencer::qlock", false, false), + parent(NULL) { + } + ~OpSequencer() { + assert(q.empty()); + } + + void queue_new(TransContext *txc) { + Mutex::Locker l(qlock); + q.push_back(*txc); + } + + void flush() { + Mutex::Locker l(qlock); + while (!q.empty()) + qcond.Wait(qlock); + } + + bool flush_commit(Context *c) { + Mutex::Locker l(qlock); + if (q.empty()) { + return true; + } + TransContext *txc = &q.back(); + if (txc->state > TransContext::STATE_KV_DONE) { + return true; + } + assert(txc->state <= TransContext::STATE_KV_DONE); + txc->oncommits.push_back(c); + return false; + } + }; + + class FsyncWQ : public ThreadPool::WorkQueue { + public: + typedef boost::intrusive::list< + fsync_item, + boost::intrusive::member_hook< + fsync_item, + boost::intrusive::list_member_hook<>, + &fsync_item::queue_item> > fsync_queue_t; + private: + NewStore *store; + fsync_queue_t fd_queue; + + public: + FsyncWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp) + : ThreadPool::WorkQueue("NewStore::FsyncWQ", ti, sti, tp), + store(s) { + } + bool _empty() { + return fd_queue.empty(); + } + bool _enqueue(fsync_item *i) { + fd_queue.push_back(*i); + return true; + } + void _dequeue(fsync_item *p) { + assert(0 == "not needed, not implemented"); + } + fsync_item *_dequeue() { + if (fd_queue.empty()) + return NULL; + fsync_item *i = &fd_queue.front(); + fd_queue.pop_front(); + return i; + } + void _process(fsync_item *i, ThreadPool::TPHandle &handle) { + store->_txc_process_fsync(i); + } + void _clear() { + fd_queue.clear(); + } + + void flush() { + lock(); + while (!fd_queue.empty()) + _wait(); + unlock(); + drain(); + } + }; + + struct KVSyncThread : public Thread { + NewStore *store; + KVSyncThread(NewStore *s) : store(s) {} + void *entry() { + store->_kv_sync_thread(); + return NULL; + } + }; + + // -------------------------------------------------------- + // members +private: + CephContext *cct; + KeyValueDB *db; + uuid_d fsid; + int path_fd; ///< open handle to $path + int fsid_fd; ///< open handle (locked) to $path/fsid + int frag_fd; ///< open handle to $path/fragments + int fset_fd; ///< open handle to $path/fragments/$cur_fid.fset + bool mounted; + + RWLock coll_lock; ///< rwlock to protect coll_map + ceph::unordered_map coll_map; + + Mutex fid_lock; + fid_t fid_last; ///< last allocated fid + fid_t fid_max; ///< max fid we can allocate before reserving more + + atomic64_t omap_id; + + Mutex wal_lock; + atomic64_t wal_seq; + + Finisher finisher; + ThreadPool fsync_tp; + FsyncWQ fsync_wq; + + KVSyncThread kv_sync_thread; + Mutex kv_lock; + Cond kv_cond, kv_sync_cond; + bool kv_stop; + deque kv_queue, kv_committing; + + Logger *logger; + + Sequencer default_osr; + + Mutex reap_lock; + Cond reap_cond; + list removed_collections; + + + // -------------------------------------------------------- + // private methods + + void _init_logger(); + void _shutdown_logger(); + + int _open_path(); + void _close_path(); + int _open_fsid(bool create); + int _lock_fsid(); + int _read_fsid(uuid_d *f); + int _write_fsid(); + void _close_fsid(); + int _open_frag(); + int _create_frag(); + void _close_frag(); + int _open_db(); + void _close_db(); + int _open_collections(); + void _close_collections(); + + CollectionRef _get_collection(coll_t cid); + void _queue_reap_collection(CollectionRef& c); + void _reap_collections(); + + int _recover_next_fid(); + int _create_fid(TransContext *txc, fid_t *fid); + int _open_fid(fid_t fid); + int _remove_fid(fid_t fid); + + int _recover_next_omap_id(); + void _get_omap_id(TransContext *txc, OnodeRef o); + + int _clean_fid_tail(TransContext *txc, const fragment_t& f); + + TransContext *_txc_create(OpSequencer *osr); + int _txc_finalize(OpSequencer *osr, TransContext *txc); + void _txc_queue_fsync(TransContext *txc); + void _txc_process_fsync(fsync_item *i); + void _txc_finish_fsync(TransContext *txc); + void _txc_submit_kv(TransContext *txc); + void _txc_finish_kv(TransContext *txc); + void _txc_finish_apply(TransContext *txc); + + void _osr_reap_done(OpSequencer *osr); + + void _kv_sync_thread(); + void _kv_stop() { + { + Mutex::Locker l(kv_lock); + kv_stop = true; + kv_cond.Signal(); + } + kv_sync_thread.join(); + kv_stop = false; + } + + wal_op_t *_get_wal_op(TransContext *txc); + int _apply_wal_transaction(TransContext *txc); + int _do_wal_transaction(wal_transaction_t& wt); + void _wait_object_wal(OnodeRef onode); + int _replay_wal(); + friend class C_ApplyWAL; + +public: + NewStore(CephContext *cct, const string& path); + ~NewStore(); + + bool needs_journal() { return false; }; + bool wants_journal() { return false; }; + bool allows_journal() { return false; }; + + int peek_journal_fsid(uuid_d *fsid); + + bool test_mount_in_use(); + + int mount(); + int umount(); + + void sync(Context *onsync); + void sync(); + void flush(); + void sync_and_flush(); + + unsigned get_max_object_name_length() { + return 4096; + } + unsigned get_max_attr_name_length() { + return 256; // arbitrary; there is no real limit internally + } + + int mkfs(); + int mkjournal() { + return 0; + } + +private: + bool sharded; +public: + void set_allow_sharded_objects() { + sharded = true; + } + bool get_allow_sharded_objects() { + return sharded; + } + + int statfs(struct statfs *buf); + + bool exists(coll_t cid, const ghobject_t& oid); + int stat( + coll_t cid, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false); // struct stat? + int read( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0, + bool allow_eio = false); + int _do_read( + OnodeRef o, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0); + + int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl); + int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value); + int getattrs(coll_t cid, const ghobject_t& oid, map& aset); + + int list_collections(vector& ls); + bool collection_exists(coll_t c); + bool collection_empty(coll_t c); + int collection_list(coll_t cid, vector& o); + int collection_list_partial(coll_t cid, ghobject_t start, + int min, int max, snapid_t snap, + vector *ls, ghobject_t *next); + int collection_list_range(coll_t cid, ghobject_t start, ghobject_t end, + snapid_t seq, vector *ls); + + int omap_get( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map *out /// < [out] Key to value map + ); + + /// Get omap header + int omap_get_header( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ); + + /// Get keys defined on oid + int omap_get_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set *keys ///< [out] Keys defined on oid + ); + + /// Get key values + int omap_get_values( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to get + map *out ///< [out] Returned keys and values + ); + + /// Filters keys into out which are defined on oid + int omap_check_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to check + set *out ///< [out] Subset of keys defined on oid + ); + + ObjectMap::ObjectMapIterator get_omap_iterator( + coll_t cid, ///< [in] collection + const ghobject_t &oid ///< [in] object + ); + + void set_fsid(uuid_d u) { + fsid = u; + } + uuid_d get_fsid() { + return fsid; + } + + objectstore_perf_stat_t get_cur_stats() { + return objectstore_perf_stat_t(); + } + + int queue_transactions( + Sequencer *osr, + list& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL); + +private: + // -------------------------------------------------------- + // write ops + + int _do_transaction(Transaction *t, + TransContext *txc, + ThreadPool::TPHandle *handle); + + int _write(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t len, + const bufferlist& bl, + uint32_t fadvise_flags); + int _do_write(TransContext *txc, + OnodeRef o, + uint64_t offset, uint64_t length, + const bufferlist& bl, + uint32_t fadvise_flags); + int _touch(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid); + int _zero(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t len); + int _do_truncate(TransContext *txc, + OnodeRef o, + uint64_t offset); + int _truncate(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset); + int _remove(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid); + int _do_remove(TransContext *txc, + OnodeRef o); + int _setattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name, + bufferptr& val); + int _setattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const map& aset); + int _rmattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name); + int _rmattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid); + void _do_omap_clear(TransContext *txc, uint64_t id, bool remove_tail); + int _omap_clear(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid); + int _omap_setkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const map& m); + int _omap_setheader(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + bufferlist& header); + int _omap_rmkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const set& m); + int _omap_rmkey_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& first, const string& last); + int _setallochint(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size); + int _clone(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid); + int _clone_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid, + uint64_t srcoff, uint64_t length, uint64_t dstoff); + int _rename(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid); + int _create_collection(TransContext *txc, coll_t cid, unsigned bits, + CollectionRef *c); + int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c); + int _split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem); + +}; + +inline ostream& operator<<(ostream& out, const NewStore::OpSequencer& s) { + return out << *s.parent; +} + +static inline void intrusive_ptr_add_ref(NewStore::Onode *o) { + o->get(); +} +static inline void intrusive_ptr_release(NewStore::Onode *o) { + o->put(); +} + +#endif diff --git a/src/os/newstore/newstore_types.cc b/src/os/newstore/newstore_types.cc new file mode 100644 index 0000000000000..3e5d0aca66429 --- /dev/null +++ b/src/os/newstore/newstore_types.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "newstore_types.h" +#include "common/Formatter.h" + +// cnode_t + +void cnode_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(bits, bl); + ENCODE_FINISH(bl); +} + +void cnode_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(bits, p); + DECODE_FINISH(p); +} + +void cnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("bits", bits); +} + +void cnode_t::generate_test_instances(list& o) +{ + o.push_back(new cnode_t()); + o.push_back(new cnode_t(0)); + o.push_back(new cnode_t(123)); +} + +// fit_t + +void fid_t::dump(Formatter *f) const +{ + f->dump_unsigned("fset", fset); + f->dump_unsigned("fno", fno); +} + +void fid_t::generate_test_instances(list& o) +{ + o.push_back(new fid_t()); + o.push_back(new fid_t(0, 1)); + o.push_back(new fid_t(123, 3278)); +} + +// fragment_t + +void fragment_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(offset, bl); + ::encode(length, bl); + ::encode(fid, bl); + ENCODE_FINISH(bl); +} + +void fragment_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(offset, p); + ::decode(length, p); + ::decode(fid, p); + DECODE_FINISH(p); +} + +void fragment_t::dump(Formatter *f) const +{ + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); + f->dump_object("fid", fid); +} + +void fragment_t::generate_test_instances(list& o) +{ + o.push_back(new fragment_t()); + o.push_back(new fragment_t(123, 456)); + o.push_back(new fragment_t(789, 1024, fid_t(3, 400))); +} + +// onode_t + +void onode_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(size, bl); + ::encode(attrs, bl); + ::encode(data_map, bl); + ::encode(omap_head, bl); + ::encode(expected_object_size, bl); + ::encode(expected_write_size, bl); + ENCODE_FINISH(bl); +} + +void onode_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(size, p); + ::decode(attrs, p); + ::decode(data_map, p); + ::decode(omap_head, p); + ::decode(expected_object_size, p); + ::decode(expected_write_size, p); + DECODE_FINISH(p); +} + +void onode_t::dump(Formatter *f) const +{ + f->dump_unsigned("size", size); + f->open_object_section("attrs"); + for (map::const_iterator p = attrs.begin(); + p != attrs.end(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first); + f->dump_unsigned("len", p->second.length()); + f->close_section(); + } + f->open_object_section("data_map"); + for (map::const_iterator p = data_map.begin(); + p != data_map.end(); ++p) { + f->open_object_section("fragment"); + f->dump_unsigned("fragment_offset", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("omap_head", omap_head); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); +} + +void onode_t::generate_test_instances(list& o) +{ + o.push_back(new onode_t()); + // FIXME +} + +// wal_op_t + +void wal_op_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(op, bl); + ::encode(fid, bl); + ::encode(offset, bl); + ::encode(length, bl); + ::encode(data, bl); + ENCODE_FINISH(bl); +} + +void wal_op_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(op, p); + ::decode(fid, p); + ::decode(offset, p); + ::decode(length, p); + ::decode(data, p); + DECODE_FINISH(p); +} + +void wal_op_t::dump(Formatter *f) const +{ + f->dump_unsigned("op", (int)op); + f->dump_object("fid", fid); + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +void wal_transaction_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(seq, bl); + ::encode(ops, bl); + ENCODE_FINISH(bl); +} + +void wal_transaction_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(seq, p); + ::decode(ops, p); + DECODE_FINISH(p); +} + +void wal_transaction_t::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->open_array_section("ops"); + for (list::const_iterator p = ops.begin(); p != ops.end(); ++p) { + f->dump_object("op", *p); + } + f->close_section(); +} diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h new file mode 100644 index 0000000000000..6d9ddfc778d1b --- /dev/null +++ b/src/os/newstore/newstore_types.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_NEWSTORE_TYPES_H +#define CEPH_OSD_NEWSTORE_TYPES_H + +#include +#include "include/types.h" + +namespace ceph { + class Formatter; +} + +/// collection metadata +struct cnode_t { + uint32_t bits; ///< how many bits of coll pgid are significant + + cnode_t(int b=0) : bits(b) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(cnode_t) + +/// unique id for a local file +struct fid_t { + uint32_t fset, fno; + string handle; + fid_t() : fset(0), fno(0) { } + fid_t(uint32_t s, uint32_t n) : fset(s), fno(n) { } + + void encode(bufferlist& bl) const { + ::encode(fset, bl); + ::encode(fno, bl); + ::encode(handle, bl); + } + void decode(bufferlist::iterator& p) { + ::decode(fset, p); + ::decode(fno, p); + ::decode(handle, p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(fid_t) + +static inline ostream& operator<<(ostream& out, const fid_t& fid) { + out << fid.fset << "/" << fid.fno; + if (fid.handle.length()) + out << "~"; + return out; +} + +static inline bool operator==(const fid_t& a, const fid_t& b) { + return a.fset == b.fset && a.fno == b.fno && a.handle == b.handle; +} +static inline bool operator!=(const fid_t& a, const fid_t& b) { + return !(a == b); +} + +/// fragment: a byte extent backed by a file +struct fragment_t { + uint32_t offset; ///< offset in file to first byte of this fragment + uint32_t length; ///< length of fragment/extent + fid_t fid; ///< file backing this fragment + + fragment_t() : offset(0), length(0) {} + fragment_t(uint32_t o, uint32_t l) : offset(o), length(l) {} + fragment_t(uint32_t o, uint32_t l, fid_t f) : offset(o), length(l), fid(f) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(fragment_t) + +/// onode: per-object metadata +struct onode_t { + uint64_t size; ///< object size + map attrs; ///< attrs + map data_map; ///< data (offset to fragment mapping) + uint64_t omap_head; ///< id for omap root node + + uint32_t expected_object_size; + uint32_t expected_write_size; + + onode_t() + : size(0), omap_head(0), + expected_object_size(0), + expected_write_size(0) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(onode_t) + + +/// writeahead-logged op +struct wal_op_t { + typedef enum { + OP_WRITE = 1, + OP_TRUNCATE = 3, + OP_ZERO = 4, + OP_REMOVE = 5, + } type_t; + __u8 op; + fid_t fid; + uint64_t offset, length; + bufferlist data; + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(wal_op_t) + + +/// writeahead-logged transaction +struct wal_transaction_t { + uint64_t seq; + list ops; + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(wal_transaction_t) + +#endif diff --git a/src/vstart.sh b/src/vstart.sh index 467b197c4318d..70ee0f14f1bfe 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -197,6 +197,9 @@ case $1 in --memstore ) memstore=1 ;; + --newstore ) + newstore=1 + ;; --hitset ) hitset="$hitset $2 $3" shift @@ -295,6 +298,10 @@ if [ "$memstore" -eq 1 ]; then COSDMEMSTORE=' osd objectstore = memstore' fi +if [ "$newstore" -eq 1 ]; then + COSDMEMSTORE=' + osd objectstore = newstore' +fi # lockdep everywhere? # export CEPH_ARGS="--lockdep 1" From db87e423b6ae05a167fcb3baf755915379ba18e2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 10:10:25 -0400 Subject: [PATCH 338/654] os/newstore: clone omap Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 40 ++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 67c251bd4f227..1f3f62eb2daf2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -304,6 +304,14 @@ void get_omap_key(uint64_t id, const string& key, string *out) out->append(key); } +void rewrite_omap_key(uint64_t id, string old, string *out) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)id); + *out = buf; + out->append(old.substr(16)); +} + void decode_omap_key(const string& key, string *user_key) { *user_key = key.substr(17); @@ -486,8 +494,7 @@ NewStore::Collection::Collection(NewStore *ns, coll_t c) : store(ns), cid(c), lock("NewStore::Collection::lock"), - onode_map() //store->cct, store->cct->_conf->newstore_onode_map_size) -#warning fixme size the lru/cache + onode_map() { } @@ -3501,7 +3508,34 @@ int NewStore::_clone(TransContext *txc, r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0); newo->onode.attrs = oldo->onode.attrs; - // fixme: omap + + // clone omap + if (o->onode.omap_head) { + dout(20) << __func__ << " clearing old omap data" << dendl; + _do_omap_clear(txc, o->onode.omap_head, true); + } + if (oldo->onode.omap_head) { + dout(20) << __func__ << " copying omap data" << dendl; + _get_omap_id(txc, o); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(oldo->onode.omap_head, &head); + get_omap_tail(oldo->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + string key; + if (it->key() == tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + dout(30) << __func__ << " got header/data " << it->key() << dendl; + assert(it->key() < tail); + rewrite_omap_key(o->onode.omap_head, it->key(), &key); + txc->t->set(PREFIX_OMAP, key, it->value()); + } + it->next(); + } + } txc->write_onode(newo); From fbf3d5528fd464f2e2591952fc544e1e9d798f7f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 1 Apr 2015 09:46:53 -0700 Subject: [PATCH 339/654] os/newstore: send complete overwrite to a new fid Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 37 ++++++++++++++++++++++++++---- src/test/objectstore/store_test.cc | 15 ++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 1f3f62eb2daf2..a0ce422f4a854 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -47,6 +47,7 @@ * abstract out fs specifics * fid xattr backpointer * kill collection_list_range + * inline first fsync_item in TransContext to void allocation? */ @@ -2896,6 +2897,34 @@ int NewStore::_do_write(TransContext *txc, goto out; } txc->sync_fd(fd); + } else if (offset == 0 && + length >= o->onode.size) { + // overwrite to new fid + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_REMOVE; + op->fid = f.fid; + + f.length = length; + o->onode.size = length; + fd = _create_fid(txc, &f.fid); + if (fd < 0) { + r = fd; + goto out; + } + dout(20) << __func__ << " replace old fid " << op->fid + << " with new fid " << f.fid + << ", writing " << offset << "~" << length << dendl; + r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; + goto out; + } + txc->sync_fd(fd); } else { // WAL assert(o->onode.data_map.size() == 1); @@ -3510,13 +3539,13 @@ int NewStore::_clone(TransContext *txc, newo->onode.attrs = oldo->onode.attrs; // clone omap - if (o->onode.omap_head) { + if (newo->onode.omap_head) { dout(20) << __func__ << " clearing old omap data" << dendl; - _do_omap_clear(txc, o->onode.omap_head, true); + _do_omap_clear(txc, newo->onode.omap_head, true); } if (oldo->onode.omap_head) { dout(20) << __func__ << " copying omap data" << dendl; - _get_omap_id(txc, o); + _get_omap_id(txc, newo); KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); string head, tail; get_omap_header(oldo->onode.omap_head, &head); @@ -3530,7 +3559,7 @@ int NewStore::_clone(TransContext *txc, } else { dout(30) << __func__ << " got header/data " << it->key() << dendl; assert(it->key() < tail); - rewrite_omap_key(o->onode.omap_head, it->key(), &key); + rewrite_omap_key(newo->onode.omap_head, it->key(), &key); txc->t->set(PREFIX_OMAP, key, it->value()); } it->next(); diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 7cb07664e1fd0..8ce487823446c 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -397,6 +397,21 @@ TEST_P(StoreTest, SimpleObjectTest) { in.hexdump(cout); ASSERT_TRUE(in.contents_equal(exp)); } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234"); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "larger overwrite" << std::endl; + r = store->apply_transaction(t); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(cid, hoid, 0, bl.length(), in); + ASSERT_EQ(bl.length(), r); + in.hexdump(cout); + ASSERT_TRUE(in.contents_equal(bl)); + } { ObjectStore::Transaction t; t.remove(cid, hoid); From d8351a8d9edb34be1c3768e8bcdb5f1354068b9b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 1 Apr 2015 14:44:45 -0700 Subject: [PATCH 340/654] os/newstore: ref count OpSequencer Our OpSequencer may live longer than the ObjectStore::Sequencer interface object does. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 6 +++--- src/os/newstore/NewStore.h | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a0ce422f4a854..62a61f08e466f 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2048,7 +2048,7 @@ void NewStore::_txc_finish_fsync(TransContext *txc) * even though fsyncs will complete in any order. */ - OpSequencer *osr = txc->osr; + OpSequencer *osr = txc->osr.get(); Mutex::Locker l(osr->qlock); txc->state = TransContext::STATE_FSYNC_DONE; @@ -2200,12 +2200,12 @@ void NewStore::_txc_finish_apply(TransContext *txc) txc->removed_collections.pop_front(); } - OpSequencer *osr = txc->osr; + OpSequencerRef osr = txc->osr; osr->qlock.Lock(); txc->state = TransContext::STATE_DONE; osr->qlock.Unlock(); - _osr_reap_done(osr); + _osr_reap_done(osr.get()); } void NewStore::_osr_reap_done(OpSequencer *osr) diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 88694b51cba69..e40bf0bb6cee9 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -147,6 +147,7 @@ class NewStore : public ObjectStore { }; class OpSequencer; + typedef boost::intrusive_ptr OpSequencerRef; struct fsync_item { boost::intrusive::list_member_hook<> queue_item; @@ -191,7 +192,7 @@ class NewStore : public ObjectStore { return "???"; } - OpSequencer *osr; + OpSequencerRef osr; boost::intrusive::list_member_hook<> sequencer_item; list fds; ///< these fds need to be synced @@ -717,4 +718,11 @@ static inline void intrusive_ptr_release(NewStore::Onode *o) { o->put(); } +static inline void intrusive_ptr_add_ref(NewStore::OpSequencer *o) { + o->get(); +} +static inline void intrusive_ptr_release(NewStore::OpSequencer *o) { + o->put(); +} + #endif From a4d2a53cf6a952ee1e93c2a17e729da13644208f Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Fri, 3 Apr 2015 09:48:43 +0800 Subject: [PATCH 341/654] Clear removed_collections after reap Previous code forgot to clear the removed_collections queues after reaped the collections in _reap_collection. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 62a61f08e466f..e200c1f7df91a 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1043,9 +1043,14 @@ void NewStore::_queue_reap_collection(CollectionRef& c) void NewStore::_reap_collections() { - Mutex::Locker l(reap_lock); - for (list::iterator p = removed_collections.begin(); - p != removed_collections.end(); + reap_lock.Lock(); + + list removed_colls; + removed_colls.swap(removed_collections); + reap_lock.Unlock(); + + for (list::iterator p = removed_colls.begin(); + p != removed_colls.end(); ++p) { CollectionRef c = *p; dout(10) << __func__ << " " << c->cid << dendl; From 713c69884ea90b601bd9efb6d8c0c2d7acdacb24 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Jul 2015 12:59:39 -0400 Subject: [PATCH 342/654] os/newstore: consolite collection_list to a single implementation Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 161 ++++++------------------------------ src/os/newstore/NewStore.h | 10 +-- 2 files changed, 27 insertions(+), 144 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index e200c1f7df91a..d42721ddfda37 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1345,8 +1345,8 @@ bool NewStore::collection_empty(coll_t cid) dout(15) << __func__ << " " << cid << dendl; vector ls; ghobject_t next; - int r = collection_list_partial(cid, ghobject_t(), 5, 5, CEPH_NOSNAP, - &ls, &next); + int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, 5, + &ls, &next); if (r < 0) return false; // fixme? bool empty = ls.empty(); @@ -1354,59 +1354,15 @@ bool NewStore::collection_empty(coll_t cid) return empty; } -int NewStore::collection_list(coll_t cid, vector& o) -{ - dout(15) << __func__ << " " << cid << dendl; - CollectionRef c = _get_collection(cid); - if (!c) - return -ENOENT; - RWLock::RLocker l(c->lock); - int r = 0; - KeyValueDB::Iterator it = db->get_iterator(PREFIX_OBJ); - string temp_start_key, temp_end_key; - string start_key, end_key; - bool temp = true; - const char *end; - get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, - &start_key, &end_key); - dout(20) << __func__ << " range " << temp_start_key << " to " << temp_end_key - << " and " << start_key << " to " << end_key << dendl; - end = temp_start_key.c_str(); - it->upper_bound(temp_start_key); - while (true) { - if (!it->valid() || strcmp(it->key().c_str(), end) > 0) { - if (!it->valid()) - dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; - else - dout(20) << __func__ << " key " << it->key() << " > " << end << dendl; - if (temp) { - dout(30) << __func__ << " switch to non-temp namespace" << dendl; - temp = false; - it->upper_bound(start_key); - end = end_key.c_str(); - continue; - } - break; - } - dout(20) << __func__ << " key " << it->key() << dendl; - ghobject_t oid; - int r = get_key_object(it->key(), &oid); - assert(r == 0); - o.push_back(oid); - it->next(); - } - dout(10) << __func__ << " " << cid << " = " << r << dendl; - return r; -} - -int NewStore::collection_list_partial( - coll_t cid, ghobject_t start, - int min, int max, snapid_t snap, +int NewStore::collection_list( + coll_t cid, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, vector *ls, ghobject_t *pnext) { dout(15) << __func__ << " " << cid - << " start " << start << " min/max " << min << "/" << max - << " snap " << snap << dendl; + << " start " << start << " end " << end << " max " << max << dendl; + if (!sort_bitwise) + return -EOPNOTSUPP; CollectionRef c = _get_collection(cid); if (!c) return -ENOENT; @@ -1416,97 +1372,17 @@ int NewStore::collection_list_partial( string temp_start_key, temp_end_key; string start_key, end_key; bool set_next = false; - const char *end; - bool temp; - - if (start == ghobject_t::get_max()) - goto out; - get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, - &start_key, &end_key); - dout(20) << __func__ << " range " << temp_start_key << " to " - << temp_end_key << " and " << start_key << " to " << end_key - << " start " << start << dendl; - it = db->get_iterator(PREFIX_OBJ); - if (start == ghobject_t()) { - it->upper_bound(temp_start_key); - temp = true; - } else { - string k; - get_object_key(start, &k); - if (start.hobj.is_temp()) { - temp = true; - assert(k >= temp_start_key && k < temp_end_key); - } else { - temp = false; - assert(k >= start_key && k < end_key); - } - it->upper_bound(k); - } - end = temp ? temp_end_key.c_str() : end_key.c_str(); - while (true) { - if (!it->valid() || strcmp(it->key().c_str(), end) > 0) { - if (!it->valid()) - dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; - else - dout(20) << __func__ << " key " << it->key() << " > " << end << dendl; - if (temp) { - dout(30) << __func__ << " switch to non-temp namespace" << dendl; - temp = false; - it->upper_bound(start_key); - end = end_key.c_str(); - continue; - } - break; - } - dout(20) << __func__ << " key " << it->key() << dendl; - ghobject_t oid; - int r = get_key_object(it->key(), &oid); - assert(r == 0); - ls->push_back(oid); - if (ls->size() >= (unsigned)max) { - *pnext = oid; - set_next = true; - break; - } - it->next(); - } - if (!set_next) { - *pnext = ghobject_t::get_max(); - } - out: - dout(10) << __func__ << " " << cid - << " start " << start << " min/max " << min << "/" << max - << " snap " << snap << " = " << r << ", ls.size() = " << ls->size() - << ", next = " << *pnext << dendl; - return r; -} - -int NewStore::collection_list_range( - coll_t cid, ghobject_t start, ghobject_t end, - snapid_t seq, vector *ls) -{ - dout(15) << __func__ << " " << cid - << " start " << start << " end " << end - << " snap " << seq << dendl; - CollectionRef c = _get_collection(cid); - if (!c) - return -ENOENT; - RWLock::RLocker l(c->lock); - int r = 0; - KeyValueDB::Iterator it; - string temp_start_key, temp_end_key; - string start_key, end_key; string end_str; const char *pend; bool temp; - if (start == ghobject_t::get_max() || end == ghobject_t()) + if (start == ghobject_t::get_max()) goto out; get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, &start_key, &end_key); dout(20) << __func__ << " range " << temp_start_key << " to " << temp_end_key << " and " << start_key << " to " << end_key - << " start " << start << " end " << end << dendl; + << " start " << start << dendl; it = db->get_iterator(PREFIX_OBJ); if (start == ghobject_t()) { it->upper_bound(temp_start_key); @@ -1537,7 +1413,7 @@ int NewStore::collection_list_range( if (!it->valid()) dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; else - dout(20) << __func__ << " key " << it->key() << " > " << pend << dendl; + dout(20) << __func__ << " key " << it->key() << " > " << end << dendl; if (temp) { if (end.hobj.is_temp()) { break; @@ -1545,7 +1421,7 @@ int NewStore::collection_list_range( dout(30) << __func__ << " switch to non-temp namespace" << dendl; temp = false; it->upper_bound(start_key); - pend = end_str.c_str(); + pend = end_key.c_str(); continue; } break; @@ -1555,12 +1431,21 @@ int NewStore::collection_list_range( int r = get_key_object(it->key(), &oid); assert(r == 0); ls->push_back(oid); + if (ls->size() >= (unsigned)max) { + *pnext = oid; + set_next = true; + break; + } it->next(); } + if (!set_next) { + *pnext = ghobject_t::get_max(); + } out: dout(10) << __func__ << " " << cid - << " start " << start << " end " << end - << " snap " << seq << " = " << r << dendl; + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << *pnext << dendl; return r; } diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index e40bf0bb6cee9..a8d28e18896be 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -538,12 +538,10 @@ class NewStore : public ObjectStore { int list_collections(vector& ls); bool collection_exists(coll_t c); bool collection_empty(coll_t c); - int collection_list(coll_t cid, vector& o); - int collection_list_partial(coll_t cid, ghobject_t start, - int min, int max, snapid_t snap, - vector *ls, ghobject_t *next); - int collection_list_range(coll_t cid, ghobject_t start, ghobject_t end, - snapid_t seq, vector *ls); + + int collection_list(coll_t cid, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, + vector *ls, ghobject_t *next); int omap_get( coll_t cid, ///< [in] Collection containing oid From 2af1e37d7de399b093a300d7beda83ec74cd3699 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 10:10:54 -0400 Subject: [PATCH 343/654] os/newstore: assigned unique nid to each new object Use this as the key for omap (omap_head), but keep the omap_head field so that we can tell when no omap data is present. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 99 +++++++++++++++++-------------- src/os/newstore/NewStore.h | 10 ++-- src/os/newstore/newstore_types.cc | 3 + src/os/newstore/newstore_types.h | 5 +- 5 files changed, 68 insertions(+), 50 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 6316c31bd04cf..96a9f78726b6d 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -799,6 +799,7 @@ OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value OPTION(newstore_fid_prealloc, OPT_INT, 1024) +OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d42721ddfda37..87a8fbab01ad9 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -565,6 +565,8 @@ NewStore::NewStore(CephContext *cct, const string& path) mounted(false), coll_lock("NewStore::coll_lock"), fid_lock("NewStore::fid_lock"), + nid_lock("NewStore::nid_lock"), + nid_max(0), wal_lock("NewStore::wal_lock"), wal_seq(0), finisher(cct), @@ -921,7 +923,7 @@ int NewStore::mount() if (r < 0) goto out_db; - r = _recover_next_omap_id(); + r = _recover_next_nid(); if (r < 0) goto out_db; @@ -1509,7 +1511,7 @@ int NewStore::OmapIteratorImpl::next() { RWLock::RLocker l(c->lock); it->next(); - if (!it->valid() || it->key() == tail) { + if (!it->valid() || it->key() >= tail) { it = KeyValueDB::Iterator(); } return 0; @@ -1559,7 +1561,7 @@ int NewStore::omap_get( if (it->key() == head) { dout(30) << __func__ << " got header" << dendl; *header = it->value(); - } else if (it->key() == tail) { + } else if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } else { @@ -1642,7 +1644,7 @@ int NewStore::omap_get_keys( it->next(); continue; } - if (it->key() == tail) { + if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } @@ -1741,38 +1743,34 @@ ObjectMap::ObjectMapIterator NewStore::get_omap_iterator( // ----------------- // write helpers -int NewStore::_recover_next_omap_id() +int NewStore::_recover_next_nid() { - KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); - it->lower_bound("GGGGGGGG"); - if (!it->valid()) { - omap_id.set(1); - dout(10) << __func__ << " no omap keys, starting at 1" << dendl; - return 0; - } - dout(20) << __func__ << " last key is " << it->key() << dendl; - uint64_t id; - int r = sscanf(it->key().c_str(), "%llx", (unsigned long long*)&id); - if (r < 0) { - derr << "unable to parse " << it->key() << dendl; - return -EIO; + nid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "nid_max", &bl); + try { + ::decode(nid_max, bl); + } catch (buffer::error& e) { } - omap_id.set(id); + dout(1) << __func__ << " old nid_max " << nid_max << dendl; + nid_last = nid_max; return 0; } -void NewStore::_get_omap_id(TransContext *txc, OnodeRef o) +void NewStore::_assign_nid(TransContext *txc, OnodeRef o) { - if (o->onode.omap_head) + if (o->onode.nid) return; - - o->onode.omap_head = omap_id.inc(); - dout(10) << __func__ << " assigned " << o->oid - << " id " << o->onode.omap_head << dendl; - string tail; - get_omap_tail(o->onode.omap_head, &tail); - bufferlist empty; - txc->t->set(PREFIX_OMAP, tail, empty); + Mutex::Locker l(nid_lock); + o->onode.nid = ++nid_last; + dout(20) << __func__ << " " << o->onode.nid << dendl; + if (nid_last > nid_max) { + nid_max += g_conf->newstore_nid_prealloc; + bufferlist bl; + ::encode(nid_max, bl); + txc->t->set(PREFIX_SUPER, "nid_max", bl); + dout(10) << __func__ << " nid_max now " << nid_max << dendl; + } } int NewStore::_recover_next_fid() @@ -2723,6 +2721,7 @@ int NewStore::_touch(TransContext *txc, OnodeRef o = c->get_onode(oid, true); assert(o); o->exists = true; + _assign_nid(txc, o); txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; return r; @@ -2889,6 +2888,7 @@ int NewStore::_write(TransContext *txc, << dendl; RWLock::WLocker l(c->lock); OnodeRef o = c->get_onode(oid, true); + _assign_nid(txc, o); int r = _do_write(txc, o, offset, length, bl, fadvise_flags); txc->write_onode(o); @@ -2910,6 +2910,7 @@ int NewStore::_zero(TransContext *txc, RWLock::WLocker l(c->lock); OnodeRef o = c->get_onode(oid, true); + _assign_nid(txc, o); if (o->onode.data_map.empty()) { // we're already a big hole @@ -3027,7 +3028,7 @@ int NewStore::_truncate(TransContext *txc, int r = 0; RWLock::WLocker l(c->lock); - OnodeRef o = c->get_onode(oid, true); + OnodeRef o = c->get_onode(oid, false); if (!o->exists) { r = -ENOENT; goto out; @@ -3059,7 +3060,7 @@ int NewStore::_do_remove(TransContext *txc, o->onode.data_map.clear(); o->onode.size = 0; if (o->onode.omap_head) { - _do_omap_clear(txc, o->onode.omap_head, true); + _do_omap_clear(txc, o->onode.omap_head); } get_object_key(o->oid, &key); @@ -3074,7 +3075,7 @@ int NewStore::_remove(TransContext *txc, dout(15) << __func__ << " " << c->cid << " " << oid << dendl; int r; RWLock::WLocker l(c->lock); - OnodeRef o = c->get_onode(oid, true); + OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; @@ -3191,8 +3192,7 @@ int NewStore::_rmattrs(TransContext *txc, return r; } -void NewStore::_do_omap_clear(TransContext *txc, uint64_t id, - bool remove_tail) +void NewStore::_do_omap_clear(TransContext *txc, uint64_t id) { KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); string prefix, tail; @@ -3200,10 +3200,7 @@ void NewStore::_do_omap_clear(TransContext *txc, uint64_t id, get_omap_tail(id, &tail); it->lower_bound(prefix); while (it->valid()) { - if (it->key() == tail) { - if (remove_tail) { - txc->t->rmkey(PREFIX_OMAP, it->key()); - } + if (it->key() >= tail) { dout(30) << __func__ << " stop at " << tail << dendl; break; } @@ -3227,7 +3224,7 @@ int NewStore::_omap_clear(TransContext *txc, goto out; } if (o->onode.omap_head != 0) { - _do_omap_clear(txc, o->onode.omap_head, false); + _do_omap_clear(txc, o->onode.omap_head); } r = 0; @@ -3250,7 +3247,10 @@ int NewStore::_omap_setkeys(TransContext *txc, r = -ENOENT; goto out; } - _get_omap_id(txc, o); + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } for (map::const_iterator p = m.begin(); p != m.end(); ++p) { string key; get_omap_key(o->onode.omap_head, p->first, &key); @@ -3279,7 +3279,10 @@ int NewStore::_omap_setheader(TransContext *txc, r = -ENOENT; goto out; } - _get_omap_id(txc, o); + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } get_omap_header(o->onode.omap_head, &key); txc->t->set(PREFIX_OMAP, key, bl); r = 0; @@ -3307,7 +3310,10 @@ int NewStore::_omap_rmkeys(TransContext *txc, r = 0; goto out; } - _get_omap_id(txc, o); + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } for (set::const_iterator p = m.begin(); p != m.end(); ++p) { string key; get_omap_key(o->onode.omap_head, *p, &key); @@ -3411,6 +3417,7 @@ int NewStore::_clone(TransContext *txc, newo = c->get_onode(new_oid, true); assert(newo); newo->exists = true; + _assign_nid(txc, newo); r = _do_read(oldo, 0, oldo->onode.size, bl, 0); if (r < 0) @@ -3431,11 +3438,13 @@ int NewStore::_clone(TransContext *txc, // clone omap if (newo->onode.omap_head) { dout(20) << __func__ << " clearing old omap data" << dendl; - _do_omap_clear(txc, newo->onode.omap_head, true); + _do_omap_clear(txc, newo->onode.omap_head); } if (oldo->onode.omap_head) { dout(20) << __func__ << " copying omap data" << dendl; - _get_omap_id(txc, newo); + if (!newo->onode.omap_head) { + newo->onode.omap_head = newo->onode.nid; + } KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); string head, tail; get_omap_header(oldo->onode.omap_head, &head); @@ -3443,7 +3452,7 @@ int NewStore::_clone(TransContext *txc, it->lower_bound(head); while (it->valid()) { string key; - if (it->key() == tail) { + if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } else { diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index a8d28e18896be..137d1b6403693 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -378,7 +378,9 @@ class NewStore : public ObjectStore { fid_t fid_last; ///< last allocated fid fid_t fid_max; ///< max fid we can allocate before reserving more - atomic64_t omap_id; + Mutex nid_lock; + uint64_t nid_last; + uint64_t nid_max; Mutex wal_lock; atomic64_t wal_seq; @@ -432,8 +434,8 @@ class NewStore : public ObjectStore { int _open_fid(fid_t fid); int _remove_fid(fid_t fid); - int _recover_next_omap_id(); - void _get_omap_id(TransContext *txc, OnodeRef o); + int _recover_next_nid(); + void _assign_nid(TransContext *txc, OnodeRef o); int _clean_fid_tail(TransContext *txc, const fragment_t& f); @@ -657,7 +659,7 @@ class NewStore : public ObjectStore { int _rmattrs(TransContext *txc, CollectionRef& c, const ghobject_t& oid); - void _do_omap_clear(TransContext *txc, uint64_t id, bool remove_tail); + void _do_omap_clear(TransContext *txc, uint64_t id); int _omap_clear(TransContext *txc, CollectionRef& c, const ghobject_t& oid); diff --git a/src/os/newstore/newstore_types.cc b/src/os/newstore/newstore_types.cc index 3e5d0aca66429..53e0887b7615b 100644 --- a/src/os/newstore/newstore_types.cc +++ b/src/os/newstore/newstore_types.cc @@ -97,6 +97,7 @@ void fragment_t::generate_test_instances(list& o) void onode_t::encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); + ::encode(nid, bl); ::encode(size, bl); ::encode(attrs, bl); ::encode(data_map, bl); @@ -109,6 +110,7 @@ void onode_t::encode(bufferlist& bl) const void onode_t::decode(bufferlist::iterator& p) { DECODE_START(1, p); + ::decode(nid, p); ::decode(size, p); ::decode(attrs, p); ::decode(data_map, p); @@ -120,6 +122,7 @@ void onode_t::decode(bufferlist::iterator& p) void onode_t::dump(Formatter *f) const { + f->dump_unsigned("nid", nid); f->dump_unsigned("size", size); f->open_object_section("attrs"); for (map::const_iterator p = attrs.begin(); diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h index 6d9ddfc778d1b..8411d6bb44e6f 100644 --- a/src/os/newstore/newstore_types.h +++ b/src/os/newstore/newstore_types.h @@ -90,6 +90,7 @@ WRITE_CLASS_ENCODER(fragment_t) /// onode: per-object metadata struct onode_t { + uint64_t nid; ///< numeric id (locally unique) uint64_t size; ///< object size map attrs; ///< attrs map data_map; ///< data (offset to fragment mapping) @@ -99,7 +100,9 @@ struct onode_t { uint32_t expected_write_size; onode_t() - : size(0), omap_head(0), + : nid(0), + size(0), + omap_head(0), expected_object_size(0), expected_write_size(0) {} From 59cd761bca2f7475fc8d121d4ab3ee8f982ebfac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Apr 2015 11:25:00 -0700 Subject: [PATCH 344/654] os/newstore: keep smallish overlay extents in kv db If we have a small overwrite, keep the extent in the key/value database. Only write it back to the file/fragment later, and when we do, write them all at once. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 + src/os/newstore/NewStore.cc | 469 ++++++++++++++++++++++++------ src/os/newstore/NewStore.h | 13 + src/os/newstore/newstore_types.cc | 68 +++++ src/os/newstore/newstore_types.h | 25 ++ 5 files changed, 484 insertions(+), 93 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 96a9f78726b6d..828d534a48c3b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -800,6 +800,8 @@ OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value OPTION(newstore_fid_prealloc, OPT_INT, 1024) OPTION(newstore_nid_prealloc, OPT_INT, 1024) +OPTION(newstore_overlay_max_length, OPT_INT, 65536) +OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 87a8fbab01ad9..d23a391608afa 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -54,6 +54,7 @@ const string PREFIX_SUPER = "S"; // field -> value const string PREFIX_COLL = "C"; // collection name -> (nothing) const string PREFIX_OBJ = "O"; // object name -> onode +const string PREFIX_OVERLAY = "V"; // u64 + offset -> value const string PREFIX_OMAP = "M"; // u64 + keyname -> value const string PREFIX_WAL = "L"; // write ahead log @@ -287,6 +288,16 @@ static int get_key_object(const string& key, ghobject_t *oid) return 0; } + +void get_overlay_key(uint64_t nid, uint64_t offset, string *out) +{ + char buf[64]; + // note: these don't have to sort by nid; no need to pad 0's + snprintf(buf, sizeof(buf), "%llx %016llx", (unsigned long long)nid, + (unsigned long long)offset); + *out = buf; +} + // '-' < '.' < '~' void get_omap_header(uint64_t id, string *out) { @@ -1157,7 +1168,8 @@ int NewStore::_do_read( bufferlist& bl, uint32_t op_flags) { - map::iterator p; + map::iterator fp, fend; + map::iterator op, oend; int r; int fd = -1; fid_t cur_fid; @@ -1178,79 +1190,113 @@ int NewStore::_do_read( r = 0; - p = o->onode.data_map.begin(); // fixme - if (p->first > offset && p != o->onode.data_map.begin()) { - --p; + // loop over overlays and data fragments. overlays take precedence. + fend = o->onode.data_map.end(); + fp = o->onode.data_map.begin(); // fixme + if (fp != o->onode.data_map.begin()) { + --fp; } - for ( ; length > 0 && p != o->onode.data_map.end(); ++p) { - assert(p->first == 0); - assert(p->second.offset == 0); - assert(p->second.length == o->onode.size); - dout(30) << __func__ << " x " << p->first << "~" << p->second.length - << " in " << p->second.fid << dendl; - if (p->first + p->second.length <= offset) { - dout(30) << __func__ << " skipping " << p->first << "~" << p->second.length + oend = o->onode.overlay_map.end(); + op = o->onode.overlay_map.begin(); // fixme + if (op != o->onode.overlay_map.begin()) { + --op; + } + while (length > 0) { + if (op != oend && op->first + op->second.length < offset) { + dout(20) << __func__ << " skip overlay " << op->first << " " << op->second << dendl; + ++op; continue; } - if (p->first > offset) { - unsigned l = p->first - offset; - dout(30) << __func__ << " zero " << offset << "~" << l << dendl; - bufferptr bp(l); - bp.zero(); - bl.append(bp); - length = length - l; - } - if (p->second.fid != cur_fid) { - cur_fid = p->second.fid; - if (fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - } - fd = _open_fid(cur_fid); - if (fd < 0) { - r = fd; - goto out; - } - } - unsigned x_off; - if (p->first < offset) { - x_off = offset - p->first; - } else { - x_off = 0; + if (fp != fend && fp->first + fp->second.length <= offset) { + dout(30) << __func__ << " skip frag " << fp->first << "~" << fp->second + << dendl; + ++fp; + continue; } - unsigned x_len = MIN(length, p->second.length - x_off); - dout(30) << __func__ << " data " << offset << "~" << x_len - << " fid " << cur_fid << " offset " << x_off + p->second.offset - << dendl; - r = ::lseek64(fd, p->second.offset + x_off, SEEK_SET); - if (r < 0) { - r = -errno; - goto out; + + // overlay? + if (op != oend && op->first <= offset) { + uint64_t x_off = offset - op->first + op->second.value_offset; + uint64_t x_len = MIN(op->first + op->second.length - offset, length); + dout(20) << __func__ << " overlay " << op->first << " " << op->second + << " use " << x_off << "~" << x_len << dendl; + bufferlist v; + string key; + get_overlay_key(o->onode.nid, op->second.key, &key); + db->get(PREFIX_OVERLAY, key, &v); + bufferlist frag; + frag.substr_of(v, x_off, x_len); + bl.claim_append(frag); + ++op; + length -= x_len; + offset += x_len; + continue; } - bufferlist t; - r = t.read_fd(fd, x_len); - if (r < 0) { - goto out; + + unsigned x_len = length; + if (op != oend && + op->first > offset && + op->first - offset < x_len) { + x_len = op->first - offset; } - bl.claim_append(t); - if ((unsigned)r < x_len) { - dout(10) << __func__ << " short read " << r << " < " << x_len - << " from " << cur_fid << " offset " << p->second.offset + x_off + + // frag? + if (fp != fend && fp->first <= offset) { + if (fp->second.fid != cur_fid) { + cur_fid = fp->second.fid; + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + fd = _open_fid(cur_fid); + if (fd < 0) { + r = fd; + goto out; + } + } + uint64_t x_off = offset - fp->first - fp->second.offset; + x_len = MIN(x_len, fp->second.length - x_off); + dout(30) << __func__ << " data " << fp->first << " " << fp->second + << " use " << x_off << "~" << x_len + << " fid " << cur_fid << " offset " << x_off + fp->second.offset << dendl; - bufferptr z(x_len - r); - z.zero(); - bl.append(z); + r = ::lseek64(fd, x_off, SEEK_SET); + if (r < 0) { + r = -errno; + goto out; + } + bufferlist t; + r = t.read_fd(fd, x_len); + if (r < 0) { + goto out; + } + bl.claim_append(t); + if ((unsigned)r < x_len) { + dout(10) << __func__ << " short read " << r << " < " << x_len + << " from " << cur_fid << dendl; + bufferptr z(x_len - r); + z.zero(); + bl.append(z); + } + offset += x_len; + length -= x_len; + if (x_off + x_len == fp->second.length) { + ++fp; + } + continue; } - offset += x_len; - length -= x_len; - } - if (length > 0 && p == o->onode.data_map.end()) { - dout(30) << __func__ << " trailing zero " << offset << "~" << length << dendl; - bufferptr bp(length); + + // zero. + dout(30) << __func__ << " zero " << offset << "~" << x_len << dendl; + bufferptr bp(x_len); bp.zero(); bl.push_back(bp); + offset += x_len; + length -= x_len; + continue; } r = bl.length(); + out: if (fd >= 0) { VOID_TEMP_FAILURE_RETRY(::close(fd)); @@ -2727,6 +2773,173 @@ int NewStore::_touch(TransContext *txc, return r; } +int NewStore::_do_overlay_clear(TransContext *txc, + OnodeRef o) +{ + dout(10) << __func__ << " " << o->oid << dendl; + + map::iterator p = o->onode.overlay_map.begin(); + while (p != o->onode.overlay_map.end()) { + dout(20) << __func__ << " rm " << p->first << " " << p->second << dendl; + string key; + get_overlay_key(o->onode.nid, p->first, &key); + txc->t->rmkey(PREFIX_OVERLAY, key); + o->onode.overlay_map.erase(p++); + } + o->onode.shared_overlays.clear(); + return 0; +} + +int NewStore::_do_overlay_trim(TransContext *txc, + OnodeRef o, + uint64_t offset, + uint64_t length) +{ + dout(10) << __func__ << " " << o->oid << " " + << offset << "~" << length << dendl; + + map::iterator p = o->onode.overlay_map.begin(); // fixme + if (p != o->onode.overlay_map.begin()) { + --p; + } + while (p != o->onode.overlay_map.end()) { + if (p->first >= offset + length) { + dout(20) << __func__ << " stop at " << p->first << " " << p->second + << dendl; + break; + } + if (p->first + p->second.length < offset) { + dout(20) << __func__ << " skip " << p->first << " " << p->second + << dendl; + ++p; + continue; + } + if (p->first >= offset && + p->first + p->second.length <= offset + length) { + dout(20) << __func__ << " rm " << p->first << " " << p->second + << dendl; + if (o->onode.shared_overlays.count(p->second.key) == 0) { + string key; + get_overlay_key(o->onode.nid, p->first, &key); + txc->t->rmkey(PREFIX_OVERLAY, key); + } + o->onode.overlay_map.erase(p++); + continue; + } + if (p->first >= offset) { + dout(20) << __func__ << " trim_front " << p->first << " " << p->second + << dendl; + overlay_t& ov = o->onode.overlay_map[offset + length] = p->second; + uint64_t by = offset + length - p->first; + ov.value_offset += by; + ov.length -= by; + o->onode.overlay_map.erase(p++); + continue; + } + if (p->first < offset && + p->first + p->second.length <= offset + length) { + dout(20) << __func__ << " trim_tail " << p->first << " " << p->second + << dendl; + p->second.length = offset - p->first; + ++p; + continue; + } + dout(20) << __func__ << " split " << p->first << " " << p->second + << dendl; + assert(p->first < offset); + assert(p->first + p->second.length > offset + length); + overlay_t& nov = o->onode.overlay_map[offset + length] = p->second; + p->second.length = offset - p->first; + uint64_t by = offset + length - p->first; + nov.value_offset += by; + nov.length -= by; + o->onode.shared_overlays.insert(p->second.key); + ++p; + } + return 0; +} + +int NewStore::_do_overlay_write(TransContext *txc, + OnodeRef o, + uint64_t offset, + uint64_t length, + const bufferlist& bl) +{ + _do_overlay_trim(txc, o, offset, length); + + dout(10) << __func__ << " " << o->oid << " " + << offset << "~" << length << dendl; + overlay_t& ov = o->onode.overlay_map[offset] = + overlay_t(++o->onode.last_overlay_key, 0, length); + dout(20) << __func__ << " added " << offset << " " << ov << dendl; + string key; + get_overlay_key(o->onode.nid, o->onode.last_overlay_key, &key); + txc->t->set(PREFIX_OVERLAY, key, bl); + return 0; +} + +int NewStore::_do_write_all_overlays(TransContext *txc, + OnodeRef o) +{ + if (o->onode.overlay_map.empty()) + return 0; + + // overwrite to new fid + if (o->onode.data_map.empty()) { + // create + fragment_t &f = o->onode.data_map[0]; + f.offset = 0; + f.length = o->onode.size; + int fd = _create_fid(txc, &f.fid); + if (fd < 0) { + return fd; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(20) << __func__ << " create " << f.fid << dendl; + } + + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + + for (map::iterator p = o->onode.overlay_map.begin(); + p != o->onode.overlay_map.end(); + ++p) { + dout(10) << __func__ << " overlay " << p->first + << "~" << p->second << dendl; + string key; + get_overlay_key(o->onode.nid, p->second.key, &key); + bufferlist bl; + db->get(PREFIX_OVERLAY, key, &bl); + + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_WRITE; + op->offset = p->first; + op->length = p->second.length; + op->fid = f.fid; + op->data.substr_of(bl, p->second.value_offset, p->second.length); + + txc->t->rmkey(PREFIX_OVERLAY, key); + } + + // this may double delete something we did above, but that's less + // work than doing careful ref counting of the overlay key/value + // pairs. + for (set::iterator p = o->onode.shared_overlays.begin(); + p != o->onode.shared_overlays.end(); + ++p) { + dout(10) << __func__ << " shared overlay " << *p << dendl; + string key; + get_overlay_key(o->onode.nid, *p, &key); + txc->t->rmkey(PREFIX_OVERLAY, key); + } + + o->onode.overlay_map.clear(); + txc->write_onode(o); + return 0; +} + int NewStore::_do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, @@ -2749,6 +2962,7 @@ int NewStore::_do_write(TransContext *txc, if (o->onode.size == offset || o->onode.size == 0 || o->onode.data_map.empty()) { + _do_overlay_clear(txc, o); if (o->onode.data_map.empty()) { // create fragment_t &f = o->onode.data_map[0]; @@ -2794,6 +3008,8 @@ int NewStore::_do_write(TransContext *txc, assert(f.offset == 0); assert(f.length == o->onode.size); + _do_overlay_clear(txc, o); + wal_op_t *op = _get_wal_op(txc); op->op = wal_op_t::OP_REMOVE; op->fid = f.fid; @@ -2814,12 +3030,37 @@ int NewStore::_do_write(TransContext *txc, goto out; } txc->sync_fd(fd); + } else if ((int)o->onode.overlay_map.size() < g_conf->newstore_overlay_max && + (int)length < g_conf->newstore_overlay_max_length) { + // write an overlay + r = _do_overlay_write(txc, o, offset, length, bl); + if (r < 0) + goto out; + if (offset + length > o->onode.size) { + // make sure the data fragment matches + if (!o->onode.data_map.empty()) { + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + r = _clean_fid_tail(txc, f); + if (r < 0) + goto out; + f.length = offset + length; + } + dout(20) << __func__ << " extending size to " << offset + length << dendl; + o->onode.size = offset + length; + } + txc->write_onode(o); } else { // WAL assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; assert(f.offset == 0); assert(f.length == o->onode.size); + r = _do_write_all_overlays(txc, o); + if (r < 0) + goto out; r = _clean_fid_tail(txc, f); if (r < 0) goto out; @@ -2965,53 +3206,95 @@ int NewStore::_zero(TransContext *txc, int NewStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset) { - if (o->onode.data_map.empty()) { - o->onode.size = offset; - } else if (offset == 0) { - while (!o->onode.data_map.empty()) { + // trim down fragments + map::iterator fp = o->onode.data_map.end(); + if (fp != o->onode.data_map.begin()) + --fp; + while (fp != o->onode.data_map.end()) { + if (fp->first + fp->second.length <= offset) { + break; + } + if (fp->first >= offset) { + dout(20) << __func__ << " wal rm fragment " << fp->first << " " + << fp->second << dendl; wal_op_t *op = _get_wal_op(txc); op->op = wal_op_t::OP_REMOVE; - op->fid = o->onode.data_map.rbegin()->second.fid; - o->onode.data_map.erase(o->onode.data_map.rbegin()->first); + op->fid = fp->second.fid; + if (fp != o->onode.data_map.begin()) { + o->onode.data_map.erase(fp--); + continue; + } else { + o->onode.data_map.erase(fp); + break; + } + } else { + assert(fp->first + fp->second.length > offset); + assert(fp->first < offset); + uint64_t newlen = offset - fp->first; + dout(20) << __func__ << " wal truncate fragment " << fp->first << " " + << fp->second << " to " << newlen << dendl; + fragment_t& f = fp->second; + f.length = newlen; + wal_op_t *op = _get_wal_op(txc); + op->op = wal_op_t::OP_TRUNCATE; + op->offset = offset; + op->fid = f.fid; + break; } - } else if (offset < o->onode.size) { - assert(o->onode.data_map.size() == 1); - fragment_t& f = o->onode.data_map.begin()->second; - assert(f.offset == 0); - assert(f.length == o->onode.size); - f.length = offset; - wal_op_t *op = _get_wal_op(txc); - op->op = wal_op_t::OP_TRUNCATE; - op->offset = offset; - op->fid = f.fid; - assert(f.offset == 0); - } else if (offset > o->onode.size) { + } + + // truncate up trailing fragment? + if (!o->onode.data_map.empty() && offset > o->onode.size) { // resize file up. make sure we don't have trailing bytes assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; assert(f.offset == 0); assert(f.length == o->onode.size); + dout(20) << __func__ << " truncate up " << f << " to " << offset << dendl; int r = _clean_fid_tail(txc, f); if (r < 0) return r; - if (false) { // hmm don't bother!! - // truncate up. don't bother to fsync since it's all zeros. - int fd = _open_fid(f.fid); - if (fd < 0) { - return fd; + f.length = offset; + } + + // trim down overlays + map::iterator op = o->onode.overlay_map.end(); + if (op != o->onode.overlay_map.begin()) + --op; + while (op != o->onode.overlay_map.end()) { + if (op->first + op->second.length <= offset) { + break; + } + if (op->first >= offset) { + if (!o->onode.shared_overlays.count(op->second.key)) { + dout(20) << __func__ << " rm overlay " << op->first << " " + << op->second << dendl; + string key; + get_overlay_key(o->onode.nid, op->second.key, &key); + txc->t->rmkey(PREFIX_OVERLAY, key); + } else { + dout(20) << __func__ << " rm overlay " << op->first << " " + << op->second << " (shared)" << dendl; } - r = ::ftruncate(fd, offset); - if (r < 0) { - r = -errno; - derr << "error from ftruncate on " << f.fid << " to " << offset << ": " - << cpp_strerror(r) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return r; + if (op != o->onode.overlay_map.begin()) { + o->onode.overlay_map.erase(op--); + continue; + } else { + o->onode.overlay_map.erase(op); + break; } - VOID_TEMP_FAILURE_RETRY(::close(fd)); + } else { + assert(op->first + op->second.length > offset); + assert(op->first < offset); + uint64_t newlen = offset - op->first; + dout(20) << __func__ << " truncate overlay " << op->first << " " + << op->second << " to " << newlen << dendl; + overlay_t& ov = op->second; + ov.length = newlen; + break; } - f.length = offset; } + o->onode.size = offset; txc->write_onode(o); return 0; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 137d1b6403693..eb548ecb75e22 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -619,6 +619,19 @@ class NewStore : public ObjectStore { uint64_t offset, size_t len, const bufferlist& bl, uint32_t fadvise_flags); + int _do_overlay_clear(TransContext *txc, + OnodeRef o); + int _do_overlay_trim(TransContext *txc, + OnodeRef o, + uint64_t offset, + uint64_t length); + int _do_overlay_write(TransContext *txc, + OnodeRef o, + uint64_t offset, + uint64_t length, + const bufferlist& bl); + int _do_write_all_overlays(TransContext *txc, + OnodeRef o); int _do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, diff --git a/src/os/newstore/newstore_types.cc b/src/os/newstore/newstore_types.cc index 53e0887b7615b..aa1a710532110 100644 --- a/src/os/newstore/newstore_types.cc +++ b/src/os/newstore/newstore_types.cc @@ -92,6 +92,52 @@ void fragment_t::generate_test_instances(list& o) o.push_back(new fragment_t(789, 1024, fid_t(3, 400))); } +ostream& operator<<(ostream& out, const fragment_t& f) +{ + out << "fragment(" << f.offset << "~" << f.length << " " << f.fid << ")"; + return out; +} + +// overlay_t + +void overlay_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode(value_offset, bl); + ::encode(length, bl); + ENCODE_FINISH(bl); +} + +void overlay_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(key, p); + ::decode(value_offset, p); + ::decode(length, p); + DECODE_FINISH(p); +} + +void overlay_t::dump(Formatter *f) const +{ + f->dump_unsigned("key", key); + f->dump_unsigned("value_offset", value_offset); + f->dump_unsigned("length", length); +} + +void overlay_t::generate_test_instances(list& o) +{ + o.push_back(new overlay_t()); + o.push_back(new overlay_t(789, 1024, 1232232)); +} + +ostream& operator<<(ostream& out, const overlay_t& o) +{ + out << "overlay(" << o.value_offset << "~" << o.length + << " key " << o.key << ")"; + return out; +} + // onode_t void onode_t::encode(bufferlist& bl) const @@ -101,6 +147,9 @@ void onode_t::encode(bufferlist& bl) const ::encode(size, bl); ::encode(attrs, bl); ::encode(data_map, bl); + ::encode(overlay_map, bl); + ::encode(shared_overlays, bl); + ::encode(last_overlay_key, bl); ::encode(omap_head, bl); ::encode(expected_object_size, bl); ::encode(expected_write_size, bl); @@ -114,6 +163,9 @@ void onode_t::decode(bufferlist::iterator& p) ::decode(size, p); ::decode(attrs, p); ::decode(data_map, p); + ::decode(overlay_map, p); + ::decode(shared_overlays, p); + ::decode(last_overlay_key, p); ::decode(omap_head, p); ::decode(expected_object_size, p); ::decode(expected_write_size, p); @@ -141,6 +193,22 @@ void onode_t::dump(Formatter *f) const f->close_section(); } f->close_section(); + f->open_object_section("overlays"); + for (map::const_iterator p = overlay_map.begin(); + p != overlay_map.end(); ++p) { + f->open_object_section("overlay"); + f->dump_unsigned("offset", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("shared_overlays"); + for (set::const_iterator p = shared_overlays.begin(); + p != shared_overlays.end(); ++p) { + f->dump_unsigned("offset", *p); + } + f->close_section(); + f->dump_unsigned("last_overlay_key", last_overlay_key); f->dump_unsigned("omap_head", omap_head); f->dump_unsigned("expected_object_size", expected_object_size); f->dump_unsigned("expected_write_size", expected_write_size); diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h index 8411d6bb44e6f..636ef65a96c8d 100644 --- a/src/os/newstore/newstore_types.h +++ b/src/os/newstore/newstore_types.h @@ -88,12 +88,36 @@ struct fragment_t { }; WRITE_CLASS_ENCODER(fragment_t) +ostream& operator<<(ostream& out, const fragment_t& o); + +struct overlay_t { + uint64_t key; ///< key (offset of start of original k/v pair) + uint32_t value_offset; ///< offset in associated value for this extent + uint32_t length; + + overlay_t() : key(0), value_offset(0), length(0) {} + overlay_t(uint64_t k, uint32_t vo, uint32_t l) + : key(k), value_offset(vo), length(l) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + +}; +WRITE_CLASS_ENCODER(overlay_t) + +ostream& operator<<(ostream& out, const overlay_t& o); + /// onode: per-object metadata struct onode_t { uint64_t nid; ///< numeric id (locally unique) uint64_t size; ///< object size map attrs; ///< attrs map data_map; ///< data (offset to fragment mapping) + map overlay_map; ///< overlay data (stored in db) + set shared_overlays; ///< overlay keys that are shared + uint32_t last_overlay_key; ///< key for next overlay uint64_t omap_head; ///< id for omap root node uint32_t expected_object_size; @@ -102,6 +126,7 @@ struct onode_t { onode_t() : nid(0), size(0), + last_overlay_key(0), omap_head(0), expected_object_size(0), expected_write_size(0) {} From ef420baf1c89e711a3185bac235865605c7c3a27 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 7 Apr 2015 16:41:27 +0800 Subject: [PATCH 345/654] os/newstore: cap fid_max below newstore_max_dir_size Prevent fid_max over the max_dir_size when preallocation. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d23a391608afa..c226f81a87d3d 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1870,8 +1870,8 @@ int NewStore::_create_fid(TransContext *txc, fid_t *fid) fid_last.fno < g_conf->newstore_max_dir_size) { ++fid_last.fno; if (fid_last.fno >= fid_max.fno) { - // raise fid_max, same fset - fid_max.fno += g_conf->newstore_fid_prealloc; + // raise fid_max, same fset, capping to max_dir_size + fid_max.fno = min(fid_max.fno + g_conf->newstore_fid_prealloc, g_conf->newstore_max_dir_size); assert(fid_max.fno >= fid_last.fno); bufferlist bl; ::encode(fid_max, bl); From 8f2c2bff30fbf17c8bc654aa2574ed29b07cdd98 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Apr 2015 15:22:09 -0700 Subject: [PATCH 346/654] os/newstore: use fs abstaction layer Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 6 ++++++ src/os/newstore/NewStore.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index c226f81a87d3d..d32cb987bba77 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -569,6 +569,7 @@ NewStore::NewStore(CephContext *cct, const string& path) : ObjectStore(path), cct(cct), db(NULL), + fs(NULL), path_fd(-1), fsid_fd(-1), frag_fd(-1), @@ -633,6 +634,9 @@ int NewStore::_open_path() << dendl; return r; } + assert(fs == NULL); + fs = FS::create(path_fd); + dout(1) << __func__ << " using fs driver '" << fs->get_name() << "'" << dendl; return 0; } @@ -640,6 +644,8 @@ void NewStore::_close_path() { VOID_TEMP_FAILURE_RETRY(::close(path_fd)); path_fd = -1; + delete fs; + fs = NULL; } int NewStore::_open_frag() diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index eb548ecb75e22..98eef602d567a 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -24,6 +24,7 @@ #include "common/RWLock.h" #include "common/WorkQueue.h" #include "os/ObjectStore.h" +#include "os/fs/FS.h" #include "os/KeyValueDB.h" #include "newstore_types.h" @@ -364,6 +365,7 @@ class NewStore : public ObjectStore { private: CephContext *cct; KeyValueDB *db; + FS *fs; uuid_d fsid; int path_fd; ///< open handle to $path int fsid_fd; ///< open handle (locked) to $path/fsid From 97bda73ebf0aa559803dd8d44dcd1fc84b550ef3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Apr 2015 15:24:16 -0700 Subject: [PATCH 347/654] os/newstore: open by handle Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 32 +++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 828d534a48c3b..abb7a7a000fd0 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -802,6 +802,7 @@ OPTION(newstore_fid_prealloc, OPT_INT, 1024) OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(newstore_overlay_max_length, OPT_INT, 65536) OPTION(newstore_overlay_max, OPT_INT, 32) +OPTION(newstore_open_by_handle, OPT_BOOL, true) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d32cb987bba77..9d8a2d6859afa 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -41,7 +41,6 @@ - DBObjectMap::clone lock ordering - HashIndex::get_path_contents_by_hash - HashIndex::list_by_hash - * open-by-handle * use work queue for wal fsyncs and kv record removals * avoid mtime updates when doing open-by-handle * abstract out fs specifics @@ -1854,6 +1853,18 @@ int NewStore::_recover_next_fid() int NewStore::_open_fid(fid_t fid) { + if (fid.handle.length() && g_conf->newstore_open_by_handle) { + int fd = fs->open_handle(path_fd, fid.handle, O_RDWR); + if (fd >= 0) { + dout(30) << __func__ << " " << fid << " = " << fd + << " (open by handle)" << dendl; + return fd; + } + int err = -errno; + dout(30) << __func__ << " " << fid << " = " << cpp_strerror(err) + << " (with open by handle, falling back to file name)" << dendl; + } + char fn[32]; snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno); int fd = ::openat(frag_fd, fn, O_RDWR); @@ -1928,15 +1939,18 @@ int NewStore::_create_fid(TransContext *txc, fid_t *fid) return r; } -#if 0 - // store a handle, too - void *hp; - size_t hlen; - int r = fd_to_handle(fd, &hp, &hlen); - if (r >= 0) { - fid->handle = string((char *)hp, hlen); + if (g_conf->newstore_open_by_handle) { + int r = fs->get_handle(fd, &fid->handle); + if (r < 0) { + dout(30) << __func__ << " get_handle got " << cpp_strerror(r) << dendl; + } else { + dout(30) << __func__ << " got handle: "; + bufferlist bl; + bl.append(fid->handle); + bl.hexdump(*_dout); + *_dout << dendl; + } } -#endif dout(30) << __func__ << " " << *fid << " = " << fd << dendl; return fd; From 0981428123e4e5064f5618fe988d0b38dddb0af7 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 9 Apr 2015 00:13:10 +0800 Subject: [PATCH 348/654] os/Newstore:Change assert in get_onode db->get will return negtive when key is not found. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 9d8a2d6859afa..a84e38a5caf00 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -537,8 +537,8 @@ NewStore::OnodeRef NewStore::Collection::get_onode( int r = store->db->get(PREFIX_OBJ, key, &v); dout(20) << " r " << r << " v.len " << v.length() << dendl; Onode *on; - assert(r >= 0); if (v.length() == 0) { + assert(r == -ENOENT); if (!create) return OnodeRef(); @@ -547,6 +547,7 @@ NewStore::OnodeRef NewStore::Collection::get_onode( on->dirty = true; } else { // loaded + assert(r >=0); on = new Onode(oid, key); bufferlist::iterator p = v.begin(); ::decode(on->onode, p); From 66aae982772fa2998a2435435cf74e6ecabfbd39 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Apr 2015 09:16:03 -0700 Subject: [PATCH 349/654] os/newstore: use overlay even if it is a new object or append This avoids the fsync for small writes. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 63 ++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a84e38a5caf00..689415c571b27 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2980,6 +2980,33 @@ int NewStore::_do_write(TransContext *txc, dout(20) << __func__ << " zero-length write" << dendl; goto out; } + + if ((int)o->onode.overlay_map.size() < g_conf->newstore_overlay_max && + (int)length < g_conf->newstore_overlay_max_length) { + // write an overlay + r = _do_overlay_write(txc, o, offset, length, bl); + if (r < 0) + goto out; + if (offset + length > o->onode.size) { + // make sure the data fragment matches + if (!o->onode.data_map.empty()) { + assert(o->onode.data_map.size() == 1); + fragment_t& f = o->onode.data_map.begin()->second; + assert(f.offset == 0); + assert(f.length == o->onode.size); + r = _clean_fid_tail(txc, f); + if (r < 0) + goto out; + f.length = offset + length; + } + dout(20) << __func__ << " extending size to " << offset + length << dendl; + o->onode.size = offset + length; + } + txc->write_onode(o); + r = 0; + goto out; + } + if (o->onode.size == offset || o->onode.size == 0 || o->onode.data_map.empty()) { @@ -3021,8 +3048,12 @@ int NewStore::_do_write(TransContext *txc, goto out; } txc->sync_fd(fd); - } else if (offset == 0 && - length >= o->onode.size) { + r = 0; + goto out; + } + + if (offset == 0 && + length >= o->onode.size) { // overwrite to new fid assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; @@ -3051,29 +3082,11 @@ int NewStore::_do_write(TransContext *txc, goto out; } txc->sync_fd(fd); - } else if ((int)o->onode.overlay_map.size() < g_conf->newstore_overlay_max && - (int)length < g_conf->newstore_overlay_max_length) { - // write an overlay - r = _do_overlay_write(txc, o, offset, length, bl); - if (r < 0) - goto out; - if (offset + length > o->onode.size) { - // make sure the data fragment matches - if (!o->onode.data_map.empty()) { - assert(o->onode.data_map.size() == 1); - fragment_t& f = o->onode.data_map.begin()->second; - assert(f.offset == 0); - assert(f.length == o->onode.size); - r = _clean_fid_tail(txc, f); - if (r < 0) - goto out; - f.length = offset + length; - } - dout(20) << __func__ << " extending size to " << offset + length << dendl; - o->onode.size = offset + length; - } - txc->write_onode(o); - } else { + r = 0; + goto out; + } + + if (true) { // WAL assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; From f9a7fd4e4c503472866da5597824471bec766f20 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Apr 2015 10:52:06 -0700 Subject: [PATCH 350/654] os/newstore: use lower_bound for finding overlay extents in map Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 689415c571b27..0fead936b2c25 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1198,12 +1198,12 @@ int NewStore::_do_read( // loop over overlays and data fragments. overlays take precedence. fend = o->onode.data_map.end(); - fp = o->onode.data_map.begin(); // fixme + fp = o->onode.data_map.lower_bound(offset); if (fp != o->onode.data_map.begin()) { --fp; } oend = o->onode.overlay_map.end(); - op = o->onode.overlay_map.begin(); // fixme + op = o->onode.overlay_map.lower_bound(offset); if (op != o->onode.overlay_map.begin()) { --op; } @@ -2819,7 +2819,8 @@ int NewStore::_do_overlay_trim(TransContext *txc, dout(10) << __func__ << " " << o->oid << " " << offset << "~" << length << dendl; - map::iterator p = o->onode.overlay_map.begin(); // fixme + map::iterator p = + o->onode.overlay_map.lower_bound(offset); if (p != o->onode.overlay_map.begin()) { --p; } From ec21f578a7f2b3d3fe209eea52e1c04f33488bdf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 11:54:01 -0700 Subject: [PATCH 351/654] os/newstore: fix off-by-one on overlay_max_length Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 0fead936b2c25..8989e584bc71d 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2983,7 +2983,7 @@ int NewStore::_do_write(TransContext *txc, } if ((int)o->onode.overlay_map.size() < g_conf->newstore_overlay_max && - (int)length < g_conf->newstore_overlay_max_length) { + (int)length <= g_conf->newstore_overlay_max_length) { // write an overlay r = _do_overlay_write(txc, o, offset, length, bl); if (r < 0) From 86a3f7dd514f3798fc9c3e4b2839f1209b08bb58 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 14:28:13 -0700 Subject: [PATCH 352/654] os/newstore: let wal cleanup kv txn get batched No need to trigger another sync kv commit here; just let the next KV commit catch it. We could possibly do a bit better here by not waking up the kv thread at all... Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 22 ++++++++++++++++++---- src/os/newstore/NewStore.h | 2 ++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 8989e584bc71d..daaee2183ac05 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2209,7 +2209,19 @@ void NewStore::_kv_sync_thread() dout(20) << __func__ << " committed " << kv_committing.size() << " in " << dur << dendl; while (!kv_committing.empty()) { - _txc_finish_kv(kv_committing.front()); + TransContext *txc = kv_committing.front(); + if (txc->state == TransContext::STATE_WAL_CLEANUP) { + txc->osr->qlock.Lock(); + txc->state = TransContext::STATE_FINISHING; + txc->osr->qlock.Unlock(); + _txc_finish_apply(txc); + } else if (txc->state == TransContext::STATE_KV_QUEUED) { + _txc_finish_kv(txc); + } else { + derr << __func__ << " unexpected txc state " << txc->get_state_name() + << dendl; + assert(0); + } kv_committing.pop_front(); } @@ -2246,13 +2258,15 @@ int NewStore::_apply_wal_transaction(TransContext *txc) get_wal_key(wt.seq, &key); KeyValueDB::Transaction cleanup = db->get_transaction(); cleanup->rmkey(PREFIX_WAL, key); - db->submit_transaction_sync(cleanup); txc->osr->qlock.Lock(); - txc->state = TransContext::STATE_FINISHING; + txc->state = TransContext::STATE_WAL_CLEANUP; txc->osr->qlock.Unlock(); - _txc_finish_apply(txc); + Mutex::Locker l(kv_lock); + db->submit_transaction(cleanup); + kv_queue.push_back(txc); + kv_cond.SignalOne(); return 0; } diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 98eef602d567a..f2ba13745053a 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -168,6 +168,7 @@ class NewStore : public ObjectStore { STATE_KV_DONE, STATE_WAL_QUEUED, STATE_WAL_APPLYING, + STATE_WAL_CLEANUP, // remove wal kv record STATE_WAL_DONE, STATE_FINISHING, STATE_DONE, @@ -186,6 +187,7 @@ class NewStore : public ObjectStore { case STATE_KV_DONE: return "kv_done"; case STATE_WAL_QUEUED: return "wal_queued"; case STATE_WAL_APPLYING: return "wal_applying"; + case STATE_WAL_CLEANUP: return "wal_cleanup"; case STATE_WAL_DONE: return "wal_done"; case STATE_FINISHING: return "finishing"; case STATE_DONE: return "done"; From 93fa4f1e30bdeed54a7cef7557f4702e89b7c570 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 15:29:16 -0700 Subject: [PATCH 353/654] os/newstore: do not call completions from kv thread Reads may call wait_wal() holding user locks, and so we cannot block progress on WAL completion/flushing by calling callbacks that may take user locks. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 38 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index daaee2183ac05..925a6ec72e821 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2096,27 +2096,23 @@ void NewStore::_txc_finish_kv(TransContext *txc) txc->osr->qlock.Lock(); txc->state = TransContext::STATE_KV_DONE; - // loop in case we race with OpSequencer::flush_commit() - do { - txc->osr->qlock.Unlock(); - if (txc->onreadable_sync) { - txc->onreadable_sync->complete(0); - txc->onreadable_sync = NULL; - } - if (txc->onreadable) { - finisher.queue(txc->onreadable); - txc->onreadable = NULL; - } - if (txc->oncommit) { - txc->oncommit->complete(0); - txc->oncommit = NULL; - } - while (!txc->oncommits.empty()) { - txc->oncommits.front()->complete(0); - txc->oncommits.pop_front(); - } - txc->osr->qlock.Lock(); - } while (txc->oncommit || !txc->oncommits.empty()); + // warning: we're calling onreadable_sync inside the sequencer lock + if (txc->onreadable_sync) { + txc->onreadable_sync->complete(0); + txc->onreadable_sync = NULL; + } + if (txc->onreadable) { + finisher.queue(txc->onreadable); + txc->onreadable = NULL; + } + if (txc->oncommit) { + finisher.queue(txc->oncommit); + txc->oncommit = NULL; + } + while (!txc->oncommits.empty()) { + finisher.queue(txc->oncommits.front()); + txc->oncommits.pop_front(); + } if (txc->wal_txn) { dout(20) << __func__ << " starting wal apply" << dendl; From 48f639beec93f38dfd99565eb9a07f429e7b8e9a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 16:33:00 -0700 Subject: [PATCH 354/654] os/newstore: drop unused FragmentHandle Signed-off-by: Sage Weil --- src/os/newstore/NewStore.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index f2ba13745053a..b8f83f4357f4f 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -36,23 +36,6 @@ class NewStore : public ObjectStore { // types public: - struct FragmentHandle { - int fd; - FragmentHandle() : fd(-1) {} - FragmentHandle(int f) : fd(f) {} - ~FragmentHandle() { - if (fd >= 0) - ::close(fd); - } - int fsync() { - return ::fsync(fd); - } - int fdatasync() { - return ::fdatasync(fd); - } - }; - typedef ceph::shared_ptr FragmentHandleRef; - class TransContext; /// an in-memory object From 5539a75efbb0ae686382c7d335072a835719b59e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 16:35:40 -0700 Subject: [PATCH 355/654] os/newstore: pass flags to _{open,create}_fid Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 30 +++++++++++++++--------------- src/os/newstore/NewStore.h | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 925a6ec72e821..81dd130b2920f 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1254,7 +1254,7 @@ int NewStore::_do_read( if (fd >= 0) { VOID_TEMP_FAILURE_RETRY(::close(fd)); } - fd = _open_fid(cur_fid); + fd = _open_fid(cur_fid, O_RDONLY); if (fd < 0) { r = fd; goto out; @@ -1852,10 +1852,10 @@ int NewStore::_recover_next_fid() return 0; } -int NewStore::_open_fid(fid_t fid) +int NewStore::_open_fid(fid_t fid, unsigned flags) { if (fid.handle.length() && g_conf->newstore_open_by_handle) { - int fd = fs->open_handle(path_fd, fid.handle, O_RDWR); + int fd = fs->open_handle(path_fd, fid.handle, flags); if (fd >= 0) { dout(30) << __func__ << " " << fid << " = " << fd << " (open by handle)" << dendl; @@ -1868,7 +1868,7 @@ int NewStore::_open_fid(fid_t fid) char fn[32]; snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno); - int fd = ::openat(frag_fd, fn, O_RDWR); + int fd = ::openat(frag_fd, fn, flags); if (fd < 0) { int r = -errno; derr << __func__ << " on " << fid << ": " << cpp_strerror(r) << dendl; @@ -1878,7 +1878,7 @@ int NewStore::_open_fid(fid_t fid) return fd; } -int NewStore::_create_fid(TransContext *txc, fid_t *fid) +int NewStore::_create_fid(TransContext *txc, fid_t *fid, unsigned flags) { { Mutex::Locker l(fid_lock); @@ -1932,7 +1932,7 @@ int NewStore::_create_fid(TransContext *txc, fid_t *fid) dout(10) << __func__ << " " << fid_last << dendl; char s[32]; snprintf(s, sizeof(s), "%u", fid->fno); - int fd = ::openat(fset_fd, s, O_RDWR|O_CREAT, 0644); + int fd = ::openat(fset_fd, s, flags | O_CREAT, 0644); if (fd < 0) { int r = -errno; derr << __func__ << " cannot create " << path << "/fragments/" @@ -2277,7 +2277,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) { dout(20) << __func__ << " write " << p->fid << " " << p->offset << "~" << p->length << dendl; - int fd = _open_fid(p->fid); + int fd = _open_fid(p->fid, O_RDWR); if (fd < 0) return fd; int r = ::lseek64(fd, p->offset, SEEK_SET); @@ -2295,7 +2295,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) { dout(20) << __func__ << " zero " << p->fid << " " << p->offset << "~" << p->length << dendl; - int fd = _open_fid(p->fid); + int fd = _open_fid(p->fid, O_RDWR); if (fd < 0) return fd; int r = ::lseek64(fd, p->offset, SEEK_SET); @@ -2318,7 +2318,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) { dout(20) << __func__ << " truncate " << p->fid << " " << p->offset << dendl; - int fd = _open_fid(p->fid); + int fd = _open_fid(p->fid, O_RDWR); if (fd < 0) return fd; int r = ::ftruncate(fd, p->offset); @@ -2922,7 +2922,7 @@ int NewStore::_do_write_all_overlays(TransContext *txc, fragment_t &f = o->onode.data_map[0]; f.offset = 0; f.length = o->onode.size; - int fd = _create_fid(txc, &f.fid); + int fd = _create_fid(txc, &f.fid, O_RDWR); if (fd < 0) { return fd; } @@ -3027,7 +3027,7 @@ int NewStore::_do_write(TransContext *txc, fragment_t &f = o->onode.data_map[0]; f.offset = 0; f.length = MAX(offset + length, o->onode.size); - fd = _create_fid(txc, &f.fid); + fd = _create_fid(txc, &f.fid, O_RDWR); if (fd < 0) { r = fd; goto out; @@ -3039,7 +3039,7 @@ int NewStore::_do_write(TransContext *txc, // append (possibly with gap) assert(o->onode.data_map.size() == 1); fragment_t &f = o->onode.data_map.rbegin()->second; - fd = _open_fid(f.fid); + fd = _open_fid(f.fid, O_RDWR); if (fd < 0) { r = fd; goto out; @@ -3079,7 +3079,7 @@ int NewStore::_do_write(TransContext *txc, f.length = length; o->onode.size = length; - fd = _create_fid(txc, &f.fid); + fd = _create_fid(txc, &f.fid, O_RDWR); if (fd < 0) { r = fd; goto out; @@ -3132,7 +3132,7 @@ int NewStore::_do_write(TransContext *txc, int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f) { - int fd = _open_fid(f.fid); + int fd = _open_fid(f.fid, O_RDWR); if (fd < 0) { return fd; } @@ -3216,7 +3216,7 @@ int NewStore::_zero(TransContext *txc, if (offset >= o->onode.size) { // after tail - int fd = _open_fid(f.fid); + int fd = _open_fid(f.fid, O_RDWR); if (fd < 0) { r = fd; goto out; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index b8f83f4357f4f..381044da1f46a 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -417,8 +417,8 @@ class NewStore : public ObjectStore { void _reap_collections(); int _recover_next_fid(); - int _create_fid(TransContext *txc, fid_t *fid); - int _open_fid(fid_t fid); + int _create_fid(TransContext *txc, fid_t *fid, unsigned flags); + int _open_fid(fid_t fid, unsigned flags); int _remove_fid(fid_t fid); int _recover_next_nid(); From c67c9a2bee177183bdff1cc9ef56d13f438a2efe Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 16:49:07 -0700 Subject: [PATCH 356/654] os/newstore: use O_DIRECT is write is page-aligned Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index abb7a7a000fd0..2c3d8ee52a6e0 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -803,6 +803,7 @@ OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(newstore_overlay_max_length, OPT_INT, 65536) OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(newstore_open_by_handle, OPT_BOOL, true) +OPTION(newstore_o_direct, OPT_BOOL, true) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 81dd130b2920f..8515eb5d0111d 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2277,7 +2277,14 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) { dout(20) << __func__ << " write " << p->fid << " " << p->offset << "~" << p->length << dendl; - int fd = _open_fid(p->fid, O_RDWR); + unsigned flags = O_RDWR; + if (g_conf->newstore_o_direct && + (p->offset & ~CEPH_PAGE_MASK) == 0 && + (p->length & ~CEPH_PAGE_MASK) == 0) { + dout(20) << __func__ << " page-aligned, using O_DIRECT" << dendl; + flags |= O_DIRECT; + } + int fd = _open_fid(p->fid, flags); if (fd < 0) return fd; int r = ::lseek64(fd, p->offset, SEEK_SET); @@ -3022,12 +3029,19 @@ int NewStore::_do_write(TransContext *txc, o->onode.size == 0 || o->onode.data_map.empty()) { _do_overlay_clear(txc, o); + unsigned flags = O_RDWR; + if (g_conf->newstore_o_direct && + (offset & ~CEPH_PAGE_MASK) == 0 && + (length & ~CEPH_PAGE_MASK) == 0) { + dout(20) << __func__ << " page-aligned, using O_DIRECT" << dendl; + flags |= O_DIRECT; + } if (o->onode.data_map.empty()) { // create fragment_t &f = o->onode.data_map[0]; f.offset = 0; f.length = MAX(offset + length, o->onode.size); - fd = _create_fid(txc, &f.fid, O_RDWR); + fd = _create_fid(txc, &f.fid, flags); if (fd < 0) { r = fd; goto out; @@ -3039,7 +3053,7 @@ int NewStore::_do_write(TransContext *txc, // append (possibly with gap) assert(o->onode.data_map.size() == 1); fragment_t &f = o->onode.data_map.rbegin()->second; - fd = _open_fid(f.fid, O_RDWR); + fd = _open_fid(f.fid, flags); if (fd < 0) { r = fd; goto out; From 28bc4ee76eb31683565fe951cd7a78c4047e9339 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Apr 2015 16:55:18 -0700 Subject: [PATCH 357/654] os/newstore: use FS::zero() Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 8515eb5d0111d..3b6b8e6bdb624 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2305,19 +2305,12 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) int fd = _open_fid(p->fid, O_RDWR); if (fd < 0) return fd; - int r = ::lseek64(fd, p->offset, SEEK_SET); + int r = fs->zero(fd, p->offset, p->length); if (r < 0) { - r = -errno; - derr << __func__ << " lseek64 on " << fd << " got: " + derr << __func__ << " zero on " << fd << " got: " << cpp_strerror(r) << dendl; return r; } -#warning use hole punch ioctl to zero when available - bufferlist bl; - bufferptr bp(p->length); - bp.zero(); - bl.append(bp); - bl.write_fd(fd); sync_fds.push_back(fd); } break; From f93856f71a644dff03478c77d0513efdb69207d6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Aug 2015 15:22:26 -0400 Subject: [PATCH 358/654] os/newstore: drop sync_and_flush Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 9 ++------- src/os/newstore/NewStore.h | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 3b6b8e6bdb624..c4b09f74d5403 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -975,7 +975,7 @@ int NewStore::umount() assert(mounted); dout(1) << __func__ << dendl; - sync_and_flush(); + sync(); _reap_collections(); dout(20) << __func__ << " stopping fsync_wq" << dendl; @@ -1003,17 +1003,12 @@ void NewStore::sync(Context *onsync) #warning write sync } -void NewStore::sync() -{ -#warning write sync -} - void NewStore::flush() { #warning write flush } -void NewStore::sync_and_flush() +void NewStore::sync() { dout(10) << __func__ << dendl; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 381044da1f46a..c93f189507b0c 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -473,7 +473,6 @@ class NewStore : public ObjectStore { void sync(Context *onsync); void sync(); void flush(); - void sync_and_flush(); unsigned get_max_object_name_length() { return 4096; From 205344d32d20299c2da2aa11401c714c00a0cd1c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Aug 2015 15:22:42 -0400 Subject: [PATCH 359/654] os/newstore: drop flush Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 5 ----- src/os/newstore/NewStore.h | 1 - 2 files changed, 6 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index c4b09f74d5403..aec8e0d40d514 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1003,11 +1003,6 @@ void NewStore::sync(Context *onsync) #warning write sync } -void NewStore::flush() -{ -#warning write flush -} - void NewStore::sync() { dout(10) << __func__ << dendl; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index c93f189507b0c..a906e3d7fef28 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -472,7 +472,6 @@ class NewStore : public ObjectStore { void sync(Context *onsync); void sync(); - void flush(); unsigned get_max_object_name_length() { return 4096; From d57547f1035b9b41c07c6ec49b7c119a117e6bea Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Aug 2015 15:23:03 -0400 Subject: [PATCH 360/654] os/newstore: drop sync() Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 4 ++-- src/os/newstore/NewStore.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index aec8e0d40d514..d73447ad0cdba 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -975,7 +975,7 @@ int NewStore::umount() assert(mounted); dout(1) << __func__ << dendl; - sync(); + _sync(); _reap_collections(); dout(20) << __func__ << " stopping fsync_wq" << dendl; @@ -1003,7 +1003,7 @@ void NewStore::sync(Context *onsync) #warning write sync } -void NewStore::sync() +void NewStore::_sync() { dout(10) << __func__ << dendl; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index a906e3d7fef28..eb302b9aac692 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -469,9 +469,9 @@ class NewStore : public ObjectStore { int mount(); int umount(); + void _sync(); void sync(Context *onsync); - void sync(); unsigned get_max_object_name_length() { return 4096; From ca9bc6327d2cd122da722188bce99368c381bb8c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Aug 2015 15:23:19 -0400 Subject: [PATCH 361/654] os/newstore: drop sync() Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 5 ----- src/os/newstore/NewStore.h | 2 -- 2 files changed, 7 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d73447ad0cdba..0b253bfe918e2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -998,11 +998,6 @@ int NewStore::umount() return 0; } -void NewStore::sync(Context *onsync) -{ -#warning write sync -} - void NewStore::_sync() { dout(10) << __func__ << dendl; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index eb302b9aac692..231b5d5624aff 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -471,8 +471,6 @@ class NewStore : public ObjectStore { int umount(); void _sync(); - void sync(Context *onsync); - unsigned get_max_object_name_length() { return 4096; } From b595aac4e1245f4478332a3aa30300e7495e65d4 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Fri, 10 Apr 2015 09:39:44 +0800 Subject: [PATCH 362/654] test/store_test Add get_omap_iterator test cases omap iterator test cases include: iter aganist omap lower_bound upper_bound Signed-off-by: Xiaoxi Chen --- src/test/objectstore/store_test.cc | 75 ++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 8ce487823446c..aa20a87be3870 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -2030,6 +2030,81 @@ TEST_P(StoreTest, OMapTest) { ASSERT_EQ(r, 0); } +TEST_P(StoreTest, OMapIterator) { + coll_t cid; + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); + int count = 0; + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = store->apply_transaction(t); + ASSERT_EQ(r, 0); + } + + map attrs; + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.omap_clear(cid, hoid); + map start_set; + t.omap_setkeys(cid, hoid, start_set); + store->apply_transaction(t); + } + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(cid, hoid); + bool correct; + //basic iteration + for (int i = 0; i < 100; i++) { + if (!(i%5)) { + std::cout << "On iteration " << i << std::endl; + } + ObjectStore::Transaction t; + bufferlist bl; + iter = store->get_omap_iterator(cid, hoid); + + for (iter->seek_to_first(), count=0; iter->valid(); iter->next(), count++) { + string key = iter->key(); + bufferlist value = iter->value(); + correct = attrs.count(key) && (string(value.c_str()) == string(attrs[key].c_str())); + if (!correct) { + if (attrs.count(key) > 0) { + std::cout << "key " << key << "in omap , " << value.c_str() << " : " << attrs[key].c_str() << std::endl; + } + else + std::cout << "key " << key << "should not exists in omap" << std::endl; + } + ASSERT_EQ(correct, true); + } + ASSERT_EQ(attrs.size(), count); + + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + bl.clear(); + bufferptr bp(buf, strlen(buf) + 1); + bl.append(bp); + map to_add; + to_add.insert(pair("key-" + string(buf), bl)); + attrs.insert(pair("key-" + string(buf), bl)); + t.omap_setkeys(cid, hoid, to_add); + store->apply_transaction(t); + } + //lower bound + string bound_key = "key-5"; + iter->lower_bound(bound_key); + correct = bound_key <= iter->key(); + if (!correct) { + std::cout << "lower bound, bound key is " << bound_key << " < iter key is " << iter->key() << std::endl; + } + ASSERT_EQ(correct, true); + //upper bound + iter->upper_bound(bound_key); + correct = iter->key() > bound_key; + if (!correct) { + std::cout << "upper bound, bound key is " << bound_key << " >= iter key is " << iter->key() << std::endl; + } + ASSERT_EQ(correct, true); +} + TEST_P(StoreTest, XattrTest) { coll_t cid; ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); From c86410239b54f430cf65166c877b7a49adebb548 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 12 Apr 2015 11:28:13 +0800 Subject: [PATCH 363/654] os/KeyValueDB: Add raw_key() interface for IteratorImpl raw_key() is useful to split out the prefix. Signed-off-by: Xiaoxi Chen --- src/os/KeyValueDB.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/os/KeyValueDB.h b/src/os/KeyValueDB.h index 028243667dbb1..e82151d53e7aa 100644 --- a/src/os/KeyValueDB.h +++ b/src/os/KeyValueDB.h @@ -160,6 +160,9 @@ class KeyValueDB { string key() { return generic_iter->key(); } + pair raw_key() { + return generic_iter->raw_key(); + } bufferlist value() { return generic_iter->value(); } From 5e9c64b4dd6b5fb9cab966a0e4ac7434804c3acc Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 8 Apr 2015 23:35:50 +0800 Subject: [PATCH 364/654] Implement get_omap_iterator implemented get_omap_iterator Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 43 ++++++++++++++++++++++++++----------- src/os/newstore/NewStore.h | 2 +- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 0b253bfe918e2..078c33da6d2b7 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1490,14 +1490,14 @@ int NewStore::collection_list( // omap reads -NewStore::OmapIteratorImpl::OmapIteratorImpl(CollectionRef c, OnodeRef o) - : c(c), o(o) +NewStore::OmapIteratorImpl::OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it) + : c(c), o(o), it(it) { RWLock::RLocker l(c->lock); if (o->onode.omap_head) { get_omap_header(o->onode.omap_head, &head); get_omap_tail(o->onode.omap_head, &tail); - it->upper_bound(head); + it->lower_bound(head); } } @@ -1505,7 +1505,7 @@ int NewStore::OmapIteratorImpl::seek_to_first() { RWLock::RLocker l(c->lock); if (o->onode.omap_head) { - it->upper_bound(head); + it->lower_bound(head); } else { it = KeyValueDB::Iterator(); } @@ -1518,7 +1518,7 @@ int NewStore::OmapIteratorImpl::upper_bound(const string& after) if (o->onode.omap_head) { string key; get_omap_key(o->onode.omap_head, after, &key); - it->upper_bound(head); + it->upper_bound(key); } else { it = KeyValueDB::Iterator(); } @@ -1531,7 +1531,7 @@ int NewStore::OmapIteratorImpl::lower_bound(const string& to) if (o->onode.omap_head) { string key; get_omap_key(o->onode.omap_head, to, &key); - it->lower_bound(head); + it->lower_bound(key); } else { it = KeyValueDB::Iterator(); } @@ -1541,16 +1541,17 @@ int NewStore::OmapIteratorImpl::lower_bound(const string& to) bool NewStore::OmapIteratorImpl::valid() { RWLock::RLocker l(c->lock); - return it->valid(); + if (it->valid() && it->raw_key().second <= tail) { + return true; + } else { + return false; + } } int NewStore::OmapIteratorImpl::next() { RWLock::RLocker l(c->lock); it->next(); - if (!it->valid() || it->key() >= tail) { - it = KeyValueDB::Iterator(); - } return 0; } @@ -1558,7 +1559,10 @@ string NewStore::OmapIteratorImpl::key() { RWLock::RLocker l(c->lock); assert(it->valid()); - return it->key(); + string db_key = it->raw_key().second; + string user_key; + decode_omap_key(db_key, &user_key); + return user_key; } bufferlist NewStore::OmapIteratorImpl::value() @@ -1773,7 +1777,22 @@ ObjectMap::ObjectMapIterator NewStore::get_omap_iterator( const ghobject_t &oid ///< [in] object ) { - assert(0); + + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) { + dout(10) << __func__ << " " << cid << "doesn't exist" <lock); + OnodeRef o = c->get_onode(oid, false); + if (!o) { + dout(10) << __func__ << " " << oid << "doesn't exist" < Date: Thu, 16 Apr 2015 00:10:08 +0800 Subject: [PATCH 367/654] os/Newstore: Check onode.omap_head in valid() and next() The db iter will be set to KeyValueDB::Iterator() if onode.omap_head not present. In that case if we touch the db iter we will get a segmentation fault. Prevent to touch the db iter when onode.omap_head is invalid(equals to 0). Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index ba8c72b1525be..4582774677326 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1541,7 +1541,7 @@ int NewStore::OmapIteratorImpl::lower_bound(const string& to) bool NewStore::OmapIteratorImpl::valid() { RWLock::RLocker l(c->lock); - if (it->valid() && it->raw_key().second <= tail) { + if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) { return true; } else { return false; @@ -1551,8 +1551,12 @@ bool NewStore::OmapIteratorImpl::valid() int NewStore::OmapIteratorImpl::next() { RWLock::RLocker l(c->lock); - it->next(); - return 0; + if (o->onode.omap_head) { + it->next(); + return 0; + } else { + return -1; + } } string NewStore::OmapIteratorImpl::key() From 1321b880ccf95bed9f1f9eca9476a127cc73c95f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 15 Apr 2015 10:35:26 -0700 Subject: [PATCH 368/654] os/newstore: update todo Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 4582774677326..de5ff20d78721 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -43,7 +43,6 @@ - HashIndex::list_by_hash * use work queue for wal fsyncs and kv record removals * avoid mtime updates when doing open-by-handle - * abstract out fs specifics * fid xattr backpointer * kill collection_list_range * inline first fsync_item in TransContext to void allocation? From 04f55d8d18b081d94650f6561efdb4e143d8e1d0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 15 Apr 2015 15:35:16 -0700 Subject: [PATCH 369/654] os/newstore: use fdatasync instead of fsync On XFS at least, fdatasync is sufficient to make data readable. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index de5ff20d78721..9959a6795b5d0 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2352,7 +2352,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) for (vector::iterator p = sync_fds.begin(); p != sync_fds.end(); ++p) { - int r = ::fsync(*p); + int r = ::fdatasync(*p); assert(r == 0); VOID_TEMP_FAILURE_RETRY(::close(*p)); } @@ -2446,7 +2446,7 @@ int NewStore::queue_transactions( for (list::iterator p = txc->fds.begin(); p != txc->fds.end(); ++p) { dout(30) << __func__ << " fsync " << p->fd << dendl; - int r = ::fsync(p->fd); + int r = ::fdatasync(p->fd); if (r < 0) { r = -errno; derr << __func__ << " fsync: " << cpp_strerror(r) << dendl; From 552d95213b13a35374b27685358ac48336c49091 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 15 Apr 2015 17:10:48 -0700 Subject: [PATCH 370/654] ceph_test_objectstore: fix omap test cleanup Signed-off-by: Sage Weil --- src/test/objectstore/store_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index aa20a87be3870..5f056dcfc2b2e 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -2103,6 +2103,14 @@ TEST_P(StoreTest, OMapIterator) { std::cout << "upper bound, bound key is " << bound_key << " >= iter key is " << iter->key() << std::endl; } ASSERT_EQ(correct, true); + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = store->apply_transaction(t); + ASSERT_EQ(r, 0); + } } TEST_P(StoreTest, XattrTest) { From dfd389e66abb572c8504651cc539bbbf48a6128a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Apr 2015 14:08:55 -0700 Subject: [PATCH 371/654] os/newstore: rebuild buffers to be page-aligned for O_DIRECT Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 29 ++++++++++++++++++++--------- src/os/newstore/NewStore.h | 4 ++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 9959a6795b5d0..750f6dd6782d6 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2290,6 +2290,11 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) (p->length & ~CEPH_PAGE_MASK) == 0) { dout(20) << __func__ << " page-aligned, using O_DIRECT" << dendl; flags |= O_DIRECT; + if (!p->data.is_page_aligned()) { + dout(20) << __func__ << " rebuilding buffer to be page-aligned" + << dendl; + p->data.rebuild(); + } } int fd = _open_fid(p->fid, flags); if (fd < 0) @@ -2982,11 +2987,12 @@ int NewStore::_do_write_all_overlays(TransContext *txc, int NewStore::_do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, - const bufferlist& bl, + bufferlist& bl, uint32_t fadvise_flags) { int fd = -1; int r = 0; + unsigned flags; dout(20) << __func__ << " have " << o->onode.size << " bytes in " << o->onode.data_map.size() @@ -3025,17 +3031,22 @@ int NewStore::_do_write(TransContext *txc, goto out; } + flags = O_RDWR; + if (g_conf->newstore_o_direct && + (offset & ~CEPH_PAGE_MASK) == 0 && + (length & ~CEPH_PAGE_MASK) == 0) { + dout(20) << __func__ << " page-aligned, can use O_DIRECT" << dendl; + flags |= O_DIRECT; + if (!bl.is_page_aligned()) { + dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl; + bl.rebuild(); + } + } + if (o->onode.size <= offset || o->onode.size == 0 || o->onode.data_map.empty()) { _do_overlay_clear(txc, o); - unsigned flags = O_RDWR; - if (g_conf->newstore_o_direct && - (offset & ~CEPH_PAGE_MASK) == 0 && - (length & ~CEPH_PAGE_MASK) == 0) { - dout(20) << __func__ << " page-aligned, using O_DIRECT" << dendl; - flags |= O_DIRECT; - } if (o->onode.data_map.empty()) { // create fragment_t &f = o->onode.data_map[0]; @@ -3180,7 +3191,7 @@ int NewStore::_write(TransContext *txc, CollectionRef& c, const ghobject_t& oid, uint64_t offset, size_t length, - const bufferlist& bl, + bufferlist& bl, uint32_t fadvise_flags) { dout(15) << __func__ << " " << c->cid << " " << oid diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 0fcd66e188ef4..ba12cc6e3ecdc 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -600,7 +600,7 @@ class NewStore : public ObjectStore { CollectionRef& c, const ghobject_t& oid, uint64_t offset, size_t len, - const bufferlist& bl, + bufferlist& bl, uint32_t fadvise_flags); int _do_overlay_clear(TransContext *txc, OnodeRef o); @@ -618,7 +618,7 @@ class NewStore : public ObjectStore { int _do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, - const bufferlist& bl, + bufferlist& bl, uint32_t fadvise_flags); int _touch(TransContext *txc, CollectionRef& c, From 7e1af1e616a4decdbb9bb634cd18b610cd0b5d4e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Apr 2015 15:01:20 -0700 Subject: [PATCH 372/654] os/newstore: use a threadpool for applying wal events Signed-off-by: Sage Weil --- src/common/config_opts.h | 3 ++ src/os/newstore/NewStore.cc | 24 +++++----- src/os/newstore/NewStore.h | 88 ++++++++++++++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 12 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2c3d8ee52a6e0..d9198b92a7170 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -798,6 +798,9 @@ OPTION(newstore_sync_queue_transaction, OPT_BOOL, false) // perform write synch OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value +OPTION(newstore_wal_threads, OPT_INT, 2) +OPTION(newstore_wal_thread_timeout, OPT_INT, 30) +OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120) OPTION(newstore_fid_prealloc, OPT_INT, 1024) OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(newstore_overlay_max_length, OPT_INT, 65536) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 750f6dd6782d6..53086f1f9359b 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -580,6 +580,14 @@ NewStore::NewStore(CephContext *cct, const string& path) nid_max(0), wal_lock("NewStore::wal_lock"), wal_seq(0), + wal_tp(cct, + "NewStore::wal_tp", + cct->_conf->newstore_wal_threads, + "newstore_wal_threads"), + wal_wq(this, + cct->_conf->newstore_wal_thread_timeout, + cct->_conf->newstore_wal_thread_suicide_timeout, + &wal_tp), finisher(cct), fsync_tp(cct, "NewStore::fsync_tp", @@ -953,6 +961,7 @@ int NewStore::mount() finisher.start(); fsync_tp.start(); + wal_tp.start(); kv_sync_thread.create(); mounted = true; @@ -981,6 +990,10 @@ int NewStore::umount() fsync_tp.stop(); dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); + dout(20) << __func__ << " draining wal_wq" << dendl; + wal_wq.drain(); + dout(20) << __func__ << " stopping wal_tp" << dendl; + wal_tp.stop(); dout(20) << __func__ << " draining finisher" << dendl; finisher.wait_for_empty(); dout(20) << __func__ << " stopping finisher" << dendl; @@ -2088,15 +2101,6 @@ void NewStore::_txc_submit_kv(TransContext *txc) kv_cond.SignalOne(); } -struct C_ApplyWAL : public Context { - NewStore *store; - NewStore::TransContext *txc; - C_ApplyWAL(NewStore *s, NewStore::TransContext *t) : store(s), txc(t) {} - void finish(int r) { - store->_apply_wal_transaction(txc); - } -}; - void NewStore::_txc_finish_kv(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; @@ -2125,7 +2129,7 @@ void NewStore::_txc_finish_kv(TransContext *txc) dout(20) << __func__ << " starting wal apply" << dendl; txc->state = TransContext::STATE_WAL_QUEUED; txc->osr->qlock.Unlock(); - finisher.queue(new C_ApplyWAL(this, txc)); + wal_wq.queue(txc); } else { txc->state = TransContext::STATE_FINISHING; txc->osr->qlock.Unlock(); diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index ba12cc6e3ecdc..d95452f37670c 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -190,6 +190,7 @@ class NewStore : public ObjectStore { list oncommits; ///< more commit completions list removed_collections; ///< colls we removed + boost::intrusive::list_member_hook<> wal_queue_item; wal_transaction_t *wal_txn; ///< wal transaction (if any) unsigned num_fsyncs_completed; @@ -237,7 +238,6 @@ class NewStore : public ObjectStore { } }; - class OpSequencer : public Sequencer_impl { public: Mutex qlock; @@ -250,11 +250,24 @@ class NewStore : public ObjectStore { &TransContext::sequencer_item> > q_list_t; q_list_t q; ///< transactions + typedef boost::intrusive::list< + TransContext, + boost::intrusive::member_hook< + TransContext, + boost::intrusive::list_member_hook<>, + &TransContext::wal_queue_item> > wal_queue_t; + wal_queue_t wal_q; ///< transactions + + boost::intrusive::list_member_hook<> wal_osr_queue_item; + Sequencer *parent; + Mutex wal_apply_lock; + OpSequencer() : qlock("NewStore::OpSequencer::qlock", false, false), - parent(NULL) { + parent(NULL), + wal_apply_lock("NewStore::OpSequencer::wal_apply_lock") { } ~OpSequencer() { assert(q.empty()); @@ -336,6 +349,75 @@ class NewStore : public ObjectStore { } }; + class WALWQ : public ThreadPool::WorkQueue { + // We need to order WAL items within each Sequencer. To do that, + // queue each txc under osr, and queue the osr's here. When we + // dequeue an txc, requeue the osr if there are more pending, and + // do it at the end of the list so that the next thread does not + // get a conflicted txc. Hold an osr mutex while doing the wal to + // preserve the ordering. + public: + typedef boost::intrusive::list< + OpSequencer, + boost::intrusive::member_hook< + OpSequencer, + boost::intrusive::list_member_hook<>, + &OpSequencer::wal_osr_queue_item> > wal_osr_queue_t; + + private: + NewStore *store; + wal_osr_queue_t wal_queue; + + public: + WALWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp) + : ThreadPool::WorkQueue("NewStore::WALWQ", ti, sti, tp), + store(s) { + } + bool _empty() { + return wal_queue.empty(); + } + bool _enqueue(TransContext *i) { + if (i->osr->wal_q.empty()) { + wal_queue.push_back(*i->osr); + } + i->osr->wal_q.push_back(*i); + return true; + } + void _dequeue(TransContext *p) { + assert(0 == "not needed, not implemented"); + } + TransContext *_dequeue() { + if (wal_queue.empty()) + return NULL; + OpSequencer *osr = &wal_queue.front(); + TransContext *i = &osr->wal_q.front(); + osr->wal_q.pop_front(); + wal_queue.pop_front(); + if (!osr->wal_q.empty()) { + // requeue at the end to minimize contention + wal_queue.push_back(*i->osr); + } + return i; + } + void _process(TransContext *i, ThreadPool::TPHandle &handle) { + // preserve wal ordering for this sequencer + Mutex::Locker l(i->osr->wal_apply_lock); + store->_apply_wal_transaction(i); + } + void _clear() { + assert(wal_queue.empty()); + } + + void flush() { + lock(); + while (!wal_queue.empty()) { + _wait(); + } + unlock(); + drain(); + } + }; + struct KVSyncThread : public Thread { NewStore *store; KVSyncThread(NewStore *s) : store(s) {} @@ -371,6 +453,8 @@ class NewStore : public ObjectStore { Mutex wal_lock; atomic64_t wal_seq; + ThreadPool wal_tp; + WALWQ wal_wq; Finisher finisher; ThreadPool fsync_tp; From efe218b4aaceca88ffe9404a248c6994f16e3f82 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Apr 2015 16:01:12 -0700 Subject: [PATCH 373/654] os/newstore: show # o_direct buffers in debug output Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 53086f1f9359b..a3a873d8624e5 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2292,7 +2292,8 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) if (g_conf->newstore_o_direct && (p->offset & ~CEPH_PAGE_MASK) == 0 && (p->length & ~CEPH_PAGE_MASK) == 0) { - dout(20) << __func__ << " page-aligned, using O_DIRECT" << dendl; + dout(20) << __func__ << " page-aligned io, using O_DIRECT, " + << p->data.buffers().size() << " buffers" << dendl; flags |= O_DIRECT; if (!p->data.is_page_aligned()) { dout(20) << __func__ << " rebuilding buffer to be page-aligned" @@ -3039,7 +3040,8 @@ int NewStore::_do_write(TransContext *txc, if (g_conf->newstore_o_direct && (offset & ~CEPH_PAGE_MASK) == 0 && (length & ~CEPH_PAGE_MASK) == 0) { - dout(20) << __func__ << " page-aligned, can use O_DIRECT" << dendl; + dout(20) << __func__ << " page-aligned, can use O_DIRECT, " + << bl.buffers().size() << " buffers" << dendl; flags |= O_DIRECT; if (!bl.is_page_aligned()) { dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl; From 143d48570fc000e69c6afb2ff9857e819d2cd9b8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Apr 2015 16:30:31 -0700 Subject: [PATCH 374/654] os/newstore: throttle wal work Signed-off-by: Sage Weil --- src/common/WorkQueue.h | 4 ++++ src/common/config_opts.h | 4 +++- src/os/newstore/NewStore.cc | 4 ++++ src/os/newstore/NewStore.h | 22 ++++++++++++++++++++-- src/os/newstore/newstore_types.h | 14 ++++++++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h index cf78b2dc30294..f0754de8e1992 100644 --- a/src/common/WorkQueue.h +++ b/src/common/WorkQueue.h @@ -322,6 +322,10 @@ class ThreadPool : public md_config_obs_t { pool->_lock.Unlock(); } + Mutex &get_lock() { + return pool->_lock; + } + void lock() { pool->lock(); } diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d9198b92a7170..bfe7d0df6650c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -798,9 +798,11 @@ OPTION(newstore_sync_queue_transaction, OPT_BOOL, false) // perform write synch OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value -OPTION(newstore_wal_threads, OPT_INT, 2) +OPTION(newstore_wal_threads, OPT_INT, 4) OPTION(newstore_wal_thread_timeout, OPT_INT, 30) OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120) +OPTION(newstore_wal_max_ops, OPT_U64, 64) +OPTION(newstore_wal_max_bytes, OPT_U64, 64*1024*1024) OPTION(newstore_fid_prealloc, OPT_INT, 1024) OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(newstore_overlay_max_length, OPT_INT, 65536) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a3a873d8624e5..d4691f8b7f705 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2419,6 +2419,10 @@ int NewStore::queue_transactions( tls, &onreadable, &ondisk, &onreadable_sync); int r; + // throttle wal work + wal_wq.throttle(g_conf->newstore_wal_max_ops, + g_conf->newstore_wal_max_bytes); + // set up the sequencer OpSequencer *osr; if (!posr) diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index d95452f37670c..2563ba3d5bf4a 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -367,6 +367,8 @@ class NewStore : public ObjectStore { private: NewStore *store; wal_osr_queue_t wal_queue; + uint64_t ops, bytes; + Cond throttle_cond; public: WALWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp) @@ -381,6 +383,8 @@ class NewStore : public ObjectStore { wal_queue.push_back(*i->osr); } i->osr->wal_q.push_back(*i); + ++ops; + bytes += i->wal_txn->get_bytes(); return true; } void _dequeue(TransContext *p) { @@ -397,12 +401,18 @@ class NewStore : public ObjectStore { // requeue at the end to minimize contention wal_queue.push_back(*i->osr); } + --ops; + bytes -= i->wal_txn->get_bytes(); + throttle_cond.Signal(); + + // preserve wal ordering for this sequencer by taking the lock + // while still holding the queue lock + i->osr->wal_apply_lock.Lock(); return i; } void _process(TransContext *i, ThreadPool::TPHandle &handle) { - // preserve wal ordering for this sequencer - Mutex::Locker l(i->osr->wal_apply_lock); store->_apply_wal_transaction(i); + i->osr->wal_apply_lock.Unlock(); } void _clear() { assert(wal_queue.empty()); @@ -416,6 +426,14 @@ class NewStore : public ObjectStore { unlock(); drain(); } + + void throttle(uint64_t max_ops, uint64_t max_bytes) { + Mutex& lock = get_lock(); + Mutex::Locker l(lock); + while (ops > max_ops || bytes > max_bytes) { + throttle_cond.Wait(lock); + } + } }; struct KVSyncThread : public Thread { diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h index 636ef65a96c8d..286fc773e6791 100644 --- a/src/os/newstore/newstore_types.h +++ b/src/os/newstore/newstore_types.h @@ -165,6 +165,20 @@ struct wal_transaction_t { uint64_t seq; list ops; + int64_t _bytes; ///< cached byte count + + wal_transaction_t() : _bytes(-1) {} + + uint64_t get_bytes() { + if (_bytes < 0) { + _bytes = 0; + for (list::iterator p = ops.begin(); p != ops.end(); ++p) { + _bytes += p->length; + } + } + return _bytes; + } + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); void dump(Formatter *f) const; From ba0d8d7fddd46ec7ef18336e9f7ce4c9b6c60579 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 10:33:14 -0700 Subject: [PATCH 375/654] os/Newstore: add newstore_db_path option The load of Keyvalue DB is heavy, allow user to put DB to a seperate(fast) device. Signed-off-by: Xiaoxi Chen --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 6 ++++++ src/os/newstore/NewStore.h | 1 + 3 files changed, 8 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index bfe7d0df6650c..ac913c2b0a68c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -809,6 +809,7 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536) OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(newstore_open_by_handle, OPT_BOOL, true) OPTION(newstore_o_direct, OPT_BOOL, true) +OPTION(newstore_db_path, OPT_STR, "") OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d4691f8b7f705..9db13d368c8fb 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -569,6 +569,7 @@ NewStore::NewStore(CephContext *cct, const string& path) cct(cct), db(NULL), fs(NULL), + db_path(cct->_conf->newstore_db_path), path_fd(-1), fsid_fd(-1), frag_fd(-1), @@ -895,6 +896,11 @@ int NewStore::mkfs() if (r < 0) goto out_close_fsid; + if (db_path != "") { + r = symlinkat(db_path.c_str(), path_fd, "db"); + if (r < 0) + goto out_close_frag; + } r = _open_db(); if (r < 0) goto out_close_frag; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 2563ba3d5bf4a..35b73770d63fc 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -452,6 +452,7 @@ class NewStore : public ObjectStore { KeyValueDB *db; FS *fs; uuid_d fsid; + string db_path; int path_fd; ///< open handle to $path int fsid_fd; ///< open handle (locked) to $path/fsid int frag_fd; ///< open handle to $path/fragments From b7a53b58741deff5d10b92f719fc42a351d4d0f4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 10:34:04 -0700 Subject: [PATCH 376/654] os/newstore: basic aio support Signed-off-by: Sage Weil --- src/common/config_opts.h | 3 + src/os/fs/FS.h | 83 +++++++++++++++++++++ src/os/newstore/NewStore.cc | 145 ++++++++++++++++++++++++++++++++---- src/os/newstore/NewStore.h | 32 +++++++- 4 files changed, 248 insertions(+), 15 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ac913c2b0a68c..8d26707c79b36 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -810,6 +810,9 @@ OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(newstore_open_by_handle, OPT_BOOL, true) OPTION(newstore_o_direct, OPT_BOOL, true) OPTION(newstore_db_path, OPT_STR, "") +OPTION(newstore_aio, OPT_BOOL, true) +OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds +OPTION(newstore_aio_max_queue_depth, OPT_INT, 64) OPTION(filestore_omap_backend, OPT_STR, "leveldb") diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h index a9d8100fafcc8..51c6363f13fcd 100644 --- a/src/os/fs/FS.h +++ b/src/os/fs/FS.h @@ -15,9 +15,19 @@ #ifndef CEPH_OS_FS_H #define CEPH_OS_FS_H +#include +#include + +#include "acconfig.h" +#ifdef HAVE_LIBAIO +# include +#endif + #include #include "include/types.h" +#include "common/Mutex.h" +#include "common/Cond.h" class FS { public: @@ -39,6 +49,79 @@ class FS { int from_fd, uint64_t from_offset, uint64_t from_len); virtual int zero(int fd, uint64_t offset, uint64_t length); + + // -- aio -- + + struct aio_t { + struct iocb iocb; // must be first element; see shenanigans in aio_queue_t + void *priv; + int fd; + vector iov; + + aio_t(void *p, int f) : priv(p), fd(f) { + memset(&iocb, 0, sizeof(iocb)); + } + + void pwritev(uint64_t offset) { + io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset); + } + }; + + struct aio_queue_t { + int max_iodepth; + io_context_t ctx; + + aio_queue_t(unsigned max_iodepth = 8) + : max_iodepth(max_iodepth), + ctx(0) { + } + ~aio_queue_t() { + assert(ctx == 0); + } + + int init() { + assert(ctx == 0); + return io_setup(max_iodepth, &ctx); + } + void shutdown() { + if (ctx) { + int r = io_destroy(ctx); + assert(r == 0); + ctx = 0; + } + } + + int submit(aio_t &aio) { + int attempts = 10; + iocb *piocb = &aio.iocb; + do { + int r = io_submit(ctx, 1, &piocb); + if (r < 0) { + if (r == -EAGAIN && attempts-- > 0) { + usleep(500); + continue; + } + return r; + } + } while (false); + return 0; + } + + int get_next_completed(int timeout_ms, aio_t **paio) { + io_event event[1]; + struct timespec t = { + timeout_ms / 1000, + (timeout_ms % 1000) * 1000 * 1000 + }; + int r = io_getevents(ctx, 1, 1, event, &t); + if (r <= 0) { + return r; + } + *paio = (aio_t *)event[0].obj; + return 1; + } + }; + }; #endif diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 9db13d368c8fb..7de2092308d91 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -598,6 +598,9 @@ NewStore::NewStore(CephContext *cct, const string& path) cct->_conf->newstore_fsync_thread_timeout, cct->_conf->newstore_fsync_thread_suicide_timeout, &fsync_tp), + aio_thread(this), + aio_stop(false), + aio_queue(cct->_conf->newstore_aio_max_queue_depth), kv_sync_thread(this), kv_lock("NewStore::kv_lock"), kv_stop(false), @@ -830,6 +833,29 @@ void NewStore::_close_db() db = NULL; } +int NewStore::_aio_start() +{ + if (g_conf->newstore_aio) { + dout(10) << __func__ << dendl; + int r = aio_queue.init(); + if (r < 0) + return r; + aio_thread.create(); + } + return 0; +} + +void NewStore::_aio_stop() +{ + if (g_conf->newstore_aio) { + dout(10) << __func__ << dendl; + aio_stop = true; + aio_thread.join(); + aio_stop = false; + aio_queue.shutdown(); + } +} + int NewStore::_open_collections() { KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); @@ -961,10 +987,14 @@ int NewStore::mount() if (r < 0) goto out_db; - r = _replay_wal(); + r = _aio_start(); if (r < 0) goto out_db; + r = _replay_wal(); + if (r < 0) + goto out_aio; + finisher.start(); fsync_tp.start(); wal_tp.start(); @@ -973,6 +1003,8 @@ int NewStore::mount() mounted = true; return 0; + out_aio: + _aio_stop(); out_db: _close_db(); out_frag: @@ -994,6 +1026,8 @@ int NewStore::umount() dout(20) << __func__ << " stopping fsync_wq" << dendl; fsync_tp.stop(); + dout(20) << __func__ << " stopping aio" << dendl; + _aio_stop(); dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); dout(20) << __func__ << " draining wal_wq" << dendl; @@ -2198,6 +2232,34 @@ void NewStore::_osr_reap_done(OpSequencer *osr) } } +void NewStore::_aio_thread() +{ + dout(10) << __func__ << " start" << dendl; + while (!aio_stop) { + dout(40) << __func__ << " polling" << dendl; + FS::aio_t *aio; + int r = aio_queue.get_next_completed(g_conf->newstore_aio_poll_ms, &aio); + if (r < 0) { + derr << __func__ << " got " << cpp_strerror(r) << dendl; + } + if (r == 1) { + TransContext *txc = static_cast(aio->priv); + int left = txc->num_aio.dec(); + dout(10) << __func__ << " finished aio on " << txc << ", " + << left << " left" << dendl; + if (left == 0) { + txc->state = TransContext::STATE_AIO_DONE; + if (!txc->fds.empty()) { + _txc_queue_fsync(txc); + } else { + _txc_finish_fsync(txc); + } + } + } + } + dout(10) << __func__ << " end" << dendl; +} + void NewStore::_kv_sync_thread() { dout(10) << __func__ << " start" << dendl; @@ -2317,7 +2379,12 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) << cpp_strerror(r) << dendl; return r; } - p->data.write_fd(fd); + r = p->data.write_fd(fd); + if (r < 0) { + derr << __func__ << " write_fd on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } sync_fds.push_back(fd); } break; @@ -2481,7 +2548,27 @@ int NewStore::queue_transactions( _txc_finish_kv(txc); } else { // async path - if (!txc->fds.empty()) { + if (!txc->aios.empty()) { + txc->state = TransContext::STATE_AIO_QUEUED; + dout(20) << __func__ << " submitting " << txc->num_aio.read() << " aios" + << dendl; + for (list::iterator p = txc->aios.begin(); + p != txc->aios.end(); + ++p) { + FS::aio_t& aio = *p; + dout(20) << __func__ << " submitting aio " << &aio << dendl; + for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) + dout(30) << __func__ << " iov " << (void*)q->iov_base + << " len " << q->iov_len << dendl; + dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR) + << dendl; + int r = aio_queue.submit(*p); + if (r) { + derr << " aio submit got " << cpp_strerror(r) << dendl; + assert(r == 0); + } + } + } else if (!txc->fds.empty()) { _txc_queue_fsync(txc); } else { _txc_finish_fsync(txc); @@ -3063,6 +3150,7 @@ int NewStore::_do_write(TransContext *txc, o->onode.size == 0 || o->onode.data_map.empty()) { _do_overlay_clear(txc, o); + uint64_t x_offset; if (o->onode.data_map.empty()) { // create fragment_t &f = o->onode.data_map[0]; @@ -3073,7 +3161,7 @@ int NewStore::_do_write(TransContext *txc, r = fd; goto out; } - ::lseek64(fd, offset, SEEK_SET); + x_offset = offset; dout(20) << __func__ << " create " << f.fid << " writing " << offset << "~" << length << dendl; } else { @@ -3087,17 +3175,32 @@ int NewStore::_do_write(TransContext *txc, } ::ftruncate(fd, f.length); // in case there is trailing crap f.length = (offset + length) - f.offset; - ::lseek64(fd, offset - f.offset, SEEK_SET); + x_offset = offset - f.offset; dout(20) << __func__ << " append " << f.fid << " writing " << (offset - f.offset) << "~" << length << dendl; } if (offset + length > o->onode.size) { o->onode.size = offset + length; } - r = bl.write_fd(fd); - if (r < 0) { - derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; - goto out; +#ifdef HAVE_LIBAIO + if (g_conf->newstore_aio && (flags & O_DIRECT)) { + txc->aios.push_back(FS::aio_t(txc, fd)); + txc->num_aio.inc(); + FS::aio_t& aio = txc->aios.back(); + bl.prepare_iov(&aio.iov); + txc->aio_bl.append(bl); + aio.pwritev(x_offset); + + dout(2) << __func__ << " prepared aio " << &aio << dendl; + } else +#endif + { + ::lseek64(fd, x_offset, SEEK_SET); + r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; + goto out; + } } txc->sync_fd(fd); r = 0; @@ -3128,12 +3231,26 @@ int NewStore::_do_write(TransContext *txc, dout(20) << __func__ << " replace old fid " << op->fid << " with new fid " << f.fid << ", writing " << offset << "~" << length << dendl; - r = bl.write_fd(fd); - if (r < 0) { - derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; - goto out; + +#ifdef HAVE_LIBAIO + if (g_conf->newstore_aio && (flags & O_DIRECT)) { + txc->aios.push_back(FS::aio_t(txc, fd)); + txc->num_aio.inc(); + FS::aio_t& aio = txc->aios.back(); + bl.prepare_iov(&aio.iov); + txc->aio_bl.append(bl); + aio.pwritev(0); + dout(2) << __func__ << " prepared aio " << &aio << dendl; + } else +#endif + { + r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; + goto out; + } + txc->sync_fd(fd); } - txc->sync_fd(fd); r = 0; goto out; } diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 35b73770d63fc..f96f85270c594 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -15,6 +15,8 @@ #ifndef CEPH_OSD_NEWSTORE_H #define CEPH_OSD_NEWSTORE_H +#include "acconfig.h" + #include #include "include/assert.h" @@ -143,6 +145,8 @@ class NewStore : public ObjectStore { struct TransContext { typedef enum { STATE_PREPARE, + STATE_AIO_QUEUED, + STATE_AIO_DONE, STATE_FSYNC_QUEUED, STATE_FSYNC_FSYNCING, STATE_FSYNC_DONE, @@ -165,6 +169,8 @@ class NewStore : public ObjectStore { case STATE_FSYNC_QUEUED: return "fsync_queued"; case STATE_FSYNC_FSYNCING: return "fsync_fsyncing"; case STATE_FSYNC_DONE: return "fsync_done"; + case STATE_AIO_QUEUED: return "aio_queued"; + case STATE_AIO_DONE: return "aio_done"; case STATE_KV_QUEUED: return "kv_queued"; case STATE_KV_COMMITTING: return "kv_committing"; case STATE_KV_DONE: return "kv_done"; @@ -194,6 +200,10 @@ class NewStore : public ObjectStore { wal_transaction_t *wal_txn; ///< wal transaction (if any) unsigned num_fsyncs_completed; + list aios; + bufferlist aio_bl; // just a pile of refs + atomic_t num_aio; + Mutex lock; Cond cond; @@ -207,6 +217,7 @@ class NewStore : public ObjectStore { onreadable_sync(NULL), wal_txn(NULL), num_fsyncs_completed(0), + num_aio(0), lock("NewStore::TransContext::lock") { //cout << "txc new " << this << std::endl; } @@ -373,7 +384,9 @@ class NewStore : public ObjectStore { public: WALWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp) : ThreadPool::WorkQueue("NewStore::WALWQ", ti, sti, tp), - store(s) { + store(s), + ops(0), + bytes(0) { } bool _empty() { return wal_queue.empty(); @@ -445,6 +458,15 @@ class NewStore : public ObjectStore { } }; + struct AioCompletionThread : public Thread { + NewStore *store; + AioCompletionThread(NewStore *s) : store(s) {} + void *entry() { + store->_aio_thread(); + return NULL; + } + }; + // -------------------------------------------------------- // members private: @@ -479,6 +501,10 @@ class NewStore : public ObjectStore { ThreadPool fsync_tp; FsyncWQ fsync_wq; + AioCompletionThread aio_thread; + bool aio_stop; + FS::aio_queue_t aio_queue; + KVSyncThread kv_sync_thread; Mutex kv_lock; Cond kv_cond, kv_sync_cond; @@ -540,6 +566,10 @@ class NewStore : public ObjectStore { void _osr_reap_done(OpSequencer *osr); + void _aio_thread(); + int _aio_start(); + void _aio_stop(); + void _kv_sync_thread(); void _kv_stop() { { From 5d8e14653dcfc140da3d37af3b185dc204d47def Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 12:49:46 -0700 Subject: [PATCH 377/654] os/newstore: combined O_DSYNC with O_DIRECT This avoids the need for an explicit fdatasync when doing O_DIRECT. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 7de2092308d91..6cbbef8cbdd65 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2362,7 +2362,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) (p->length & ~CEPH_PAGE_MASK) == 0) { dout(20) << __func__ << " page-aligned io, using O_DIRECT, " << p->data.buffers().size() << " buffers" << dendl; - flags |= O_DIRECT; + flags |= O_DIRECT | O_DSYNC; if (!p->data.is_page_aligned()) { dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl; @@ -2385,7 +2385,9 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) << cpp_strerror(r) << dendl; return r; } - sync_fds.push_back(fd); + if (!(flags & O_DSYNC)) { + sync_fds.push_back(fd); + } } break; case wal_op_t::OP_ZERO: @@ -3139,7 +3141,7 @@ int NewStore::_do_write(TransContext *txc, (length & ~CEPH_PAGE_MASK) == 0) { dout(20) << __func__ << " page-aligned, can use O_DIRECT, " << bl.buffers().size() << " buffers" << dendl; - flags |= O_DIRECT; + flags |= O_DIRECT | O_DSYNC; if (!bl.is_page_aligned()) { dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl; bl.rebuild(); @@ -3202,7 +3204,9 @@ int NewStore::_do_write(TransContext *txc, goto out; } } - txc->sync_fd(fd); + if (!(flags & O_DSYNC)) { + txc->sync_fd(fd); + } r = 0; goto out; } From e580a827290eb05b086b04591523345ee004ccae Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 12:48:38 -0700 Subject: [PATCH 378/654] os/newstore: a few comments about wal Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 6cbbef8cbdd65..8dd10d6c5a9a5 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2420,13 +2420,19 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) << cpp_strerror(r) << dendl; return r; } - //sync_fds.push_back(fd); // do we care? + // note: we are not syncing this truncate. instead, we are + // careful about only reading as much of the fragment as we + // know is valid, and truncating to expected size before + // extending the file. } break; case wal_op_t::OP_REMOVE: dout(20) << __func__ << " remove " << p->fid << dendl; _remove_fid(p->fid); + // note: we do not fsync the directory. instead, we tolerate + // leaked fragments in a crash. in practice, this will be + // exceedingly rare. break; default: From 2317e446c5e86f5ec6c74e6b59d8c4d6981ba62d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 15:33:11 -0700 Subject: [PATCH 379/654] os/newstore: use aio for wal writes, too Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 136 +++++++++++++++++++++++------------- src/os/newstore/NewStore.h | 10 ++- 2 files changed, 95 insertions(+), 51 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 8dd10d6c5a9a5..246ff18699269 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2245,14 +2245,27 @@ void NewStore::_aio_thread() if (r == 1) { TransContext *txc = static_cast(aio->priv); int left = txc->num_aio.dec(); - dout(10) << __func__ << " finished aio on " << txc << ", " - << left << " left" << dendl; + dout(10) << __func__ << " finished aio on " << txc << " state " + << txc->get_state_name() << ", " + << left << " aios left" << dendl; + VOID_TEMP_FAILURE_RETRY(::close(aio->fd)); if (left == 0) { - txc->state = TransContext::STATE_AIO_DONE; - if (!txc->fds.empty()) { - _txc_queue_fsync(txc); - } else { - _txc_finish_fsync(txc); + switch (txc->state) { + case TransContext::STATE_AIO_QUEUED: + txc->state = TransContext::STATE_AIO_DONE; + if (!txc->fds.empty()) { + _txc_queue_fsync(txc); + } else { + _txc_finish_fsync(txc); + } + break; + + case TransContext::STATE_WAL_AIO_WAIT: + _wal_finish(txc); + break; + + default: + assert(0 == "unexpected txc state on aio completion"); } } } @@ -2319,16 +2332,31 @@ wal_op_t *NewStore::_get_wal_op(TransContext *txc) return &txc->wal_txn->ops.back(); } -int NewStore::_apply_wal_transaction(TransContext *txc) +int NewStore::_wal_apply(TransContext *txc) { wal_transaction_t& wt = *txc->wal_txn; dout(20) << __func__ << " txc " << txc << " seq " << wt.seq << dendl; txc->state = TransContext::STATE_WAL_APPLYING; - int r = _do_wal_transaction(wt); + txc->aios.clear(); + int r = _do_wal_transaction(wt, txc); if (r < 0) return r; + if (!txc->aios.empty()) { + _txc_aio_submit(txc); + txc->state = TransContext::STATE_WAL_AIO_WAIT; + return 0; + } else { + return _wal_finish(txc); + } +} + +int NewStore::_wal_finish(TransContext *txc) +{ + wal_transaction_t& wt = *txc->wal_txn; + dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl; + string key; get_wal_key(wt.seq, &key); KeyValueDB::Transaction cleanup = db->get_transaction(); @@ -2345,7 +2373,8 @@ int NewStore::_apply_wal_transaction(TransContext *txc) return 0; } -int NewStore::_do_wal_transaction(wal_transaction_t& wt) +int NewStore::_do_wal_transaction(wal_transaction_t& wt, + TransContext *txc) { vector sync_fds; sync_fds.reserve(wt.ops.size()); @@ -2372,20 +2401,29 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) int fd = _open_fid(p->fid, flags); if (fd < 0) return fd; - int r = ::lseek64(fd, p->offset, SEEK_SET); - if (r < 0) { - r = -errno; - derr << __func__ << " lseek64 on " << fd << " got: " - << cpp_strerror(r) << dendl; - return r; - } - r = p->data.write_fd(fd); - if (r < 0) { - derr << __func__ << " write_fd on " << fd << " got: " - << cpp_strerror(r) << dendl; - return r; - } - if (!(flags & O_DSYNC)) { +#ifdef HAVE_LIBAIO + if (g_conf->newstore_aio && txc && (flags & O_DIRECT)) { + txc->aios.push_back(FS::aio_t(txc, fd)); + FS::aio_t& aio = txc->aios.back(); + p->data.prepare_iov(&aio.iov); + aio.pwritev(p->offset); + dout(2) << __func__ << " prepared aio " << &aio << dendl; + } else +#endif + { + int r = ::lseek64(fd, p->offset, SEEK_SET); + if (r < 0) { + r = -errno; + derr << __func__ << " lseek64 on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } + r = p->data.write_fd(fd); + if (r < 0) { + derr << __func__ << " write_fd on " << fd << " got: " + << cpp_strerror(r) << dendl; + return r; + } sync_fds.push_back(fd); } } @@ -2403,6 +2441,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt) << cpp_strerror(r) << dendl; return r; } + // FIXME: do aio fdatasync? sync_fds.push_back(fd); } break; @@ -2469,7 +2508,7 @@ int NewStore::_replay_wal() return -EIO; } dout(20) << __func__ << " replay " << it->key() << dendl; - int r = _do_wal_transaction(wt); + int r = _do_wal_transaction(wt, NULL); // don't bother with aio here if (r < 0) return r; cleanup->rmkey(PREFIX_WAL, it->key()); @@ -2557,25 +2596,8 @@ int NewStore::queue_transactions( } else { // async path if (!txc->aios.empty()) { + _txc_aio_submit(txc); txc->state = TransContext::STATE_AIO_QUEUED; - dout(20) << __func__ << " submitting " << txc->num_aio.read() << " aios" - << dendl; - for (list::iterator p = txc->aios.begin(); - p != txc->aios.end(); - ++p) { - FS::aio_t& aio = *p; - dout(20) << __func__ << " submitting aio " << &aio << dendl; - for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) - dout(30) << __func__ << " iov " << (void*)q->iov_base - << " len " << q->iov_len << dendl; - dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR) - << dendl; - int r = aio_queue.submit(*p); - if (r) { - derr << " aio submit got " << cpp_strerror(r) << dendl; - assert(r == 0); - } - } } else if (!txc->fds.empty()) { _txc_queue_fsync(txc); } else { @@ -2586,6 +2608,29 @@ int NewStore::queue_transactions( return 0; } +void NewStore::_txc_aio_submit(TransContext *txc) +{ + int num = txc->aios.size(); + dout(10) << __func__ << " submitting " << num << " aios" << dendl; + txc->num_aio.set(num); + for (list::iterator p = txc->aios.begin(); + p != txc->aios.end(); + ++p) { + FS::aio_t& aio = *p; + dout(20) << __func__ << " submitting aio " << &aio << dendl; + for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) + dout(30) << __func__ << " iov " << (void*)q->iov_base + << " len " << q->iov_len << dendl; + dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR) + << dendl; + int r = aio_queue.submit(*p); + if (r) { + derr << " aio submit got " << cpp_strerror(r) << dendl; + assert(r == 0); + } + } +} + int NewStore::_do_transaction(Transaction *t, TransContext *txc, ThreadPool::TPHandle *handle) @@ -3193,12 +3238,10 @@ int NewStore::_do_write(TransContext *txc, #ifdef HAVE_LIBAIO if (g_conf->newstore_aio && (flags & O_DIRECT)) { txc->aios.push_back(FS::aio_t(txc, fd)); - txc->num_aio.inc(); FS::aio_t& aio = txc->aios.back(); bl.prepare_iov(&aio.iov); txc->aio_bl.append(bl); aio.pwritev(x_offset); - dout(2) << __func__ << " prepared aio " << &aio << dendl; } else #endif @@ -3209,8 +3252,6 @@ int NewStore::_do_write(TransContext *txc, derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl; goto out; } - } - if (!(flags & O_DSYNC)) { txc->sync_fd(fd); } r = 0; @@ -3245,7 +3286,6 @@ int NewStore::_do_write(TransContext *txc, #ifdef HAVE_LIBAIO if (g_conf->newstore_aio && (flags & O_DIRECT)) { txc->aios.push_back(FS::aio_t(txc, fd)); - txc->num_aio.inc(); FS::aio_t& aio = txc->aios.back(); bl.prepare_iov(&aio.iov); txc->aio_bl.append(bl); diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index f96f85270c594..93e547566b3e1 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -155,6 +155,7 @@ class NewStore : public ObjectStore { STATE_KV_DONE, STATE_WAL_QUEUED, STATE_WAL_APPLYING, + STATE_WAL_AIO_WAIT, STATE_WAL_CLEANUP, // remove wal kv record STATE_WAL_DONE, STATE_FINISHING, @@ -176,6 +177,7 @@ class NewStore : public ObjectStore { case STATE_KV_DONE: return "kv_done"; case STATE_WAL_QUEUED: return "wal_queued"; case STATE_WAL_APPLYING: return "wal_applying"; + case STATE_WAL_AIO_WAIT: return "wal_aio_wait"; case STATE_WAL_CLEANUP: return "wal_cleanup"; case STATE_WAL_DONE: return "wal_done"; case STATE_FINISHING: return "finishing"; @@ -424,7 +426,7 @@ class NewStore : public ObjectStore { return i; } void _process(TransContext *i, ThreadPool::TPHandle &handle) { - store->_apply_wal_transaction(i); + store->_wal_apply(i); i->osr->wal_apply_lock.Unlock(); } void _clear() { @@ -557,6 +559,7 @@ class NewStore : public ObjectStore { TransContext *_txc_create(OpSequencer *osr); int _txc_finalize(OpSequencer *osr, TransContext *txc); + void _txc_aio_submit(TransContext *txc); void _txc_queue_fsync(TransContext *txc); void _txc_process_fsync(fsync_item *i); void _txc_finish_fsync(TransContext *txc); @@ -582,8 +585,9 @@ class NewStore : public ObjectStore { } wal_op_t *_get_wal_op(TransContext *txc); - int _apply_wal_transaction(TransContext *txc); - int _do_wal_transaction(wal_transaction_t& wt); + int _wal_apply(TransContext *txc); + int _wal_finish(TransContext *txc); + int _do_wal_transaction(wal_transaction_t& wt, TransContext *txc); void _wait_object_wal(OnodeRef onode); int _replay_wal(); friend class C_ApplyWAL; From 3b667125982290a9ab3e296bc87fe9e9a10cfcd9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2015 17:10:19 -0700 Subject: [PATCH 380/654] os/newstore: move toward state-machine Signed-off-by: Sage Weil --- src/common/config_opts.h | 3 +- src/os/newstore/NewStore.cc | 234 ++++++++++++++++++------------------ src/os/newstore/NewStore.h | 28 ++--- 3 files changed, 133 insertions(+), 132 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 8d26707c79b36..e9020757aba8f 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -794,7 +794,8 @@ OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") OPTION(newstore_fail_eio, OPT_BOOL, true) -OPTION(newstore_sync_queue_transaction, OPT_BOOL, false) // perform write synchronously from queue_transaction +OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously +OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 246ff18699269..25eb6412134e9 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -991,7 +991,7 @@ int NewStore::mount() if (r < 0) goto out_db; - r = _replay_wal(); + r = _wal_replay(); if (r < 0) goto out_aio; @@ -2036,6 +2036,88 @@ NewStore::TransContext *NewStore::_txc_create(OpSequencer *osr) return txc; } +void NewStore::_txc_state_proc(TransContext *txc) +{ + while (true) { + dout(10) << __func__ << " txc " << txc + << " " << txc->get_state_name() << dendl; + switch (txc->state) { + case TransContext::STATE_PREPARE: + if (!txc->aios.empty()) { + txc->state = TransContext::STATE_AIO_WAIT; + _txc_aio_submit(txc); + return; + } + // ** fall-thru ** + + case TransContext::STATE_AIO_WAIT: + if (!txc->fds.empty()) { + txc->state = TransContext::STATE_FSYNC_WAIT; + if (!g_conf->newstore_sync_io) { + _txc_queue_fsync(txc); + return; + } + _txc_do_sync_fsync(txc); + } + _txc_finish_io(txc); // may trigger blocked txc's too + return; + + case TransContext::STATE_IO_DONE: + assert(txc->osr->qlock.is_locked()); // see _txc_finish_io + txc->state = TransContext::STATE_KV_QUEUED; + if (!g_conf->newstore_sync_transaction) { + Mutex::Locker l(kv_lock); + db->submit_transaction(txc->t); + kv_queue.push_back(txc); + kv_cond.SignalOne(); + return; + } + db->submit_transaction_sync(txc->t); + break; + + case TransContext::STATE_KV_QUEUED: + txc->state = TransContext::STATE_KV_DONE; + _txc_finish_kv(txc); + // ** fall-thru ** + + case TransContext::STATE_KV_DONE: + if (txc->wal_txn) { + txc->state = TransContext::STATE_WAL_QUEUED; + wal_wq.queue(txc); + return; + } + txc->state = TransContext::STATE_FINISHING; + break; + + case TransContext::STATE_WAL_APPLYING: + if (!txc->aios.empty()) { + txc->state = TransContext::STATE_WAL_AIO_WAIT; + _txc_aio_submit(txc); + return; + } + // ** fall-thru ** + + case TransContext::STATE_WAL_AIO_WAIT: + _wal_finish(txc); + return; + + case TransContext::STATE_WAL_CLEANUP: + txc->state = TransContext::STATE_FINISHING; + // ** fall-thru ** + + case TransContext::TransContext::STATE_FINISHING: + _txc_finish(txc); + return; + + default: + derr << __func__ << " unexpected txc " << txc + << " state " << txc->get_state_name() << dendl; + assert(0 == "unexpected txc state"); + return; + } + } +} + void NewStore::_txc_process_fsync(fsync_item *i) { dout(20) << __func__ << " txc " << i->txc << dendl; @@ -2049,12 +2131,12 @@ void NewStore::_txc_process_fsync(fsync_item *i) } VOID_TEMP_FAILURE_RETRY(::close(i->fd)); if (i->txc->finish_fsync()) { - _txc_finish_fsync(i->txc); + _txc_finish_io(i->txc); } dout(20) << __func__ << " txc " << i->txc << " done" << dendl; } -void NewStore::_txc_finish_fsync(TransContext *txc) +void NewStore::_txc_finish_io(TransContext *txc) { dout(20) << __func__ << " " << txc << dendl; @@ -2065,25 +2147,25 @@ void NewStore::_txc_finish_fsync(TransContext *txc) OpSequencer *osr = txc->osr.get(); Mutex::Locker l(osr->qlock); - txc->state = TransContext::STATE_FSYNC_DONE; + txc->state = TransContext::STATE_IO_DONE; OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc); while (p != osr->q.begin()) { --p; - if (p->state < TransContext::STATE_FSYNC_DONE) { + if (p->state < TransContext::STATE_IO_DONE) { dout(20) << __func__ << " " << txc << " blocked by " << &*p << " " << p->get_state_name() << dendl; return; } - if (p->state > TransContext::STATE_FSYNC_DONE) { + if (p->state > TransContext::STATE_IO_DONE) { ++p; break; } } do { - _txc_submit_kv(&*p++); + _txc_state_proc(&*p++); } while (p != osr->q.end() && - p->state == TransContext::STATE_FSYNC_DONE); + p->state == TransContext::STATE_IO_DONE); } int NewStore::_txc_finalize(OpSequencer *osr, TransContext *txc) @@ -2119,7 +2201,6 @@ int NewStore::_txc_finalize(OpSequencer *osr, TransContext *txc) void NewStore::_txc_queue_fsync(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; - txc->state = TransContext::STATE_FSYNC_QUEUED; fsync_wq.lock(); for (list::iterator p = txc->fds.begin(); p != txc->fds.end(); @@ -2130,22 +2211,25 @@ void NewStore::_txc_queue_fsync(TransContext *txc) fsync_wq.unlock(); } -void NewStore::_txc_submit_kv(TransContext *txc) +void NewStore::_txc_do_sync_fsync(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; - txc->state = TransContext::STATE_KV_QUEUED; - - Mutex::Locker l(kv_lock); - db->submit_transaction(txc->t); - kv_queue.push_back(txc); - kv_cond.SignalOne(); + for (list::iterator p = txc->fds.begin(); + p != txc->fds.end(); ++p) { + dout(30) << __func__ << " fsync " << p->fd << dendl; + int r = ::fdatasync(p->fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsync: " << cpp_strerror(r) << dendl; + assert(0 == "fsync error"); + } + VOID_TEMP_FAILURE_RETRY(::close(p->fd)); + } } void NewStore::_txc_finish_kv(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; - txc->osr->qlock.Lock(); - txc->state = TransContext::STATE_KV_DONE; // warning: we're calling onreadable_sync inside the sequencer lock if (txc->onreadable_sync) { @@ -2164,20 +2248,9 @@ void NewStore::_txc_finish_kv(TransContext *txc) finisher.queue(txc->oncommits.front()); txc->oncommits.pop_front(); } - - if (txc->wal_txn) { - dout(20) << __func__ << " starting wal apply" << dendl; - txc->state = TransContext::STATE_WAL_QUEUED; - txc->osr->qlock.Unlock(); - wal_wq.queue(txc); - } else { - txc->state = TransContext::STATE_FINISHING; - txc->osr->qlock.Unlock(); - _txc_finish_apply(txc); - } } -void NewStore::_txc_finish_apply(TransContext *txc) +void NewStore::_txc_finish(TransContext *txc) { dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; assert(txc->state == TransContext::STATE_FINISHING); @@ -2250,23 +2323,7 @@ void NewStore::_aio_thread() << left << " aios left" << dendl; VOID_TEMP_FAILURE_RETRY(::close(aio->fd)); if (left == 0) { - switch (txc->state) { - case TransContext::STATE_AIO_QUEUED: - txc->state = TransContext::STATE_AIO_DONE; - if (!txc->fds.empty()) { - _txc_queue_fsync(txc); - } else { - _txc_finish_fsync(txc); - } - break; - - case TransContext::STATE_WAL_AIO_WAIT: - _wal_finish(txc); - break; - - default: - assert(0 == "unexpected txc state on aio completion"); - } + _txc_state_proc(txc); } } } @@ -2298,18 +2355,7 @@ void NewStore::_kv_sync_thread() << " in " << dur << dendl; while (!kv_committing.empty()) { TransContext *txc = kv_committing.front(); - if (txc->state == TransContext::STATE_WAL_CLEANUP) { - txc->osr->qlock.Lock(); - txc->state = TransContext::STATE_FINISHING; - txc->osr->qlock.Unlock(); - _txc_finish_apply(txc); - } else if (txc->state == TransContext::STATE_KV_QUEUED) { - _txc_finish_kv(txc); - } else { - derr << __func__ << " unexpected txc state " << txc->get_state_name() - << dendl; - assert(0); - } + _txc_state_proc(txc); kv_committing.pop_front(); } @@ -2340,16 +2386,10 @@ int NewStore::_wal_apply(TransContext *txc) txc->aios.clear(); int r = _do_wal_transaction(wt, txc); - if (r < 0) - return r; + assert(r == 0); - if (!txc->aios.empty()) { - _txc_aio_submit(txc); - txc->state = TransContext::STATE_WAL_AIO_WAIT; - return 0; - } else { - return _wal_finish(txc); - } + _txc_state_proc(txc); + return 0; } int NewStore::_wal_finish(TransContext *txc) @@ -2362,9 +2402,7 @@ int NewStore::_wal_finish(TransContext *txc) KeyValueDB::Transaction cleanup = db->get_transaction(); cleanup->rmkey(PREFIX_WAL, key); - txc->osr->qlock.Lock(); txc->state = TransContext::STATE_WAL_CLEANUP; - txc->osr->qlock.Unlock(); Mutex::Locker l(kv_lock); db->submit_transaction(cleanup); @@ -2490,7 +2528,7 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt, return 0; } -int NewStore::_replay_wal() +int NewStore::_wal_replay() { dout(10) << __func__ << " start" << dendl; KeyValueDB::Iterator it = db->get_iterator(PREFIX_WAL); @@ -2539,7 +2577,7 @@ int NewStore::queue_transactions( tls, &onreadable, &ondisk, &onreadable_sync); int r; - // throttle wal work + // throttle on wal work wal_wq.throttle(g_conf->newstore_wal_max_ops, g_conf->newstore_wal_max_bytes); @@ -2557,54 +2595,22 @@ int NewStore::queue_transactions( dout(5) << __func__ << " new " << *osr << "/" << osr->parent << dendl; } + // prepare TransContext *txc = _txc_create(osr); + txc->onreadable = onreadable; + txc->onreadable_sync = onreadable_sync; + txc->oncommit = ondisk; - // XXX do it sync for now; this is not crash safe for (list::iterator p = tls.begin(); p != tls.end(); ++p) { (*p)->set_osr(osr); - _do_transaction(*p, txc, handle); + _txc_add_transaction(txc, *p); } - txc->onreadable = onreadable; - txc->onreadable_sync = onreadable_sync; - txc->oncommit = ondisk; - r = _txc_finalize(osr, txc); assert(r == 0); - if (g_conf->newstore_sync_queue_transaction) { - // do it syncrhonously. for example, if we have a *very* fast backend. - - // sync - txc->state = TransContext::STATE_FSYNC_FSYNCING; - for (list::iterator p = txc->fds.begin(); - p != txc->fds.end(); ++p) { - dout(30) << __func__ << " fsync " << p->fd << dendl; - int r = ::fdatasync(p->fd); - if (r < 0) { - r = -errno; - derr << __func__ << " fsync: " << cpp_strerror(r) << dendl; - return r; - } - VOID_TEMP_FAILURE_RETRY(::close(p->fd)); - } - - txc->state = TransContext::STATE_KV_COMMITTING; - db->submit_transaction_sync(txc->t); - - _txc_finish_kv(txc); - } else { - // async path - if (!txc->aios.empty()) { - _txc_aio_submit(txc); - txc->state = TransContext::STATE_AIO_QUEUED; - } else if (!txc->fds.empty()) { - _txc_queue_fsync(txc); - } else { - _txc_finish_fsync(txc); - } - } - + // execute (start) + _txc_state_proc(txc); return 0; } @@ -2617,7 +2623,7 @@ void NewStore::_txc_aio_submit(TransContext *txc) p != txc->aios.end(); ++p) { FS::aio_t& aio = *p; - dout(20) << __func__ << " submitting aio " << &aio << dendl; + dout(20) << __func__ << " aio " << &aio << " fd " << aio.fd << dendl; for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) dout(30) << __func__ << " iov " << (void*)q->iov_base << " len " << q->iov_len << dendl; @@ -2631,9 +2637,7 @@ void NewStore::_txc_aio_submit(TransContext *txc) } } -int NewStore::_do_transaction(Transaction *t, - TransContext *txc, - ThreadPool::TPHandle *handle) +int NewStore::_txc_add_transaction(TransContext *txc, Transaction *t) { Transaction::iterator i = t->begin(); int pos = 0; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 93e547566b3e1..961b55cce1e33 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -145,11 +145,9 @@ class NewStore : public ObjectStore { struct TransContext { typedef enum { STATE_PREPARE, - STATE_AIO_QUEUED, - STATE_AIO_DONE, - STATE_FSYNC_QUEUED, - STATE_FSYNC_FSYNCING, - STATE_FSYNC_DONE, + STATE_FSYNC_WAIT, + STATE_AIO_WAIT, + STATE_IO_DONE, STATE_KV_QUEUED, STATE_KV_COMMITTING, STATE_KV_DONE, @@ -167,11 +165,9 @@ class NewStore : public ObjectStore { const char *get_state_name() { switch (state) { case STATE_PREPARE: return "prepare"; - case STATE_FSYNC_QUEUED: return "fsync_queued"; - case STATE_FSYNC_FSYNCING: return "fsync_fsyncing"; - case STATE_FSYNC_DONE: return "fsync_done"; - case STATE_AIO_QUEUED: return "aio_queued"; - case STATE_AIO_DONE: return "aio_done"; + case STATE_FSYNC_WAIT: return "fsync_wait"; + case STATE_AIO_WAIT: return "aio_wait"; + case STATE_IO_DONE: return "io_done"; case STATE_KV_QUEUED: return "kv_queued"; case STATE_KV_COMMITTING: return "kv_committing"; case STATE_KV_DONE: return "kv_done"; @@ -558,14 +554,16 @@ class NewStore : public ObjectStore { int _clean_fid_tail(TransContext *txc, const fragment_t& f); TransContext *_txc_create(OpSequencer *osr); + int _txc_add_transaction(TransContext *txc, Transaction *t); int _txc_finalize(OpSequencer *osr, TransContext *txc); + void _txc_state_proc(TransContext *txc); void _txc_aio_submit(TransContext *txc); + void _txc_do_sync_fsync(TransContext *txc); void _txc_queue_fsync(TransContext *txc); void _txc_process_fsync(fsync_item *i); - void _txc_finish_fsync(TransContext *txc); - void _txc_submit_kv(TransContext *txc); + void _txc_finish_io(TransContext *txc); void _txc_finish_kv(TransContext *txc); - void _txc_finish_apply(TransContext *txc); + void _txc_finish(TransContext *txc); void _osr_reap_done(OpSequencer *osr); @@ -588,9 +586,7 @@ class NewStore : public ObjectStore { int _wal_apply(TransContext *txc); int _wal_finish(TransContext *txc); int _do_wal_transaction(wal_transaction_t& wt, TransContext *txc); - void _wait_object_wal(OnodeRef onode); - int _replay_wal(); - friend class C_ApplyWAL; + int _wal_replay(); public: NewStore(CephContext *cct, const string& path); From 715fd3b7a293f01ea5fda1dc4296658ec7f5835c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Jul 2015 13:22:50 -0400 Subject: [PATCH 381/654] os/newstore: todo Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 25eb6412134e9..b1222d02c0d9e 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -31,6 +31,11 @@ TODO: + * multiple fragments per object (with configurable size.. maybe 1 or 2 mb default?) + * read path should be totally generic (handle any fragment pattern) + * write path should ideally tolerate any fragment pattern, but only generate a fixed layout (since the tunable may be changed over time). + * rocksdb: use db_paths (db/ and db.bulk/ ?) + * rocksdb: auto-detect use_fsync option when not xfs or btrfs * hobject sorting - backfill - scrub @@ -41,11 +46,11 @@ - DBObjectMap::clone lock ordering - HashIndex::get_path_contents_by_hash - HashIndex::list_by_hash - * use work queue for wal fsyncs and kv record removals * avoid mtime updates when doing open-by-handle * fid xattr backpointer * kill collection_list_range * inline first fsync_item in TransContext to void allocation? + * refcounted fragments (for efficient clone) */ From dd79b4d8328801ce23594ca05f1a71dc85ca4ef2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 22 Apr 2015 17:22:32 -0700 Subject: [PATCH 382/654] os/newstore: release wal throttle when wal completes, not when queued If we take the aio path, the io is queued immediately and the resources are released back to the pool. Instead release them when wal completes. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 2 ++ src/os/newstore/NewStore.h | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index b1222d02c0d9e..1367e6e5bc1a5 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2402,6 +2402,8 @@ int NewStore::_wal_finish(TransContext *txc) wal_transaction_t& wt = *txc->wal_txn; dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl; + wal_wq.release_throttle(txc); + string key; get_wal_key(wt.seq, &key); KeyValueDB::Transaction cleanup = db->get_transaction(); diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 961b55cce1e33..fbe1cf0423e7b 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -412,9 +412,6 @@ class NewStore : public ObjectStore { // requeue at the end to minimize contention wal_queue.push_back(*i->osr); } - --ops; - bytes -= i->wal_txn->get_bytes(); - throttle_cond.Signal(); // preserve wal ordering for this sequencer by taking the lock // while still holding the queue lock @@ -445,6 +442,14 @@ class NewStore : public ObjectStore { throttle_cond.Wait(lock); } } + + void release_throttle(TransContext *txc) { + lock(); + --ops; + bytes -= txc->wal_txn->get_bytes(); + throttle_cond.Signal(); + unlock(); + } }; struct KVSyncThread : public Thread { From f9f9e1b105167e6991f80b8d6e6b4973ea56ca31 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Jul 2015 13:24:00 -0400 Subject: [PATCH 383/654] os/newstore: debug io_submit EAGAIN Signed-off-by: Sage Weil --- src/os/fs/FS.h | 6 ++++-- src/os/newstore/NewStore.cc | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h index 51c6363f13fcd..4b52732422677 100644 --- a/src/os/fs/FS.h +++ b/src/os/fs/FS.h @@ -71,7 +71,7 @@ class FS { int max_iodepth; io_context_t ctx; - aio_queue_t(unsigned max_iodepth = 8) + aio_queue_t(unsigned max_iodepth) : max_iodepth(max_iodepth), ctx(0) { } @@ -91,7 +91,7 @@ class FS { } } - int submit(aio_t &aio) { + int submit(aio_t &aio, int *retries) { int attempts = 10; iocb *piocb = &aio.iocb; do { @@ -99,10 +99,12 @@ class FS { if (r < 0) { if (r == -EAGAIN && attempts-- > 0) { usleep(500); + (*retries)++; continue; } return r; } + assert(r == 1); } while (false); return 0; } diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 1367e6e5bc1a5..6d75a3459b4cb 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2624,7 +2624,8 @@ int NewStore::queue_transactions( void NewStore::_txc_aio_submit(TransContext *txc) { int num = txc->aios.size(); - dout(10) << __func__ << " submitting " << num << " aios" << dendl; + dout(10) << __func__ << " txc " << txc << " submitting " << num << dendl; + assert(num > 0); txc->num_aio.set(num); for (list::iterator p = txc->aios.begin(); p != txc->aios.end(); @@ -2636,7 +2637,10 @@ void NewStore::_txc_aio_submit(TransContext *txc) << " len " << q->iov_len << dendl; dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR) << dendl; - int r = aio_queue.submit(*p); + int retries = 0; + int r = aio_queue.submit(*p, &retries); + if (retries) + derr << __func__ << " retries " << retries << dendl; if (r) { derr << " aio submit got " << cpp_strerror(r) << dendl; assert(r == 0); From dffa43051ae29775361231c50aaa4e4fb4909487 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 23 Apr 2015 09:41:35 +0800 Subject: [PATCH 384/654] os/NewStore: don't clear overlay in the create/append case of write Shouldn't clear the overlay in the create/append case of write. Otherwise, this removes the overlay data and leads to data loss. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 6d75a3459b4cb..bcd826346776f 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3217,7 +3217,6 @@ int NewStore::_do_write(TransContext *txc, if (o->onode.size <= offset || o->onode.size == 0 || o->onode.data_map.empty()) { - _do_overlay_clear(txc, o); uint64_t x_offset; if (o->onode.data_map.empty()) { // create From a165fe81c5da132ec96f75456a3d43de1cd69d31 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 23 Apr 2015 10:25:15 +0800 Subject: [PATCH 385/654] os/NewStore: clear the shared_overlays after writing all the overlays Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index bcd826346776f..e39fbbfbe9be4 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3150,6 +3150,7 @@ int NewStore::_do_write_all_overlays(TransContext *txc, } o->onode.overlay_map.clear(); + o->onode.shared_overlays.clear(); txc->write_onode(o); return 0; } From b1136fbd33936e355f332ae9535aa19a85c37211 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 23 Apr 2015 10:34:37 +0800 Subject: [PATCH 386/654] os/NewStore: data_map shouldn't be empty when writing all overlays This should be an assert instead of creating new data_map. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index e39fbbfbe9be4..d501943b99310 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3098,20 +3098,6 @@ int NewStore::_do_write_all_overlays(TransContext *txc, if (o->onode.overlay_map.empty()) return 0; - // overwrite to new fid - if (o->onode.data_map.empty()) { - // create - fragment_t &f = o->onode.data_map[0]; - f.offset = 0; - f.length = o->onode.size; - int fd = _create_fid(txc, &f.fid, O_RDWR); - if (fd < 0) { - return fd; - } - VOID_TEMP_FAILURE_RETRY(::close(fd)); - dout(20) << __func__ << " create " << f.fid << dendl; - } - assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; assert(f.offset == 0); From 41886c5420934dea85121c497bef370cfd290fc2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 14:51:51 -0700 Subject: [PATCH 387/654] os/newstore: throttle over entire write lifecycle Take a global throttle when we submit ops and release when they complete. The first throttles cover the period from submit to commit, while the wal ones also cover the async post-commit wal work. The configs are additive since the wal ones cover both periods; this should make them reasonably idiot-proof. Signed-off-by: Sage Weil --- src/common/config_opts.h | 7 +++++-- src/os/newstore/NewStore.cc | 33 ++++++++++++++++++++++++++------- src/os/newstore/NewStore.h | 31 ++++++++----------------------- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index e9020757aba8f..f39d6435fdb65 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -796,14 +796,17 @@ OPTION(newstore_backend, OPT_STR, "rocksdb") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously +OPTION(newstore_sync_wal_apply, OPT_BOOL, true) // perform initial wal work synchronously (possibly in combination with aio so we only *queue* ios) OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value OPTION(newstore_wal_threads, OPT_INT, 4) OPTION(newstore_wal_thread_timeout, OPT_INT, 30) OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120) -OPTION(newstore_wal_max_ops, OPT_U64, 64) -OPTION(newstore_wal_max_bytes, OPT_U64, 64*1024*1024) +OPTION(newstore_max_ops, OPT_U64, 512) +OPTION(newstore_max_bytes, OPT_U64, 64*1024*1024) +OPTION(newstore_wal_max_ops, OPT_U64, 512) +OPTION(newstore_wal_max_bytes, OPT_U64, 128*1024*1024) OPTION(newstore_fid_prealloc, OPT_INT, 1024) OPTION(newstore_nid_prealloc, OPT_INT, 1024) OPTION(newstore_overlay_max_length, OPT_INT, 65536) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index d501943b99310..a9ece6da05a50 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -584,6 +584,14 @@ NewStore::NewStore(CephContext *cct, const string& path) fid_lock("NewStore::fid_lock"), nid_lock("NewStore::nid_lock"), nid_max(0), + throttle_ops(cct, "newstore_max_ops", cct->_conf->newstore_max_ops), + throttle_bytes(cct, "newstore_max_bytes", cct->_conf->newstore_max_bytes), + throttle_wal_ops(cct, "newstore_wal_max_ops", + cct->_conf->newstore_max_ops + + cct->_conf->newstore_wal_max_ops), + throttle_wal_bytes(cct, "newstore_wal_max_bytes", + cct->_conf->newstore_max_bytes + + cct->_conf->newstore_wal_max_bytes), wal_lock("NewStore::wal_lock"), wal_seq(0), wal_tp(cct, @@ -2088,7 +2096,11 @@ void NewStore::_txc_state_proc(TransContext *txc) case TransContext::STATE_KV_DONE: if (txc->wal_txn) { txc->state = TransContext::STATE_WAL_QUEUED; - wal_wq.queue(txc); + if (g_conf->newstore_sync_wal_apply) { + _wal_apply(txc); + } else { + wal_wq.queue(txc); + } return; } txc->state = TransContext::STATE_FINISHING; @@ -2253,6 +2265,9 @@ void NewStore::_txc_finish_kv(TransContext *txc) finisher.queue(txc->oncommits.front()); txc->oncommits.pop_front(); } + + throttle_ops.put(txc->ops); + throttle_bytes.put(txc->bytes); } void NewStore::_txc_finish(TransContext *txc) @@ -2280,6 +2295,9 @@ void NewStore::_txc_finish(TransContext *txc) txc->removed_collections.pop_front(); } + throttle_wal_ops.put(txc->ops); + throttle_wal_bytes.put(txc->bytes); + OpSequencerRef osr = txc->osr; osr->qlock.Lock(); txc->state = TransContext::STATE_DONE; @@ -2402,8 +2420,6 @@ int NewStore::_wal_finish(TransContext *txc) wal_transaction_t& wt = *txc->wal_txn; dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl; - wal_wq.release_throttle(txc); - string key; get_wal_key(wt.seq, &key); KeyValueDB::Transaction cleanup = db->get_transaction(); @@ -2584,10 +2600,6 @@ int NewStore::queue_transactions( tls, &onreadable, &ondisk, &onreadable_sync); int r; - // throttle on wal work - wal_wq.throttle(g_conf->newstore_wal_max_ops, - g_conf->newstore_wal_max_bytes); - // set up the sequencer OpSequencer *osr; if (!posr) @@ -2610,12 +2622,19 @@ int NewStore::queue_transactions( for (list::iterator p = tls.begin(); p != tls.end(); ++p) { (*p)->set_osr(osr); + txc->ops += (*p)->get_num_ops(); + txc->bytes += (*p)->get_num_bytes(); _txc_add_transaction(txc, *p); } r = _txc_finalize(osr, txc); assert(r == 0); + throttle_ops.get(txc->ops); + throttle_bytes.get(txc->bytes); + throttle_wal_ops.get(txc->ops); + throttle_wal_bytes.get(txc->bytes); + // execute (start) _txc_state_proc(txc); return 0; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index fbe1cf0423e7b..9f97122045fac 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -185,6 +185,8 @@ class NewStore : public ObjectStore { OpSequencerRef osr; boost::intrusive::list_member_hook<> sequencer_item; + uint64_t ops, bytes; + list fds; ///< these fds need to be synced set onodes; ///< these onodes need to be updated/written KeyValueDB::Transaction t; ///< then we will commit this @@ -210,6 +212,8 @@ class NewStore : public ObjectStore { TransContext(OpSequencer *o) : state(STATE_PREPARE), osr(o), + ops(0), + bytes(0), oncommit(NULL), onreadable(NULL), onreadable_sync(NULL), @@ -376,15 +380,11 @@ class NewStore : public ObjectStore { private: NewStore *store; wal_osr_queue_t wal_queue; - uint64_t ops, bytes; - Cond throttle_cond; public: WALWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp) : ThreadPool::WorkQueue("NewStore::WALWQ", ti, sti, tp), - store(s), - ops(0), - bytes(0) { + store(s) { } bool _empty() { return wal_queue.empty(); @@ -394,8 +394,6 @@ class NewStore : public ObjectStore { wal_queue.push_back(*i->osr); } i->osr->wal_q.push_back(*i); - ++ops; - bytes += i->wal_txn->get_bytes(); return true; } void _dequeue(TransContext *p) { @@ -434,22 +432,6 @@ class NewStore : public ObjectStore { unlock(); drain(); } - - void throttle(uint64_t max_ops, uint64_t max_bytes) { - Mutex& lock = get_lock(); - Mutex::Locker l(lock); - while (ops > max_ops || bytes > max_bytes) { - throttle_cond.Wait(lock); - } - } - - void release_throttle(TransContext *txc) { - lock(); - --ops; - bytes -= txc->wal_txn->get_bytes(); - throttle_cond.Signal(); - unlock(); - } }; struct KVSyncThread : public Thread { @@ -495,6 +477,9 @@ class NewStore : public ObjectStore { uint64_t nid_last; uint64_t nid_max; + Throttle throttle_ops, throttle_bytes; ///< submit to commit + Throttle throttle_wal_ops, throttle_wal_bytes; ///< submit to wal complete + Mutex wal_lock; atomic64_t wal_seq; ThreadPool wal_tp; From 4eca15a950794bbff24e293d708b36e994c17280 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 24 Apr 2015 13:41:35 -0700 Subject: [PATCH 388/654] os/newstore: fix _txc_aio_submit The aios may complete before _txc_aio_submit completes. In fact, the aio may complete, commit to the kv store, and then queue more wal aio's before we finish the loop. Move aios to a separate list to ensure we only submit them once and do not right another CPU adjusting the list. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 32 +++++++++++++++++++------------- src/os/newstore/NewStore.h | 3 ++- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a9ece6da05a50..17109ba9645f8 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2056,7 +2056,7 @@ void NewStore::_txc_state_proc(TransContext *txc) << " " << txc->get_state_name() << dendl; switch (txc->state) { case TransContext::STATE_PREPARE: - if (!txc->aios.empty()) { + if (!txc->pending_aios.empty()) { txc->state = TransContext::STATE_AIO_WAIT; _txc_aio_submit(txc); return; @@ -2107,7 +2107,7 @@ void NewStore::_txc_state_proc(TransContext *txc) break; case TransContext::STATE_WAL_APPLYING: - if (!txc->aios.empty()) { + if (!txc->pending_aios.empty()) { txc->state = TransContext::STATE_WAL_AIO_WAIT; _txc_aio_submit(txc); return; @@ -2407,7 +2407,7 @@ int NewStore::_wal_apply(TransContext *txc) dout(20) << __func__ << " txc " << txc << " seq " << wt.seq << dendl; txc->state = TransContext::STATE_WAL_APPLYING; - txc->aios.clear(); + assert(txc->pending_aios.empty()); int r = _do_wal_transaction(wt, txc); assert(r == 0); @@ -2464,8 +2464,8 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt, return fd; #ifdef HAVE_LIBAIO if (g_conf->newstore_aio && txc && (flags & O_DIRECT)) { - txc->aios.push_back(FS::aio_t(txc, fd)); - FS::aio_t& aio = txc->aios.back(); + txc->pending_aios.push_back(FS::aio_t(txc, fd)); + FS::aio_t& aio = txc->pending_aios.back(); p->data.prepare_iov(&aio.iov); aio.pwritev(p->offset); dout(2) << __func__ << " prepared aio " << &aio << dendl; @@ -2642,13 +2642,19 @@ int NewStore::queue_transactions( void NewStore::_txc_aio_submit(TransContext *txc) { - int num = txc->aios.size(); + int num = txc->pending_aios.size(); dout(10) << __func__ << " txc " << txc << " submitting " << num << dendl; assert(num > 0); txc->num_aio.set(num); - for (list::iterator p = txc->aios.begin(); - p != txc->aios.end(); - ++p) { + + // move these aside, and get our end iterator position now, as the + // aios might complete as soon as they are submitted and queue more + // wal aio's. + list::iterator e = txc->submitted_aios.begin(); + txc->submitted_aios.splice(e, txc->pending_aios); + list::iterator p = txc->submitted_aios.begin(); + assert(p != e); + for (; p != e; ++p) { FS::aio_t& aio = *p; dout(20) << __func__ << " aio " << &aio << " fd " << aio.fd << dendl; for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) @@ -3257,8 +3263,8 @@ int NewStore::_do_write(TransContext *txc, } #ifdef HAVE_LIBAIO if (g_conf->newstore_aio && (flags & O_DIRECT)) { - txc->aios.push_back(FS::aio_t(txc, fd)); - FS::aio_t& aio = txc->aios.back(); + txc->pending_aios.push_back(FS::aio_t(txc, fd)); + FS::aio_t& aio = txc->pending_aios.back(); bl.prepare_iov(&aio.iov); txc->aio_bl.append(bl); aio.pwritev(x_offset); @@ -3305,8 +3311,8 @@ int NewStore::_do_write(TransContext *txc, #ifdef HAVE_LIBAIO if (g_conf->newstore_aio && (flags & O_DIRECT)) { - txc->aios.push_back(FS::aio_t(txc, fd)); - FS::aio_t& aio = txc->aios.back(); + txc->pending_aios.push_back(FS::aio_t(txc, fd)); + FS::aio_t& aio = txc->pending_aios.back(); bl.prepare_iov(&aio.iov); txc->aio_bl.append(bl); aio.pwritev(0); diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 9f97122045fac..55b73f7fd0b9e 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -200,7 +200,8 @@ class NewStore : public ObjectStore { wal_transaction_t *wal_txn; ///< wal transaction (if any) unsigned num_fsyncs_completed; - list aios; + list pending_aios; ///< not yet submitted + list submitted_aios; ///< submitting or submitted bufferlist aio_bl; // just a pile of refs atomic_t num_aio; From 29ba720885a3ee69577ddd3c3ce4cd273505726b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 27 Apr 2015 15:49:28 +0800 Subject: [PATCH 389/654] os/Nestore: batch cleanup batch cleanup wal. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 37 ++++++++++++++++++++++++------------- src/os/newstore/NewStore.h | 1 + 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 17109ba9645f8..44bdbe0882aec 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2359,7 +2359,8 @@ void NewStore::_kv_sync_thread() kv_lock.Lock(); while (true) { assert(kv_committing.empty()); - if (kv_queue.empty()) { + assert(wal_cleaning.empty()); + if (kv_queue.empty() && wal_cleanup_queue.empty()) { if (kv_stop) break; dout(20) << __func__ << " sleep" << dendl; @@ -2367,20 +2368,37 @@ void NewStore::_kv_sync_thread() kv_cond.Wait(kv_lock); dout(20) << __func__ << " wake" << dendl; } else { - dout(20) << __func__ << " committing " << kv_queue.size() << dendl; + dout(20) << __func__ << " committing " << kv_queue.size() << " cleaning " << wal_cleanup_queue.size() << dendl; kv_committing.swap(kv_queue); + wal_cleaning.swap(wal_cleanup_queue); utime_t start = ceph_clock_now(NULL); kv_lock.Unlock(); - db->submit_transaction_sync(db->get_transaction()); + KeyValueDB::Transaction txc_cleanup_sync = db->get_transaction(); + //adding wal cleanup op + for (std::deque::iterator it = wal_cleaning.begin(); + it != wal_cleaning.end(); + it++) { + wal_transaction_t& wt =*(*it)->wal_txn; + string key; + get_wal_key(wt.seq, &key); + txc_cleanup_sync->rmkey(PREFIX_WAL, key); + } + + db->submit_transaction_sync(txc_cleanup_sync); utime_t finish = ceph_clock_now(NULL); utime_t dur = finish - start; - dout(20) << __func__ << " committed " << kv_committing.size() + dout(20) << __func__ << " committed " << kv_committing.size() << "cleaned " << wal_cleaning.size() << " in " << dur << dendl; while (!kv_committing.empty()) { TransContext *txc = kv_committing.front(); _txc_state_proc(txc); kv_committing.pop_front(); } + while (!wal_cleaning.empty()) { + TransContext *txc = wal_cleaning.front(); + _txc_state_proc(txc); + wal_cleaning.pop_front(); + } // this is as good a place as any ... _reap_collections(); @@ -2420,16 +2438,9 @@ int NewStore::_wal_finish(TransContext *txc) wal_transaction_t& wt = *txc->wal_txn; dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl; - string key; - get_wal_key(wt.seq, &key); - KeyValueDB::Transaction cleanup = db->get_transaction(); - cleanup->rmkey(PREFIX_WAL, key); - - txc->state = TransContext::STATE_WAL_CLEANUP; - Mutex::Locker l(kv_lock); - db->submit_transaction(cleanup); - kv_queue.push_back(txc); + txc->state = TransContext::STATE_WAL_CLEANUP; + wal_cleanup_queue.push_back(txc); kv_cond.SignalOne(); return 0; } diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 55b73f7fd0b9e..ce6dfac17d04b 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -499,6 +499,7 @@ class NewStore : public ObjectStore { Cond kv_cond, kv_sync_cond; bool kv_stop; deque kv_queue, kv_committing; + deque wal_cleanup_queue, wal_cleaning; Logger *logger; From 793dcc396c055bbe4ce396cdef911c7a5583a3b1 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 27 Apr 2015 16:15:26 +0800 Subject: [PATCH 390/654] os/NewStore: combine contiguous overlays when writing all the overlays Combine contiguous overlay writes to reduce the numbers of WAL writes and fs writes. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 44bdbe0882aec..21407aa20d7d7 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3140,8 +3140,7 @@ int NewStore::_do_write_all_overlays(TransContext *txc, assert(f.length == o->onode.size); for (map::iterator p = o->onode.overlay_map.begin(); - p != o->onode.overlay_map.end(); - ++p) { + p != o->onode.overlay_map.end(); ) { dout(10) << __func__ << " overlay " << p->first << "~" << p->second << dendl; string key; @@ -3157,6 +3156,31 @@ int NewStore::_do_write_all_overlays(TransContext *txc, op->data.substr_of(bl, p->second.value_offset, p->second.length); txc->t->rmkey(PREFIX_OVERLAY, key); + + // Combine with later overlays if contiguous + map::iterator prev = p, next = p; + ++next; + while (next != o->onode.overlay_map.end()) { + if (prev->first + prev->second.length == next->first) { + dout(10) << __func__ << " combining overlay " << next->first + << "~" << next->second << dendl; + string key_next; + get_overlay_key(o->onode.nid, next->second.key, &key_next); + bufferlist bl_next, bl_next_data; + db->get(PREFIX_OVERLAY, key_next, &bl_next); + + bl_next_data.substr_of(bl_next, next->second.value_offset, + next->second.length); + bl.claim_append(bl_next_data); + txc->t->rmkey(PREFIX_OVERLAY, key_next); + + ++prev; + ++next; + } else { + break; + } + } + p = next; } // this may double delete something we did above, but that's less From c552cd20ab84997e9fc2d6795b69d70116e82708 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 27 Apr 2015 16:27:21 +0800 Subject: [PATCH 391/654] osd/NewStore: fix for skipping the overlay in _do_overlay_trim When the offset of the write starts at the end of the overlay, that is, p->first + p->second.length == offset, the overlay could be skipped as well. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 21407aa20d7d7..b2ffad79bf43b 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3058,7 +3058,7 @@ int NewStore::_do_overlay_trim(TransContext *txc, << dendl; break; } - if (p->first + p->second.length < offset) { + if (p->first + p->second.length <= offset) { dout(20) << __func__ << " skip " << p->first << " " << p->second << dendl; ++p; From 117330045f722a84f4e7c775dc656b8dd41377b2 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 27 Apr 2015 16:28:33 +0800 Subject: [PATCH 392/654] os/newstore : Do not need to call fdatasync if using direct. skip ::fdatasync if in direct mode. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index b2ffad79bf43b..440f4715a77b4 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2496,7 +2496,8 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt, << cpp_strerror(r) << dendl; return r; } - sync_fds.push_back(fd); + if (!(flags & O_DIRECT)) + sync_fds.push_back(fd); } } break; From 4c9e37de8af2a1f9c60840a0e6a78814a6b49f1c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 27 Apr 2015 14:42:55 -0700 Subject: [PATCH 393/654] os/newstore: fix race in _txc_aio_submit We cannot rely on the iterator pointers being valid after we submit the aio because we are racing with the completion. Make our loop decision before submitting and avoid dereferencing txc after that point. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 440f4715a77b4..3ce09f3fff856 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2666,7 +2666,8 @@ void NewStore::_txc_aio_submit(TransContext *txc) txc->submitted_aios.splice(e, txc->pending_aios); list::iterator p = txc->submitted_aios.begin(); assert(p != e); - for (; p != e; ++p) { + bool done = false; + while (!done) { FS::aio_t& aio = *p; dout(20) << __func__ << " aio " << &aio << " fd " << aio.fd << dendl; for (vector::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q) @@ -2674,6 +2675,16 @@ void NewStore::_txc_aio_submit(TransContext *txc) << " len " << q->iov_len << dendl; dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR) << dendl; + + // be careful: as soon as we submit aio we race with completion. + // since we are holding a ref take care not to dereference txc at + // all after that point. + list::iterator next = p; + ++next; + done = (next == e); + + // do not dereference txc (or it's contents) after we submit (if + // done == true and we don't loop) int retries = 0; int r = aio_queue.submit(*p, &retries); if (retries) From df239f0f62fa324af7972594a333eeee66cc4a04 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Fri, 17 Apr 2015 16:14:41 +0800 Subject: [PATCH 394/654] os/Newstore:Fix collection_list_range We need to rule out hobject_t::max before calling get_object_key (in which will call get_filestore_key_u32 and get an assert failure) Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 3ce09f3fff856..e137e5f3d022d 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1503,14 +1503,18 @@ int NewStore::collection_list( } it->upper_bound(k); } - get_object_key(end, &end_str); - if (end.hobj.is_temp()) { - if (temp) - pend = end_str.c_str(); - else - goto out; + if (end.hobj.is_max()) { + pend = temp ? temp_end_key.c_str() : end_key.c_str(); } else { - pend = temp ? temp_end_key.c_str() : end_str.c_str(); + get_object_key(end, &end_str); + if (end.hobj.is_temp()) { + if (temp) + pend = end_str.c_str(); + else + goto out; + } else { + pend = temp ? temp_end_key.c_str() : end_str.c_str(); + } } while (true) { if (!it->valid() || strcmp(it->key().c_str(), pend) > 0) { From 65055a02077a66cba85d81c756697cac775a2012 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Tue, 28 Apr 2015 16:41:39 +0800 Subject: [PATCH 395/654] os/NewStore: need to increase the wal op length when combining overlays Need to add the length of the combining overlays to the length of the wal op. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index e137e5f3d022d..ca976d6714d1f 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3188,6 +3188,7 @@ int NewStore::_do_write_all_overlays(TransContext *txc, bl_next_data.substr_of(bl_next, next->second.value_offset, next->second.length); bl.claim_append(bl_next_data); + op->length += next->second.length; txc->t->rmkey(PREFIX_OVERLAY, key_next); ++prev; From 37da4292b38b6df41e29706587f2b6a9ddddbd0b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 28 Apr 2015 20:56:13 +0800 Subject: [PATCH 396/654] os/newstore:close fd after writting with O_DIRECT fix bug in 2b4c60e0a521ad10b94bbc82865b49f2d28c2ac9 Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index ca976d6714d1f..3088f43027de5 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2502,6 +2502,8 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt, } if (!(flags & O_DIRECT)) sync_fds.push_back(fd); + else + VOID_TEMP_FAILURE_RETRY(::close(fd)); } } break; From 2a7393a4468ce1506861eaa836d58f06014b448e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Apr 2015 09:28:13 -0700 Subject: [PATCH 397/654] os/newstore: more conservative default for aio queue depth There appears to be a kernel aio bug when the queue depth is small. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f39d6435fdb65..b83fef4f83023 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -816,7 +816,7 @@ OPTION(newstore_o_direct, OPT_BOOL, true) OPTION(newstore_db_path, OPT_STR, "") OPTION(newstore_aio, OPT_BOOL, true) OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds -OPTION(newstore_aio_max_queue_depth, OPT_INT, 64) +OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096) OPTION(filestore_omap_backend, OPT_STR, "leveldb") From 6399f1d0608f3266b27df9a6f180bfaf473b8e7b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Apr 2015 09:47:09 -0700 Subject: [PATCH 398/654] os/newstore: fix multiple aio case Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 3088f43027de5..39af9c45f05ce 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2345,8 +2345,8 @@ void NewStore::_aio_thread() if (r == 1) { TransContext *txc = static_cast(aio->priv); int left = txc->num_aio.dec(); - dout(10) << __func__ << " finished aio on " << txc << " state " - << txc->get_state_name() << ", " + dout(10) << __func__ << " finished aio " << aio << " txc " << txc + << " state " << txc->get_state_name() << ", " << left << " aios left" << dendl; VOID_TEMP_FAILURE_RETRY(::close(aio->fd)); if (left == 0) { @@ -2685,14 +2685,14 @@ void NewStore::_txc_aio_submit(TransContext *txc) // be careful: as soon as we submit aio we race with completion. // since we are holding a ref take care not to dereference txc at // all after that point. - list::iterator next = p; - ++next; - done = (next == e); + list::iterator cur = p; + ++p; + done = (p == e); // do not dereference txc (or it's contents) after we submit (if // done == true and we don't loop) int retries = 0; - int r = aio_queue.submit(*p, &retries); + int r = aio_queue.submit(*cur, &retries); if (retries) derr << __func__ << " retries " << retries << dendl; if (r) { From e02e7438578dc614d4cf8858164989970d36c4dc Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Tue, 28 Apr 2015 16:24:16 +0800 Subject: [PATCH 399/654] os/NewStore: avoid dup the data of the overlays in the WAL When writing all the overlays, there is no need to dup the data in WAL. Instead, we can reference the overlays in the WAL, and remove these overlays after commiting them to the fs. When replaying, we can get these data from the referenced overlays. Doing this way, we can save a write and a deletion for each of the overlay data in the db. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 44 ++++++++++++++++++++++++++----- src/os/newstore/newstore_types.cc | 29 ++++++++++++++++++-- src/os/newstore/newstore_types.h | 3 +++ 3 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 39af9c45f05ce..bb946f288f476 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2383,6 +2383,23 @@ void NewStore::_kv_sync_thread() it != wal_cleaning.end(); it++) { wal_transaction_t& wt =*(*it)->wal_txn; + // cleanup the data in overlays + for (list::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) { + for (vector::iterator q = p->overlays.begin(); + q != p->overlays.end(); ++q) { + string key; + get_overlay_key(p->nid, q->key, &key); + txc_cleanup_sync->rmkey(PREFIX_OVERLAY, key); + } + } + // cleanup the shared overlays. this may double delete something we + // did above, but that's less work than doing careful ref counting + // of the overlay key/value pairs. + for (vector::iterator p = wt.shared_overlay_keys.begin(); + p != wt.shared_overlay_keys.end(); ++p) { + txc_cleanup_sync->rmkey(PREFIX_OVERLAY, *p); + } + // cleanup the wal string key; get_wal_key(wt.seq, &key); txc_cleanup_sync->rmkey(PREFIX_WAL, key); @@ -2586,6 +2603,19 @@ int NewStore::_wal_replay() derr << __func__ << " failed to decode wal txn " << it->key() << dendl; return -EIO; } + + // Get the overlay data of the WAL for replay + for (list::iterator q = wt.ops.begin(); q != wt.ops.end(); ++q) { + for (vector::iterator oit = q->overlays.begin(); + oit != q->overlays.end(); ++oit) { + string key; + get_overlay_key(q->nid, oit->key, &key); + bufferlist bl, bl_data; + db->get(PREFIX_OVERLAY, key, &bl); + bl_data.substr_of(bl, oit->value_offset, oit->length); + q->data.claim_append(bl_data); + } + } dout(20) << __func__ << " replay " << it->key() << dendl; int r = _do_wal_transaction(wt, NULL); // don't bother with aio here if (r < 0) @@ -3171,10 +3201,11 @@ int NewStore::_do_write_all_overlays(TransContext *txc, op->offset = p->first; op->length = p->second.length; op->fid = f.fid; + // The overlays will be removed from the db after applying the WAL + op->nid = o->onode.nid; + op->overlays.push_back(p->second); op->data.substr_of(bl, p->second.value_offset, p->second.length); - txc->t->rmkey(PREFIX_OVERLAY, key); - // Combine with later overlays if contiguous map::iterator prev = p, next = p; ++next; @@ -3191,7 +3222,7 @@ int NewStore::_do_write_all_overlays(TransContext *txc, next->second.length); bl.claim_append(bl_next_data); op->length += next->second.length; - txc->t->rmkey(PREFIX_OVERLAY, key_next); + op->overlays.push_back(next->second); ++prev; ++next; @@ -3202,16 +3233,15 @@ int NewStore::_do_write_all_overlays(TransContext *txc, p = next; } - // this may double delete something we did above, but that's less - // work than doing careful ref counting of the overlay key/value - // pairs. + // put the shared overlay keys into the WAL transaction, so that we + // can cleanup them later after applying the WAL for (set::iterator p = o->onode.shared_overlays.begin(); p != o->onode.shared_overlays.end(); ++p) { dout(10) << __func__ << " shared overlay " << *p << dendl; string key; get_overlay_key(o->onode.nid, *p, &key); - txc->t->rmkey(PREFIX_OVERLAY, key); + txc->wal_txn->shared_overlay_keys.push_back(key); } o->onode.overlay_map.clear(); diff --git a/src/os/newstore/newstore_types.cc b/src/os/newstore/newstore_types.cc index aa1a710532110..5489faf143f62 100644 --- a/src/os/newstore/newstore_types.cc +++ b/src/os/newstore/newstore_types.cc @@ -229,7 +229,11 @@ void wal_op_t::encode(bufferlist& bl) const ::encode(fid, bl); ::encode(offset, bl); ::encode(length, bl); - ::encode(data, bl); + ::encode(nid, bl); + ::encode(overlays, bl); + if (!overlays.size()) { + ::encode(data, bl); + } ENCODE_FINISH(bl); } @@ -240,7 +244,11 @@ void wal_op_t::decode(bufferlist::iterator& p) ::decode(fid, p); ::decode(offset, p); ::decode(length, p); - ::decode(data, p); + ::decode(nid, p); + ::decode(overlays, p); + if (!overlays.size()) { + ::decode(data, p); + } DECODE_FINISH(p); } @@ -250,6 +258,15 @@ void wal_op_t::dump(Formatter *f) const f->dump_object("fid", fid); f->dump_unsigned("offset", offset); f->dump_unsigned("length", length); + if (overlays.size()) { + f->dump_unsigned("nid", nid); + f->open_array_section("overlays"); + for (vector::const_iterator p = overlays.begin(); + p != overlays.end(); ++p) { + f->dump_object("overlay", *p); + } + f->close_section(); + } } void wal_transaction_t::encode(bufferlist& bl) const @@ -257,6 +274,7 @@ void wal_transaction_t::encode(bufferlist& bl) const ENCODE_START(1, 1, bl); ::encode(seq, bl); ::encode(ops, bl); + ::encode(shared_overlay_keys, bl); ENCODE_FINISH(bl); } @@ -265,6 +283,7 @@ void wal_transaction_t::decode(bufferlist::iterator& p) DECODE_START(1, p); ::decode(seq, p); ::decode(ops, p); + ::decode(shared_overlay_keys, p); DECODE_FINISH(p); } @@ -276,4 +295,10 @@ void wal_transaction_t::dump(Formatter *f) const f->dump_object("op", *p); } f->close_section(); + f->open_array_section("shared_overlay_keys"); + for (vector::const_iterator p = shared_overlay_keys.begin(); + p != shared_overlay_keys.end(); ++p) { + f->dump_string("shared_overlay_key", *p); + } + f->close_section(); } diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h index 286fc773e6791..ca616adb94c42 100644 --- a/src/os/newstore/newstore_types.h +++ b/src/os/newstore/newstore_types.h @@ -151,6 +151,8 @@ struct wal_op_t { fid_t fid; uint64_t offset, length; bufferlist data; + uint64_t nid; + vector overlays; void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); @@ -164,6 +166,7 @@ WRITE_CLASS_ENCODER(wal_op_t) struct wal_transaction_t { uint64_t seq; list ops; + vector shared_overlay_keys; int64_t _bytes; ///< cached byte count From 36ed3dd20ad2e7922a2ec300036601ba03841d3f Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 29 Apr 2015 13:45:52 +0800 Subject: [PATCH 400/654] os/Newstore: flush_commit return true on STATE_KV_DONE There is a racing condition here, if the flush_commit() call happened after _txc_finish_kv and before next state, the context was pushed to on_commits but no one will handle the context since we already pass _txc_finish_kv. This bug can be easily reproduce by putting a sleep(5) after _txc_finish_kv, and trigger the bug by ceph-osd -i 0 --mkfs. Fix this bug by return true directly when state >= STATE_KV_DONE(instead of > in previous code). We already persist the data in STATE_KV_DONE so it's safe for us to do this. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index ce6dfac17d04b..95122a1c4662b 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -304,10 +304,10 @@ class NewStore : public ObjectStore { return true; } TransContext *txc = &q.back(); - if (txc->state > TransContext::STATE_KV_DONE) { + if (txc->state >= TransContext::STATE_KV_DONE) { return true; } - assert(txc->state <= TransContext::STATE_KV_DONE); + assert(txc->state < TransContext::STATE_KV_DONE); txc->oncommits.push_back(c); return false; } From cdc652ebbefb927ac338afbfee16f13044bffa18 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 29 Apr 2015 14:10:51 +0800 Subject: [PATCH 401/654] os/NewStore: fix the append of the later overlays when doing combination The data of the later contiguous overlays should be claim_append to 'op->data', instead of 'bl'. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index bb946f288f476..8c7289a98111c 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3220,12 +3220,12 @@ int NewStore::_do_write_all_overlays(TransContext *txc, bl_next_data.substr_of(bl_next, next->second.value_offset, next->second.length); - bl.claim_append(bl_next_data); + op->data.claim_append(bl_next_data); op->length += next->second.length; - op->overlays.push_back(next->second); + op->overlays.push_back(next->second); - ++prev; - ++next; + ++prev; + ++next; } else { break; } From e3abf245ba4a555b695a437fa488ca41b36c2c77 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 29 Apr 2015 13:59:16 +0800 Subject: [PATCH 402/654] os/newstore: fix deadlock when newstore_sync_transaction=true There is a deadlock issue in Newstore when newstore_sync_transaction = true. With sync_transaction to true, the txc state machine will go all the way down from STATE_IO_DONE to STATE_FINISHING in the same thread, while holding the osr->qlock(). The deadlock is caused in _txc_finish and _osr_reap_done, when trying to lock osr->qlock again. Since the _txc_finish can be called with(in sync transaction mode) or without (in async transaction mode) holding the qlock, so fix this by setting the qlock to PTHREAD_MUTEX_RECURSIVE, thus we can recursive acquire the qlock. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 95122a1c4662b..c126873e970e1 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -279,7 +279,8 @@ class NewStore : public ObjectStore { Mutex wal_apply_lock; OpSequencer() - : qlock("NewStore::OpSequencer::qlock", false, false), + //set the qlock to to PTHREAD_MUTEX_RECURSIVE mode + : qlock("NewStore::OpSequencer::qlock", true, false), parent(NULL), wal_apply_lock("NewStore::OpSequencer::wal_apply_lock") { } From 02d0ef8fe00c66d638a844220188f1f438989242 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 29 Apr 2015 14:32:25 +0800 Subject: [PATCH 403/654] os/NewStore: delay the read of all the overlays until wal applying The read of all the overlays can be delayed until applying the wal. If we are doing async wal apply, this can reduce write op latency by eliminating unnecessary reads in the write code path. Signed-off-by: Zhiqiang Wang --- src/os/newstore/NewStore.cc | 44 +++++++++++++++++-------------------- src/os/newstore/NewStore.h | 1 + 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 8c7289a98111c..75ba0bd93e138 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2472,6 +2472,9 @@ int NewStore::_do_wal_transaction(wal_transaction_t& wt, vector sync_fds; sync_fds.reserve(wt.ops.size()); + // read all the overlay data first for apply + _do_read_all_overlays(wt); + for (list::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) { switch (p->op) { case wal_op_t::OP_WRITE: @@ -2605,17 +2608,7 @@ int NewStore::_wal_replay() } // Get the overlay data of the WAL for replay - for (list::iterator q = wt.ops.begin(); q != wt.ops.end(); ++q) { - for (vector::iterator oit = q->overlays.begin(); - oit != q->overlays.end(); ++oit) { - string key; - get_overlay_key(q->nid, oit->key, &key); - bufferlist bl, bl_data; - db->get(PREFIX_OVERLAY, key, &bl); - bl_data.substr_of(bl, oit->value_offset, oit->length); - q->data.claim_append(bl_data); - } - } + _do_read_all_overlays(wt); dout(20) << __func__ << " replay " << it->key() << dendl; int r = _do_wal_transaction(wt, NULL); // don't bother with aio here if (r < 0) @@ -3191,10 +3184,6 @@ int NewStore::_do_write_all_overlays(TransContext *txc, p != o->onode.overlay_map.end(); ) { dout(10) << __func__ << " overlay " << p->first << "~" << p->second << dendl; - string key; - get_overlay_key(o->onode.nid, p->second.key, &key); - bufferlist bl; - db->get(PREFIX_OVERLAY, key, &bl); wal_op_t *op = _get_wal_op(txc); op->op = wal_op_t::OP_WRITE; @@ -3204,7 +3193,6 @@ int NewStore::_do_write_all_overlays(TransContext *txc, // The overlays will be removed from the db after applying the WAL op->nid = o->onode.nid; op->overlays.push_back(p->second); - op->data.substr_of(bl, p->second.value_offset, p->second.length); // Combine with later overlays if contiguous map::iterator prev = p, next = p; @@ -3213,14 +3201,6 @@ int NewStore::_do_write_all_overlays(TransContext *txc, if (prev->first + prev->second.length == next->first) { dout(10) << __func__ << " combining overlay " << next->first << "~" << next->second << dendl; - string key_next; - get_overlay_key(o->onode.nid, next->second.key, &key_next); - bufferlist bl_next, bl_next_data; - db->get(PREFIX_OVERLAY, key_next, &bl_next); - - bl_next_data.substr_of(bl_next, next->second.value_offset, - next->second.length); - op->data.claim_append(bl_next_data); op->length += next->second.length; op->overlays.push_back(next->second); @@ -3250,6 +3230,22 @@ int NewStore::_do_write_all_overlays(TransContext *txc, return 0; } +void NewStore::_do_read_all_overlays(wal_transaction_t& wt) +{ + for (list::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) { + for (vector::iterator q = p->overlays.begin(); + q != p->overlays.end(); ++q) { + string key; + get_overlay_key(p->nid, q->key, &key); + bufferlist bl, bl_data; + db->get(PREFIX_OVERLAY, key, &bl); + bl_data.substr_of(bl, q->value_offset, q->length); + p->data.claim_append(bl_data); + } + } + return; +} + int NewStore::_do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index c126873e970e1..c4cdc2bd3f2e8 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -741,6 +741,7 @@ class NewStore : public ObjectStore { const bufferlist& bl); int _do_write_all_overlays(TransContext *txc, OnodeRef o); + void _do_read_all_overlays(wal_transaction_t& wt); int _do_write(TransContext *txc, OnodeRef o, uint64_t offset, uint64_t length, From 08f3efb47492baa0dc765080a208cde7d85f972c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2015 16:10:32 -0700 Subject: [PATCH 404/654] Revert "os/NewStore: data_map shouldn't be empty when writing all overlays" This reverts commit 0d9cce462fec61f754ddcd17cf9a3cf69581d7c5. We may want to write an overlay if hte object is new and the write is small to defer the cost of the fsync. --- src/os/newstore/NewStore.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 75ba0bd93e138..33c322a2c2360 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3175,6 +3175,20 @@ int NewStore::_do_write_all_overlays(TransContext *txc, if (o->onode.overlay_map.empty()) return 0; + // overwrite to new fid + if (o->onode.data_map.empty()) { + // create + fragment_t &f = o->onode.data_map[0]; + f.offset = 0; + f.length = o->onode.size; + int fd = _create_fid(txc, &f.fid, O_RDWR); + if (fd < 0) { + return fd; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(20) << __func__ << " create " << f.fid << dendl; + } + assert(o->onode.data_map.size() == 1); fragment_t& f = o->onode.data_map.begin()->second; assert(f.offset == 0); From 668c2777158bf08888c67921210a796a80686e56 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 28 Apr 2015 16:11:05 -0700 Subject: [PATCH 405/654] rocksdb: fallocate_with_keep_size = false This improves my 4k random writes on hdd by about 25%. Signed-off-by: Sage Weil --- src/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rocksdb b/src/rocksdb index 6ca7befb55767..e0ab03a46bed9 160000 --- a/src/rocksdb +++ b/src/rocksdb @@ -1 +1 @@ -Subproject commit 6ca7befb55767784a447a5daddd09e387ec92bd3 +Subproject commit e0ab03a46bed911ec7b8d8506b2c62322d128b49 From e89b2474b7cda97ee83819f9ed834215ba3ca7b0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 29 Apr 2015 11:52:55 -0700 Subject: [PATCH 406/654] os/newstore: avoid sync append for small ios An append is expensive in terms of latency (write, fdatasync, kv commit), while a wal write is just the kv commit and the write and fdatasync are async. For small IOs doing the wal may improve performance. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b83fef4f83023..6463595e9b7e0 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -803,6 +803,7 @@ OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout v OPTION(newstore_wal_threads, OPT_INT, 4) OPTION(newstore_wal_thread_timeout, OPT_INT, 30) OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120) +OPTION(newstore_sync_append_min, OPT_INT, 65536) // only do sync append for large ios OPTION(newstore_max_ops, OPT_U64, 512) OPTION(newstore_max_bytes, OPT_U64, 64*1024*1024) OPTION(newstore_wal_max_ops, OPT_U64, 512) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 33c322a2c2360..aac07485497c0 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3320,8 +3320,9 @@ int NewStore::_do_write(TransContext *txc, } } - if (o->onode.size <= offset || - o->onode.size == 0 || + if (((o->onode.size <= offset || o->onode.size == 0) && + length >= g_conf->newstore_sync_append_min) || + o->onode.data_map.empty()) { uint64_t x_offset; if (o->onode.data_map.empty()) { From 4c1552001a2e12bfdcf8630292cb07da494dcb31 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 29 Apr 2015 13:57:40 -0700 Subject: [PATCH 407/654] Revert "os/newstore: avoid sync append for small ios" This reverts commit 69baab2f7eaca7688ce1d45802a82fc3539cd906. This is slower. :( --- src/common/config_opts.h | 1 - src/os/newstore/NewStore.cc | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 6463595e9b7e0..b83fef4f83023 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -803,7 +803,6 @@ OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout v OPTION(newstore_wal_threads, OPT_INT, 4) OPTION(newstore_wal_thread_timeout, OPT_INT, 30) OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120) -OPTION(newstore_sync_append_min, OPT_INT, 65536) // only do sync append for large ios OPTION(newstore_max_ops, OPT_U64, 512) OPTION(newstore_max_bytes, OPT_U64, 64*1024*1024) OPTION(newstore_wal_max_ops, OPT_U64, 512) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index aac07485497c0..33c322a2c2360 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3320,9 +3320,8 @@ int NewStore::_do_write(TransContext *txc, } } - if (((o->onode.size <= offset || o->onode.size == 0) && - length >= g_conf->newstore_sync_append_min) || - + if (o->onode.size <= offset || + o->onode.size == 0 || o->onode.data_map.empty()) { uint64_t x_offset; if (o->onode.data_map.empty()) { From 90e7f5e6484d13f55ddc0a393705b9454a5325ab Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 29 Apr 2015 14:51:00 -0700 Subject: [PATCH 408/654] os/newstore: only ftruncate if i_size is incorrect Even a no-op ftruncate can block in the kernel. Prior to this change I could frequently see ftruncate wait for an aio completion on the same file. Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 27 +++++++++++++++++++++------ src/os/newstore/NewStore.h | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 33c322a2c2360..de2f893a628cd 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3346,7 +3346,10 @@ int NewStore::_do_write(TransContext *txc, r = fd; goto out; } - ::ftruncate(fd, f.length); // in case there is trailing crap + r = _clean_fid_tail_fd(f, fd); // in case there is trailing crap + if (r < 0) { + goto out; + } f.length = (offset + length) - f.offset; x_offset = offset - f.offset; dout(20) << __func__ << " append " << f.fid << " writing " @@ -3458,12 +3461,8 @@ int NewStore::_do_write(TransContext *txc, return r; } -int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f) +int NewStore::_clean_fid_tail_fd(const fragment_t& f, int fd) { - int fd = _open_fid(f.fid, O_RDWR); - if (fd < 0) { - return fd; - } struct stat st; int r = ::fstat(fd, &st); if (r < 0) { @@ -3481,6 +3480,22 @@ int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f) << cpp_strerror(r) << dendl; return r; } + return 1; + } + return 0; +} + +int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f) +{ + int fd = _open_fid(f.fid, O_RDWR); + if (fd < 0) { + return fd; + } + int r = _clean_fid_tail_fd(f, fd); + if (r < 0) { + return r; + } + if (r > 0) { txc->sync_fd(fd); } else { // all good! diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index c4cdc2bd3f2e8..6d6a28dabfe80 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -544,6 +544,7 @@ class NewStore : public ObjectStore { int _recover_next_nid(); void _assign_nid(TransContext *txc, OnodeRef o); + int _clean_fid_tail_fd(const fragment_t& f, int fd); int _clean_fid_tail(TransContext *txc, const fragment_t& f); TransContext *_txc_create(OpSequencer *osr); From 9c2eb2858950a2189f73717e31a978a914faca2a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 29 Apr 2015 15:00:46 -0700 Subject: [PATCH 409/654] os/newstore: clean up kv commit debug output Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index de2f893a628cd..566e778a8614b 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2372,13 +2372,16 @@ void NewStore::_kv_sync_thread() kv_cond.Wait(kv_lock); dout(20) << __func__ << " wake" << dendl; } else { - dout(20) << __func__ << " committing " << kv_queue.size() << " cleaning " << wal_cleanup_queue.size() << dendl; + dout(20) << __func__ << " committing " << kv_queue.size() + << " cleaning " << wal_cleanup_queue.size() << dendl; kv_committing.swap(kv_queue); wal_cleaning.swap(wal_cleanup_queue); utime_t start = ceph_clock_now(NULL); kv_lock.Unlock(); + + // one transaction to force a sync. clean up wal keys while we + // are at it. KeyValueDB::Transaction txc_cleanup_sync = db->get_transaction(); - //adding wal cleanup op for (std::deque::iterator it = wal_cleaning.begin(); it != wal_cleaning.end(); it++) { @@ -2404,11 +2407,11 @@ void NewStore::_kv_sync_thread() get_wal_key(wt.seq, &key); txc_cleanup_sync->rmkey(PREFIX_WAL, key); } - db->submit_transaction_sync(txc_cleanup_sync); utime_t finish = ceph_clock_now(NULL); utime_t dur = finish - start; - dout(20) << __func__ << " committed " << kv_committing.size() << "cleaned " << wal_cleaning.size() + dout(20) << __func__ << " committed " << kv_committing.size() + << " cleaned " << wal_cleaning.size() << " in " << dur << dendl; while (!kv_committing.empty()) { TransContext *txc = kv_committing.front(); From 22a6a9f7681bbbafd46c36623aea8e467bb7d593 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 1 May 2015 17:21:23 -0700 Subject: [PATCH 410/654] os/newstore: process multiple aio completions at a time This isn't affecting things for a slow disk, but it will matter for faster backends. Signed-off-by: Sage Weil --- src/os/fs/FS.h | 10 ++++++---- src/os/newstore/NewStore.cc | 27 ++++++++++++++++----------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h index 4b52732422677..3ab5d67f39aaf 100644 --- a/src/os/fs/FS.h +++ b/src/os/fs/FS.h @@ -109,8 +109,8 @@ class FS { return 0; } - int get_next_completed(int timeout_ms, aio_t **paio) { - io_event event[1]; + int get_next_completed(int timeout_ms, aio_t **paio, int max) { + io_event event[max]; struct timespec t = { timeout_ms / 1000, (timeout_ms % 1000) * 1000 * 1000 @@ -119,8 +119,10 @@ class FS { if (r <= 0) { return r; } - *paio = (aio_t *)event[0].obj; - return 1; + for (int i=0; inewstore_aio_poll_ms, &aio); + int max = 16; + FS::aio_t *aio[max]; + int r = aio_queue.get_next_completed(g_conf->newstore_aio_poll_ms, + aio, max); if (r < 0) { derr << __func__ << " got " << cpp_strerror(r) << dendl; } - if (r == 1) { - TransContext *txc = static_cast(aio->priv); - int left = txc->num_aio.dec(); - dout(10) << __func__ << " finished aio " << aio << " txc " << txc - << " state " << txc->get_state_name() << ", " - << left << " aios left" << dendl; - VOID_TEMP_FAILURE_RETRY(::close(aio->fd)); - if (left == 0) { - _txc_state_proc(txc); + if (r > 0) { + dout(30) << __func__ << " got " << r << " completed aios" << dendl; + for (int i = 0; i < r; ++i) { + TransContext *txc = static_cast(aio[i]->priv); + int left = txc->num_aio.dec(); + dout(10) << __func__ << " finished aio " << aio[i] << " txc " << txc + << " state " << txc->get_state_name() << ", " + << left << " aios left" << dendl; + VOID_TEMP_FAILURE_RETRY(::close(aio[i]->fd)); + if (left == 0) { + _txc_state_proc(txc); + } } } } From 92979d750b6021726b9364a7a7c0925002009f76 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 1 May 2015 17:22:57 -0700 Subject: [PATCH 411/654] os/newstore: queue kv transactions in kv_sync_thread It appears that db->submit_transaction() will block if there is a sync commit that is in progress instead of simply queueing the new txn for later. To work around this, submit these to the backend in the kv_sync_thread prior to the synchronous submit_transaction_sync(). Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 044f1ebe39176..5a5f730ad8605 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2084,7 +2084,6 @@ void NewStore::_txc_state_proc(TransContext *txc) txc->state = TransContext::STATE_KV_QUEUED; if (!g_conf->newstore_sync_transaction) { Mutex::Locker l(kv_lock); - db->submit_transaction(txc->t); kv_queue.push_back(txc); kv_cond.SignalOne(); return; @@ -2384,6 +2383,12 @@ void NewStore::_kv_sync_thread() utime_t start = ceph_clock_now(NULL); kv_lock.Unlock(); + for (std::deque::iterator it = kv_committing.begin(); + it != kv_committing.end(); + it++) { + db->submit_transaction((*it)->t); + } + // one transaction to force a sync. clean up wal keys while we // are at it. KeyValueDB::Transaction txc_cleanup_sync = db->get_transaction(); From 35821d3aa9d53db72b7c90c821491866b60826ef Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 2 May 2015 16:29:24 -0700 Subject: [PATCH 412/654] os/newstore: renamed TransContext::fds -> sync_items Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 10 +++++----- src/os/newstore/NewStore.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 5a5f730ad8605..26a93722ba1ee 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2068,7 +2068,7 @@ void NewStore::_txc_state_proc(TransContext *txc) // ** fall-thru ** case TransContext::STATE_AIO_WAIT: - if (!txc->fds.empty()) { + if (!txc->sync_items.empty()) { txc->state = TransContext::STATE_FSYNC_WAIT; if (!g_conf->newstore_sync_io) { _txc_queue_fsync(txc); @@ -2222,8 +2222,8 @@ void NewStore::_txc_queue_fsync(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; fsync_wq.lock(); - for (list::iterator p = txc->fds.begin(); - p != txc->fds.end(); + for (list::iterator p = txc->sync_items.begin(); + p != txc->sync_items.end(); ++p) { fsync_wq._enqueue(&*p); fsync_wq._wake(); @@ -2234,8 +2234,8 @@ void NewStore::_txc_queue_fsync(TransContext *txc) void NewStore::_txc_do_sync_fsync(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; - for (list::iterator p = txc->fds.begin(); - p != txc->fds.end(); ++p) { + for (list::iterator p = txc->sync_items.begin(); + p != txc->sync_items.end(); ++p) { dout(30) << __func__ << " fsync " << p->fd << dendl; int r = ::fdatasync(p->fd); if (r < 0) { diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 6d6a28dabfe80..a399bc0462375 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -187,7 +187,7 @@ class NewStore : public ObjectStore { uint64_t ops, bytes; - list fds; ///< these fds need to be synced + list sync_items; ///< these fds need to be synced set onodes; ///< these onodes need to be updated/written KeyValueDB::Transaction t; ///< then we will commit this Context *oncommit; ///< signal on commit @@ -230,7 +230,7 @@ class NewStore : public ObjectStore { } void sync_fd(int f) { - fds.push_back(fsync_item(f, this)); + sync_items.push_back(fsync_item(f, this)); } void write_onode(OnodeRef &o) { onodes.insert(o); @@ -239,7 +239,7 @@ class NewStore : public ObjectStore { bool finish_fsync() { Mutex::Locker l(lock); ++num_fsyncs_completed; - if (num_fsyncs_completed == fds.size()) { + if (num_fsyncs_completed == sync_items.size()) { cond.Signal(); return true; } @@ -247,7 +247,7 @@ class NewStore : public ObjectStore { } void wait_fsync() { Mutex::Locker l(lock); - while (num_fsyncs_completed < fds.size()) + while (num_fsyncs_completed < sync_items.size()) cond.Wait(lock); } }; From 8ad6b9dfc3d9b0c8d2003cf6b5752989faecae2e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 4 May 2015 11:05:27 -0700 Subject: [PATCH 413/654] os/newstore: make sync/async submit_transaction optional It seems doing this synchronously may be better for SSDs? Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b83fef4f83023..efb0d66207349 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -796,6 +796,7 @@ OPTION(newstore_backend, OPT_STR, "rocksdb") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously +OPTION(newstore_sync_submit_transaction, OPT_BOOL, false) OPTION(newstore_sync_wal_apply, OPT_BOOL, true) // perform initial wal work synchronously (possibly in combination with aio so we only *queue* ios) OPTION(newstore_fsync_threads, OPT_INT, 16) // num threads calling fsync OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 26a93722ba1ee..7667bd266faba 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2084,6 +2084,9 @@ void NewStore::_txc_state_proc(TransContext *txc) txc->state = TransContext::STATE_KV_QUEUED; if (!g_conf->newstore_sync_transaction) { Mutex::Locker l(kv_lock); + if (g_conf->newstore_sync_submit_transaction) { + db->submit_transaction(txc->t); + } kv_queue.push_back(txc); kv_cond.SignalOne(); return; @@ -2383,10 +2386,12 @@ void NewStore::_kv_sync_thread() utime_t start = ceph_clock_now(NULL); kv_lock.Unlock(); - for (std::deque::iterator it = kv_committing.begin(); - it != kv_committing.end(); - it++) { - db->submit_transaction((*it)->t); + if (!g_conf->newstore_sync_submit_transaction) { + for (std::deque::iterator it = kv_committing.begin(); + it != kv_committing.end(); + it++) { + db->submit_transaction((*it)->t); + } } // one transaction to force a sync. clean up wal keys while we From 404cdd286dd9738148eec7f94c9dfac36672ab8b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 7 May 2015 15:41:20 +0800 Subject: [PATCH 414/654] os/newstore: Implement fiemap For simplicity we ignore holes inside an fragment now. Signed-off-by: Xiaoxi Chen --- src/os/newstore/NewStore.cc | 98 ++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 7667bd266faba..1b3790f9ca200 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1360,7 +1360,6 @@ int NewStore::_do_read( return r; } - int NewStore::fiemap( coll_t cid, const ghobject_t& oid, @@ -1368,7 +1367,102 @@ int NewStore::fiemap( size_t len, bufferlist& bl) { - assert(0); + map m; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + return -ENOENT; + } + + if (offset == len && offset == 0) + len = o->onode.size; + + if (offset > o->onode.size) + return 0; + + if (offset + len > o->onode.size) { + len = o->onode.size - offset; + } + + dout(20) << __func__ << " " << offset << "~" << len << " size " + << o->onode.size << dendl; + + map::iterator fp, fend; + map::iterator op, oend; + + // loop over overlays and data fragments. overlays take precedence. + fend = o->onode.data_map.end(); + fp = o->onode.data_map.lower_bound(offset); + if (fp != o->onode.data_map.begin()) { + --fp; + } + oend = o->onode.overlay_map.end(); + op = o->onode.overlay_map.lower_bound(offset); + if (op != o->onode.overlay_map.begin()) { + --op; + } + uint64_t start = offset; + while (len > 0) { + if (op != oend && op->first + op->second.length < offset) { + ++op; + continue; + } + if (fp != fend && fp->first + fp->second.length <= offset) { + ++fp; + continue; + } + + // overlay? + if (op != oend && op->first <= offset) { + uint64_t x_len = MIN(op->first + op->second.length - offset, len); + //m[offset] = x_len; + dout(30) << __func__ << " get overlay, off = " << offset << " len=" << x_len << dendl; + len -= x_len; + offset += x_len; + ++op; + continue; + } + + unsigned x_len = len; + if (op != oend && + op->first > offset && + op->first - offset < x_len) { + x_len = op->first - offset; + } + + // frag? + if (fp != fend && fp->first <= offset) { + uint64_t x_off = offset - fp->first - fp->second.offset; + x_len = MIN(x_len, fp->second.length - x_off); + //m[offset] = x_len; + dout(30) << __func__ << " get frag, off = " << offset << " len=" << x_len << dendl; + len -= x_len; + offset += x_len; + if (x_off + x_len == fp->second.length) + ++fp; + continue; + } + // we are seeing a hole, time to add an entry to fiemap. + m[start] = offset - start; + dout(20) << __func__ << " get fiemap entry, off = " << start << " len=" << m[start] << dendl; + offset += x_len; + start = offset; + len -= x_len; + continue; + } + //add tailing + if (offset - start != 0) { + m[start] = offset - start; + dout(20) << __func__ << " get fiemap entry, off = " << start << " len=" << m[start] << dendl; + } + + ::encode(m, bl); + dout(20) << __func__ << " " << offset << "~" << len << " size = 0 (" << m << ")" << dendl; + return 0; } int NewStore::getattr( From a1f0bdb0bd2d9b862456cba0900822f1d3c8dc97 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 14:57:47 -0400 Subject: [PATCH 415/654] os/newstore: fix collection range for temp objects Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 1b3790f9ca200..a198d2ae103f2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -154,7 +154,7 @@ static void get_coll_key_range(const coll_t& cid, int bits, (unsigned)hobject_t::_reverse_bits(pgid.ps())); start->append(buf); snprintf(buf, sizeof(buf), ".%016llx.%08x.", - (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull), + (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull), (unsigned)hobject_t::_reverse_bits(pgid.ps())); temp_start->append(buf); @@ -165,7 +165,7 @@ static void get_coll_key_range(const coll_t& cid, int bits, (unsigned long long)(pgid.pool() + 0x8000000000000000ull)); end->append(buf); snprintf(buf, sizeof(buf), ".%016llx.gggggggg.", - (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull)); + (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull)); temp_end->append(buf); } else { snprintf(buf, sizeof(buf), ".%016llx.%08x.", @@ -173,7 +173,7 @@ static void get_coll_key_range(const coll_t& cid, int bits, (unsigned)end_hash); end->append(buf); snprintf(buf, sizeof(buf), ".%016llx.%08x.", - (unsigned long long)((-1ll - pgid.pool()) + 0x8000000000000000ull), + (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull), (unsigned)end_hash); temp_end->append(buf); } From 15382c50d87e5282a0133665bf7687ec2c8e2e0a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 15:08:55 -0400 Subject: [PATCH 416/654] os/newstore: tolerate null pnext to collection_list() Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a198d2ae103f2..a71302767a9c8 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1574,6 +1574,10 @@ int NewStore::collection_list( const char *pend; bool temp; + ghobject_t static_next; + if (!pnext) + pnext = &static_next; + if (start == ghobject_t::get_max()) goto out; get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, From 79799ca10930bbd8c19a904f8d41cd813d6098b5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 15:09:21 -0400 Subject: [PATCH 417/654] os/newstore: trim overlay when zeroing extent Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index a71302767a9c8..f0ac5f99bc81b 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3203,6 +3203,7 @@ int NewStore::_do_overlay_trim(TransContext *txc, { dout(10) << __func__ << " " << o->oid << " " << offset << "~" << length << dendl; + int changed = 0; map::iterator p = o->onode.overlay_map.lower_bound(offset); @@ -3231,6 +3232,7 @@ int NewStore::_do_overlay_trim(TransContext *txc, txc->t->rmkey(PREFIX_OVERLAY, key); } o->onode.overlay_map.erase(p++); + ++changed; continue; } if (p->first >= offset) { @@ -3241,6 +3243,7 @@ int NewStore::_do_overlay_trim(TransContext *txc, ov.value_offset += by; ov.length -= by; o->onode.overlay_map.erase(p++); + ++changed; continue; } if (p->first < offset && @@ -3249,6 +3252,7 @@ int NewStore::_do_overlay_trim(TransContext *txc, << dendl; p->second.length = offset - p->first; ++p; + ++changed; continue; } dout(20) << __func__ << " split " << p->first << " " << p->second @@ -3262,8 +3266,9 @@ int NewStore::_do_overlay_trim(TransContext *txc, nov.length -= by; o->onode.shared_overlays.insert(p->second.key); ++p; + ++changed; } - return 0; + return changed; } int NewStore::_do_overlay_write(TransContext *txc, @@ -3657,6 +3662,10 @@ int NewStore::_zero(TransContext *txc, OnodeRef o = c->get_onode(oid, true); _assign_nid(txc, o); + // overlay + if (_do_overlay_trim(txc, o, offset, length) > 0) + txc->write_onode(o); + if (o->onode.data_map.empty()) { // we're already a big hole if (offset + length > o->onode.size) { From 094a190fd75b0bf9ec53b06da9b752dcd95c7e33 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 15:33:39 -0400 Subject: [PATCH 418/654] os/newstore: change escaping chars # is lowest besides space and !, except for " (which would be too confusing). Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index f0ac5f99bc81b..9f5adc244e106 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -91,10 +91,10 @@ static void append_escaped(const string &in, string *out) { char hexbyte[8]; for (string::const_iterator i = in.begin(); i != in.end(); ++i) { - if (*i <= '%') { - snprintf(hexbyte, sizeof(hexbyte), "%%%02x", (unsigned)*i); + if (*i <= '#') { + snprintf(hexbyte, sizeof(hexbyte), "#%02x", (unsigned)*i); out->append(hexbyte); - } else if (*i >= 126) { + } else if (*i >= '~') { snprintf(hexbyte, sizeof(hexbyte), "~%02x", (unsigned)*i); out->append(hexbyte); } else { @@ -107,7 +107,7 @@ static int decode_escaped(const char *p, string *out) { const char *orig_p = p; while (*p && *p != '!') { - if (*p == '%' || *p == '~') { + if (*p == '#' || *p == '~') { unsigned hex; int r = sscanf(++p, "%2x", &hex); if (r < 1) From faca5d0044f9b142cc362fd3978fe9afb982163f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Aug 2015 17:22:32 -0400 Subject: [PATCH 419/654] os/newstore: add 'newstore backend options' to pass options to e.g. rocksdb Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/newstore/NewStore.cc | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index efb0d66207349..f8bbdfbc0a565 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -793,6 +793,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") +OPTION(newstore_backend_options, OPT_STR, "") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 9f5adc244e106..002d4e73bd413 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -828,7 +828,7 @@ int NewStore::_open_db() db = NULL; return -EIO; } - db->init(); + db->init(g_conf->newstore_backend_options); stringstream err; if (db->create_and_open(err)) { derr << __func__ << " erroring opening db: " << err.str() << dendl; @@ -836,6 +836,9 @@ int NewStore::_open_db() db = NULL; return -EIO; } + dout(1) << __func__ << " opened " << g_conf->newstore_backend + << " path " << path + << " options " << g_conf->newstore_backend_options << dendl; return 0; } From c37b06d0fbd7f17b12f927a30bb026345e6158b8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 22 Aug 2015 10:33:40 -0400 Subject: [PATCH 420/654] os/newstore: flush object before doing omap reads Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 002d4e73bd413..78764cf1fab11 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -31,6 +31,7 @@ TODO: + * collection_list must flush pending db work * multiple fragments per object (with configurable size.. maybe 1 or 2 mb default?) * read path should be totally generic (handle any fragment pattern) * write path should ideally tolerate any fragment pattern, but only generate a fixed layout (since the tunable may be changed over time). @@ -1766,6 +1767,7 @@ int NewStore::omap_get( } if (!o->onode.omap_head) goto out; + o->flush(); { KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); string head, tail; @@ -1814,6 +1816,7 @@ int NewStore::omap_get_header( } if (!o->onode.omap_head) goto out; + o->flush(); { string head; get_omap_header(o->onode.omap_head, &head); @@ -1847,6 +1850,7 @@ int NewStore::omap_get_keys( } if (!o->onode.omap_head) goto out; + o->flush(); { KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); string head, tail; @@ -1896,6 +1900,7 @@ int NewStore::omap_get_values( } if (!o->onode.omap_head) goto out; + o->flush(); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { string key; get_omap_key(o->onode.omap_head, *p, &key); @@ -1930,6 +1935,7 @@ int NewStore::omap_check_keys( } if (!o->onode.omap_head) goto out; + o->flush(); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { string key; get_omap_key(o->onode.omap_head, *p, &key); @@ -1964,6 +1970,7 @@ ObjectMap::ObjectMapIterator NewStore::get_omap_iterator( dout(10) << __func__ << " " << oid << "doesn't exist" <flush(); dout(10) << __func__ << " header = " << o->onode.omap_head <get_iterator(PREFIX_OMAP); return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); From f3ddb75e3ea06226f094e55358cbe93c8ce4a090 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Aug 2015 17:59:34 -0400 Subject: [PATCH 421/654] os/newstore: fix end bound on collection_list Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 78764cf1fab11..8ca130424cce2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -1574,7 +1574,6 @@ int NewStore::collection_list( string temp_start_key, temp_end_key; string start_key, end_key; bool set_next = false; - string end_str; const char *pend; bool temp; @@ -1608,16 +1607,17 @@ int NewStore::collection_list( if (end.hobj.is_max()) { pend = temp ? temp_end_key.c_str() : end_key.c_str(); } else { - get_object_key(end, &end_str); + get_object_key(end, &end_key); if (end.hobj.is_temp()) { if (temp) - pend = end_str.c_str(); + pend = end_key.c_str(); else goto out; } else { - pend = temp ? temp_end_key.c_str() : end_str.c_str(); + pend = temp ? temp_end_key.c_str() : end_key.c_str(); } } + dout(30) << __func__ << " pend " << pend << dendl; while (true) { if (!it->valid() || strcmp(it->key().c_str(), pend) > 0) { if (!it->valid()) @@ -1632,6 +1632,7 @@ int NewStore::collection_list( temp = false; it->upper_bound(start_key); pend = end_key.c_str(); + dout(30) << __func__ << " pend " << pend << dendl; continue; } break; From 522f8509ad17fcabd90e31cb1a9b670f2925ccb0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Aug 2015 13:55:45 -0400 Subject: [PATCH 422/654] ceph_test_keyvaluedb: some simple KeyValueDB unit tests Signed-off-by: Sage Weil --- src/test/CMakeLists.txt | 17 ++++ src/test/Makefile-server.am | 5 ++ src/test/objectstore/test_kv.cc | 148 ++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 src/test/objectstore/test_kv.cc diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 3ffbc29f21171..6a5f10cc0b879 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -1450,6 +1450,23 @@ target_link_libraries(test_objectstore ${CMAKE_DL_LIBS} ) +add_executable(test_keyvaluedb + objectstore/test_kv.cc + $ + ) +set_target_properties(test_keyvaluedb PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) +target_link_libraries(test_keyvaluedb + os + common + ${UNITTEST_LIBS} + global + ${EXTRALIBS} + ${BLKID_LIBRARIES} + ${TCMALLOC_LIBS} + ${CMAKE_DL_LIBS} + ) + add_executable(test_objectstore_workloadgen objectstore/workload_generator.cc objectstore/TestObjectStoreState.cc diff --git a/src/test/Makefile-server.am b/src/test/Makefile-server.am index bb3ce82fe7ee5..8bf8cfcb19e70 100644 --- a/src/test/Makefile-server.am +++ b/src/test/Makefile-server.am @@ -55,6 +55,11 @@ ceph_test_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) ceph_test_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS) bin_DEBUGPROGRAMS += ceph_test_objectstore +ceph_test_keyvaluedb_SOURCES = test/objectstore/test_kv.cc +ceph_test_keyvaluedb_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) +ceph_test_keyvaluedb_CXXFLAGS = $(UNITTEST_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_keyvaluedb + ceph_test_filestore_SOURCES = test/filestore/TestFileStore.cc ceph_test_filestore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) ceph_test_filestore_CXXFLAGS = $(UNITTEST_CXXFLAGS) diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc new file mode 100644 index 0000000000000..61007e3bf33d6 --- /dev/null +++ b/src/test/objectstore/test_kv.cc @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include "os/KeyValueDB.h" +#include "include/Context.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "include/stringify.h" +#include + +#if GTEST_HAS_PARAM_TEST + +class KVTest : public ::testing::TestWithParam { +public: + boost::scoped_ptr db; + + KVTest() : db(0) {} + + void init() { + db.reset(KeyValueDB::create(g_ceph_context, string(GetParam()), + string("kv_test_temp_dir"))); + } + void fini() { + db.reset(NULL); + } + + virtual void SetUp() { + int r = ::mkdir("kv_test_temp_dir", 0777); + if (r < 0 && errno != EEXIST) { + r = -errno; + cerr << __func__ << ": unable to create kv_test_temp_dir" + << ": " << cpp_strerror(r) << std::endl; + return; + } + init(); + } + virtual void TearDown() { + fini(); + } +}; + +TEST_P(KVTest, OpenClose) { + ASSERT_EQ(0, db->create_and_open(cout)); + fini(); +} + +TEST_P(KVTest, OpenCloseReopenClose) { + ASSERT_EQ(0, db->create_and_open(cout)); + fini(); + init(); + ASSERT_EQ(0, db->open(cout)); + fini(); +} + +TEST_P(KVTest, PutReopen) { + ASSERT_EQ(0, db->create_and_open(cout)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist value; + value.append("value"); + t->set("prefix", "key", value); + t->set("prefix", "key2", value); + t->set("prefix", "key3", value); + db->submit_transaction_sync(t); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout)); + { + bufferlist v; + ASSERT_EQ(0, db->get("prefix", "key", &v)); + ASSERT_EQ(v.length(), 5u); + ASSERT_EQ(0, db->get("prefix", "key2", &v)); + ASSERT_EQ(v.length(), 5u); + } + { + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkey("prefix", "key"); + t->rmkey("prefix", "key3"); + db->submit_transaction_sync(t); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout)); + { + bufferlist v; + ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v)); + ASSERT_EQ(0, db->get("prefix", "key2", &v)); + ASSERT_EQ(v.length(), 5u); + ASSERT_EQ(-ENOENT, db->get("prefix", "key3", &v)); + } + fini(); +} + + +INSTANTIATE_TEST_CASE_P( + KeyValueDB, + KVTest, + ::testing::Values("leveldb", "rocksdb")); + +#else + +// Google Test may not support value-parameterized tests with some +// compilers. If we use conditional compilation to compile out all +// code referring to the gtest_main library, MSVC linker will not link +// that library at all and consequently complain about missing entry +// point defined in that library (fatal error LNK1561: entry point +// must be defined). This dummy test keeps gtest_main linked in. +TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {} + +#endif + +int main(int argc, char **argv) { + vector args; + argv_to_vec(argc, (const char **)argv, args); + env_to_vec(args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + g_ceph_context->_conf->set_val( + "enable_experimental_unrecoverable_data_corrupting_features", + "rocksdb"); + g_ceph_context->_conf->apply_changes(NULL); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From d6b0e53c54fc964a7a83baf272692cadda8d5b5a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Aug 2015 14:52:56 -0400 Subject: [PATCH 423/654] os/RocksDBStore: fix rmkey() This took way too long to debug! Signed-off-by: Sage Weil --- src/os/RocksDBStore.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/os/RocksDBStore.cc b/src/os/RocksDBStore.cc index 621b0bfea105a..73789822b333d 100644 --- a/src/os/RocksDBStore.cc +++ b/src/os/RocksDBStore.cc @@ -250,8 +250,7 @@ void RocksDBStore::RocksDBTransactionImpl::set( void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix, const string &k) { - string key = combine_strings(prefix, k); - bat->Delete(rocksdb::Slice(k)); + bat->Delete(combine_strings(prefix, k)); } void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix) From 0d463ffdec941bbcde7579071af060183a9cb168 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Aug 2015 14:54:00 -0400 Subject: [PATCH 424/654] os/RocksDBStore: make other rmkey match No need for Slice() here; it can take a string. Signed-off-by: Sage Weil --- src/os/RocksDBStore.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/os/RocksDBStore.cc b/src/os/RocksDBStore.cc index 73789822b333d..cb3ac9143f46e 100644 --- a/src/os/RocksDBStore.cc +++ b/src/os/RocksDBStore.cc @@ -259,8 +259,7 @@ void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix for (it->seek_to_first(); it->valid(); it->next()) { - string key = combine_strings(prefix, it->key()); - bat->Delete(key); + bat->Delete(combine_strings(prefix, it->key())); } } From 905048630645f825f899fccb6ff82ae340fd5c3f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Aug 2015 14:57:28 -0400 Subject: [PATCH 425/654] rocksdb: update to 3.11.2 Signed-off-by: Sage Weil --- src/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rocksdb b/src/rocksdb index e0ab03a46bed9..f371049d895a0 160000 --- a/src/rocksdb +++ b/src/rocksdb @@ -1 +1 @@ -Subproject commit e0ab03a46bed911ec7b8d8506b2c62322d128b49 +Subproject commit f371049d895a083810746a00049688043d8cea7e From 1fa2ef23475b36535b76372ce88aea41da07ddc1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:22:02 -0400 Subject: [PATCH 426/654] ceph_test_objectstore: enable newstore tests Signed-off-by: Sage Weil --- src/test/objectstore/store_test.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 5f056dcfc2b2e..5055959f39174 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -2516,7 +2516,11 @@ TEST_P(StoreTest, SetAllocHint) { INSTANTIATE_TEST_CASE_P( ObjectStore, StoreTest, - ::testing::Values("memstore", "filestore", "keyvaluestore"/*, "newstore" */)); + ::testing::Values( + "memstore", + "filestore", + "keyvaluestore", + "newstore")); #else From 9d1582d71fa11de52180d4c9bd8c81b9b6a61a9b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 Aug 2015 15:41:50 -0400 Subject: [PATCH 427/654] ceph_test_objectstore: make OMapIterator test work with FileStore Signed-off-by: Sage Weil --- src/test/objectstore/store_test.cc | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 5055959f39174..8e9ca11222603 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -2051,17 +2051,19 @@ TEST_P(StoreTest, OMapIterator) { t.omap_setkeys(cid, hoid, start_set); store->apply_transaction(t); } - ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(cid, hoid); + ObjectMap::ObjectMapIterator iter; bool correct; //basic iteration for (int i = 0; i < 100; i++) { if (!(i%5)) { std::cout << "On iteration " << i << std::endl; } - ObjectStore::Transaction t; bufferlist bl; - iter = store->get_omap_iterator(cid, hoid); + // FileStore may deadlock two active iterators over the same data + iter = ObjectMap::ObjectMapIterator(); + + iter = store->get_omap_iterator(cid, hoid); for (iter->seek_to_first(), count=0; iter->valid(); iter->next(), count++) { string key = iter->key(); bufferlist value = iter->value(); @@ -2077,6 +2079,9 @@ TEST_P(StoreTest, OMapIterator) { } ASSERT_EQ(attrs.size(), count); + // FileStore may deadlock an active iterator vs apply_transaction + iter = ObjectMap::ObjectMapIterator(); + char buf[100]; snprintf(buf, sizeof(buf), "%d", i); bl.clear(); @@ -2085,9 +2090,12 @@ TEST_P(StoreTest, OMapIterator) { map to_add; to_add.insert(pair("key-" + string(buf), bl)); attrs.insert(pair("key-" + string(buf), bl)); + ObjectStore::Transaction t; t.omap_setkeys(cid, hoid, to_add); store->apply_transaction(t); } + + iter = store->get_omap_iterator(cid, hoid); //lower bound string bound_key = "key-5"; iter->lower_bound(bound_key); @@ -2104,6 +2112,8 @@ TEST_P(StoreTest, OMapIterator) { } ASSERT_EQ(correct, true); + // FileStore may deadlock an active iterator vs apply_transaction + iter = ObjectMap::ObjectMapIterator(); { ObjectStore::Transaction t; t.remove(cid, hoid); From caf28fe9a5fd4f9097cfda681bf5c0b1708cf9e0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 11:45:58 -0400 Subject: [PATCH 428/654] rocksdb: update alt dist rule Signed-off-by: Sage Weil --- src/Makefile-rocksdb.am | 130 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 7 deletions(-) diff --git a/src/Makefile-rocksdb.am b/src/Makefile-rocksdb.am index fb642912a00ad..9d45f48774fa9 100644 --- a/src/Makefile-rocksdb.am +++ b/src/Makefile-rocksdb.am @@ -11,35 +11,57 @@ else rocksdb/PATENTS \ rocksdb/README.md \ rocksdb/ROCKSDB_LITE.md \ - rocksdb/configure.ac \ - rocksdb/Makefile.am \ + rocksdb/AUTHORS \ + rocksdb/configure.ac \ + rocksdb/CONTRIBUTING.md \ rocksdb/db/builder.cc \ rocksdb/db/builder.h \ rocksdb/db/c.cc \ rocksdb/db/column_family.cc \ rocksdb/db/column_family.h \ + rocksdb/db/column_family_test.cc \ + rocksdb/db/compact_files_test.cc \ rocksdb/db/compaction.cc \ rocksdb/db/compaction.h \ rocksdb/db/compaction_job.cc \ rocksdb/db/compaction_job.h \ + rocksdb/db/compaction_job_test.cc \ rocksdb/db/compaction_picker.cc \ rocksdb/db/compaction_picker.h \ + rocksdb/db/compaction_picker_test.cc \ + rocksdb/db/comparator_db_test.cc \ + rocksdb/db/corruption_test.cc \ + rocksdb/db/c_test.c \ + rocksdb/db/cuckoo_table_db_test.cc \ + rocksdb/db/db_bench.cc \ rocksdb/db/db_filesnapshot.cc \ rocksdb/db/dbformat.cc \ rocksdb/db/dbformat.h \ + rocksdb/db/dbformat_test.cc \ rocksdb/db/db_impl.cc \ rocksdb/db/db_impl_debug.cc \ + rocksdb/db/db_impl_experimental.cc \ rocksdb/db/db_impl.h \ rocksdb/db/db_impl_readonly.cc \ rocksdb/db/db_impl_readonly.h \ rocksdb/db/db_iter.cc \ rocksdb/db/db_iter.h \ + rocksdb/db/db_iter_test.cc \ + rocksdb/db/db_test.cc \ + rocksdb/db/deletefile_test.cc \ + rocksdb/db/event_logger_helpers.cc \ + rocksdb/db/event_logger_helpers.h \ + rocksdb/db/experimental.cc \ + rocksdb/db/fault_injection_test.cc \ rocksdb/db/file_indexer.cc \ rocksdb/db/file_indexer.h \ + rocksdb/db/file_indexer_test.cc \ rocksdb/db/filename.cc \ rocksdb/db/filename.h \ + rocksdb/db/filename_test.cc \ rocksdb/db/flush_job.cc \ rocksdb/db/flush_job.h \ + rocksdb/db/flush_job_test.cc \ rocksdb/db/flush_scheduler.cc \ rocksdb/db/flush_scheduler.h \ rocksdb/db/forward_iterator.cc \ @@ -47,9 +69,11 @@ else rocksdb/db/internal_stats.cc \ rocksdb/db/internal_stats.h \ rocksdb/db/job_context.h \ + rocksdb/db/listener_test.cc \ rocksdb/db/log_format.h \ rocksdb/db/log_reader.cc \ rocksdb/db/log_reader.h \ + rocksdb/db/log_test.cc \ rocksdb/db/log_writer.cc \ rocksdb/db/log_writer.h \ rocksdb/db/managed_iterator.cc \ @@ -60,45 +84,73 @@ else rocksdb/db/memtable.h \ rocksdb/db/memtable_list.cc \ rocksdb/db/memtable_list.h \ + rocksdb/db/memtable_list_test.cc \ + rocksdb/db/memtablerep_bench.cc \ rocksdb/db/merge_context.h \ rocksdb/db/merge_helper.cc \ rocksdb/db/merge_helper.h \ rocksdb/db/merge_operator.cc \ + rocksdb/db/merge_test.cc \ + rocksdb/db/perf_context_test.cc \ + rocksdb/db/plain_table_db_test.cc \ + rocksdb/db/prefix_test.cc \ rocksdb/db/repair.cc \ rocksdb/db/skiplist.h \ + rocksdb/db/skiplist_test.cc \ rocksdb/db/slice.cc \ rocksdb/db/snapshot.h \ rocksdb/db/table_cache.cc \ rocksdb/db/table_cache.h \ rocksdb/db/table_properties_collector.cc \ rocksdb/db/table_properties_collector.h \ + rocksdb/db/table_properties_collector_test.cc \ rocksdb/db/transaction_log_impl.cc \ rocksdb/db/transaction_log_impl.h \ rocksdb/db/version_builder.cc \ rocksdb/db/version_builder.h \ + rocksdb/db/version_builder_test.cc \ rocksdb/db/version_edit.cc \ rocksdb/db/version_edit.h \ + rocksdb/db/version_edit_test.cc \ rocksdb/db/version_set.cc \ rocksdb/db/version_set.h \ + rocksdb/db/version_set_test.cc \ rocksdb/db/wal_manager.cc \ rocksdb/db/wal_manager.h \ + rocksdb/db/wal_manager_test.cc \ rocksdb/db/write_batch_base.cc \ rocksdb/db/write_batch.cc \ rocksdb/db/write_batch_internal.h \ + rocksdb/db/write_batch_test.cc \ rocksdb/db/writebuffer.h \ rocksdb/db/write_controller.cc \ rocksdb/db/write_controller.h \ + rocksdb/db/write_controller_test.cc \ rocksdb/db/write_thread.cc \ rocksdb/db/write_thread.h \ - rocksdb/hdfs/README \ + rocksdb/doc/doc.css \ + rocksdb/doc/index.html \ + rocksdb/doc/log_format.txt \ + rocksdb/doc/rockslogo.jpg \ + rocksdb/doc/rockslogo.png \ + rocksdb/examples/column_families_example.cc \ + rocksdb/examples/compact_files_example.cc \ + rocksdb/examples/c_simple_example.c \ + rocksdb/examples/.gitignore \ + rocksdb/examples/Makefile \ + rocksdb/examples/README.md \ + rocksdb/examples/simple_example.cc \ rocksdb/hdfs/env_hdfs.h \ + rocksdb/hdfs/README \ rocksdb/hdfs/setup.sh \ + rocksdb/HISTORY.md \ rocksdb/include/rocksdb/cache.h \ rocksdb/include/rocksdb/c.h \ rocksdb/include/rocksdb/compaction_filter.h \ rocksdb/include/rocksdb/comparator.h \ rocksdb/include/rocksdb/db.h \ rocksdb/include/rocksdb/env.h \ + rocksdb/include/rocksdb/experimental.h \ rocksdb/include/rocksdb/filter_policy.h \ rocksdb/include/rocksdb/flush_block_policy.h \ rocksdb/include/rocksdb/immutable_options.h \ @@ -128,6 +180,7 @@ else rocksdb/include/rocksdb/utilities/convenience.h \ rocksdb/include/rocksdb/utilities/db_ttl.h \ rocksdb/include/rocksdb/utilities/document_db.h \ + rocksdb/include/rocksdb/utilities/flashcache.h \ rocksdb/include/rocksdb/utilities/geo_db.h \ rocksdb/include/rocksdb/utilities/json_document.h \ rocksdb/include/rocksdb/utilities/leveldb_options.h \ @@ -140,24 +193,36 @@ else rocksdb/include/rocksdb/write_batch.h \ rocksdb/include/utilities/backupable_db.h \ rocksdb/include/utilities/db_ttl.h \ + rocksdb/include/utilities/document_db.h \ rocksdb/include/utilities/geo_db.h \ + rocksdb/include/utilities/json_document.h \ rocksdb/include/utilities/stackable_db.h \ rocksdb/include/utilities/utility_db.h \ + rocksdb/INSTALL.md \ + rocksdb/LICENSE \ rocksdb/m4/libtool.m4 \ rocksdb/m4/lt~obsolete.m4 \ rocksdb/m4/ltoptions.m4 \ rocksdb/m4/ltsugar.m4 \ rocksdb/m4/ltversion.m4 \ + rocksdb/Makefile.am \ + rocksdb/PATENTS \ rocksdb/port/likely.h \ + rocksdb/port/port_example.h \ rocksdb/port/port.h \ rocksdb/port/port_posix.cc \ rocksdb/port/port_posix.h \ + rocksdb/port/README \ rocksdb/port/stack_trace.cc \ rocksdb/port/stack_trace.h \ + rocksdb/port/win/stdint.h \ + rocksdb/README.md \ + rocksdb/ROCKSDB_LITE.md \ rocksdb/table/adaptive_table_factory.cc \ rocksdb/table/adaptive_table_factory.h \ rocksdb/table/block_based_filter_block.cc \ rocksdb/table/block_based_filter_block.h \ + rocksdb/table/block_based_filter_block_test.cc \ rocksdb/table/block_based_table_builder.cc \ rocksdb/table/block_based_table_builder.h \ rocksdb/table/block_based_table_factory.cc \ @@ -170,22 +235,27 @@ else rocksdb/table/block.h \ rocksdb/table/block_hash_index.cc \ rocksdb/table/block_hash_index.h \ + rocksdb/table/block_hash_index_test.cc \ rocksdb/table/block_prefix_index.cc \ rocksdb/table/block_prefix_index.h \ + rocksdb/table/block_test.cc \ rocksdb/table/bloom_block.cc \ rocksdb/table/bloom_block.h \ rocksdb/table/cuckoo_table_builder.cc \ rocksdb/table/cuckoo_table_builder.h \ + rocksdb/table/cuckoo_table_builder_test.cc \ rocksdb/table/cuckoo_table_factory.cc \ rocksdb/table/cuckoo_table_factory.h \ rocksdb/table/cuckoo_table_reader.cc \ rocksdb/table/cuckoo_table_reader.h \ + rocksdb/table/cuckoo_table_reader_test.cc \ rocksdb/table/filter_block.h \ rocksdb/table/flush_block_policy.cc \ rocksdb/table/format.cc \ rocksdb/table/format.h \ rocksdb/table/full_filter_block.cc \ rocksdb/table/full_filter_block.h \ + rocksdb/table/full_filter_block_test.cc \ rocksdb/table/get_context.cc \ rocksdb/table/get_context.h \ rocksdb/table/iterator.cc \ @@ -193,8 +263,10 @@ else rocksdb/table/iter_heap.h \ rocksdb/table/merger.cc \ rocksdb/table/merger.h \ + rocksdb/table/merger_test.cc \ rocksdb/table/meta_blocks.cc \ rocksdb/table/meta_blocks.h \ + rocksdb/table/mock_table.cc \ rocksdb/table/mock_table.h \ rocksdb/table/plain_table_builder.cc \ rocksdb/table/plain_table_builder.h \ @@ -209,41 +281,57 @@ else rocksdb/table/table_builder.h \ rocksdb/table/table_properties.cc \ rocksdb/table/table_properties_internal.h \ + rocksdb/table/table_reader_bench.cc \ rocksdb/table/table_reader.h \ + rocksdb/table/table_test.cc \ rocksdb/table/two_level_iterator.cc \ rocksdb/table/two_level_iterator.h \ - rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \ - rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + rocksdb/third-party/fbson/COMMIT.md \ rocksdb/third-party/fbson/FbsonDocument.h \ rocksdb/third-party/fbson/FbsonJsonParser.h \ rocksdb/third-party/fbson/FbsonStream.h \ rocksdb/third-party/fbson/FbsonUtil.h \ rocksdb/third-party/fbson/FbsonWriter.h \ + rocksdb/third-party/flashcache/flashcache_ioctl.h \ + rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \ + rocksdb/USERS.md \ rocksdb/util/allocator.h \ rocksdb/util/arena.cc \ rocksdb/util/arena.h \ + rocksdb/util/arena_test.cc \ rocksdb/util/auto_roll_logger.cc \ rocksdb/util/auto_roll_logger.h \ + rocksdb/util/auto_roll_logger_test.cc \ rocksdb/util/autovector.h \ - rocksdb/util/benchharness.h \ + rocksdb/util/autovector_test.cc \ rocksdb/util/bloom.cc \ + rocksdb/util/bloom_test.cc \ rocksdb/util/build_version.h \ + rocksdb/util/cache_bench.cc \ rocksdb/util/cache.cc \ + rocksdb/util/cache_test.cc \ rocksdb/util/coding.cc \ rocksdb/util/coding.h \ + rocksdb/util/coding_test.cc \ rocksdb/util/comparator.cc \ rocksdb/util/compression.h \ rocksdb/util/crc32c.cc \ rocksdb/util/crc32c.h \ + rocksdb/util/crc32c_test.cc \ rocksdb/util/db_info_dumper.cc \ rocksdb/util/db_info_dumper.h \ rocksdb/util/dynamic_bloom.cc \ rocksdb/util/dynamic_bloom.h \ + rocksdb/util/dynamic_bloom_test.cc \ rocksdb/util/env.cc \ rocksdb/util/env_hdfs.cc \ rocksdb/util/env_posix.cc \ + rocksdb/util/env_test.cc \ rocksdb/util/event_logger.cc \ rocksdb/util/event_logger.h \ + rocksdb/util/event_logger_test.cc \ + rocksdb/util/filelock_test.cc \ rocksdb/util/file_util.cc \ rocksdb/util/file_util.h \ rocksdb/util/filter_policy.cc \ @@ -257,37 +345,52 @@ else rocksdb/util/hash_skiplist_rep.h \ rocksdb/util/histogram.cc \ rocksdb/util/histogram.h \ + rocksdb/util/histogram_test.cc \ rocksdb/util/instrumented_mutex.cc \ rocksdb/util/instrumented_mutex.h \ rocksdb/util/iostats_context.cc \ rocksdb/util/iostats_context_imp.h \ rocksdb/utilities/backupable/backupable_db.cc \ + rocksdb/utilities/backupable/backupable_db_test.cc \ rocksdb/utilities/checkpoint/checkpoint.cc \ rocksdb/utilities/compacted_db/compacted_db_impl.cc \ rocksdb/utilities/compacted_db/compacted_db_impl.h \ rocksdb/utilities/convenience/convenience.cc \ rocksdb/utilities/document/document_db.cc \ + rocksdb/utilities/document/document_db_test.cc \ rocksdb/utilities/document/json_document_builder.cc \ rocksdb/utilities/document/json_document.cc \ + rocksdb/utilities/document/json_document_test.cc \ + rocksdb/utilities/flashcache/flashcache.cc \ + rocksdb/utilities/flashcache/flashcache.h \ rocksdb/utilities/geodb/geodb_impl.cc \ rocksdb/utilities/geodb/geodb_impl.h \ + rocksdb/utilities/geodb/geodb_test.cc \ rocksdb/utilities/leveldb_options/leveldb_options.cc \ + rocksdb/utilities/merge_operators.h \ rocksdb/utilities/merge_operators/put.cc \ rocksdb/utilities/merge_operators/string_append/stringappend2.cc \ rocksdb/utilities/merge_operators/string_append/stringappend2.h \ rocksdb/utilities/merge_operators/string_append/stringappend.cc \ rocksdb/utilities/merge_operators/string_append/stringappend.h \ + rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \ rocksdb/utilities/merge_operators/uint64add.cc \ - rocksdb/utilities/merge_operators.h \ + rocksdb/utilities/redis/README \ rocksdb/utilities/redis/redis_list_exception.h \ rocksdb/utilities/redis/redis_list_iterator.h \ rocksdb/utilities/redis/redis_lists.cc \ rocksdb/utilities/redis/redis_lists.h \ + rocksdb/utilities/redis/redis_lists_test.cc \ rocksdb/utilities/spatialdb/spatial_db.cc \ + rocksdb/utilities/spatialdb/spatial_db_test.cc \ rocksdb/utilities/spatialdb/utils.h \ rocksdb/utilities/ttl/db_ttl_impl.cc \ rocksdb/utilities/ttl/db_ttl_impl.h \ + rocksdb/utilities/ttl/ttl_test.cc \ rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \ + rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \ + rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \ + rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \ rocksdb/util/ldb_cmd.cc \ rocksdb/util/ldb_cmd_execute_result.h \ rocksdb/util/ldb_cmd.h \ @@ -296,8 +399,13 @@ else rocksdb/util/log_buffer.h \ rocksdb/util/logging.cc \ rocksdb/util/logging.h \ + rocksdb/util/log_write_bench.cc \ + rocksdb/util/manual_compaction_test.cc \ rocksdb/util/memenv.cc \ + rocksdb/util/memenv_test.cc \ + rocksdb/util/mock_env.cc \ rocksdb/util/mock_env.h \ + rocksdb/util/mock_env_test.cc \ rocksdb/util/murmurhash.cc \ rocksdb/util/murmurhash.h \ rocksdb/util/mutable_cf_options.cc \ @@ -307,15 +415,19 @@ else rocksdb/util/options.cc \ rocksdb/util/options_helper.cc \ rocksdb/util/options_helper.h \ + rocksdb/util/options_test.cc \ rocksdb/util/perf_context.cc \ rocksdb/util/perf_context_imp.h \ rocksdb/util/posix_logger.h \ rocksdb/util/random.h \ rocksdb/util/rate_limiter.cc \ rocksdb/util/rate_limiter.h \ + rocksdb/util/rate_limiter_test.cc \ rocksdb/util/scoped_arena_iterator.h \ rocksdb/util/skiplistrep.cc \ rocksdb/util/slice.cc \ + rocksdb/util/slice_transform_test.cc \ + rocksdb/util/sst_dump_test.cc \ rocksdb/util/sst_dump_tool.cc \ rocksdb/util/sst_dump_tool_imp.h \ rocksdb/util/statistics.cc \ @@ -327,10 +439,14 @@ else rocksdb/util/string_util.h \ rocksdb/util/sync_point.cc \ rocksdb/util/sync_point.h \ + rocksdb/util/testharness.cc \ rocksdb/util/testharness.h \ + rocksdb/util/testutil.cc \ rocksdb/util/testutil.h \ + rocksdb/util/thread_list_test.cc \ rocksdb/util/thread_local.cc \ rocksdb/util/thread_local.h \ + rocksdb/util/thread_local_test.cc \ rocksdb/util/thread_operation.h \ rocksdb/util/thread_status_impl.cc \ rocksdb/util/thread_status_updater.cc \ From eab4d53b7447b5f5e6adaf64b21b6c681d1425bb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Aug 2015 14:21:23 -0400 Subject: [PATCH 429/654] do_autogen.sh: build static rocksdb by default Signed-off-by: Sage Weil --- do_autogen.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/do_autogen.sh b/do_autogen.sh index bfbe528bdcef9..febdfcb1705bb 100755 --- a/do_autogen.sh +++ b/do_autogen.sh @@ -18,7 +18,7 @@ do_autogen.sh: make a ceph build by running autogen, etc. -c use cryptopp -C add parameters to configure -j with java --r with rocksdb +-R without rocksdb -J --with-jemalloc -L --without-lttng @@ -33,6 +33,7 @@ die() { debug_level=0 verbose=0 profile=0 +rocksdb=1 CONFIGURE_FLAGS="--disable-static --with-lttng" while getopts "d:e:hHrTPJLjpcvO:C:" flag do @@ -55,7 +56,7 @@ do j) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --enable-cephfs-java";; - r) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-librocksdb-static";; + R) rocksdb=0;; v) verbose=1;; @@ -72,6 +73,10 @@ do esac done +if [ $rocksdb -eq 1 ]; then + CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-librocksdb-static" +fi + if [ $profile -eq 1 ]; then if [ $debug_level -ne 0 ]; then echo "Can't specify both -d and -P. Profiling builds are \ From 05d79b66cf67f65ced819e6d03dfe35b04c86ab7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:13:47 -0400 Subject: [PATCH 430/654] os/newstore: update todo Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 8ca130424cce2..58d3aff9d85c2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -37,19 +37,8 @@ * write path should ideally tolerate any fragment pattern, but only generate a fixed layout (since the tunable may be changed over time). * rocksdb: use db_paths (db/ and db.bulk/ ?) * rocksdb: auto-detect use_fsync option when not xfs or btrfs - * hobject sorting - - backfill - - scrub - - pgnls - - tiering agent position - - ObjectStore::collection_list_range - - ObjectStore::collection_list_partial - - DBObjectMap::clone lock ordering - - HashIndex::get_path_contents_by_hash - - HashIndex::list_by_hash * avoid mtime updates when doing open-by-handle * fid xattr backpointer - * kill collection_list_range * inline first fsync_item in TransContext to void allocation? * refcounted fragments (for efficient clone) From b7c5bd12b40ed75780d2527b3e80116fc15bac1e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:14:03 -0400 Subject: [PATCH 431/654] ceph_test_keyvaluedb: add simple commit latency benchmark Signed-off-by: Sage Weil --- src/test/objectstore/test_kv.cc | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc index 61007e3bf33d6..df3805b8d1d68 100644 --- a/src/test/objectstore/test_kv.cc +++ b/src/test/objectstore/test_kv.cc @@ -113,6 +113,39 @@ TEST_P(KVTest, PutReopen) { fini(); } +TEST_P(KVTest, BenchCommit) { + int n = 1024; + ASSERT_EQ(0, db->create_and_open(cout)); + utime_t start = ceph_clock_now(NULL); + { + cout << "priming" << std::endl; + // prime + bufferlist big; + bufferptr bp(1048576); + bp.zero(); + big.append(bp); + for (int i=0; i<30; ++i) { + KeyValueDB::Transaction t = db->get_transaction(); + t->set("prefix", "big" + stringify(i), big); + db->submit_transaction_sync(t); + } + } + cout << "now doing small writes" << std::endl; + bufferlist data; + bufferptr bp(1024); + bp.zero(); + data.append(bp); + for (int i=0; iget_transaction(); + t->set("prefix", "key" + stringify(i), data); + db->submit_transaction_sync(t); + } + utime_t end = ceph_clock_now(NULL); + utime_t dur = end - start; + cout << n << " commits in " << dur << ", avg latency " << (dur / (double)n) + << std::endl; +} + INSTANTIATE_TEST_CASE_P( KeyValueDB, From c13bb7aa48ec936c200727cc88f2892036326b77 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 13:59:40 -0400 Subject: [PATCH 432/654] os/newstore: fix swarning os/newstore/NewStore.cc: In member function 'int NewStore::_zero(NewStore::TransContext*, NewStore::CollectionRef&, const ghobject_t&, uint64_t, size_t)': os/newstore/NewStore.cc:3693:32: warning: ignoring return value of 'int ftruncate(int, __off64_t)', declared with attribute warn_unused_result [-Wunused-result] ::ftruncate(fd, f.length); ^ Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 58d3aff9d85c2..953dcd22d5864 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -3690,7 +3690,8 @@ int NewStore::_zero(TransContext *txc, goto out; } f.length = (offset + length) - f.offset; - ::ftruncate(fd, f.length); + r = ::ftruncate(fd, f.length); + assert(r == 0); // this shouldn't fail dout(20) << __func__ << " tail " << f.fid << " truncating up to " << f.length << dendl; o->onode.size = offset + length; From 7c00bf05db08c4fbe77be5286b330c2da7dab779 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 1 Sep 2015 11:35:42 -0400 Subject: [PATCH 433/654] cmake: update FUSE_INCLUDE_DIRS to match autoconf client/fuse_ll.cc is now including and instead of and , so we need to add the fuse directory to the FUSE_INCLUDE_DIRS variable using find_path() with just fuse.h was finding a /usr/include/fuse.h instead of the one in /usr/include/fuse/. looking for fuse_common.h and fuse_lowlevel.h first causes it to generate the correct FUSE_INCLUDE_DIRS=/usr/include/fuse Fixes: #12909 Signed-off-by: Casey Bodley --- cmake/modules/Findfuse.cmake | 6 +++--- src/CMakeLists.txt | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cmake/modules/Findfuse.cmake b/cmake/modules/Findfuse.cmake index 7c1a8789b28e8..dd8a6c17faef2 100644 --- a/cmake/modules/Findfuse.cmake +++ b/cmake/modules/Findfuse.cmake @@ -73,7 +73,7 @@ endif(PKG_CONFIG_FOUND) find_path( FUSE_INCLUDE_DIRS - NAMES fuse.h + NAMES fuse_common.h fuse_lowlevel.h fuse.h PATHS "${PC_FUSE_INCLUDE_DIRS}" DOC "Include directories for FUSE" ) @@ -94,8 +94,8 @@ if(NOT FUSE_LIBRARIES) endif(NOT FUSE_LIBRARIES) if(FUSE_FOUND) - if(EXISTS "${FUSE_INCLUDE_DIRS}/fuse/fuse_common.h") - file(READ "${FUSE_INCLUDE_DIRS}/fuse/fuse_common.h" _contents) + if(EXISTS "${FUSE_INCLUDE_DIRS}/fuse_common.h") + file(READ "${FUSE_INCLUDE_DIRS}/fuse_common.h" _contents) string(REGEX REPLACE ".*# *define *FUSE_MAJOR_VERSION *([0-9]+).*" "\\1" FUSE_MAJOR_VERSION "${_contents}") string(REGEX REPLACE ".*# *define *FUSE_MINOR_VERSION *([0-9]+).*" "\\1" FUSE_MINOR_VERSION "${_contents}") set(FUSE_VERSION "${FUSE_MAJOR_VERSION}.${FUSE_MINOR_VERSION}") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5b8298e130830..eed321004c53b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -806,6 +806,7 @@ if(WITH_LIBCEPHFS) client/fuse_ll.cc) add_executable(ceph-fuse ${ceph_fuse_srcs}) target_link_libraries(ceph-fuse fuse client global) + set_target_properties(ceph-fuse PROPERTIES COMPILE_FLAGS "-I${FUSE_INCLUDE_DIRS}") install(TARGETS ceph-fuse DESTINATION bin) endif(WITH_FUSE) endif(WITH_LIBCEPHFS) From d015d23d9103b4edad9213a0bf6780209289dc48 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 1 Sep 2015 14:15:45 -0400 Subject: [PATCH 434/654] osd: sparse reads returning invalid extent map The extent map retrieved from the fiemap might have been truncated while reading the extents. Therefore, the map needs to be re-encoded in the response instead of directly copied. Fixes: #12904 Signed-off-by: Jason Dillaman --- src/osd/ReplicatedPG.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c29a2afa706d5..d85f2603dc46b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3960,7 +3960,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) op.extent.length = total_read; - osd_op.outdata.claim_append(bl); + ::encode(m, osd_op.outdata); // re-encode since it might be modified ::encode_destructively(data_bl, osd_op.outdata); ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10); From cbe85ec12635bd9a7ec39ec1ab08b018ee6b678f Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 2 Sep 2015 01:41:55 +0800 Subject: [PATCH 435/654] doc: fix the code-block in ruby.rst * and add the link to library homepage in the section titles Signed-off-by: Kefu Chai --- doc/radosgw/s3/ruby.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/radosgw/s3/ruby.rst b/doc/radosgw/s3/ruby.rst index 0a62e3f425e02..435b3c63083da 100644 --- a/doc/radosgw/s3/ruby.rst +++ b/doc/radosgw/s3/ruby.rst @@ -1,7 +1,7 @@ .. _ruby: -Ruby S3 Examples (aws-sdk gem ~>2) -================================== +Ruby `AWS::SDK`_ Examples (aws-sdk gem ~>2) +=========================================== Settings --------------------- @@ -141,6 +141,7 @@ This downloads the object ``poetry.pdf`` and saves it in ``/home/larry/documents/`` .. code-block:: ruby + s3_client.get_object(bucket: 'my-new-bucket', key: 'poetry.pdf', response_target: '/home/larry/documents/poetry.pdf') @@ -183,12 +184,12 @@ The output of this will look something like:: http://objects.dreamhost.com/my-bucket-name/hello.txt http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX -.. _`Aws::S3`: http://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Client.html +.. _`AWS::SDK`: http://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Client.html -Ruby S3 Examples (aws-s3 gem) -============================= +Ruby `AWS::S3`_ Examples (aws-s3 gem) +===================================== Creating a Connection --------------------- From 2ca2c1b97869fecd0ef511eecb94a91204515287 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 1 Sep 2015 12:49:06 -0700 Subject: [PATCH 436/654] osd, test: Minor clean-up from fast-read and error handling ec changes Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 9 +++++---- src/osd/ReplicatedPG.cc | 2 ++ src/test/erasure-code/test-erasure-eio.sh | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index aef52a03476a7..305c17f266a44 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1001,7 +1001,7 @@ void ECBackend::handle_sub_read_reply( map::iterator iter = tid_to_read_map.find(op.tid); if (iter == tid_to_read_map.end()) { //canceled - dout(10) << __func__ << ": abort " << op << dendl; + dout(20) << __func__ << ": dropped " << op << dendl; return; } ReadOp &rop = iter->second; @@ -1082,7 +1082,6 @@ void ECBackend::handle_sub_read_reply( set want_to_read, dummy_minimum; get_want_to_read_shards(&want_to_read); int err; - // XXX: Could just do if (have.size < ec_impl->get_data_chunk_count()) if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) { dout(20) << __func__ << " minimum_to_decode failed" << dendl; if (rop.in_progress.empty()) { @@ -1610,6 +1609,7 @@ void ECBackend::start_read_op( dout(10) << __func__ << ": started " << op << dendl; } +// This is based on start_read_op(), maybe this should be refactored void ECBackend::start_remaining_read_op( ReadOp &op, map &to_read) @@ -1621,7 +1621,8 @@ void ECBackend::start_remaining_read_op( dout(10) << __func__ << ": starting additional " << op << dendl; map messages; - for (map::iterator i = op.to_read.begin(); + for (map::iterator i = op.to_read.begin(); i != op.to_read.end(); ++i) { bool need_attrs = i->second.want_attrs; @@ -1956,7 +1957,7 @@ int ECBackend::objects_remaining_read_async( ReadOp &rop) { set already_read; - set ots = rop.obj_to_source[hoid]; + const set& ots = rop.obj_to_source[hoid]; for (set::iterator i = ots.begin(); i != ots.end(); ++i) already_read.insert(i->shard); dout(10) << __func__ << " have/error shards=" << already_read << dendl; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 9485f0eae1571..4c704ee9f1395 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3667,6 +3667,8 @@ struct FillInVerifyExtent : public Context { void finish(int len) { *rval = len; *r = len; + if (len < 0) + return; // whole object? can we verify the checksum? if (maybe_crc && *r == size) { uint32_t crc = outdatap->crc32c(-1); diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh index fe465c93bda3d..129d09b7ec81f 100755 --- a/src/test/erasure-code/test-erasure-eio.sh +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -102,7 +102,7 @@ function rados_get() { if [ $expect = "1" ]; then ! ./rados --pool $poolname get $objname $dir/COPY - return $? + return fi # # get an object, compare with $dir/ORIGINAL From 3a41ef4e32ee641238465d8fea85810715ba4a01 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 21:15:07 -0400 Subject: [PATCH 437/654] ms/async: log message tx/rx at level 1 Signed-off-by: Sage Weil --- src/msg/async/AsyncConnection.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 97bb17ab20deb..b0433e81f8850 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -841,6 +841,8 @@ void AsyncConnection::process() in_seq.set(message->get_seq()); ldout(async_msgr->cct, 10) << __func__ << " got message " << message->get_seq() << " " << message << " " << *message << dendl; + ldout(async_msgr->cct, 1) << " == rx == " << message << " " << *message + << dendl; // if send_message always successfully send, it may have no // opportunity to send seq ack. 10 is a experience value. @@ -1921,6 +1923,8 @@ void AsyncConnection::accept(int incoming) int AsyncConnection::send_message(Message *m) { ldout(async_msgr->cct, 10) << __func__ << " m=" << m << dendl; + ldout(async_msgr->cct, 1) << " == tx == " << m << " " << *m + << dendl; // optimistic think it's ok to encode(actually may broken now) if (!m->get_priority()) From 023c517509937e8037a6f25b55b66b9ff8befea9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 21:15:20 -0400 Subject: [PATCH 438/654] vstart.sh: enable all experimental features for vstart Signed-off-by: Sage Weil --- src/vstart.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vstart.sh b/src/vstart.sh index 70ee0f14f1bfe..7b42a4c40da9d 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -398,7 +398,7 @@ if [ "$start_mon" -eq 1 ]; then rgw dns name = localhost filestore fd cache size = 32 run dir = $CEPH_OUT_DIR - enable experimental unrecoverable data corrupting features = newstore rocksdb + enable experimental unrecoverable data corrupting features = * EOF if [ "$cephx" -eq 1 ] ; then cat <> $conf_fn From b199c492b072e5a43f6c134c760c3f178bd176d0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 21:43:04 -0400 Subject: [PATCH 439/654] ceph-osd-prestart.sh: fix osd data dir ownership check Signed-off-by: Sage Weil --- src/ceph-osd-prestart.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-osd-prestart.sh b/src/ceph-osd-prestart.sh index a76747b232d5b..77974e8a58278 100644 --- a/src/ceph-osd-prestart.sh +++ b/src/ceph-osd-prestart.sh @@ -54,7 +54,7 @@ fi # ensure ownership is correct owner=`stat -c %U $data/.` -if [ $owner -ne 'ceph' -a $owner -ne 'root' ]; then +if [ $owner != 'ceph' -a $owner != 'root' ]; then echo "ceph-osd data dir $data is not owned by 'ceph' or 'root'" echo "you must 'ceph-disk chown ...' or similar to fix ownership" exit 1 From b02cc060ebf40939796d2adcbbc661bae304f1b7 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Wed, 2 Sep 2015 11:45:52 +0800 Subject: [PATCH 440/654] AsyncConnection: Don't use unsafe feature as message encode feature Fix #12908 Signed-off-by: Haomai Wang --- src/msg/async/AsyncConnection.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index b0433e81f8850..3c607037c1e2b 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -1965,7 +1965,7 @@ int AsyncConnection::send_message(Message *m) } if (!is_queued() && can_write == CANWRITE) { if (!can_fast_prepare) - prepare_send_message(f, m, bl); + prepare_send_message(get_features(), m, bl); if (write_message(m, bl) < 0) { ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl; // we want to handle fault within internal thread From 89aacaf699e7a126aa768b096331329e709a5108 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 2 Sep 2015 14:00:40 +0800 Subject: [PATCH 441/654] doc: add the doc for min_write_recency_for_promote Signed-off-by: Zhiqiang Wang --- doc/dev/cache-pool.rst | 13 +++++++++---- doc/rados/operations/cache-tiering.rst | 15 ++++++++++----- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst index 2d22f1f3b7df0..7dc71c828e9ff 100644 --- a/doc/dev/cache-pool.rst +++ b/doc/dev/cache-pool.rst @@ -56,6 +56,7 @@ Set the target size and enable the tiering agent for foo-hot:: ceph osd pool set foo-hot hit_set_period 3600 # 1 hour ceph osd pool set foo-hot target_max_bytes 1000000000000 # 1 TB ceph osd pool set foo-hot min_read_recency_for_promote 1 + ceph osd pool set foo-hot min_write_recency_for_promote 1 Drain the cache in preparation for turning it off:: @@ -114,14 +115,18 @@ between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted If it's set to 1, the current HitSet is checked. And if this object is in the current HitSet, it's promoted. Otherwise not. For the other values, the exact number of archive HitSets are checked. The object is promoted if the object is -found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: +found in any of the most recent ``min_read_recency_for_promote`` HitSets. + +A similar parameter can be set for the write operation, which is +``min_write_recency_for_promote``. :: ceph osd pool set {cachepool} min_read_recency_for_promote 1 + ceph osd pool set {cachepool} min_write_recency_for_promote 1 Note that the longer the ``hit_set_period`` and the higher the -``min_read_recency_for_promote`` the more RAM will be consumed by the ceph-osd -process. In particular, when the agent is active to flush or evict cache objects, -all hit_set_count HitSets are loaded into RAM. +``min_read_recency_for_promote``/``min_write_recency_for_promote`` the more RAM +will be consumed by the ceph-osd process. In particular, when the agent is active +to flush or evict cache objects, all hit_set_count HitSets are loaded into RAM. Cache mode ~~~~~~~~~~ diff --git a/doc/rados/operations/cache-tiering.rst b/doc/rados/operations/cache-tiering.rst index f53526600d308..8275fe4381d39 100644 --- a/doc/rados/operations/cache-tiering.rst +++ b/doc/rados/operations/cache-tiering.rst @@ -193,14 +193,19 @@ between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted If it's set to 1, the current HitSet is checked. And if this object is in the current HitSet, it's promoted. Otherwise not. For the other values, the exact number of archive HitSets are checked. The object is promoted if the object is -found in any of the most recent ``min_read_recency_for_promote`` HitSets. :: +found in any of the most recent ``min_read_recency_for_promote`` HitSets. + +A similar parameter can be set for the write operation, which is +``min_write_recency_for_promote``. :: ceph osd pool set {cachepool} min_read_recency_for_promote 1 + ceph osd pool set {cachepool} min_write_recency_for_promote 1 -.. note:: The longer the period and the higher the min_read_recency_for_promote, - the more RAM the ``ceph-osd`` daemon consumes. In particular, when the agent - is active to flush or evict cache objects, all ``hit_set_count`` HitSets are - loaded into RAM. +.. note:: The longer the period and the higher the + ``min_read_recency_for_promote``/``min_write_recency_for_promote``, the more + RAM the ``ceph-osd`` daemon consumes. In particular, when the agent is active + to flush or evict cache objects, all ``hit_set_count`` HitSets are loaded + into RAM. Cache Sizing From 67f5f5217795474b610c20bdf35922e076b8169b Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 2 Sep 2015 17:20:36 +0800 Subject: [PATCH 442/654] memstore: fix the build on i386 on i386, uint64 is `unsigned long long`, while size_t is `unsigned int`. std::min(uint64, size_t) can not be resolved. Signed-off-by: Kefu Chai --- src/os/MemStore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc index a098f315994f7..9b0d994dd8c6e 100644 --- a/src/os/MemStore.cc +++ b/src/os/MemStore.cc @@ -1511,7 +1511,7 @@ int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff, const int64_t delta = dstoff - srcoff; auto &src_data = static_cast(src)->data; - const auto src_page_size = src_data.get_page_size(); + const uint64_t src_page_size = src_data.get_page_size(); auto &dst_data = data; const auto dst_page_size = dst_data.get_page_size(); From 9cc1055eb6e18d55084d5af2b2f93eb8da5ab7da Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Wed, 2 Sep 2015 18:59:08 +0800 Subject: [PATCH 443/654] AsyncConnection: Close connection when unregistered connection met WAIT Fix #12912 Signed-off-by: Haomai Wang --- src/msg/async/AsyncConnection.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index b0433e81f8850..cf58a6ea8aa19 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -1506,6 +1506,10 @@ int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_co } if (reply.tag == CEPH_MSGR_TAG_WAIT) { ldout(async_msgr->cct, 3) << __func__ << " connect got WAIT (connection race)" << dendl; + if (!once_ready) { + ldout(async_msgr->cct, 1) << __func__ << " got WAIT while connection isn't registered, just closed." << dendl; + goto fail; + } state = STATE_WAIT; } From c938d1f95e79ab5541c453b070fd97e44623781c Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 2 Sep 2015 21:01:57 +0800 Subject: [PATCH 444/654] rocksdb: fix 32-bit build Signed-off-by: Kefu Chai --- src/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rocksdb b/src/rocksdb index f371049d895a0..15ed8c97fdbdf 160000 --- a/src/rocksdb +++ b/src/rocksdb @@ -1 +1 @@ -Subproject commit f371049d895a083810746a00049688043d8cea7e +Subproject commit 15ed8c97fdbdf022ec9c8da02b06f4cd9189bb7e From 6d80ff135458f168ad86f21d48d6b295c69826cf Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Wed, 2 Sep 2015 16:00:10 +0200 Subject: [PATCH 445/654] tools: fix do_autogen.sh -R The R letter was missing from the getopts flags. Also sort the flags alphabetically to make it easier to spot that kind of lossage. Signed-off-by: Loic Dachary --- do_autogen.sh | 55 +++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/do_autogen.sh b/do_autogen.sh index febdfcb1705bb..9a4505b98372b 100755 --- a/do_autogen.sh +++ b/do_autogen.sh @@ -4,23 +4,24 @@ usage() { cat < add parameters to configure +-c use cryptopp -d debug build level 0: no debug level 1: -g level 3: -Wextra level 4: even more... --T --without-tcmalloc -e dump encoded objects to --P profiling build --p google profiler --O optimize --c use cryptopp --C add parameters to configure +-h this help message -j with java --R without rocksdb -J --with-jemalloc -L --without-lttng +-O optimize +-p google profiler +-P profiling build +-R without rocksdb +-T --without-tcmalloc +-v verbose output EOF } @@ -35,41 +36,25 @@ verbose=0 profile=0 rocksdb=1 CONFIGURE_FLAGS="--disable-static --with-lttng" -while getopts "d:e:hHrTPJLjpcvO:C:" flag +while getopts "C:cd:e:hjJLO:pPRTv" flag do case $flag in + C) CONFIGURE_FLAGS="$CONFIGURE_FLAGS $OPTARG";; + c) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-cryptopp --without-nss";; d) debug_level=$OPTARG;; - + e) encode_dump=$OPTARG;; + h) usage ; exit 0;; + j) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --enable-cephfs-java";; + J) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-jemalloc";; + L) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-lttng";; O) CFLAGS="${CFLAGS} -O$OPTARG";; - - c) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-cryptopp --without-nss";; - - C) CONFIGURE_FLAGS="$CONFIGURE_FLAGS $OPTARG";; - - P) profile=1;; p) with_profiler="--with-profiler" ;; - - h) usage - exit 0;; - - T) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-tcmalloc";; - - j) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --enable-cephfs-java";; - + P) profile=1;; R) rocksdb=0;; - + T) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-tcmalloc";; v) verbose=1;; - e) encode_dump=$OPTARG;; - - J) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-jemalloc";; - - L) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-lttng";; - - *) - echo - usage - exit 1;; + *) echo ; usage ; exit 1;; esac done From ddca3210184b57727a6def3ae434daf33aa611b9 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 1 Sep 2015 15:24:32 -0400 Subject: [PATCH 446/654] rbd: add verbose error reporting to merge-diff tool Signed-off-by: Jason Dillaman --- src/rbd.cc | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/rbd.cc b/src/rbd.cc index f8330ff45f0c4..062034ccf09c0 100755 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -2077,12 +2077,16 @@ static int do_merge_diff(const char *first, const char *second, const char *path // and the (offset,length) in wztag must be ascending order. r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size); - if (r < 0) + if (r < 0) { + cerr << "rbd: failed to parse first diff header" << std::endl; goto done; + } r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size); - if (r < 0) + if (r < 0) { + cerr << "rbd: failed to parse second diff header" << std::endl; goto done; + } if (f_to != s_from) { r = -EINVAL; @@ -2113,8 +2117,10 @@ static int do_merge_diff(const char *first, const char *second, const char *path ::encode(s_size, bl); r = bl.write_fd(pd); - if (r < 0) + if (r < 0) { + cerr << "rbd: failed to write merged diff header" << std::endl; goto done; + } } if (f_size > s_size) @@ -2131,8 +2137,13 @@ static int do_merge_diff(const char *first, const char *second, const char *path uint64_t last_off = f_off; r = parse_diff_body(fd, &f_tag, &f_off, &f_len); - if (r < 0) + dout(2) << "first diff data chunk: tag=" << f_tag << ", " + << "off=" << f_off << ", " + << "len=" << f_len << dendl; + if (r < 0) { + cerr << "rbd: failed to read first diff data chunk header" << std::endl; goto done; + } if (f_tag == 'e') { f_end = true; @@ -2146,6 +2157,8 @@ static int do_merge_diff(const char *first, const char *second, const char *path if (last_off > f_off) { r = -ENOTSUP; + cerr << "rbd: out-of-order offset from first diff (" + << last_off << " > " << f_off << ")" << std::endl; goto done; } } @@ -2154,8 +2167,14 @@ static int do_merge_diff(const char *first, const char *second, const char *path uint64_t last_off = s_off; r = parse_diff_body(sd, &s_tag, &s_off, &s_len); - if (r < 0) + dout(2) << "second diff data chunk: tag=" << f_tag << ", " + << "off=" << f_off << ", " + << "len=" << f_len << dendl; + if (r < 0) { + cerr << "rbd: failed to read second diff data chunk header" + << std::endl; goto done; + } if (s_tag == 'e') { s_end = true; @@ -2168,6 +2187,8 @@ static int do_merge_diff(const char *first, const char *second, const char *path if (last_off > s_off) { r = -ENOTSUP; + cerr << "rbd: out-of-order offset from second diff (" + << last_off << " > " << s_off << ")" << std::endl; goto done; } } @@ -2195,12 +2216,12 @@ static int do_merge_diff(const char *first, const char *second, const char *path if (first_stdin) { bufferptr bp = buffer::create(delta); r = safe_read_exact(fd, bp.c_str(), delta); - if (r < 0) - goto done; } else { r = lseek(fd, delta, SEEK_CUR); - if(r < 0) - goto done; + } + if (r < 0) { + cerr << "rbd: failed to skip first diff data" << std::endl; + goto done; } } f_off += delta; From cabfe137817bebda13bd31b9e09267d0778e5968 Mon Sep 17 00:00:00 2001 From: Guang G Yang Date: Tue, 28 Jul 2015 16:25:53 +0000 Subject: [PATCH 447/654] osd: check the length of the map before accessing the first element Signed-off-by: Guang Yang yguang@yahoo-inc.com --- src/osd/ECUtil.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 9d2c2fb261e9d..46a16c379fb43 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -9,11 +9,11 @@ int ECUtil::decode( ErasureCodeInterfaceRef &ec_impl, map &to_decode, bufferlist *out) { + assert(to_decode.size()); uint64_t total_data_size = to_decode.begin()->second.length(); - - assert(to_decode.size()); assert(total_data_size % sinfo.get_chunk_size() == 0); + assert(out); assert(out->length() == 0); @@ -47,10 +47,9 @@ int ECUtil::decode( ErasureCodeInterfaceRef &ec_impl, map &to_decode, map &out) { + assert(to_decode.size()); uint64_t total_data_size = to_decode.begin()->second.length(); - - assert(to_decode.size()); assert(total_data_size % sinfo.get_chunk_size() == 0); for (map::iterator i = to_decode.begin(); From 65dcc2da76750d0b6dd2cf0031c44f32749f33e5 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 20 Jul 2015 17:48:15 -0700 Subject: [PATCH 448/654] osd: When generating past intervals due to an import end at pg epoch Add assert() to make sure same_interval_since isn't too far forward Fixes: #12387 Signed-off-by: David Zafman --- src/osd/PG.cc | 2 +- src/osd/osd_types.cc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index b53370f8c38d9..edf91721be80b 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -647,7 +647,7 @@ bool PG::_calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_ *end = info.history.same_interval_since; } else { // PG must be imported, so let's calculate the whole range. - *end = osd->get_superblock().newest_map; + *end = osdmap_ref->get_epoch(); } // Do we already have the intervals we want? diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 4bc95e4dd1852..bec9f0ded6b3a 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2652,6 +2652,7 @@ bool pg_interval_t::check_new_interval( pg_interval_t& i = (*past_intervals)[same_interval_since]; i.first = same_interval_since; i.last = osdmap->get_epoch() - 1; + assert(i.first <= i.last); i.acting = old_acting; i.up = old_up; i.primary = old_acting_primary; From fa347d8f69b8eff2e246d35a127c4bfa5a50b5e0 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 16:02:44 -0700 Subject: [PATCH 449/654] rgw: delete-at and delete-after also on obj put / copy And potentially later we could use also the S3 api, so it could work with multipart upload, and POST obj. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_op.cc | 32 +++++++++++------ src/rgw/rgw_op.h | 12 +++++-- src/rgw/rgw_rados.cc | 48 +++++++++++++++++-------- src/rgw/rgw_rados.h | 12 ++++--- src/rgw/rgw_rest_swift.cc | 75 ++++++++++++++++++++++++++------------- 5 files changed, 123 insertions(+), 56 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 3078e4197904d..74c77f761ebcb 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1581,7 +1581,7 @@ class RGWPutObjProcessor_Multipart : public RGWPutObjProcessor_Atomic protected: int prepare(RGWRados *store, string *oid_rand); int do_complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match = NULL, const char *if_nomatch = NULL); public: @@ -1655,7 +1655,7 @@ static bool is_v2_upload_id(const string& upload_id) } int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match, const char *if_nomatch) { complete_writing_data(); @@ -1666,6 +1666,7 @@ int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_ head_obj_op.meta.set_mtime = set_mtime; head_obj_op.meta.mtime = mtime; head_obj_op.meta.owner = s->owner.get_id(); + head_obj_op.meta.delete_at = delete_at; int r = head_obj_op.write_meta(s->obj_size, attrs); if (r < 0) @@ -1788,6 +1789,17 @@ static int get_system_versioning_params(req_state *s, uint64_t *olh_epoch, strin return 0; } +static void encode_delete_at_attr(time_t delete_at, map& attrs) +{ + if (delete_at == 0) { + return; + } + + bufferlist delatbl; + ::encode(utime_t(delete_at, 0), delatbl); + attrs[RGW_ATTR_DELETE_AT] = delatbl; +} + void RGWPutObj::execute() { RGWPutObjProcessor *processor = NULL; @@ -1985,8 +1997,10 @@ void RGWPutObj::execute() } rgw_get_request_metadata(s->cct, s->info, attrs); + encode_delete_at_attr(delete_at, attrs); + + ret = processor->complete(etag, &mtime, 0, attrs, delete_at, if_match, if_nomatch); - ret = processor->complete(etag, &mtime, 0, attrs, if_match, if_nomatch); done: dispose_processor(processor); perfcounter->tinc(l_rgw_put_lat, @@ -2105,7 +2119,7 @@ void RGWPostObj::execute() attrs[RGW_ATTR_CONTENT_TYPE] = ct_bl; } - ret = processor->complete(etag, NULL, 0, attrs); + ret = processor->complete(etag, NULL, 0, attrs, delete_at); done: dispose_processor(processor); @@ -2372,12 +2386,7 @@ void RGWPutMetadataObject::execute() /* Filter currently existing attributes. */ prepare_add_del_attrs(orig_attrs, attrs, rmattrs); populate_with_generic_attrs(s, attrs); - - if (!delete_at.is_zero()) { - bufferlist delatbl; - ::encode(delete_at, delatbl); - attrs[RGW_ATTR_DELETE_AT] = delatbl; - } + encode_delete_at_attr(delete_at, attrs); ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, NULL); } @@ -2612,6 +2621,8 @@ void RGWCopyObj::execute() obj_ctx.set_atomic(src_obj); obj_ctx.set_atomic(dst_obj); + encode_delete_at_attr(delete_at, attrs); + ret = store->copy_obj(obj_ctx, s->user.user_id, client_id, @@ -2631,6 +2642,7 @@ void RGWCopyObj::execute() attrs_mod, attrs, RGW_OBJ_CATEGORY_MAIN, olh_epoch, + delete_at, (version_id.empty() ? NULL : &version_id), &s->req_id, /* use req_id as tag */ &etag, diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 8781faeff4613..7a196a3d6ce22 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -436,6 +436,8 @@ class RGWPutObj : public RGWOp { uint64_t olh_epoch; string version_id; + time_t delete_at; + public: RGWPutObj() { ret = 0; @@ -449,6 +451,7 @@ class RGWPutObj : public RGWOp { mtime = 0; user_manifest_parts_hash = NULL; olh_epoch = 0; + delete_at = 0; } virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) { @@ -489,11 +492,12 @@ class RGWPostObj : public RGWOp { string content_type; RGWAccessControlPolicy policy; map attrs; + time_t delete_at; public: RGWPostObj() : min_len(0), max_len(LLONG_MAX), ret(0), len(0), ofs(0), supplied_md5_b64(NULL), supplied_etag(NULL), - data_pending(false) {} + data_pending(false), delete_at(0) {} virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) { RGWOp::init(store, s, h); @@ -579,11 +583,11 @@ class RGWPutMetadataObject : public RGWOp { int ret; RGWAccessControlPolicy policy; string placement_rule; - utime_t delete_at; + time_t delete_at; public: RGWPutMetadataObject() - : ret(0) + : ret(0), delete_at(0) {} virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) { @@ -658,6 +662,7 @@ class RGWCopyObj : public RGWOp { string version_id; uint64_t olh_epoch; + time_t delete_at; int init_common(); @@ -680,6 +685,7 @@ class RGWCopyObj : public RGWOp { attrs_mod = RGWRados::ATTRSMOD_NONE; last_ofs = 0; olh_epoch = 0; + delete_at = 0; } static bool parse_copy_location(const string& src, string& bucket_name, rgw_obj_key& object); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index bde1cd7249f6f..a682cd422d142 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -892,10 +892,10 @@ void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct) } int RGWPutObjProcessor::complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match, const char * if_nomatch) { - int r = do_complete(etag, mtime, set_mtime, attrs, if_match, if_nomatch); + int r = do_complete(etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch); if (r < 0) return r; @@ -1209,7 +1209,7 @@ int RGWPutObjProcessor_Atomic::complete_writing_data() } int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match, const char *if_nomatch) { int r = complete_writing_data(); @@ -1235,6 +1235,7 @@ int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t s obj_op.meta.owner = bucket_info.owner; obj_op.meta.flags = PUT_OBJ_CREATE; obj_op.meta.olh_epoch = olh_epoch; + obj_op.meta.delete_at = delete_at; r = obj_op.write_meta(obj_len, attrs); if (r < 0) { @@ -3624,6 +3625,17 @@ int RGWRados::Object::Write::write_meta(uint64_t size, } } + if (meta.delete_at > 0) { + rgw_obj_key obj_key; + obj.get_index_key(&obj_key); + + r = store->objexp_hint_add(utime_t(meta.delete_at, 0), bucket.name, bucket.bucket_id, obj_key); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; + /* ignoring error, nothing we can do at this point */ + } + } + /* update quota cache */ store->quota_handler->update_stats(meta.owner, bucket, (orig_exists ? 0 : 1), size, orig_size); @@ -3861,8 +3873,8 @@ class RGWRadosPutObj : public RGWGetDataCB processor->set_extra_data_len(len); } - int complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs) { - return processor->complete(etag, mtime, set_mtime, attrs); + int complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs, time_t delete_at) { + return processor->complete(etag, mtime, set_mtime, attrs, delete_at); } }; @@ -3925,7 +3937,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj) return ret; } - return copy_obj_data(rctx, dest_bucket_info, read_op, end, obj, obj, max_chunk_size, NULL, mtime, attrset, RGW_OBJ_CATEGORY_MAIN, 0, NULL, NULL, NULL, NULL); + return copy_obj_data(rctx, dest_bucket_info, read_op, end, obj, obj, max_chunk_size, NULL, mtime, attrset, RGW_OBJ_CATEGORY_MAIN, 0, 0, NULL, NULL, NULL, NULL); } int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, @@ -3948,6 +3960,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -4047,7 +4060,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, set_copy_attrs(src_attrs, attrs, attrs_mod); } - ret = cb.complete(etag, mtime, set_mtime, attrs); + ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at); if (ret < 0) { goto set_err_state; } @@ -4128,6 +4141,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -4160,7 +4174,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, info, source_zone, dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr, unmod_ptr, if_match, if_nomatch, attrs_mod, attrs, category, - olh_epoch, version_id, ptag, petag, err, progress_cb, progress_data); + olh_epoch, delete_at, version_id, ptag, petag, err, progress_cb, progress_data); } map src_attrs; @@ -4238,7 +4252,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */ return copy_obj_data(obj_ctx, dest_bucket_info, read_op, end, dest_obj, src_obj, - max_chunk_size, mtime, 0, attrs, category, olh_epoch, + max_chunk_size, mtime, 0, attrs, category, olh_epoch, delete_at, version_id, ptag, petag, err); } @@ -4330,6 +4344,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, write_op.meta.flags = PUT_OBJ_CREATE; write_op.meta.category = category; write_op.meta.olh_epoch = olh_epoch; + write_op.meta.delete_at = delete_at; ret = write_op.write_meta(end + 1, attrs); if (ret < 0) { @@ -4373,6 +4388,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx, map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -4429,7 +4445,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx, } } - ret = processor.complete(etag, mtime, set_mtime, attrs); + ret = processor.complete(etag, mtime, set_mtime, attrs, delete_at); return ret; } @@ -5503,12 +5519,16 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj, if (name.compare(RGW_ATTR_DELETE_AT) == 0) { utime_t ts; - ::decode(ts, bl); + try { + ::decode(ts, bl); - rgw_obj_key obj_key; - obj.get_index_key(&obj_key); + rgw_obj_key obj_key; + obj.get_index_key(&obj_key); - objexp_hint_add(ts, bucket.name, bucket.bucket_id, obj_key); + objexp_hint_add(ts, bucket.name, bucket.bucket_id, obj_key); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl; + } } } diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 121bfa58c22c8..c533b51045eb1 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1606,10 +1606,11 @@ class RGWRados const char *if_match; const char *if_nomatch; uint64_t olh_epoch; + time_t delete_at; MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), remove_objs(NULL), set_mtime(0), category(RGW_OBJ_CATEGORY_MAIN), flags(0), - if_match(NULL), if_nomatch(NULL), olh_epoch(0) {} + if_match(NULL), if_nomatch(NULL), olh_epoch(0), delete_at(0) {} } meta; Write(RGWRados::Object *_target) : target(_target) {} @@ -1808,6 +1809,7 @@ class RGWRados map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -1855,6 +1857,7 @@ class RGWRados map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -1873,6 +1876,7 @@ class RGWRados map& attrs, RGWObjCategory category, uint64_t olh_epoch, + time_t delete_at, string *version_id, string *ptag, string *petag, @@ -2377,7 +2381,7 @@ class RGWPutObjProcessor RGWBucketInfo bucket_info; virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match = NULL, const char *if_nomatch = NULL) = 0; public: @@ -2393,7 +2397,7 @@ class RGWPutObjProcessor assert(0); } virtual int complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match = NULL, const char *if_nomatch = NULL); CephContext *ctx(); @@ -2464,7 +2468,7 @@ class RGWPutObjProcessor_Atomic : public RGWPutObjProcessor_Aio int write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive); virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime, - map& attrs, + map& attrs, time_t delete_at, const char *if_match = NULL, const char *if_nomatch = NULL); int prepare_next_part(off_t ofs); diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 7dea41012f7db..02a0cd17f0cd4 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -481,6 +481,40 @@ void RGWDeleteBucket_ObjStore_SWIFT::send_response() rgw_flush_formatter_and_reset(s, s->formatter); } +static int get_delete_at_param(req_state *s, time_t *delete_at) +{ + /* Handle Swift object expiration. */ + utime_t delat_proposal; + string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); + + if (x_delete.empty()) { + x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + } else { + /* X-Delete-After HTTP is present. It means we need add its value + * to the current time. */ + delat_proposal = ceph_clock_now(g_ceph_context); + } + + if (x_delete.empty()) { + return 0; + } + string err; + long ts = strict_strtoll(x_delete.c_str(), 10, &err); + + if (!err.empty()) { + return -EINVAL; + } + + delat_proposal += utime_t(ts, 0); + if (delat_proposal < ceph_clock_now(g_ceph_context)) { + return -EINVAL; + } + + *delete_at = delat_proposal.sec(); + + return 0; +} + int RGWPutObj_ObjStore_SWIFT::get_params() { if (s->has_bad_meta) @@ -515,6 +549,12 @@ int RGWPutObj_ObjStore_SWIFT::get_params() obj_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + int r = get_delete_at_param(s, &delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + return RGWPutObj_ObjStore::get_params(); } @@ -621,31 +661,10 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params() } /* Handle Swift object expiration. */ - utime_t delat_proposal; - string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); - - if (x_delete.empty()) { - x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); - } else { - /* X-Delete-After HTTP is present. It means we need add its value - * to the current time. */ - delat_proposal = ceph_clock_now(g_ceph_context); - } - - if (!x_delete.empty()) { - string err; - long ts = strict_strtoll(x_delete.c_str(), 10, &err); - - if (!err.empty()) { - return -EINVAL; - } - - delat_proposal += utime_t(ts, 0); - if (delat_proposal < ceph_clock_now(g_ceph_context)) { - return -EINVAL; - } - - delete_at = delat_proposal; + int r = get_delete_at_param(s, &delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; } placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", ""); @@ -750,6 +769,12 @@ int RGWCopyObj_ObjStore_SWIFT::get_params() attrs_mod = RGWRados::ATTRSMOD_MERGE; } + int r = get_delete_at_param(s, &delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + return 0; } From a69a989feb68ae3722a12aa07d07b805a4c69bb2 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 27 Aug 2015 16:38:04 -0700 Subject: [PATCH 450/654] rgw: objexp shards index by key Not by time. This should provide better concurrency. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_object_expirer_core.cc | 15 +++++---- src/rgw/rgw_object_expirer_core.h | 2 +- src/rgw/rgw_rados.cc | 53 ++++++++++-------------------- src/rgw/rgw_rados.h | 10 +++--- 4 files changed, 31 insertions(+), 49 deletions(-) diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc index 2dee9bb011a1f..b092b40bc3945 100644 --- a/src/rgw/rgw_object_expirer_core.cc +++ b/src/rgw/rgw_object_expirer_core.cc @@ -125,7 +125,7 @@ void RGWObjectExpirer::trim_chunk(const string& shard, return; } -void RGWObjectExpirer::proceed_single_shard(const string& shard, +void RGWObjectExpirer::process_single_shard(const string& shard, const utime_t& last_run, const utime_t& round_start) { @@ -179,20 +179,21 @@ void RGWObjectExpirer::proceed_single_shard(const string& shard, return; } -void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, - const utime_t& round_start) +void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, const utime_t& round_start) { bool is_next_available; utime_t shard_marker; - do { + CephContext *cct = store->ctx(); + int num_shards = cct->_conf->rgw_objexp_hints_num_shards; + + for (int i = 0; i < num_shards; i++) { string shard; - store->objexp_get_shard(last_run, round_start, shard_marker, shard, - is_next_available); + store->objexp_get_shard(i, shard); ldout(store->ctx(), 20) << "proceeding shard = " << shard << dendl; - proceed_single_shard(shard, last_run, round_start); + process_single_shard(shard, last_run, round_start); } while (is_next_available); return; diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h index 12bcc8e6b9ae7..bd137fa6bd296 100644 --- a/src/rgw/rgw_object_expirer_core.h +++ b/src/rgw/rgw_object_expirer_core.h @@ -74,7 +74,7 @@ class RGWObjectExpirer { const utime_t& from, const utime_t& to); - void proceed_single_shard(const string& shard, + void process_single_shard(const string& shard, const utime_t& last_run, const utime_t& round_start); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index a682cd422d142..3ff74e4598ce6 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2348,18 +2348,27 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const return cls_log_trim(io_ctx, oid, start_time, end_time, from_marker, to_marker); } -string RGWRados::objexp_hint_get_shardname(const utime_t &ts) +string RGWRados::objexp_hint_get_shardname(int shard_num) { - const time_t roundedts = ts.sec() / cct->_conf->rgw_objexp_time_step; - const unsigned int shnum = roundedts % cct->_conf->rgw_objexp_hints_num_shards; - char buf[32]; - snprintf(buf, sizeof(buf), "%010u", shnum); + snprintf(buf, sizeof(buf), "%010u", shard_num); string objname("obj_delete_at_hint."); return objname + buf; } +#define MAX_PBJEXP_SHARDS_PRIME 7877 + +int RGWRados::objexp_key_shard(const rgw_obj_key& key) +{ + string obj_key = key.name + key.instance; + int num_shards = cct->_conf->rgw_objexp_hints_num_shards; + uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + sid = sid2 % MAX_BUCKET_INDEX_SHARDS_PRIME % num_shards; + return sid % num_shards; +} + static string objexp_hint_get_keyext(const string& bucket_name, const string& bucket_id, const rgw_obj_key& obj_key) @@ -2384,40 +2393,14 @@ int RGWRados::objexp_hint_add(const utime_t& delete_at, ObjectWriteOperation op; cls_timeindex_add(op, delete_at, keyext, hebl); - string shard_name = objexp_hint_get_shardname(delete_at); + string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key)); return objexp_pool_ctx.operate(shard_name, &op); } -void RGWRados::objexp_get_shard(const utime_t& start_time, - const utime_t& end_time, - utime_t &marker, /* in/out */ - string& shard, /* out */ - bool& truncated) /* out */ +void RGWRados::objexp_get_shard(int shard_num, + string& shard) /* out */ { - if (marker.is_zero()) { - marker = start_time; - } - - const uint32_t num_shards = cct->_conf->rgw_objexp_hints_num_shards; - const time_t time_step = cct->_conf->rgw_objexp_time_step; - - const time_t sts = start_time.sec() / time_step; - const time_t ets = end_time.sec() / time_step; - const time_t mts = marker.sec() / time_step; - - const uint32_t periods = ets - sts; - const uint32_t iters = min(periods, num_shards - 1); - - shard = objexp_hint_get_shardname(marker); - - if (mts - sts < iters) { - truncated = true; - marker += utime_t(time_step, 0); - } else { - truncated = false; - } - - return; + shard = objexp_hint_get_shardname(shard_num); } int RGWRados::objexp_hint_list(const string& oid, diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index c533b51045eb1..27787ec9859f6 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2113,12 +2113,10 @@ class RGWRados int time_log_trim(const string& oid, const utime_t& start_time, const utime_t& end_time, const string& from_marker, const string& to_marker); - string objexp_hint_get_shardname(const utime_t &ts); - void objexp_get_shard(const utime_t& start_time, - const utime_t& end_time, - utime_t &marker, /* out */ - string& shard, /* out */ - bool& truncated); /* out */ + string objexp_hint_get_shardname(int shard_num); + int objexp_key_shard(const rgw_obj_key& key); + void objexp_get_shard(int shard_num, + string& shard); /* out */ int objexp_hint_add(const utime_t& delete_at, const string& bucket_name, const string& bucket_id, From 3626db4f2fb5ff1136826c91f5ded57a7c7303d3 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 2 Sep 2015 17:56:07 -0700 Subject: [PATCH 451/654] rgw: don't copy delete_at attr, unless it's intra region copy We don't want to keep the expiration value of a copied object, unless we're doing a copy within the same zone group. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 3ff74e4598ce6..ae9d64fc6c9b9 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -4024,6 +4024,20 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, JSONDecoder::decode_json("attrs", src_attrs, &jp); src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout + if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */ + src_attrs.erase(RGW_ATTR_DELETE_AT); + } else { + map::iterator iter = src_attrs.find(RGW_ATTR_DELETE_AT); + if (iter != src_attrs.end()) { + try { + utime_t da; + ::decode(da, iter->second); + delete_at = (time_t)da.sec(); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; + } + } + } } } @@ -4182,6 +4196,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, } src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL]; + src_attrs.erase(RGW_ATTR_DELETE_AT); set_copy_attrs(src_attrs, attrs, attrs_mod); attrs.erase(RGW_ATTR_ID_TAG); From 9bf103c420b886e103c34dbecb98cacfedc05c17 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Sep 2015 21:32:30 -0400 Subject: [PATCH 452/654] osd/ReplicatedPG: snaptimmer: adjust stats through ctx->delta_stats We should not directly modifying the stats here; use the existing ctx->delta_stats field. Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c29a2afa706d5..2c12e72bb2b0d 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3093,8 +3093,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid) if (*p == last) break; assert(p != snapset.clones.end()); - object_stat_sum_t delta; - delta.num_bytes -= snapset.get_clone_bytes(last); + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last); if (p != snapset.clones.begin()) { // not the oldest... merge overlap into next older clone @@ -3104,25 +3103,24 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid) bool adjust_prev_bytes = is_present_clone(prev_coid); if (adjust_prev_bytes) - delta.num_bytes -= snapset.get_clone_bytes(*n); + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); snapset.clone_overlap[*n].intersection_of( snapset.clone_overlap[*p]); if (adjust_prev_bytes) - delta.num_bytes += snapset.get_clone_bytes(*n); + ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); } - delta.num_objects--; + ctx->delta_stats.num_objects--; if (coi.is_dirty()) - delta.num_objects_dirty--; + ctx->delta_stats.num_objects_dirty--; if (coi.is_omap()) - delta.num_objects_omap--; + ctx->delta_stats.num_objects_omap--; if (coi.is_whiteout()) { dout(20) << __func__ << " trimming whiteout on " << coid << dendl; - delta.num_whiteouts--; + ctx->delta_stats.num_whiteouts--; } - delta.num_object_clones--; - info.stats.stats.add(delta); + ctx->delta_stats.num_object_clones--; obc->obs.exists = false; snapset.clones.erase(p); @@ -12163,6 +12161,8 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&) assert(repop); repop->queue_snap_trimmer = true; + info.stats.stats.add(ctx->delta_stats); + repops.insert(repop->get()); pg->simple_repop_submit(repop); } From eb2993a760e8076d7d6adfea125e06504d8d3263 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Sep 2015 21:33:25 -0400 Subject: [PATCH 453/654] osd/ReplicatedPG: create apply_ctx_stats() helper finish_ctx does a dance to update the pg stats correctly despite racing scrub or backfill; move this into a helper. Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 6 ++++++ src/osd/ReplicatedPG.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 2c12e72bb2b0d..c254ad5881a49 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6244,8 +6244,14 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc ctx->obc->ssc->snapset = ctx->new_snapset; } + apply_ctx_stats(ctx, scrub_ok); +} + +void ReplicatedPG::apply_ctx_stats(OpContext *ctx, bool scrub_ok) +{ info.stats.stats.add(ctx->delta_stats); + const hobject_t& soid = ctx->obs->oi.soid; for (set::iterator i = backfill_targets.begin(); i != backfill_targets.end(); ++i) { diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 4517e68730ebb..a2802b2ecdef2 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1157,6 +1157,8 @@ class ReplicatedPG : public PG, public PGBackend::Listener { void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv); void make_writeable(OpContext *ctx); void log_op_stats(OpContext *ctx); + void apply_ctx_stats(OpContext *ctx, + bool scrub_ok=false); ///< true if we should skip scrub stat update void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi, interval_set& modified, uint64_t offset, From 75d9f584e224cb0e36445b6d87cd245793fa675d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Sep 2015 21:39:55 -0400 Subject: [PATCH 454/654] osd/ReplicatedPG: use apply_ctx_stats() everywhere We were open-coding the stats update in several places, and getting it wrong in most of them. Use a single helper to do it. Note that any repop must *either* call apply_ctx_stats() or finish_ctx() (which calls it for you). Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c254ad5881a49..09a5bf4068b59 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -8154,6 +8154,8 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch) ctx->log.back().mod_desc.mark_unrollbackable(); } + // no ctx->delta_stats + // obc ref swallowed by repop! simple_repop_submit(repop); @@ -10702,7 +10704,7 @@ void ReplicatedPG::hit_set_remove_all() utime_t now = ceph_clock_now(cct); ctx->mtime = now; hit_set_trim(repop, 0); - info.stats.stats.add(ctx->delta_stats); + apply_ctx_stats(ctx); simple_repop_submit(repop); } @@ -10923,12 +10925,7 @@ void ReplicatedPG::hit_set_persist() hit_set_trim(repop, max); - info.stats.stats.add(ctx->delta_stats); - if (scrubber.active) { - if (cmp(oid, scrubber.start, get_sort_bitwise()) < 0) - scrub_cstat.add(ctx->delta_stats); - } - + apply_ctx_stats(ctx); simple_repop_submit(repop); } @@ -12167,13 +12164,14 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&) assert(repop); repop->queue_snap_trimmer = true; - info.stats.stats.add(ctx->delta_stats); + pg->apply_ctx_stats(repop->ctx); repops.insert(repop->get()); pg->simple_repop_submit(repop); } return discard_event(); } + /* WaitingOnReplicasObjects */ ReplicatedPG::WaitingOnReplicas::WaitingOnReplicas(my_context ctx) : my_base(ctx), From 64962aafed362a2a798eefe54158f65af767f0bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Sep 2015 21:58:37 -0400 Subject: [PATCH 455/654] qa/workunits/rados/test_alloc_hint.sh: sudo to list files The osd data dir is owned by ceph and not readable by other non-root users. Fixes: #12861 Signed-off-by: Sage Weil --- qa/workunits/rados/test_alloc_hint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/rados/test_alloc_hint.sh b/qa/workunits/rados/test_alloc_hint.sh index 86d3986659eb9..c43fc3c00bde5 100755 --- a/qa/workunits/rados/test_alloc_hint.sh +++ b/qa/workunits/rados/test_alloc_hint.sh @@ -61,7 +61,7 @@ function expect_alloc_hint_eq() { # e.g., .../25.6_head/foo__head_7FC1F406__19 # .../26.bs1_head/bar__head_EFE6384B__1a_ffffffffffffffff_1 - local fns=(${OSD_DATA[i]}/current/${PGID}*_head/${OBJ}_*) + local fns=$(sudo find ${OSD_DATA[i]}/current/${PGID}*_head -type f | grep head/${OBJ}_) local count="${#fns[@]}" if [ "${count}" -ne 1 ]; then echo "bad fns count: ${count}" >&2 From f65267c96cbd4cd25036b6bf399692e77bbb9436 Mon Sep 17 00:00:00 2001 From: Vikhyat Umrao Date: Thu, 3 Sep 2015 12:02:05 +0530 Subject: [PATCH 456/654] rgw : setting max number of buckets for users via ceph.conf option This patch adds a new option "rgw_user_max_buckets" for setting max number of buckets for users via ceph.conf. Fixes #12714 Signed-off-by: Vikhyat Umrao --- src/common/config_opts.h | 1 + src/rgw/rgw_admin.cc | 1 + src/rgw/rgw_rest_user.cc | 5 +++-- src/rgw/rgw_user.cc | 15 ++++++++------- src/test/cli/radosgw-admin/help.t | 1 + 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f8bbdfbc0a565..a1c88b196289d 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1123,6 +1123,7 @@ OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for ea OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change +OPTION(rgw_user_max_buckets, OPT_U32, 1000) // global option to set max buckets count for all user OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 1140cbdbbc5f8..610275700cb83 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -125,6 +125,7 @@ void _usage() cout << " --access= Set access permissions for sub-user, should be one\n"; cout << " of read, write, readwrite, full\n"; cout << " --display-name=\n"; + cout << " --max_buckets max number of buckets for a user\n"; cout << " --system set the system flag on the user\n"; cout << " --bucket=\n"; cout << " --pool=\n"; diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc index 6b5d1eb286965..6cd2591bfb5de 100644 --- a/src/rgw/rgw_rest_user.cc +++ b/src/rgw/rgw_rest_user.cc @@ -71,6 +71,7 @@ void RGWOp_User_Create::execute() bool exclusive; uint32_t max_buckets; + uint32_t default_max_buckets = s->cct->_conf->rgw_user_max_buckets; RGWUserAdminOpState op_state; @@ -83,7 +84,7 @@ void RGWOp_User_Create::execute() RESTArgs::get_string(s, "user-caps", caps, &caps); RESTArgs::get_bool(s, "generate-key", true, &gen_key); RESTArgs::get_bool(s, "suspended", false, &suspended); - RESTArgs::get_uint32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets); + RESTArgs::get_uint32(s, "max-buckets", default_max_buckets, &max_buckets); RESTArgs::get_bool(s, "system", false, &system); RESTArgs::get_bool(s, "exclusive", false, &exclusive); @@ -122,7 +123,7 @@ void RGWOp_User_Create::execute() op_state.set_key_type(key_type); } - if (max_buckets != RGW_DEFAULT_MAX_BUCKETS) + if (max_buckets != default_max_buckets) op_state.set_max_buckets(max_buckets); if (s->info.args.exists("suspended")) diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc index 5a5328d8c1b4d..47475c3d8b5ea 100644 --- a/src/rgw/rgw_user.cc +++ b/src/rgw/rgw_user.cc @@ -1810,7 +1810,13 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg) if (!user_email.empty()) user_info.user_email = user_email; - user_info.max_buckets = op_state.get_max_buckets(); + CephContext *cct = store->ctx(); + if (op_state.max_buckets_specified) { + user_info.max_buckets = op_state.get_max_buckets(); + } else { + user_info.max_buckets = cct->_conf->rgw_user_max_buckets; + } + user_info.suspended = op_state.get_suspension_status(); user_info.system = op_state.system; @@ -2016,13 +2022,8 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg) if (!display_name.empty()) user_info.display_name = display_name; - // will be set to RGW_DEFAULT_MAX_BUCKETS by default - uint32_t max_buckets = op_state.get_max_buckets(); - - ldout(store->ctx(), 0) << "max_buckets=" << max_buckets << " specified=" << op_state.max_buckets_specified << dendl; - if (op_state.max_buckets_specified) - user_info.max_buckets = max_buckets; + user_info.max_buckets = op_state.get_max_buckets(); if (op_state.system_specified) user_info.system = op_state.system; diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index fec8737541e4b..c9eab32c2db11 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -82,6 +82,7 @@ --access= Set access permissions for sub-user, should be one of read, write, readwrite, full --display-name= + --max_buckets max number of buckets for a user --system set the system flag on the user --bucket= --pool= From 557e581a4efbd47427846613c1f4d78598e9fbc0 Mon Sep 17 00:00:00 2001 From: huangjun Date: Thu, 3 Sep 2015 19:10:03 +0800 Subject: [PATCH 457/654] mon/MonClient: fix error in 'ceph ping mon.id' Fixes: #12442 Signed-off-by: huangjun --- src/mon/MonClient.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index 6624e6be30335..35c5e726c3ca3 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -207,11 +207,18 @@ int MonClient::ping_monitor(const string &mon_id, string *result_reply) { ldout(cct, 10) << __func__ << dendl; - if (mon_id.empty()) { + string new_mon_id; + if (monmap.contains("noname-"+mon_id)) { + new_mon_id = "noname-"+mon_id; + } else { + new_mon_id = mon_id; + } + + if (new_mon_id.empty()) { ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl; return -EINVAL; - } else if (!monmap.contains(mon_id)) { - ldout(cct, 10) << __func__ << " no such monitor 'mon." << mon_id << "'" + } else if (!monmap.contains(new_mon_id)) { + ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'" << dendl; return -ENOENT; } @@ -224,8 +231,8 @@ int MonClient::ping_monitor(const string &mon_id, string *result_reply) smsgr->add_dispatcher_head(pinger); smsgr->start(); - ConnectionRef con = smsgr->get_connection(monmap.get_inst(mon_id)); - ldout(cct, 10) << __func__ << " ping mon." << mon_id + ConnectionRef con = smsgr->get_connection(monmap.get_inst(new_mon_id)); + ldout(cct, 10) << __func__ << " ping mon." << new_mon_id << " " << con->get_peer_addr() << dendl; con->send_message(new MPing); From 929ca5b6adc4a37567d6ce9ebbb0803582fed4b1 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Thu, 9 Jul 2015 13:39:03 +0200 Subject: [PATCH 458/654] ceph.spec.in: drop lsb-release dependency from ceph-common It was there as an equivalent of redhat-lsb-core, but the redhat-lsb-core bits that ceph-common relies on are included in insserv-compat on SUSE, and insserv-compat is in base. Signed-off-by: Nathan Cutler --- ceph.spec.in | 3 --- 1 file changed, 3 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index a8374dc4f35eb..2e4fd54ffdea5 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -180,9 +180,6 @@ Requires: python-requests %if 0%{?rhel} || 0%{?fedora} Requires: redhat-lsb-core %endif -%if 0%{defined suse_version} -Requires: lsb-release -%endif # python-argparse is only needed in distros with Python 2.6 or lower %if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110) Requires: python-argparse From 75f2a983211668bd127336ff07d447a0f5525734 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Thu, 9 Jul 2015 13:42:50 +0200 Subject: [PATCH 459/654] ceph.spec.in: clean up suse_version conditionals Use 0%{?suse_version} throughout for consistency. Signed-off-by: Nathan Cutler --- ceph.spec.in | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 2e4fd54ffdea5..322e21bee01db 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -114,7 +114,7 @@ BuildRequires: python-requests %if ( 0%{?rhel} > 0 && 0%{?rhel} < 7 ) || ( 0%{?centos} > 0 && 0%{?centos} < 7 ) BuildRequires: python-sphinx10 %endif -%if 0%{?fedora} || 0%{defined suse_version} || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7 || 0%{?centos} >= 7 BuildRequires: python-sphinx %endif @@ -129,7 +129,7 @@ BuildRequires: yasm ################################################################################# # distro-conditional dependencies ################################################################################# -%if 0%{defined suse_version} +%if 0%{?suse_version} Requires: python-Flask BuildRequires: net-tools BuildRequires: libbz2-devel @@ -211,7 +211,7 @@ Summary: Rados REST gateway Group: Development/Libraries Requires: ceph-common = %{epoch}:%{version}-%{release} Requires: librados2 = %{epoch}:%{version}-%{release} -%if 0%{defined suse_version} +%if 0%{?suse_version} BuildRequires: libexpat-devel BuildRequires: FastCGI-devel %endif @@ -552,7 +552,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" # fix bug in specific version of libedit-devel -%if 0%{defined suse_version} +%if 0%{?suse_version} sed -i -e "s/-lcurses/-lncurses/g" Makefile sed -i -e "s/-lcurses/-lncurses/g" src/Makefile sed -i -e "s/-lcurses/-lncurses/g" man/Makefile @@ -640,7 +640,7 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw -%if %{defined suse_version} +%if 0%{?suse_version} # Fedora seems to have some problems with this macro, use it only on SUSE %fdupes -s $RPM_BUILD_ROOT/%{python_sitelib} %fdupes %buildroot From e54f89677dd509ed2f79bb45be0260325055d97b Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Thu, 3 Sep 2015 16:28:19 +0200 Subject: [PATCH 460/654] ceph.spec.in: drop redundant centos from conditionals Since the %{rhel} macro is guaranteed to be defined on all CentOS installations, the %{centos} macro is superfluous. Signed-off-by: Nathan Cutler --- ceph.spec.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 322e21bee01db..6f8565423f09f 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -111,10 +111,10 @@ BuildRequires: pkgconfig BuildRequires: python BuildRequires: python-nose BuildRequires: python-requests -%if ( 0%{?rhel} > 0 && 0%{?rhel} < 7 ) || ( 0%{?centos} > 0 && 0%{?centos} < 7 ) +%if 0%{?rhel} > 0 && 0%{?rhel} < 7 BuildRequires: python-sphinx10 %endif -%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7 BuildRequires: python-sphinx %endif @@ -853,7 +853,7 @@ mkdir -p %{_localstatedir}/run/ceph/ CEPH_GROUP_ID="" CEPH_USER_ID="" # disabled for now until we have the numbers -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +%if 0%{?rhel} || 0%{?fedora} CEPH_GROUP_ID="-g 167" CEPH_USER_ID="-u 167" %endif From d506bf14756826e3c7ebf102b57265c6da01a96a Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 2 Sep 2015 10:54:44 -0400 Subject: [PATCH 461/654] vstart: add -c argument to radosgw-admin commands Signed-off-by: Casey Bodley --- src/vstart.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vstart.sh b/src/vstart.sh index 8edd8895a1efc..679d33a16140b 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -698,7 +698,7 @@ do_rgw() RGWSUDO= [ $CEPH_RGW_PORT -lt 1024 ] && RGWSUDO=sudo - $RGWSUDO $CEPH_BIN/radosgw --log-file=${CEPH_OUT_DIR}/rgw.log ${RGWDEBUG} --debug-ms=1 + $RGWSUDO $CEPH_BIN/radosgw -c $conf_fn --log-file=${CEPH_OUT_DIR}/rgw.log ${RGWDEBUG} --debug-ms=1 # Create S3 user local akey='0555b35654ad1656d804' @@ -724,7 +724,7 @@ do_rgw() # Create Swift user echo "setting up user tester" - $CEPH_BIN/radosgw-admin user create --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing > /dev/null + $CEPH_BIN/radosgw-admin user create -c $conf_fn --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing > /dev/null echo "" echo "S3 User Info:" From 6e7fafc9d2d28e4bd75645de19410b7ec5b7c769 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 4 Aug 2015 12:45:52 -0400 Subject: [PATCH 462/654] README.md: Add basic CMake instructions README.md: Fixed spacing, trimmed cmake section Signed-off-by: Ali Maredia --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index db891ea91b3d2..a3684e40d1e9c 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ Debian Squeeze. Backports for Ceph can be found at ceph.com/debian-leveldb. Building Ceph ============= +Autotools +--------- + Developers, please refer to the [Developer Guide](doc/dev/quick_guide.rst) for more information, otherwise, you can build the server daemons, and FUSE client, by executing the @@ -47,6 +50,21 @@ following: (Note that the FUSE client will only be built if libfuse is present.) +CMake +----- + +Prerequisite: + CMake 2.8.11 + +Build instructions: + + mkdir build + cd build + cmake [options] /path/to/ceph/src/dir + make + +(Note that /path/to/ceph/src/dir can be in the tree and out of the tree) + Dependencies ------------ From bf82c65e330e36971ce182c7eefaa2a5e14f9d9e Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 17 Aug 2015 16:26:47 -0400 Subject: [PATCH 463/654] cmake: check_TESTPROGRAMS tests running Make check working, accept rocksdb tests. Clean up coming. Signed-off-by: Ali Maredia --- CMakeLists.txt | 5 + src/CMakeLists.txt | 23 + src/erasure-code/CMakeLists.txt | 1 + src/erasure-code/isa/CMakeLists.txt | 70 +-- src/erasure-code/shec/CMakeLists.txt | 36 ++ src/test/CMakeLists.txt | 700 ++++++++++++++++++++++++--- src/test/erasure-code/CMakeLists.txt | 242 +++++++++ 7 files changed, 969 insertions(+), 108 deletions(-) create mode 100644 src/erasure-code/shec/CMakeLists.txt create mode 100644 src/test/erasure-code/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 59a7328572cfb..af4776c399993 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,6 +241,11 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(OperatingSystem "Mac OS X") endif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +# enables testing and creates Make check command +enable_testing() +set(CMAKE_CTEST_COMMAND ctest -V) +add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) + add_subdirectory(src) # man pages must be preprocessed, not supported yet diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5b8298e130830..7ded1c47dc6b2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -179,6 +179,8 @@ set(crush_srcs crush/CrushCompiler.cc crush/CrushTester.cc) +add_library(crush STATIC ${crush_srcs}) + add_subdirectory(json_spirit) set(xio_common_srcs) @@ -852,6 +854,27 @@ if(${WITH_RBD}) ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) install(TARGETS rbd DESTINATION bin) install(PROGRAMS ${CMAKE_SOURCE_DIR}/src/ceph-rbdnamer DESTINATION bin) + + set(librbd_replay_srcs + rbd_replay/Deser.cc + rbd_replay/ImageNameMap.cc + rbd_replay/PendingIO.cc + rbd_replay/rbd_loc.cc + rbd_replay/Replayer.cc + rbd_replay/Ser.cc) + add_library(librbd_replay STATIC ${librbd_replay_srcs}) + target_link_libraries(librbd_replay PRIVATE librbd librados global udev) + add_executable(rbd_replay + rbd_replay/rbd-replay.cc) + target_link_libraries(rbd_replay librbd librados global librbd_replay) + + set(librbd_replay_ios_srcs + rbd_replay/ios.cc) + add_library(librbd_replay_ios STATIC ${librbd_replay_ios_srcs}) + target_link_libraries(librbd_replay_ios librbd librados global) + + install(TARGETS librbd_replay librbd_replay_ios DESTINATION lib) + endif(${WITH_RBD}) # RadosGW diff --git a/src/erasure-code/CMakeLists.txt b/src/erasure-code/CMakeLists.txt index 484d8958423f4..8bcba45b2767f 100644 --- a/src/erasure-code/CMakeLists.txt +++ b/src/erasure-code/CMakeLists.txt @@ -4,6 +4,7 @@ set(erasure_codelibdir ${LIBRARY_OUTPUT_PATH}/erasure-code) add_subdirectory(jerasure) add_subdirectory(lrc) +add_subdirectory(shec) if (HAVE_BETTER_YASM_ELF64) add_subdirectory(isa) diff --git a/src/erasure-code/isa/CMakeLists.txt b/src/erasure-code/isa/CMakeLists.txt index 099cc727322a0..446e44782fdab 100644 --- a/src/erasure-code/isa/CMakeLists.txt +++ b/src/erasure-code/isa/CMakeLists.txt @@ -3,46 +3,46 @@ include_directories(isa-l/include) set(isa_srcs - isa-l/erasure_code/ec_base.c - isa-l/erasure_code/ec_highlevel_func.c - isa-l/erasure_code/ec_multibinary.asm.s - isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s + isa-l/erasure_code/ec_base.c + isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_2vect_mad_avx2.asm.s - isa-l/erasure_code/gf_2vect_mad_avx.asm.s - isa-l/erasure_code/gf_2vect_mad_sse.asm.s - isa-l/erasure_code/gf_3vect_mad_avx2.asm.s - isa-l/erasure_code/gf_3vect_mad_avx.asm.s - isa-l/erasure_code/gf_3vect_mad_sse.asm.s - isa-l/erasure_code/gf_4vect_mad_avx2.asm.s - isa-l/erasure_code/gf_4vect_mad_avx.asm.s - isa-l/erasure_code/gf_4vect_mad_sse.asm.s - isa-l/erasure_code/gf_5vect_mad_avx2.asm.s - isa-l/erasure_code/gf_5vect_mad_avx.asm.s - isa-l/erasure_code/gf_5vect_mad_sse.asm.s - isa-l/erasure_code/gf_6vect_mad_avx2.asm.s - isa-l/erasure_code/gf_6vect_mad_avx.asm.s - isa-l/erasure_code/gf_6vect_mad_sse.asm.s + isa-l/erasure_code/gf_2vect_mad_avx2.asm.s + isa-l/erasure_code/gf_3vect_mad_avx2.asm.s + isa-l/erasure_code/gf_4vect_mad_avx2.asm.s + isa-l/erasure_code/gf_5vect_mad_avx2.asm.s + isa-l/erasure_code/gf_6vect_mad_avx2.asm.s isa-l/erasure_code/gf_vect_mad_avx2.asm.s + isa-l/erasure_code/ec_highlevel_func.c + isa-l/erasure_code/gf_2vect_mad_avx.asm.s + isa-l/erasure_code/gf_3vect_mad_avx.asm.s + isa-l/erasure_code/gf_4vect_mad_avx.asm.s + isa-l/erasure_code/gf_5vect_mad_avx.asm.s + isa-l/erasure_code/gf_6vect_mad_avx.asm.s isa-l/erasure_code/gf_vect_mad_avx.asm.s + isa-l/erasure_code/ec_multibinary.asm.s + isa-l/erasure_code/gf_2vect_mad_sse.asm.s + isa-l/erasure_code/gf_3vect_mad_sse.asm.s + isa-l/erasure_code/gf_4vect_mad_sse.asm.s + isa-l/erasure_code/gf_5vect_mad_sse.asm.s + isa-l/erasure_code/gf_6vect_mad_sse.asm.s isa-l/erasure_code/gf_vect_mad_sse.asm.s + isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s isa-l/erasure_code/gf_vect_mul_avx.asm.s + isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s isa-l/erasure_code/gf_vect_mul_sse.asm.s ErasureCodeIsa.cc ErasureCodeIsaTableCache.cc diff --git a/src/erasure-code/shec/CMakeLists.txt b/src/erasure-code/shec/CMakeLists.txt new file mode 100644 index 0000000000000..260f4d21838bb --- /dev/null +++ b/src/erasure-code/shec/CMakeLists.txt @@ -0,0 +1,36 @@ +#shec plugin + +set(shec_srcs + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + ErasureCodePluginShec.cc + ErasureCodeShec.cc + ErasureCodeShecTableCache.cc + determinant.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/src/cauchy.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/src/galois.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/src/jerasure.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/src/liberation.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/src/reed_sol.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_method.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w16.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w32.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w64.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w128.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_general.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w4.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_rand.c + ${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/src/gf_w8.c) + +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/include) +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/gf-complete/include) +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure) +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/shec) +add_library(ec_shec_generic SHARED ${shec_srcs}) +target_link_libraries(ec_shec_generic crush pthread) +add_dependencies(ec_shec_generic ${CMAKE_SOURCE_DIR}/src/ceph_ver.h) +set_target_properties(ec_shec_generic PROPERTIES VERSION 1.0.0 SOVERSION 1) +install(TARGETS ec_shec_generic DESTINATION lib/erasure-code) + +#TODO:build libec_shec_neon, libec_shec+sse3, libec_shec_sse4 libraries diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 6a5f10cc0b879..8baa194d76824 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -300,6 +300,49 @@ add_executable(bench_log target_link_libraries(bench_log global pthread rt ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) ## Unit tests +#make check starts here + +add_custom_target(symlinks COMMAND + ln -sf ${CMAKE_SOURCE_DIR}/src/test/ ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-mon ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/ceph ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-authtool ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-conf ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-osd ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_SOURCE_DIR}/src/ceph-disk ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_SOURCE_DIR}/qa/ ${CMAKE_BINARY_DIR}/src/ + COMMENT "Symlinks for test directory have been created") +add_dependencies(check symlinks) + +add_test(NAME ceph_objectstore_tool COMMAND python ${CMAKE_SOURCE_DIR}/src/test/ceph_objectstore_tool.py) +add_dependencies(check ceph_objectstore_tool) + +add_test(NAME ceph_argparse_py COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_argparse.py) +add_dependencies(check ceph_argparse_py) + +add_test(NAME unittest_bufferlist_shell COMMAND bash ${CMAKE_SOURCE_DIR}/src/unittest_bufferlist.sh) +add_dependencies(check unittest_bufferlist_shell) + +add_test(NAME check_generated COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/encoding/check-generated.sh) +add_dependencies(check check_generated) + +add_test(NAME misc COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/misc.sh) +add_dependencies(check misc) + +add_test(NAME mkfs COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mkfs.sh) +add_dependencies(check mkfs) + +add_test(NAME ceph_disk COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/ceph-disk.sh) +add_dependencies(check ceph_disk) + +add_test(NAME mon_handle_forward COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-handle-forward.sh) +add_dependencies(check mon_handle_forward) + +add_test(NAME vstart_wrapped_tests COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/vstart_wrapped_tests.sh) +add_dependencies(check mon_handle_forward) + +set(UNITTEST_LIBS gtest_main ${PTHREAD_LIBS}) +set(UNITTEST_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CMAKE_SOURCE_DIR}/src/gtest/include -I${CMAKE_BINARY_DIR}/src/gtest/include -fno-strict-aliasing") set(UNITTEST_LIBS gmock_main gmock gtest ${PTHREAD_LIBS}) set(UNITTEST_CXX_FLAGS "-I${CMAKE_SOURCE_DIR}/src/gmock/include -I${CMAKE_BINARY_DIR}/src/gmock/include -I${CMAKE_SOURCE_DIR}/src/gmock/gtest/include -I${CMAKE_BINARY_DIR}/src/gmock/gtest/include -fno-strict-aliasing") @@ -308,12 +351,14 @@ set(UNITTEST_CXX_FLAGS "-I${CMAKE_SOURCE_DIR}/src/gmock/include -I${CMAKE_BINARY set(unittest_encoding_srcs encoding.cc ) -add_executable(unittest_encoding +add_executable(unittest_encoding EXCLUDE_FROM_ALL ${unittest_encoding_srcs} $ ) -target_link_libraries(unittest_encoding cephfs librados pthread rt m - ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +add_test(unittest_encoding unittest_encoding) +add_dependencies(check unittest_encoding) +#target_link_libraries(unittest_encoding librados global boost_filesystem +target_link_libraries(unittest_encoding cephfs librados pthread rt m ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_encoding PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) @@ -321,46 +366,83 @@ set_target_properties(unittest_encoding set(unittest_addrs_srcs test_addrs.cc ) -add_executable(unittest_addrs +add_executable(unittest_addrs EXCLUDE_FROM_ALL ${unittest_addrs_srcs} $ ) +add_test(unittest_addrs unittest_addrs) +add_dependencies(check unittest_addrs) target_link_libraries(unittest_addrs cephfs librados pthread rt m ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_addrs PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_blkdev +set(unittest_blkdev_srcs common/test_blkdev.cc) +add_executable(unittest_blkdev EXCLUDE_FROM_ALL + ${unittest_blkdev_srcs} + $ + ) +add_test(unittest_blkdev unittest_blkdev) +add_dependencies(check unittest_blkdev) +target_link_libraries(unittest_blkdev + global + ${BLKID_LIBRARIES} + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_blkdev PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_bloom_filter set(unittest_bloom_filter_srcs common/test_bloom_filter.cc ) -add_executable(unittest_bloom_filter +add_executable(unittest_bloom_filter EXCLUDE_FROM_ALL ${unittest_bloom_filter_srcs} $ ) +add_test(unittest_bloom_filter unittest_bloom_filter) +add_dependencies(check unittest_bloom_filter) target_link_libraries(unittest_bloom_filter global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_bloom_filter PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_histogram -add_executable(unittest_histogram +add_executable(unittest_histogram EXCLUDE_FROM_ALL common/histogram.cc $ ) +add_test(unittest_histogram unittest_histogram) +add_dependencies(check unittest_histogram) target_link_libraries(unittest_histogram global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_histogram PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_prioritized_queue +add_executable(unittest_prioritized_queue EXCLUDE_FROM_ALL + common/test_prioritized_queue.cc + $ + ) +add_test(unittest_prioritized_queue unittest_prioritized_queue) +add_dependencies(check unittest_prioritized_queue) +target_link_libraries(unittest_prioritized_queue global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_prioritized_queue + PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) + # unittest_str_map set(unittest_str_map_srcs common/test_str_map.cc ) -add_executable(unittest_str_map +add_executable(unittest_str_map EXCLUDE_FROM_ALL ${unittest_str_map_srcs} $ ) +add_test(unittest_str_map unittest_str_map) +add_dependencies(check unittest_str_map) target_link_libraries(unittest_str_map common global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS} common) set_target_properties(unittest_str_map @@ -370,23 +452,42 @@ set_target_properties(unittest_str_map set(unittest_sharedptr_registry_srcs common/test_sharedptr_registry.cc ) -add_executable(unittest_sharedptr_registry +add_executable(unittest_sharedptr_registry EXCLUDE_FROM_ALL ${unittest_sharedptr_registry_srcs} $ ) +add_test(unittest_sharedptr_registry unittest_sharedptr_registry) +add_dependencies(check unittest_sharedptr_registry) target_link_libraries(unittest_sharedptr_registry global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_sharedptr_registry PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_shared_cache +set(unittest_shared_cache_srcs + common/test_shared_cache.cc + ) +add_executable(unittest_shared_cache EXCLUDE_FROM_ALL + ${unittest_shared_cache_srscs} + $ + ) +add_test(unittest_shared_cache unittest_shared_cache) +add_dependencies(check unittest_shared_cache) +target_link_libraries(unittest_shared_cache global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_shared_cache + PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) + # unittest_sloppy_crc_map set(unittest_sloppy_crc_map_srcs common/test_sloppy_crc_map.cc ) -add_executable(unittest_sloppy_crc_map +add_executable(unittest_sloppy_crc_map EXCLUDE_FROM_ALL ${unittest_sloppy_crc_map_srcs} $ ) +add_test(unittest_sloppy_crc_map unittest_sloppy_crc_map) +add_dependencies(check unittest_sloppy_crc_map) target_link_libraries(unittest_sloppy_crc_map global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_sloppy_crc_map @@ -397,10 +498,12 @@ set(unittest_util_srcs common/test_util.cc ${CMAKE_SOURCE_DIR}/src/common/util.cc ) -add_executable(unittest_util +add_executable(unittest_util EXCLUDE_FROM_ALL ${unittest_util_srcs} $ ) +add_test(unittest_util unittest_util) +add_dependencies(check unittest_util) target_link_libraries(unittest_util global ${BLKID_LIBRARIES} @@ -411,12 +514,40 @@ target_link_libraries(unittest_util set_target_properties(unittest_util PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_crush_wrapper +set(unittest_crush_wrapper_srcs crush/CrushWrapper.cc) +add_executable(unittest_crush_wrapper EXCLUDE_FROM_ALL + ${unittest_crush_wrapper_srcs} + $ + ) +add_test(unittest_crush_wrapper unittest_crush_wrapper) +add_dependencies(check unittest_crush_wrapper) +target_link_libraries(unittest_crush_wrapper global crush ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_crush_wrapper PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_crush +set(unittest_crush_srcs crush/crush.cc) +add_executable(unittest_crush EXCLUDE_FROM_ALL + ${unittest_crush_srcs} + $ + ) +add_test(unittest_crush unittest_crush) +add_dependencies(check unittest_crush) +target_link_libraries(unittest_crush global m ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} + ${UNITTEST_LIBS} ${EXTRALIBS}) +set_target_properties(unittest_crush PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_osdmap set(unittest_osdmap_srcs osd/TestOSDMap.cc) -add_executable(unittest_osdmap +add_executable(unittest_osdmap EXCLUDE_FROM_ALL ${unittest_osdmap_srcs} $ ) +add_test(unittest_osdmap unittest_osdmap) +add_dependencies(check unittest_osdmap) target_link_libraries(unittest_osdmap global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_osdmap PROPERTIES COMPILE_FLAGS @@ -424,21 +555,26 @@ set_target_properties(unittest_osdmap PROPERTIES COMPILE_FLAGS # unittest_workqueue set(unittest_workqueue_srcs test_workqueue.cc) -add_executable(unittest_workqueue +add_executable(unittest_workqueue EXCLUDE_FROM_ALL ${unittest_workqueue_srcs} $ ) +add_test(unittest_workqueue unittest_workqueue) +add_dependencies(check unittest_workqueue) target_link_libraries(unittest_workqueue global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_workqueue PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) + # unittest_striper set(unittest_striper_srcs test_striper.cc) -add_executable(unittest_striper +add_executable(unittest_striper EXCLUDE_FROM_ALL ${unittest_striper_srcs} $ ) +add_test(unittest_striper unittest_striper) +add_dependencies(check unittest_striper) target_link_libraries(unittest_striper global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_striper PROPERTIES COMPILE_FLAGS @@ -446,10 +582,12 @@ set_target_properties(unittest_striper PROPERTIES COMPILE_FLAGS # unittest_prebufferedstreambuf set(unittest_prebufferedstreambuf_srcs test_prebufferedstreambuf.cc) -add_executable(unittest_prebufferedstreambuf +add_executable(unittest_prebufferedstreambuf EXCLUDE_FROM_ALL ${unittest_prebufferedstreambuf_srcs} $ ) +add_test(unittest_prebufferedstreambuf unittest_prebufferedstreambuf) +add_dependencies(check unittest_prebufferedstreambuf) target_link_libraries(unittest_prebufferedstreambuf global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_prebufferedstreambuf PROPERTIES COMPILE_FLAGS @@ -457,10 +595,12 @@ set_target_properties(unittest_prebufferedstreambuf PROPERTIES COMPILE_FLAGS # unittest_str_list set(unittest_str_list_srcs test_str_list.cc) -add_executable(unittest_str_list +add_executable(unittest_str_list EXCLUDE_FROM_ALL ${unittest_str_list_srcs} $ ) +add_test(unittest_str_list unittest_str_list) +add_dependencies(check unittest_str_list) target_link_libraries(unittest_str_list global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_str_list PROPERTIES COMPILE_FLAGS @@ -468,10 +608,12 @@ set_target_properties(unittest_str_list PROPERTIES COMPILE_FLAGS # unittest_log set(unittest_log_srcs ${CMAKE_SOURCE_DIR}/src/log/test.cc) -add_executable(unittest_log +add_executable(unittest_log EXCLUDE_FROM_ALL ${unittest_log_srcs} $ ) +add_test(unittest_log unittest_log) +add_dependencies(check unittest_log) target_link_libraries(unittest_log global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_log PROPERTIES COMPILE_FLAGS @@ -479,10 +621,12 @@ set_target_properties(unittest_log PROPERTIES COMPILE_FLAGS # unittest_throttle set(unittest_throttle_srcs common/Throttle.cc) -add_executable(unittest_throttle +add_executable(unittest_throttle EXCLUDE_FROM_ALL ${unittest_throttle_srcs} $ ) +add_test(unittest_throttle unittest_throttle) +add_dependencies(check unittest_throttle) target_link_libraries(unittest_throttle global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_throttle PROPERTIES COMPILE_FLAGS @@ -490,19 +634,23 @@ set_target_properties(unittest_throttle PROPERTIES COMPILE_FLAGS # unittest_base64 set(unittest_base64_srcs base64.cc) -add_executable(unittest_base64 +add_executable(unittest_base64 EXCLUDE_FROM_ALL ${unittest_base64_srcs} $ ) +add_test(unittest_base64 unittest_base64) +add_dependencies(check unittest_base64) target_link_libraries(unittest_base64 global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_base64 PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_ceph_argparse set(unittest_ceph_argparse_srcs ceph_argparse.cc) -add_executable(unittest_ceph_argparse +add_executable(unittest_ceph_argparse EXCLUDE_FROM_ALL ${unittest_ceph_argparse_srcs} $ ) +add_test(unittest_ceph_argparse unittest_ceph_argparse) +add_dependencies(check unittest_ceph_argparse) target_link_libraries(unittest_ceph_argparse global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_ceph_argparse PROPERTIES COMPILE_FLAGS @@ -510,32 +658,73 @@ set_target_properties(unittest_ceph_argparse PROPERTIES COMPILE_FLAGS # unittest_ceph_compatset set(unittest_ceph_compatset_srcs ceph_compatset.cc) -add_executable(unittest_ceph_compatset +add_executable(unittest_ceph_compatset EXCLUDE_FROM_ALL ${unittest_ceph_compatset_srcs} $ ) +add_test(unittest_ceph_compatset unittest_ceph_compatset) +add_dependencies(check unittest_ceph_compatset) target_link_libraries(unittest_ceph_compatset global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_ceph_compatset PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_mds_types +add_executable(unittest_mds_types EXCLUDE_FROM_ALL + fs/mds_types.cc + $ + ) +add_test(unittest_mds_types unittest_mds_types) +add_dependencies(check unittest_mds_types) +target_link_libraries(unittest_mds_types global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_mds_types PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) + # unittest_osd_types set(unittest_osd_types_srcs osd/types.cc) -add_executable(unittest_osd_types +add_executable(unittest_osd_types EXCLUDE_FROM_ALL ${unittest_osd_types_srcs} $ ) +add_test(unittest_osd_types unittest_osd_types) +add_dependencies(check unittest_osd_types) target_link_libraries(unittest_osd_types global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_osd_types PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_lru +add_executable(unittest_lru EXCLUDE_FROM_ALL + common/test_lru.cc + $ + ) +add_test(unittest_lru unittest_lru) +add_dependencies(check unittest_lru) +target_link_libraries(unittest_lru global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_lru PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_io_priority +add_executable(unittest_io_priority EXCLUDE_FROM_ALL + common/test_io_priority.cc + $ + ) +add_test(unittest_io_priority unittest_io_priority) +add_dependencies(check unittest_io_priority) +target_link_libraries(unittest_io_priority global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_io_priority PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_gather set(unittest_gather_srcs gather.cc) -add_executable(unittest_gather +add_executable(unittest_gather EXCLUDE_FROM_ALL ${unittest_gather_srcs} $ ) +add_test(unittest_gather unittest_gather) +add_dependencies(check unittest_gather) target_link_libraries(unittest_gather global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_gather PROPERTIES COMPILE_FLAGS @@ -543,10 +732,12 @@ set_target_properties(unittest_gather PROPERTIES COMPILE_FLAGS # run_cmd set(unittest_run_cmd_srcs run_cmd.cc) -add_executable(unittest_run_cmd +add_executable(unittest_run_cmd EXCLUDE_FROM_ALL ${unittest_run_cmd_srcs} $ ) +add_test(unittest_run_cmd unittest_run_cmd) +add_dependencies(check unittest_run_cmd) target_link_libraries(unittest_run_cmd global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_run_cmd PROPERTIES COMPILE_FLAGS @@ -554,10 +745,12 @@ set_target_properties(unittest_run_cmd PROPERTIES COMPILE_FLAGS # signals set(unittest_signals_srcs signals.cc) -add_executable(unittest_signals +add_executable(unittest_signals EXCLUDE_FROM_ALL ${unittest_signals_srcs} $ ) +add_test(unittest_signals unittest_signals) +add_dependencies(check unittest_signals) target_link_libraries(unittest_signals global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_signals PROPERTIES COMPILE_FLAGS @@ -565,21 +758,50 @@ set_target_properties(unittest_signals PROPERTIES COMPILE_FLAGS # unittest_simple_spin set(unittest_simple_spin_srcs simple_spin.cc) -add_executable(unittest_simple_spin +add_executable(unittest_simple_spin EXCLUDE_FROM_ALL ${unittest_simple_spin_srcs} $ ) +add_test(unittest_simple_spin unittest_simple_spin) +add_dependencies(check unittest_simple_spin) target_link_libraries(unittest_simple_spin global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_simple_spin PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_bufferlist +set(unittest_bufferlist_srcs bufferlist.cc) +add_executable(unittest_bufferlist EXCLUDE_FROM_ALL + ${unittest_bufferlist_srcs} + $ + ) +add_test(unittest_bufferlist unittest_bufferlist) +add_dependencies(check unittest_bufferlist) +target_link_libraries(unittest_bufferlist global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_bufferlist PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_xlist +add_executable(unittest_xlist EXCLUDE_FROM_ALL + test_xlist.cc + $ + ) +add_test(unittest_xlist unittest_xlist) +add_dependencies(check unittest_xlist) +target_link_libraries(unittest_xlist common ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_xlist PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_librados set(unittest_librados_srcs librados/librados.cc) -add_executable(unittest_librados +add_executable(unittest_librados EXCLUDE_FROM_ALL ${unittest_librados_srcs} $ ) +add_test(unittest_librados unittest_librados) +add_dependencies(check unittest_librados) target_link_libraries(unittest_librados librados global @@ -591,23 +813,14 @@ target_link_libraries(unittest_librados set_target_properties(unittest_librados PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_bufferlist -set(unittest_bufferlist_srcs bufferlist.cc) -add_executable(unittest_bufferlist - ${unittest_bufferlist_srcs} - $ - ) -target_link_libraries(unittest_bufferlist global ${CMAKE_DL_LIBS} - ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_bufferlist PROPERTIES COMPILE_FLAGS - ${UNITTEST_CXX_FLAGS}) - # unittest_crc32 set(unittest_crc32_srcs common/test_crc32c.cc) -add_executable(unittest_crc32 +add_executable(unittest_crc32 EXCLUDE_FROM_ALL ${unittest_crc32_srcs} $ ) +add_test(unittest_crc32 unittest_crc32) +add_dependencies(check unittest_crc32) target_link_libraries(unittest_crc32 global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_crc32 PROPERTIES COMPILE_FLAGS @@ -615,21 +828,37 @@ set_target_properties(unittest_crc32 PROPERTIES COMPILE_FLAGS # unittest_arch set(unittest_arch_srcs test_arch.cc) -add_executable(unittest_arch +add_executable(unittest_arch EXCLUDE_FROM_ALL ${unittest_arch_srcs} $ ) +add_test(unittest_arch unittest_arch) +add_dependencies(check unittest_arch) target_link_libraries(unittest_arch global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_arch PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_crypto +add_executable(unittest_crypto + crypto.cc + $ + ) +add_test(unittest_crypto unittest_crypto) +add_dependencies(check unittest_crypto) +target_link_libraries(unittest_crypto global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_crypto PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_crypto_init set(unittest_crypto_init_srcs crypto_init.cc) -add_executable(unittest_crypto_init +add_executable(unittest_crypto_init EXCLUDE_FROM_ALL ${unittest_crypto_init_srcs} $ ) +add_test(unittest_crypto_init unittest_crypto_init) +add_dependencies(check unittest_crypto_init) target_link_libraries(unittest_crypto_init global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_crypto_init PROPERTIES COMPILE_FLAGS @@ -637,10 +866,12 @@ set_target_properties(unittest_crypto_init PROPERTIES COMPILE_FLAGS # unittest_perf_counters set(unittest_perf_counters_srcs perf_counters.cc) -add_executable(unittest_perf_counters +add_executable(unittest_perf_counters EXCLUDE_FROM_ALL ${unittest_perf_counters_srcs} $ ) +add_test(unittest_perf_counters unittest_perf_counters) +add_dependencies(check unittest_perf_counters) target_link_libraries(unittest_perf_counters global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_perf_counters PROPERTIES COMPILE_FLAGS @@ -648,10 +879,12 @@ set_target_properties(unittest_perf_counters PROPERTIES COMPILE_FLAGS # unittest_admin_socket set(unittest_admin_socket_srcs admin_socket.cc) -add_executable(unittest_admin_socket +add_executable(unittest_admin_socket EXCLUDE_FROM_ALL ${unittest_admin_socket_srcs} $ ) +add_test(unittest_admin_socket unittest_admin_socket) +add_dependencies(check unittest_admin_socket) target_link_libraries(unittest_admin_socket global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_admin_socket PROPERTIES COMPILE_FLAGS @@ -659,10 +892,12 @@ set_target_properties(unittest_admin_socket PROPERTIES COMPILE_FLAGS # unittest_ceph_crypto set(unittest_ceph_crypto_srcs ceph_crypto.cc) -add_executable(unittest_ceph_crypto +add_executable(unittest_ceph_crypto EXCLUDE_FROM_ALL ${unittest_ceph_crypto_srcs} $ ) +add_test(unittest_ceph_crypto unittest_ceph_crypto) +add_dependencies(check unittest_ceph_crypto) target_link_libraries(unittest_ceph_crypto global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_ceph_crypto PROPERTIES COMPILE_FLAGS @@ -670,10 +905,12 @@ set_target_properties(unittest_ceph_crypto PROPERTIES COMPILE_FLAGS # unittest_utf8 set(unittest_utf8_srcs utf8.cc) -add_executable(unittest_utf8 +add_executable(unittest_utf8 EXCLUDE_FROM_ALL ${unittest_utf8_srcs} $ ) +add_test(unittest_utf8 unittest_utf8) +add_dependencies(check unittest_utf8) target_link_libraries(unittest_utf8 global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_utf8 PROPERTIES COMPILE_FLAGS @@ -681,10 +918,12 @@ set_target_properties(unittest_utf8 PROPERTIES COMPILE_FLAGS # unittest_mime set(unittest_mime_srcs mime.cc) -add_executable(unittest_mime +add_executable(unittest_mime EXCLUDE_FROM_ALL ${unittest_mime_srcs} $ ) +add_test(unittest_mime unittest_mime) +add_dependencies(check unittest_mime) target_link_libraries(unittest_mime global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_mime PROPERTIES COMPILE_FLAGS @@ -692,10 +931,12 @@ set_target_properties(unittest_mime PROPERTIES COMPILE_FLAGS # unittest_escape set(unittest_escape_srcs escape.cc) -add_executable(unittest_escape +add_executable(unittest_escape EXCLUDE_FROM_ALL ${unittest_escape_srcs} $ ) +add_test(unittest_escape unittest_escape) +add_dependencies(check unittest_escape) target_link_libraries(unittest_escape global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_escape PROPERTIES COMPILE_FLAGS @@ -721,10 +962,12 @@ target_link_libraries(unittest_chain_xattr # unittest_strtol set(unittest_strtol_srcs strtol.cc) -add_executable(unittest_strtol +add_executable(unittest_strtol EXCLUDE_FROM_ALL ${unittest_strtol_srcs} $ ) +add_test(unittest_strtol unittest_strtol) +add_dependencies(check unittest_strtol) target_link_libraries(unittest_strtol global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_strtol PROPERTIES COMPILE_FLAGS @@ -732,10 +975,12 @@ set_target_properties(unittest_strtol PROPERTIES COMPILE_FLAGS # unittest_confutils set(unittest_confutils_srcs confutils.cc) -add_executable(unittest_confutils +add_executable(unittest_confutils EXCLUDE_FROM_ALL ${unittest_confutils_srcs} $ ) +add_test(unittest_confutils unittest_confutils) +add_dependencies(check unittest_confutils) target_link_libraries(unittest_confutils global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_confutils PROPERTIES COMPILE_FLAGS @@ -743,10 +988,12 @@ set_target_properties(unittest_confutils PROPERTIES COMPILE_FLAGS # unittest_config set(unittest_config_srcs common/test_config.cc) -add_executable(unittest_config +add_executable(unittest_config EXCLUDE_FROM_ALL ${unittest_config_srcs} $ ) +add_test(unittest_config unittest_config) +add_dependencies(check unittest_config) target_link_libraries(unittest_config global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_config PROPERTIES COMPILE_FLAGS @@ -754,21 +1001,78 @@ set_target_properties(unittest_config PROPERTIES COMPILE_FLAGS # unittest_context set(unittest_context_srcs common/test_context.cc) -add_executable(unittest_context +add_executable(unittest_context EXCLUDE_FROM_ALL ${unittest_context_srcs} $ ) +add_test(unittest_context unittest_context) +add_dependencies(check unittest_context) target_link_libraries(unittest_context global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_context PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_chain_xattr +set(unittest_chain_xattr_srcs + objectstore/chain_xattr.cc + ) +add_executable(unittest_chain_xattr EXCLUDE_FROM_ALL + ${unittest_chain_xattr_srcs} + $ + ) +set_target_properties(unittest_chain_xattr PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) +add_test(unittest_chain_xattr unittest_chain_xattr) +add_dependencies(check unittest_chain_xattr) +target_link_libraries(unittest_chain_xattr + os + global + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS} + ) + +# unittest_flatindex +set(unittest_flatindex_srcs + os/TestFlatIndex.cc + ) +add_executable(unittest_flatindex EXCLUDE_FROM_ALL + ${unittest_flatindex_srcs} + $ + ) +add_test(unittest_flatindex unittest_flatindex) +add_dependencies(check unittest_flatindex) +target_link_libraries(unittest_flatindex + os + global + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS} + ) +set_target_properties(unittest_flatindex PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_safe_io +set(unittest_safe_io_srcs test_safe_io.cc) +add_executable(unittest_safe_io EXCLUDE_FROM_ALL + ${unittest_safe_op_srcs} + $ + ) +add_test(unittest_safe_io unittest_safe_io) +add_dependencies(check unittest_safe_io) +target_link_libraries(unittest_safe_io global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_safe_io PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_heartbeatmap set(unittest_heartbeatmap_srcs heartbeat_map.cc) -add_executable(unittest_heartbeatmap +add_executable(unittest_heartbeatmap EXCLUDE_FROM_ALL ${unittest_heartbeatmap_srcs} $ ) +add_test(unittest_heartbeatmap unittest_heartbeatmap) +add_dependencies(check unittest_heartbeatmap) target_link_libraries(unittest_heartbeatmap global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_heartbeatmap PROPERTIES COMPILE_FLAGS @@ -776,24 +1080,49 @@ set_target_properties(unittest_heartbeatmap PROPERTIES COMPILE_FLAGS if(${WITH_RADOSGW}) # unittest_formatter + # why does this include rgw/rgw_formats.cc...? set(unittest_formatter_srcs formatter.cc ${CMAKE_SOURCE_DIR}/src/rgw/rgw_formats.cc) - add_executable(unittest_formatter + add_executable(unittest_formatter EXCLUDE_FROM_ALL ${unittest_formatter_srcs} $ ) + add_test(unittest_formatter unittest_formatter) + add_dependencies(check unittest_formatter) target_link_libraries(unittest_formatter global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_formatter PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) endif(${WITH_RADOSGW}) +# unittest_daemon_config +set(unittest_daemon_config_srcs daemon_config.cc) +add_executable(unittest_daemon_config EXCLUDE_FROM_ALL + ${unittest_daemon_config_srcs} + $ + ) +add_test(unittest_daemon_config unittest_daemon_config) +add_dependencies(check unittest_daemon_config) +target_link_libraries(unittest_daemon_config + common + global + ${UNITTEST_LIBS} + ${BLKID_LIBRARIES} + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${EXTRALIBS} + ) +set_target_properties(unittest_daemon_config PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_libcephfs_config set(unittest_libcephfs_config_srcs libcephfs_config.cc) -add_executable(unittest_libcephfs_config +add_executable(unittest_libcephfs_config EXCLUDE_FROM_ALL ${unittest_libcephfs_config_srcs} $ ) +add_test(unittest_libcephfs_config unittest_libcephfs_config) +add_dependencies(check unittest_libcephfs_config) target_link_libraries(unittest_libcephfs_config cephfs ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_libcephfs_config PROPERTIES COMPILE_FLAGS @@ -801,10 +1130,12 @@ set_target_properties(unittest_libcephfs_config PROPERTIES COMPILE_FLAGS # unittest_lfnindex set(unittest_lfnindex_srcs os/TestLFNIndex.cc) -add_executable(unittest_lfnindex +add_executable(unittest_lfnindex EXCLUDE_FROM_ALL ${unittest_lfnindex_srcs} $ ) +add_test(unittest_lfnindex unittest_lfnindex) +add_dependencies(check unittest_lfnindex) target_link_libraries(unittest_lfnindex os global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_lfnindex PROPERTIES COMPILE_FLAGS @@ -812,10 +1143,12 @@ set_target_properties(unittest_lfnindex PROPERTIES COMPILE_FLAGS # unittest_librados_config set(unittest_librados_config_srcs librados/librados_config.cc) -add_executable(unittest_librados_config +add_executable(unittest_librados_config EXCLUDE_FROM_ALL ${unittest_librados_config_srcs} $ ) +add_test(unittest_librados_config unittest_librados_config) +add_dependencies(check unittest_librados_config) target_link_libraries(unittest_librados_config librados global @@ -827,37 +1160,176 @@ target_link_libraries(unittest_librados_config set_target_properties(unittest_librados_config PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_daemon_config -set(unittest_daemon_config_srcs daemon_config.cc) -add_executable(unittest_daemon_config - ${unittest_daemon_config_srcs} +# unittest_rbd_replay +set(unittest_rbd_replay_srcs test_rbd_replay.cc) +add_executable(unittest_rbd_replay EXCLUDE_FROM_ALL + ${unittest_librados_config_srcs} $ - ) -target_link_libraries(unittest_daemon_config - common + $ +) +add_test(unittest_rbd_replay unittest_rbd_replay) +add_dependencies(check unittest_rbd_replay) +target_link_libraries(unittest_rbd_replay + librbd + librados global - ${UNITTEST_LIBS} + librbd_replay + librbd_replay_ios + keyutils ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} - ${EXTRALIBS} - ) -set_target_properties(unittest_daemon_config PROPERTIES COMPILE_FLAGS - ${UNITTEST_CXX_FLAGS}) + ${UNITTEST_LIBS} +) +set_target_properties(unittest_rbd_replay PROPERTIES COMPILE_FLAGS +${UNITTEST_CXX_FLAGS}) # unittest_mon_moncap set(unittest_mon_moncap_srcs mon/moncap.cc) -add_executable(unittest_mon_moncap +add_executable(unittest_mon_moncap EXCLUDE_FROM_ALL ${unittest_mon_moncap_srcs} $ ) +add_test(unittest_mon_moncap unittest_mon_moncap) +add_dependencies(check unittest_mon_moncap) target_link_libraries(unittest_mon_moncap mon global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_mon_moncap PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_mon_pgmap +set(unittest_mon_pgmap_srcs mon/PGMap.cc) +add_executable(unittest_mon_pgmap EXCLUDE_FROM_ALL + ${unittest_mon_moncap_srcs} + $ + ) +add_test(unittest_mon_pgmap unittest_mon_pgmap) +add_dependencies(check unittest_mon_pgmap) +target_link_libraries(unittest_mon_pgmap mon global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_mon_pgmap PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_ecbackend +set(unittest_ecbackend_srcs osd/TestECBackend.cc) +add_executable(unittest_ecbackend EXCLUDE_FROM_ALL + ${unittest_ecbackend_srcs} + $ + ) +add_test(unittest_ecbackend unittest_ecbackend) +add_dependencies(check unittest_ecbackend) +target_link_libraries(unittest_ecbackend osd global ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_ecbackend PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_osdscrub +add_executable(unittest_osdscrub EXCLUDE_FROM_ALL + osd/TestOSDScrub.cc + $ + ) +add_test(unittest_osdscrub unittest_osdscrub) +add_dependencies(check unittest_osdscrub) +target_link_libraries(unittest_osdscrub osd global dl os mon ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_osdscrub PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_pglog +add_executable(unittest_pglog EXCLUDE_FROM_ALL + osd/TestPGLog.cc + $ + ) +add_test(unittest_pglog unittest_pglog) +add_dependencies(check unittest_pglog) +target_link_libraries(unittest_pglog osd global dl ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_pglog PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_hitset +add_executable(unittest_hitset EXCLUDE_FROM_ALL + osd/hitset.cc + $ + ) +add_test(unittest_hitset unittest_hitset) +add_dependencies(check unittest_hitset) +target_link_libraries(unittest_hitset osd global ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_hitset PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_osd_osdcap +add_executable(unittest_osd_osdcap EXCLUDE_FROM_ALL + osd/osdcap.cc + $ +) +add_test(unittest_osd_osdcap unittest_osd_osdcap) +add_dependencies(check unittest_osd_osdcap) +target_link_libraries(unittest_osd_osdcap osd global ${CMAKE_DL_LIBS} +${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_osd_osdcap PROPERTIES COMPILE_FLAGS +${UNITTEST_CXX_FLAGS}) + +# unittest_snap_mapper +add_executable(unittest_snap_mapper EXCLUDE_FROM_ALL + test_snap_mapper.cc + $ + ) +add_test(unittest_snap_mapper unittest_snap_mapper) +add_dependencies(check unittest_snap_mapper) +target_link_libraries(unittest_snap_mapper osd global ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_snap_mapper PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +if(WITH_SLIBROCKSDB) +# unittest_rocksdb_option_static +add_executable(unittest_rocksdb_option_static EXCLUDE_FROM_ALL + objectstore/TestRocksdbOptionParse.cc + $ + ) +add_test(unittest_rocksdb_option_static unittest_rocksdb_option_static) +add_dependencies(check unittest_rocksdb_option_static) +target_link_libraries(unittest_rocksdb_option_static os librocksdb global ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set(UNITTEST_ROCKSDB_STATIC_FLAGS "-std=gnu++11 -I rocksdb/include") +set_target_properties(unittest_rocksdb_option_static PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS} ${LIBROCKSDB_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_ROCKSDB_STATIC_FLAGS}) +endif(WITH_SLIBROCKSDB) + +if(WITH_DLIBROCKSDB) +# unittest_rocksdb_option +add_executable(unittest_rocksdb_option EXCLUDE_FROM_ALL + objectstore/TestRocksdbOptionParse.cc + $ + ) +add_test(unittest_rocksdb_option_static unittest_rocksdb_option) +add_dependencies(check unittest_rocksdb_option) +target_link_libraries(unittest_rocksdb_option os rocksdb global ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set(UNITTEST_ROCKSDB_FLAGS "-std=gnu++11") +set_target_properties(unittest_rocksdb_option_static PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS} ${LIBROCKSDB_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_ROCKSDB_STATIC_FLAGS}) +endif(WITH_DLIBROCKSDB) + +# unittest_mds_authcap +add_executable(unittest_mds_authcap EXCLUDE_FROM_ALL + mds/TestMDSAuthCaps.cc + $ + ) +add_test(unittest_mds_authcap unittest_mds_authcap) +add_dependencies(check unittest_mds_authcap) +target_link_libraries(unittest_mds_authcap mds global ${CMAKE_DL_LIBS} + ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +set_target_properties(unittest_mds_authcap PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_ipaddr -add_executable(unittest_ipaddr test_ipaddr.cc) +add_executable(unittest_ipaddr EXCLUDE_FROM_ALL + test_ipaddr.cc) +add_test(unittest_ipaddr unittest_ipaddr) +add_dependencies(check unittest_ipaddr) target_link_libraries(unittest_ipaddr mon global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_ipaddr PROPERTIES COMPILE_FLAGS @@ -868,10 +1340,12 @@ set(unittest_texttable_srcs test_texttable.cc ${CMAKE_SOURCE_DIR}/src/common/TextTable.cc ) -add_executable(unittest_texttable +add_executable(unittest_texttable EXCLUDE_FROM_ALL ${unittest_texttable_srcs} $ ) +add_test(unittest_texttable unittest_texttable) +add_dependencies(check unittest_texttable) target_link_libraries(unittest_texttable mon global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) set_target_properties(unittest_texttable PROPERTIES COMPILE_FLAGS @@ -879,24 +1353,84 @@ set_target_properties(unittest_texttable PROPERTIES COMPILE_FLAGS # unittest_on_exit set(unittest_on_exit_srcs on_exit.cc) -add_executable(unittest_on_exit +add_executable(unittest_on_exit EXCLUDE_FROM_ALL ${unittest_on_exit_srcs} $ ) +add_test(unittest_on_exit unittest_on_exit) +add_dependencies(check unittest_on_exit) target_link_libraries(unittest_on_exit global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) +add_test(unittest_on_exit unittest_on_exit) +add_dependencies(check unittest_on_exit) set_target_properties(unittest_on_exit PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +# unittest_readahead +set(unittest_readahead_srcs common/Readahead.cc) +add_executable(unittest_readahead EXCLUDE_FROM_ALL + ${unittest_readahead_srcs} + $ + ) +add_test(unittest_readahead unittest_readahead) +add_dependencies(check unittest_readahead) +target_link_libraries(unittest_readahead + global + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +add_test(unittest_readahead unittest_readahead) +add_dependencies(check unittest_readahead) +set_target_properties(unittest_readahead PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_tableformatter +set(unittest_tableformatter_srcs common/test_tableformatter.cc) +add_executable(unittest_tableformatter EXCLUDE_FROM_ALL + ${unittest_tableformatter_srcs} + $ + ) +add_test(unittest_tableformatter unittest_tableformatter) +add_dependencies(check unittest_tableformatter) +target_link_libraries(unittest_tableformatter + global + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +add_test(unittest_tableformatter unittest_tableformatter) +add_dependencies(check unittest_tableformatter) +set_target_properties(unittest_tableformatter PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_bit_vector +set(unittest_bit_vector_srcs common/test_bit_vector.cc) +add_executable(unittest_bit_vector EXCLUDE_FROM_ALL + ${unittest_bit_vector_srcs} + $ + ) +add_test(unittest_bit_vector unittest_bit_vector) +add_dependencies(check unittest_bit_vector) +target_link_libraries(unittest_bit_vector + global + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +add_test(unittest_bit_vector unittest_bit_vector) +add_dependencies(check unittest_bit_vector) +set_target_properties(unittest_bit_vector PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + # unittest_subprocess set(unittest_subprocess_srcs test_subprocess.cc) -add_executable(unittest_subprocess +add_executable(unittest_subprocess EXCLUDE_FROM_ALL ${unittest_subprocess_srcs} $ ) +add_test(unittest_subprocess unittest_subprocess) +add_dependencies(check unittest_subprocess) target_link_libraries(unittest_subprocess global ${CMAKE_DL_LIBS} @@ -906,11 +1440,31 @@ set_target_properties(unittest_subprocess PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_pageset -add_executable(unittest_pageset test_pageset.cc) +add_executable(unittest_pageset EXCLUDE_FROM_ALL test_pageset.cc) +add_test(unittest_subprocess unittest_subprocess) +add_dependencies(check unittest_subprocess) target_link_libraries(unittest_pageset ${UNITTEST_LIBS}) set_target_properties(unittest_pageset PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +## unittest_async_compressor +#add_executable(unittest_async_compressor EXCLUDE_FROM_ALL +# common/test_async_compressor.cc +# $ +#) +#add_dependencies(check unittest_async_compressor) +#target_link_libraries(unittest_async_compressor +# global +# compressor +# ${CMAKE_DL_LIBS} +# ${TCMALLOC_LIBS} +# ${UNITTEST_LIBS}) +#set_target_properties(unittest_async_compressor PROPERTIES COMPILE_FLAGS +# ${UNITTEST_CXX_FLAGS}) + +add_subdirectory(erasure-code EXCLUDE_FROM_ALL) +#make check ends here + if(${WITH_RADOSGW}) # test_cors set(test_cors_srcs test_cors.cc) diff --git a/src/test/erasure-code/CMakeLists.txt b/src/test/erasure-code/CMakeLists.txt new file mode 100644 index 0000000000000..613e8f009b51c --- /dev/null +++ b/src/test/erasure-code/CMakeLists.txt @@ -0,0 +1,242 @@ +# make check tests for erasure-code directory +# unittest_erasure_code_plugin +add_executable(unittest_erasure_code_plugin EXCLUDE_FROM_ALL + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCodePlugin.cc + ) +add_test(unittest_erasure_code_plugin unittest_erasure_code_plugin) +add_dependencies(check unittest_erasure_code_plugin) +target_link_libraries(unittest_erasure_code_plugin + global + osd + common + dl + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_plugin PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code +add_executable(unittest_erasure_code EXCLUDE_FROM_ALL + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCode.cc + ) +add_test(unittest_erasure_code unittest_erasure_code) +add_dependencies(check unittest_erasure_code) +target_link_libraries(unittest_erasure_code + global + osd + common + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_jerasure +add_executable(unittest_erasure_code_jerasure EXCLUDE_FROM_ALL + TestErasureCodeJerasure.cc +) +add_test(unittest_erasure_code_jerasure unittest_erasure_code_jerasure) +add_dependencies(check unittest_erasure_code_jerasure) +target_link_libraries(unittest_erasure_code_jerasure + global + osd + common + ec_jerasure_generic + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set(UNITTEST_ERASURE_CODE_JERASURE_CXXFLAGS "-Ierasure-code/jerasure/gf-complete/include -Ierasure-code/jerasure/jerasure/include") +set_target_properties(unittest_erasure_code_jerasure PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_jerasure_plugin +add_executable(unittest_erasure_code_jerasure_plugin EXCLUDE_FROM_ALL + TestErasureCodePluginJerasure.cc + ) +add_test(unittest_erasure_code_jerasure_plugin unittest_erasure_code_jerasure_plugin) +add_dependencies(check unittest_erasure_code_jerasure_plugin) +target_link_libraries(unittest_erasure_code_jerasure_plugin + global + osd + common + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_jerasure_plugin PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +#not sure how to check for YASM_EFL64 right now, setting it true +set(WITH_BETTER_YASM_ELF64 1) +if(WITH_BETTER_YASM_ELF64) + +#unittest_erasure_code_isa +add_executable(unittest_erasure_code_isa EXCLUDE_FROM_ALL + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCodeIsa.cc +) +add_test(unittest_erasure_code_isa unittest_erasure_code_isa) +add_dependencies(check unittest_erasure_code_isa) +target_link_libraries(unittest_erasure_code_isa + global + osd + common + ec_isa + erasure_code + dl + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_isa PROPERTIES COMPILE_FLAGS +${UNITTEST_CXX_FLAGS}) + +#unittest_erasure_code_plugin_isa +add_executable(unittest_erasure_code_plugin_isa EXCLUDE_FROM_ALL + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCodePluginIsa.cc +) +add_test(unittest_erasure_code_plugin_isa unittest_erasure_code_plugin_isa) +add_dependencies(check unittest_erasure_code_plugin_isa) +target_link_libraries(unittest_erasure_code_plugin_isa + global + osd + common + ec_isa + crush + dl + erasure_code + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_plugin_isa PROPERTIES COMPILE_FLAGS +${UNITTEST_CXX_FLAGS}) +endif(WITH_BETTER_YASM_ELF64) + +# unittest_erasure_code_lrc +add_executable(unittest_erasure_code_lrc EXCLUDE_FROM_ALL + TestErasureCodeLrc.cc + ${lrc_srcs} + ) +add_test(unittest_erasure_code_lrc unittest_erasure_code_lrc) +add_dependencies(check unittest_erasure_code_lrc) +target_link_libraries(unittest_erasure_code_lrc + global + osd + dl + ec_lrc + common + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_lrc PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_plugin_lrc +add_executable(unittest_erasure_code_plugin_lrc EXCLUDE_FROM_ALL + TestErasureCodePluginLrc.cc + ) +add_test(unittest_erasure_code_plugin_lrc unittest_erasure_code_plugin_lrc) +add_dependencies(check unittest_erasure_code_plugin_lrc) +target_link_libraries(unittest_erasure_code_plugin_lrc + global + osd + dl + common + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_plugin_lrc PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_shec +add_executable(unittest_erasure_code_shec EXCLUDE_FROM_ALL + TestErasureCodePluginShec.cc + ) +add_test(unittest_erasure_code_shec unittest_erasure_code_shec) +add_dependencies(check unittest_erasure_code_shec) +target_link_libraries(unittest_erasure_code_shec + global + osd + dl + common + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_shec PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_shec_thread +add_executable(unittest_erasure_code_shec_thread EXCLUDE_FROM_ALL + TestErasureCodeShec_thread.cc + ) +add_test(unittest_erasure_code_shec_thread unittest_erasure_code_shec_thread) +add_dependencies(check unittest_erasure_code_shec_thread) +target_link_libraries(unittest_erasure_code_shec_thread + global + osd + dl + common + ec_shec_generic + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_shec_thread PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_shec_arguments +add_executable(unittest_erasure_code_shec_arguments EXCLUDE_FROM_ALL + TestErasureCodeShec_arguments.cc + ) +add_test(unittest_erasure_code_shec_arguments unittest_erasure_code_shec_arguments) +add_dependencies(check unittest_erasure_code_shec_arguments) +target_link_libraries(unittest_erasure_code_shec_arguments + global + osd + dl + common + ec_shec_generic + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_shec_arguments PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_plugin_shec +add_executable(unittest_erasure_code_plugin_shec EXCLUDE_FROM_ALL + TestErasureCodePluginShec.cc + ) +add_test(unittest_erasure_code_plugin_shec unittest_erasure_code_plugin_shec) +add_dependencies(check unittest_erasure_code_plugin_shec) +target_link_libraries(unittest_erasure_code_plugin_shec + global + osd + dl + common + ec_shec_generic + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_plugin_shec PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +# unittest_erasure_code_example +add_executable(unittest_erasure_code_example EXCLUDE_FROM_ALL +${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCodeExample.cc +) +add_test(unittest_erasure_code_example unittest_erasure_code_example) +add_dependencies(check unittest_erasure_code_example) +target_link_libraries(unittest_erasure_code_example + global + osd + dl + common + erasure_code + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_example PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + From 8ddca17fbf77a69f1202d8bc7af747e24ec9d90a Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 18 Aug 2015 14:34:54 -0400 Subject: [PATCH 464/654] cmake: Cleaned up syntax for make check targets Signed-off-by: Ali Maredia --- src/test/CMakeLists.txt | 201 ++++++++------------------- src/test/erasure-code/CMakeLists.txt | 134 ++++++++++-------- 2 files changed, 140 insertions(+), 195 deletions(-) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 8baa194d76824..267fcf0b93beb 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -348,11 +348,8 @@ set(UNITTEST_LIBS gmock_main gmock gtest ${PTHREAD_LIBS}) set(UNITTEST_CXX_FLAGS "-I${CMAKE_SOURCE_DIR}/src/gmock/include -I${CMAKE_BINARY_DIR}/src/gmock/include -I${CMAKE_SOURCE_DIR}/src/gmock/gtest/include -I${CMAKE_BINARY_DIR}/src/gmock/gtest/include -fno-strict-aliasing") # unittest_encoding -set(unittest_encoding_srcs - encoding.cc - ) add_executable(unittest_encoding EXCLUDE_FROM_ALL - ${unittest_encoding_srcs} + encoding.cc $ ) add_test(unittest_encoding unittest_encoding) @@ -363,11 +360,8 @@ set_target_properties(unittest_encoding PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_addrs -set(unittest_addrs_srcs - test_addrs.cc - ) add_executable(unittest_addrs EXCLUDE_FROM_ALL - ${unittest_addrs_srcs} + test_addrs.cc $ ) add_test(unittest_addrs unittest_addrs) @@ -378,9 +372,8 @@ set_target_properties(unittest_addrs PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_blkdev -set(unittest_blkdev_srcs common/test_blkdev.cc) add_executable(unittest_blkdev EXCLUDE_FROM_ALL - ${unittest_blkdev_srcs} + common/test_blkdev.cc $ ) add_test(unittest_blkdev unittest_blkdev) @@ -395,11 +388,8 @@ set_target_properties(unittest_blkdev PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_bloom_filter -set(unittest_bloom_filter_srcs - common/test_bloom_filter.cc - ) add_executable(unittest_bloom_filter EXCLUDE_FROM_ALL - ${unittest_bloom_filter_srcs} + common/test_bloom_filter.cc $ ) add_test(unittest_bloom_filter unittest_bloom_filter) @@ -434,11 +424,8 @@ set_target_properties(unittest_prioritized_queue PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_str_map -set(unittest_str_map_srcs - common/test_str_map.cc - ) add_executable(unittest_str_map EXCLUDE_FROM_ALL - ${unittest_str_map_srcs} + common/test_str_map.cc $ ) add_test(unittest_str_map unittest_str_map) @@ -449,11 +436,8 @@ set_target_properties(unittest_str_map PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_sharedptr_registry -set(unittest_sharedptr_registry_srcs - common/test_sharedptr_registry.cc - ) add_executable(unittest_sharedptr_registry EXCLUDE_FROM_ALL - ${unittest_sharedptr_registry_srcs} + common/test_sharedptr_registry.cc $ ) add_test(unittest_sharedptr_registry unittest_sharedptr_registry) @@ -464,11 +448,8 @@ set_target_properties(unittest_sharedptr_registry PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_shared_cache -set(unittest_shared_cache_srcs - common/test_shared_cache.cc - ) add_executable(unittest_shared_cache EXCLUDE_FROM_ALL - ${unittest_shared_cache_srscs} + common/test_shared_cache.cc $ ) add_test(unittest_shared_cache unittest_shared_cache) @@ -479,11 +460,8 @@ set_target_properties(unittest_shared_cache PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_sloppy_crc_map -set(unittest_sloppy_crc_map_srcs - common/test_sloppy_crc_map.cc - ) add_executable(unittest_sloppy_crc_map EXCLUDE_FROM_ALL - ${unittest_sloppy_crc_map_srcs} + common/test_sloppy_crc_map.cc $ ) add_test(unittest_sloppy_crc_map unittest_sloppy_crc_map) @@ -494,12 +472,9 @@ set_target_properties(unittest_sloppy_crc_map PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_util -set(unittest_util_srcs +add_executable(unittest_util EXCLUDE_FROM_ALL common/test_util.cc ${CMAKE_SOURCE_DIR}/src/common/util.cc - ) -add_executable(unittest_util EXCLUDE_FROM_ALL - ${unittest_util_srcs} $ ) add_test(unittest_util unittest_util) @@ -515,9 +490,8 @@ set_target_properties(unittest_util PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_crush_wrapper -set(unittest_crush_wrapper_srcs crush/CrushWrapper.cc) add_executable(unittest_crush_wrapper EXCLUDE_FROM_ALL - ${unittest_crush_wrapper_srcs} + crush/CrushWrapper.cc $ ) add_test(unittest_crush_wrapper unittest_crush_wrapper) @@ -528,9 +502,8 @@ set_target_properties(unittest_crush_wrapper PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_crush -set(unittest_crush_srcs crush/crush.cc) add_executable(unittest_crush EXCLUDE_FROM_ALL - ${unittest_crush_srcs} + crush/crush.cc $ ) add_test(unittest_crush unittest_crush) @@ -541,9 +514,8 @@ set_target_properties(unittest_crush PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_osdmap -set(unittest_osdmap_srcs osd/TestOSDMap.cc) add_executable(unittest_osdmap EXCLUDE_FROM_ALL - ${unittest_osdmap_srcs} + osd/TestOSDMap.cc $ ) add_test(unittest_osdmap unittest_osdmap) @@ -554,9 +526,8 @@ set_target_properties(unittest_osdmap PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_workqueue -set(unittest_workqueue_srcs test_workqueue.cc) add_executable(unittest_workqueue EXCLUDE_FROM_ALL - ${unittest_workqueue_srcs} + test_workqueue.cc $ ) add_test(unittest_workqueue unittest_workqueue) @@ -568,9 +539,8 @@ set_target_properties(unittest_workqueue PROPERTIES COMPILE_FLAGS # unittest_striper -set(unittest_striper_srcs test_striper.cc) add_executable(unittest_striper EXCLUDE_FROM_ALL - ${unittest_striper_srcs} + test_striper.cc $ ) add_test(unittest_striper unittest_striper) @@ -581,9 +551,8 @@ set_target_properties(unittest_striper PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_prebufferedstreambuf -set(unittest_prebufferedstreambuf_srcs test_prebufferedstreambuf.cc) add_executable(unittest_prebufferedstreambuf EXCLUDE_FROM_ALL - ${unittest_prebufferedstreambuf_srcs} + test_prebufferedstreambuf.cc $ ) add_test(unittest_prebufferedstreambuf unittest_prebufferedstreambuf) @@ -594,9 +563,8 @@ set_target_properties(unittest_prebufferedstreambuf PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_str_list -set(unittest_str_list_srcs test_str_list.cc) add_executable(unittest_str_list EXCLUDE_FROM_ALL - ${unittest_str_list_srcs} + test_str_list.cc $ ) add_test(unittest_str_list unittest_str_list) @@ -607,9 +575,8 @@ set_target_properties(unittest_str_list PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_log -set(unittest_log_srcs ${CMAKE_SOURCE_DIR}/src/log/test.cc) add_executable(unittest_log EXCLUDE_FROM_ALL - ${unittest_log_srcs} + ${CMAKE_SOURCE_DIR}/src/log/test.cc $ ) add_test(unittest_log unittest_log) @@ -620,9 +587,8 @@ set_target_properties(unittest_log PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_throttle -set(unittest_throttle_srcs common/Throttle.cc) add_executable(unittest_throttle EXCLUDE_FROM_ALL - ${unittest_throttle_srcs} + common/Throttle.cc $ ) add_test(unittest_throttle unittest_throttle) @@ -633,9 +599,8 @@ set_target_properties(unittest_throttle PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_base64 -set(unittest_base64_srcs base64.cc) add_executable(unittest_base64 EXCLUDE_FROM_ALL - ${unittest_base64_srcs} + base64.cc $ ) add_test(unittest_base64 unittest_base64) @@ -644,9 +609,8 @@ target_link_libraries(unittest_base64 global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} $ set_target_properties(unittest_base64 PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_ceph_argparse -set(unittest_ceph_argparse_srcs ceph_argparse.cc) add_executable(unittest_ceph_argparse EXCLUDE_FROM_ALL - ${unittest_ceph_argparse_srcs} + ceph_argparse.cc $ ) add_test(unittest_ceph_argparse unittest_ceph_argparse) @@ -657,9 +621,8 @@ set_target_properties(unittest_ceph_argparse PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_ceph_compatset -set(unittest_ceph_compatset_srcs ceph_compatset.cc) add_executable(unittest_ceph_compatset EXCLUDE_FROM_ALL - ${unittest_ceph_compatset_srcs} + ceph_compatset.cc $ ) add_test(unittest_ceph_compatset unittest_ceph_compatset) @@ -681,9 +644,8 @@ target_link_libraries(unittest_mds_types global ${CMAKE_DL_LIBS} set_target_properties(unittest_mds_types PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_osd_types -set(unittest_osd_types_srcs osd/types.cc) add_executable(unittest_osd_types EXCLUDE_FROM_ALL - ${unittest_osd_types_srcs} + osd/types.cc $ ) add_test(unittest_osd_types unittest_osd_types) @@ -718,9 +680,8 @@ set_target_properties(unittest_io_priority PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_gather -set(unittest_gather_srcs gather.cc) add_executable(unittest_gather EXCLUDE_FROM_ALL - ${unittest_gather_srcs} + gather.cc $ ) add_test(unittest_gather unittest_gather) @@ -731,9 +692,8 @@ set_target_properties(unittest_gather PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # run_cmd -set(unittest_run_cmd_srcs run_cmd.cc) add_executable(unittest_run_cmd EXCLUDE_FROM_ALL - ${unittest_run_cmd_srcs} + run_cmd.cc $ ) add_test(unittest_run_cmd unittest_run_cmd) @@ -744,9 +704,8 @@ set_target_properties(unittest_run_cmd PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # signals -set(unittest_signals_srcs signals.cc) add_executable(unittest_signals EXCLUDE_FROM_ALL - ${unittest_signals_srcs} + signals.cc $ ) add_test(unittest_signals unittest_signals) @@ -757,9 +716,8 @@ set_target_properties(unittest_signals PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_simple_spin -set(unittest_simple_spin_srcs simple_spin.cc) add_executable(unittest_simple_spin EXCLUDE_FROM_ALL - ${unittest_simple_spin_srcs} + simple_spin.cc $ ) add_test(unittest_simple_spin unittest_simple_spin) @@ -770,9 +728,8 @@ set_target_properties(unittest_simple_spin PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_bufferlist -set(unittest_bufferlist_srcs bufferlist.cc) add_executable(unittest_bufferlist EXCLUDE_FROM_ALL - ${unittest_bufferlist_srcs} + bufferlist.cc $ ) add_test(unittest_bufferlist unittest_bufferlist) @@ -795,9 +752,8 @@ set_target_properties(unittest_xlist PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_librados -set(unittest_librados_srcs librados/librados.cc) add_executable(unittest_librados EXCLUDE_FROM_ALL - ${unittest_librados_srcs} + librados/librados.cc $ ) add_test(unittest_librados unittest_librados) @@ -814,9 +770,8 @@ set_target_properties(unittest_librados PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_crc32 -set(unittest_crc32_srcs common/test_crc32c.cc) add_executable(unittest_crc32 EXCLUDE_FROM_ALL - ${unittest_crc32_srcs} + common/test_crc32c.cc $ ) add_test(unittest_crc32 unittest_crc32) @@ -827,9 +782,8 @@ set_target_properties(unittest_crc32 PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_arch -set(unittest_arch_srcs test_arch.cc) add_executable(unittest_arch EXCLUDE_FROM_ALL - ${unittest_arch_srcs} + test_arch.cc $ ) add_test(unittest_arch unittest_arch) @@ -852,9 +806,8 @@ set_target_properties(unittest_crypto PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_crypto_init -set(unittest_crypto_init_srcs crypto_init.cc) add_executable(unittest_crypto_init EXCLUDE_FROM_ALL - ${unittest_crypto_init_srcs} + crypto_init.cc $ ) add_test(unittest_crypto_init unittest_crypto_init) @@ -865,9 +818,8 @@ set_target_properties(unittest_crypto_init PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_perf_counters -set(unittest_perf_counters_srcs perf_counters.cc) add_executable(unittest_perf_counters EXCLUDE_FROM_ALL - ${unittest_perf_counters_srcs} + perf_counters.cc $ ) add_test(unittest_perf_counters unittest_perf_counters) @@ -878,9 +830,8 @@ set_target_properties(unittest_perf_counters PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_admin_socket -set(unittest_admin_socket_srcs admin_socket.cc) add_executable(unittest_admin_socket EXCLUDE_FROM_ALL - ${unittest_admin_socket_srcs} + admin_socket.cc $ ) add_test(unittest_admin_socket unittest_admin_socket) @@ -891,9 +842,8 @@ set_target_properties(unittest_admin_socket PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_ceph_crypto -set(unittest_ceph_crypto_srcs ceph_crypto.cc) add_executable(unittest_ceph_crypto EXCLUDE_FROM_ALL - ${unittest_ceph_crypto_srcs} + ceph_crypto.cc $ ) add_test(unittest_ceph_crypto unittest_ceph_crypto) @@ -904,9 +854,8 @@ set_target_properties(unittest_ceph_crypto PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_utf8 -set(unittest_utf8_srcs utf8.cc) add_executable(unittest_utf8 EXCLUDE_FROM_ALL - ${unittest_utf8_srcs} + utf8.cc $ ) add_test(unittest_utf8 unittest_utf8) @@ -917,9 +866,8 @@ set_target_properties(unittest_utf8 PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_mime -set(unittest_mime_srcs mime.cc) add_executable(unittest_mime EXCLUDE_FROM_ALL - ${unittest_mime_srcs} + mime.cc $ ) add_test(unittest_mime unittest_mime) @@ -930,9 +878,8 @@ set_target_properties(unittest_mime PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_escape -set(unittest_escape_srcs escape.cc) add_executable(unittest_escape EXCLUDE_FROM_ALL - ${unittest_escape_srcs} + escape.cc $ ) add_test(unittest_escape unittest_escape) @@ -961,9 +908,8 @@ target_link_libraries(unittest_chain_xattr ) # unittest_strtol -set(unittest_strtol_srcs strtol.cc) add_executable(unittest_strtol EXCLUDE_FROM_ALL - ${unittest_strtol_srcs} + strtol.cc $ ) add_test(unittest_strtol unittest_strtol) @@ -974,9 +920,8 @@ set_target_properties(unittest_strtol PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_confutils -set(unittest_confutils_srcs confutils.cc) add_executable(unittest_confutils EXCLUDE_FROM_ALL - ${unittest_confutils_srcs} + confutils.cc $ ) add_test(unittest_confutils unittest_confutils) @@ -987,9 +932,8 @@ set_target_properties(unittest_confutils PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_config -set(unittest_config_srcs common/test_config.cc) add_executable(unittest_config EXCLUDE_FROM_ALL - ${unittest_config_srcs} + common/test_config.cc $ ) add_test(unittest_config unittest_config) @@ -1000,9 +944,8 @@ set_target_properties(unittest_config PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_context -set(unittest_context_srcs common/test_context.cc) add_executable(unittest_context EXCLUDE_FROM_ALL - ${unittest_context_srcs} + common/test_context.cc $ ) add_test(unittest_context unittest_context) @@ -1013,11 +956,8 @@ set_target_properties(unittest_context PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_chain_xattr -set(unittest_chain_xattr_srcs - objectstore/chain_xattr.cc - ) add_executable(unittest_chain_xattr EXCLUDE_FROM_ALL - ${unittest_chain_xattr_srcs} + objectstore/chain_xattr.cc $ ) set_target_properties(unittest_chain_xattr PROPERTIES COMPILE_FLAGS @@ -1033,11 +973,8 @@ target_link_libraries(unittest_chain_xattr ) # unittest_flatindex -set(unittest_flatindex_srcs - os/TestFlatIndex.cc - ) add_executable(unittest_flatindex EXCLUDE_FROM_ALL - ${unittest_flatindex_srcs} + os/TestFlatIndex.cc $ ) add_test(unittest_flatindex unittest_flatindex) @@ -1053,9 +990,8 @@ set_target_properties(unittest_flatindex PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_safe_io -set(unittest_safe_io_srcs test_safe_io.cc) add_executable(unittest_safe_io EXCLUDE_FROM_ALL - ${unittest_safe_op_srcs} + common/test_safe_io.cc $ ) add_test(unittest_safe_io unittest_safe_io) @@ -1066,9 +1002,8 @@ set_target_properties(unittest_safe_io PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_heartbeatmap -set(unittest_heartbeatmap_srcs heartbeat_map.cc) add_executable(unittest_heartbeatmap EXCLUDE_FROM_ALL - ${unittest_heartbeatmap_srcs} + heartbeat_map.cc $ ) add_test(unittest_heartbeatmap unittest_heartbeatmap) @@ -1081,10 +1016,8 @@ set_target_properties(unittest_heartbeatmap PROPERTIES COMPILE_FLAGS if(${WITH_RADOSGW}) # unittest_formatter # why does this include rgw/rgw_formats.cc...? - set(unittest_formatter_srcs formatter.cc - ${CMAKE_SOURCE_DIR}/src/rgw/rgw_formats.cc) add_executable(unittest_formatter EXCLUDE_FROM_ALL - ${unittest_formatter_srcs} + ${CMAKE_SOURCE_DIR}/src/rgw/rgw_formats.cc $ ) add_test(unittest_formatter unittest_formatter) @@ -1096,9 +1029,8 @@ if(${WITH_RADOSGW}) endif(${WITH_RADOSGW}) # unittest_daemon_config -set(unittest_daemon_config_srcs daemon_config.cc) add_executable(unittest_daemon_config EXCLUDE_FROM_ALL - ${unittest_daemon_config_srcs} + daemon_config.cc $ ) add_test(unittest_daemon_config unittest_daemon_config) @@ -1116,9 +1048,8 @@ set_target_properties(unittest_daemon_config PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_libcephfs_config -set(unittest_libcephfs_config_srcs libcephfs_config.cc) add_executable(unittest_libcephfs_config EXCLUDE_FROM_ALL - ${unittest_libcephfs_config_srcs} + libcephfs_config.cc $ ) add_test(unittest_libcephfs_config unittest_libcephfs_config) @@ -1129,9 +1060,8 @@ set_target_properties(unittest_libcephfs_config PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_lfnindex -set(unittest_lfnindex_srcs os/TestLFNIndex.cc) add_executable(unittest_lfnindex EXCLUDE_FROM_ALL - ${unittest_lfnindex_srcs} + os/TestLFNIndex.cc $ ) add_test(unittest_lfnindex unittest_lfnindex) @@ -1161,9 +1091,8 @@ set_target_properties(unittest_librados_config PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_rbd_replay -set(unittest_rbd_replay_srcs test_rbd_replay.cc) add_executable(unittest_rbd_replay EXCLUDE_FROM_ALL - ${unittest_librados_config_srcs} + test_rbd_replay.cc $ $ ) @@ -1185,9 +1114,9 @@ set_target_properties(unittest_rbd_replay PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_mon_moncap -set(unittest_mon_moncap_srcs mon/moncap.cc) add_executable(unittest_mon_moncap EXCLUDE_FROM_ALL ${unittest_mon_moncap_srcs} + mon/moncap.cc $ ) add_test(unittest_mon_moncap unittest_mon_moncap) @@ -1198,8 +1127,8 @@ set_target_properties(unittest_mon_moncap PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_mon_pgmap -set(unittest_mon_pgmap_srcs mon/PGMap.cc) add_executable(unittest_mon_pgmap EXCLUDE_FROM_ALL + mon/PGMap.cc ${unittest_mon_moncap_srcs} $ ) @@ -1211,9 +1140,8 @@ set_target_properties(unittest_mon_pgmap PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_ecbackend -set(unittest_ecbackend_srcs osd/TestECBackend.cc) add_executable(unittest_ecbackend EXCLUDE_FROM_ALL - ${unittest_ecbackend_srcs} + osd/TestECBackend.cc $ ) add_test(unittest_ecbackend unittest_ecbackend) @@ -1283,6 +1211,7 @@ target_link_libraries(unittest_snap_mapper osd global ${CMAKE_DL_LIBS} set_target_properties(unittest_snap_mapper PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) +#TODO: rocksdb unittests if(WITH_SLIBROCKSDB) # unittest_rocksdb_option_static add_executable(unittest_rocksdb_option_static EXCLUDE_FROM_ALL @@ -1293,7 +1222,7 @@ add_test(unittest_rocksdb_option_static unittest_rocksdb_option_static) add_dependencies(check unittest_rocksdb_option_static) target_link_libraries(unittest_rocksdb_option_static os librocksdb global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set(UNITTEST_ROCKSDB_STATIC_FLAGS "-std=gnu++11 -I rocksdb/include") +set(UNITTEST_ROCKSDB_STATIC_FLAGS "-std=gnu++11 -Irocksdb/include") set_target_properties(unittest_rocksdb_option_static PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS} ${LIBROCKSDB_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_ROCKSDB_STATIC_FLAGS}) endif(WITH_SLIBROCKSDB) @@ -1336,12 +1265,9 @@ set_target_properties(unittest_ipaddr PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_texttable -set(unittest_texttable_srcs +add_executable(unittest_texttable EXCLUDE_FROM_ALL test_texttable.cc ${CMAKE_SOURCE_DIR}/src/common/TextTable.cc - ) -add_executable(unittest_texttable EXCLUDE_FROM_ALL - ${unittest_texttable_srcs} $ ) add_test(unittest_texttable unittest_texttable) @@ -1352,9 +1278,8 @@ set_target_properties(unittest_texttable PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_on_exit -set(unittest_on_exit_srcs on_exit.cc) add_executable(unittest_on_exit EXCLUDE_FROM_ALL - ${unittest_on_exit_srcs} + on_exit.cc $ ) add_test(unittest_on_exit unittest_on_exit) @@ -1370,9 +1295,8 @@ set_target_properties(unittest_on_exit PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_readahead -set(unittest_readahead_srcs common/Readahead.cc) add_executable(unittest_readahead EXCLUDE_FROM_ALL - ${unittest_readahead_srcs} + common/Readahead.cc $ ) add_test(unittest_readahead unittest_readahead) @@ -1388,9 +1312,8 @@ set_target_properties(unittest_readahead PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_tableformatter -set(unittest_tableformatter_srcs common/test_tableformatter.cc) add_executable(unittest_tableformatter EXCLUDE_FROM_ALL - ${unittest_tableformatter_srcs} + common/test_tableformatter.cc $ ) add_test(unittest_tableformatter unittest_tableformatter) @@ -1406,9 +1329,8 @@ set_target_properties(unittest_tableformatter PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_bit_vector -set(unittest_bit_vector_srcs common/test_bit_vector.cc) add_executable(unittest_bit_vector EXCLUDE_FROM_ALL - ${unittest_bit_vector_srcs} + common/test_bit_vector.cc $ ) add_test(unittest_bit_vector unittest_bit_vector) @@ -1424,9 +1346,8 @@ set_target_properties(unittest_bit_vector PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_subprocess -set(unittest_subprocess_srcs test_subprocess.cc) add_executable(unittest_subprocess EXCLUDE_FROM_ALL - ${unittest_subprocess_srcs} + test_subprocess.cc $ ) add_test(unittest_subprocess unittest_subprocess) diff --git a/src/test/erasure-code/CMakeLists.txt b/src/test/erasure-code/CMakeLists.txt index 613e8f009b51c..4ef8430dade6a 100644 --- a/src/test/erasure-code/CMakeLists.txt +++ b/src/test/erasure-code/CMakeLists.txt @@ -34,24 +34,6 @@ target_link_libraries(unittest_erasure_code set_target_properties(unittest_erasure_code PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_jerasure -add_executable(unittest_erasure_code_jerasure EXCLUDE_FROM_ALL - TestErasureCodeJerasure.cc -) -add_test(unittest_erasure_code_jerasure unittest_erasure_code_jerasure) -add_dependencies(check unittest_erasure_code_jerasure) -target_link_libraries(unittest_erasure_code_jerasure - global - osd - common - ec_jerasure_generic - ${CMAKE_DL_LIBS} - ${TCMALLOC_LIBS} - ${UNITTEST_LIBS}) -set(UNITTEST_ERASURE_CODE_JERASURE_CXXFLAGS "-Ierasure-code/jerasure/gf-complete/include -Ierasure-code/jerasure/jerasure/include") -set_target_properties(unittest_erasure_code_jerasure PROPERTIES COMPILE_FLAGS - ${UNITTEST_CXX_FLAGS}) - # unittest_erasure_code_jerasure_plugin add_executable(unittest_erasure_code_jerasure_plugin EXCLUDE_FROM_ALL TestErasureCodePluginJerasure.cc @@ -150,30 +132,73 @@ target_link_libraries(unittest_erasure_code_plugin_lrc set_target_properties(unittest_erasure_code_plugin_lrc PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_shec -add_executable(unittest_erasure_code_shec EXCLUDE_FROM_ALL +# unittest_erasure_code_plugin_shec +add_executable(unittest_erasure_code_plugin_shec EXCLUDE_FROM_ALL TestErasureCodePluginShec.cc ) -add_test(unittest_erasure_code_shec unittest_erasure_code_shec) -add_dependencies(check unittest_erasure_code_shec) -target_link_libraries(unittest_erasure_code_shec +add_test(unittest_erasure_code_plugin_shec unittest_erasure_code_plugin_shec) +add_dependencies(check unittest_erasure_code_plugin_shec) +target_link_libraries(unittest_erasure_code_plugin_shec global osd dl common + ec_shec_generic ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_erasure_code_shec PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_erasure_code_plugin_shec PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_shec_thread -add_executable(unittest_erasure_code_shec_thread EXCLUDE_FROM_ALL - TestErasureCodeShec_thread.cc +# unittest_erasure_code_example +add_executable(unittest_erasure_code_example EXCLUDE_FROM_ALL + ${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc + TestErasureCodeExample.cc +) +add_test(unittest_erasure_code_example unittest_erasure_code_example) +add_dependencies(check unittest_erasure_code_example) +target_link_libraries(unittest_erasure_code_example + global + osd + dl + common + erasure_code + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_example PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure/jerasure/include) +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code//jerasure/gf-complete/include) + +# unittest_erasure_code_jerasure +add_executable(unittest_erasure_code_jerasure EXCLUDE_FROM_ALL + TestErasureCodeJerasure.cc ) -add_test(unittest_erasure_code_shec_thread unittest_erasure_code_shec_thread) -add_dependencies(check unittest_erasure_code_shec_thread) -target_link_libraries(unittest_erasure_code_shec_thread +add_test(unittest_erasure_code_jerasure unittest_erasure_code_jerasure) +add_dependencies(check unittest_erasure_code_jerasure) +target_link_libraries(unittest_erasure_code_jerasure + global + osd + common + ec_jerasure_generic + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_erasure_code_jerasure PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) + +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/jerasure) +include_directories(${CMAKE_SOURCE_DIR}/src/erasure-code/shec) + +# unittest_erasure_code_shec +add_executable(unittest_erasure_code_shec + TestErasureCodeShec.cc + ) +add_test(unittest_erasure_code_shec unittest_erasure_code_shec) +add_dependencies(check unittest_erasure_code_shec) +target_link_libraries(unittest_erasure_code_shec global osd dl @@ -182,16 +207,16 @@ target_link_libraries(unittest_erasure_code_shec_thread ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_erasure_code_shec_thread PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_erasure_code_shec PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_shec_arguments -add_executable(unittest_erasure_code_shec_arguments EXCLUDE_FROM_ALL - TestErasureCodeShec_arguments.cc +# unittest_erasure_code_shec_all +add_executable(unittest_erasure_code_shec_all EXCLUDE_FROM_ALL + TestErasureCodeShec_all.cc ) -add_test(unittest_erasure_code_shec_arguments unittest_erasure_code_shec_arguments) -add_dependencies(check unittest_erasure_code_shec_arguments) -target_link_libraries(unittest_erasure_code_shec_arguments +add_test(unittest_erasure_code_shec_all unittest_erasure_code_shec_all) +add_dependencies(check unittest_erasure_code_shec_all) +target_link_libraries(unittest_erasure_code_shec_all global osd dl @@ -200,16 +225,16 @@ target_link_libraries(unittest_erasure_code_shec_arguments ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_erasure_code_shec_arguments PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_erasure_code_shec_all PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_plugin_shec -add_executable(unittest_erasure_code_plugin_shec EXCLUDE_FROM_ALL - TestErasureCodePluginShec.cc +# unittest_erasure_code_shec_thread +add_executable(unittest_erasure_code_shec_thread EXCLUDE_FROM_ALL + TestErasureCodeShec_thread.cc ) -add_test(unittest_erasure_code_plugin_shec unittest_erasure_code_plugin_shec) -add_dependencies(check unittest_erasure_code_plugin_shec) -target_link_libraries(unittest_erasure_code_plugin_shec +add_test(unittest_erasure_code_shec_thread unittest_erasure_code_shec_thread) +add_dependencies(check unittest_erasure_code_shec_thread) +target_link_libraries(unittest_erasure_code_shec_thread global osd dl @@ -218,25 +243,24 @@ target_link_libraries(unittest_erasure_code_plugin_shec ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_erasure_code_plugin_shec PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_erasure_code_shec_thread PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_erasure_code_example -add_executable(unittest_erasure_code_example EXCLUDE_FROM_ALL -${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc - TestErasureCodeExample.cc -) -add_test(unittest_erasure_code_example unittest_erasure_code_example) -add_dependencies(check unittest_erasure_code_example) -target_link_libraries(unittest_erasure_code_example +# unittest_erasure_code_shec_arguments +add_executable(unittest_erasure_code_shec_arguments EXCLUDE_FROM_ALL + TestErasureCodeShec_arguments.cc + ) +add_test(unittest_erasure_code_shec_arguments unittest_erasure_code_shec_arguments) +add_dependencies(check unittest_erasure_code_shec_arguments) +target_link_libraries(unittest_erasure_code_shec_arguments global osd dl common - erasure_code + ec_shec_generic ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_erasure_code_example PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_erasure_code_shec_arguments PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) From 5268b3d79a4e5441127a79b3058ad50b233801d9 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 18 Aug 2015 15:44:36 -0400 Subject: [PATCH 465/654] cmake: Removed scripts, check_PROGRAMS included Removed the unittest scripts for the time being. Built unittests included in check_PROGRAMS target. Signed-off-by: Ali Maredia --- src/CMakeLists.txt | 6 ++++ src/test/CMakeLists.txt | 76 +++++++++++------------------------------ 2 files changed, 25 insertions(+), 57 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7ded1c47dc6b2..114de3264bf84 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -697,6 +697,12 @@ target_link_libraries(cephfstool common ${EXTRALIBS}) set_target_properties(cephfstool PROPERTIES OUTPUT_NAME cephfs) install(TARGETS cephfstool DESTINATION bin) +set(compressor_srcs + compressor/Compressor.cc + compressor/AsyncCompressor.cc) +add_library(compressor STATIC ${compressor_srcs}) +target_link_libraries(compressor common snappy) + #set(ceph_srcs tools/ceph.cc tools/common.cc) #add_executable(ceph ${ceph_srcs}) #target_link_libraries(ceph global ${LIBEDIT_LIBS}) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 267fcf0b93beb..0e81894825475 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -302,45 +302,6 @@ target_link_libraries(bench_log global pthread rt ${BLKID_LIBRARIES} ${CMAKE_DL_ ## Unit tests #make check starts here -add_custom_target(symlinks COMMAND - ln -sf ${CMAKE_SOURCE_DIR}/src/test/ ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_BINARY_DIR}/src/ceph-mon ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_BINARY_DIR}/ceph ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_BINARY_DIR}/src/ceph-authtool ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_BINARY_DIR}/src/ceph-conf ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_BINARY_DIR}/src/ceph-osd ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_SOURCE_DIR}/src/ceph-disk ${CMAKE_BINARY_DIR}/src/test/ && - ln -sf ${CMAKE_SOURCE_DIR}/qa/ ${CMAKE_BINARY_DIR}/src/ - COMMENT "Symlinks for test directory have been created") -add_dependencies(check symlinks) - -add_test(NAME ceph_objectstore_tool COMMAND python ${CMAKE_SOURCE_DIR}/src/test/ceph_objectstore_tool.py) -add_dependencies(check ceph_objectstore_tool) - -add_test(NAME ceph_argparse_py COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_argparse.py) -add_dependencies(check ceph_argparse_py) - -add_test(NAME unittest_bufferlist_shell COMMAND bash ${CMAKE_SOURCE_DIR}/src/unittest_bufferlist.sh) -add_dependencies(check unittest_bufferlist_shell) - -add_test(NAME check_generated COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/encoding/check-generated.sh) -add_dependencies(check check_generated) - -add_test(NAME misc COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/misc.sh) -add_dependencies(check misc) - -add_test(NAME mkfs COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mkfs.sh) -add_dependencies(check mkfs) - -add_test(NAME ceph_disk COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/ceph-disk.sh) -add_dependencies(check ceph_disk) - -add_test(NAME mon_handle_forward COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-handle-forward.sh) -add_dependencies(check mon_handle_forward) - -add_test(NAME vstart_wrapped_tests COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/vstart_wrapped_tests.sh) -add_dependencies(check mon_handle_forward) - set(UNITTEST_LIBS gtest_main ${PTHREAD_LIBS}) set(UNITTEST_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CMAKE_SOURCE_DIR}/src/gtest/include -I${CMAKE_BINARY_DIR}/src/gtest/include -fno-strict-aliasing") @@ -1350,7 +1311,6 @@ add_executable(unittest_subprocess EXCLUDE_FROM_ALL test_subprocess.cc $ ) -add_test(unittest_subprocess unittest_subprocess) add_dependencies(check unittest_subprocess) target_link_libraries(unittest_subprocess global @@ -1368,20 +1328,20 @@ target_link_libraries(unittest_pageset ${UNITTEST_LIBS}) set_target_properties(unittest_pageset PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -## unittest_async_compressor -#add_executable(unittest_async_compressor EXCLUDE_FROM_ALL -# common/test_async_compressor.cc -# $ -#) -#add_dependencies(check unittest_async_compressor) -#target_link_libraries(unittest_async_compressor -# global -# compressor -# ${CMAKE_DL_LIBS} -# ${TCMALLOC_LIBS} -# ${UNITTEST_LIBS}) -#set_target_properties(unittest_async_compressor PROPERTIES COMPILE_FLAGS -# ${UNITTEST_CXX_FLAGS}) +# unittest_async_compressor +add_executable(unittest_async_compressor EXCLUDE_FROM_ALL + common/test_async_compressor.cc + $ +) +add_dependencies(check unittest_async_compressor) +target_link_libraries(unittest_async_compressor + global + compressor + ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS} + ${UNITTEST_LIBS}) +set_target_properties(unittest_async_compressor PROPERTIES COMPILE_FLAGS + ${UNITTEST_CXX_FLAGS}) add_subdirectory(erasure-code EXCLUDE_FROM_ALL) #make check ends here @@ -1534,7 +1494,8 @@ add_executable(multi_stress_watch target_link_libraries(multi_stress_watch librados global radostest ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) -add_executable(test_librbd +# unittest_librbd +add_executable(unittest_librbd EXCLUDE_FROM_ALL librbd/test_librbd.cc librbd/test_fixture.cc librbd/test_ImageWatcher.cc @@ -1545,9 +1506,10 @@ add_executable(test_librbd ${CMAKE_SOURCE_DIR}/src/common/TextTable.cc ${CMAKE_SOURCE_DIR}/src/common/secret.c ) -set_target_properties(test_librbd PROPERTIES COMPILE_FLAGS +add_dependencies(check unittest_librbd) +set_target_properties(unittest_librbd PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -target_link_libraries(test_librbd +target_link_libraries(unittest_librbd librbd librados ${UNITTEST_LIBS} From 66ea78a2da44d6094fa837edcaa6083f49c3c5b4 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 18 Aug 2015 17:03:58 -0400 Subject: [PATCH 466/654] cmake: Fixed rbd_replay build issue Signed-off-by: Ali Maredia --- src/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 114de3264bf84..dd7852246346e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -862,6 +862,7 @@ if(${WITH_RBD}) install(PROGRAMS ${CMAKE_SOURCE_DIR}/src/ceph-rbdnamer DESTINATION bin) set(librbd_replay_srcs + rbd_replay/actions.cc rbd_replay/Deser.cc rbd_replay/ImageNameMap.cc rbd_replay/PendingIO.cc @@ -870,9 +871,13 @@ if(${WITH_RBD}) rbd_replay/Ser.cc) add_library(librbd_replay STATIC ${librbd_replay_srcs}) target_link_libraries(librbd_replay PRIVATE librbd librados global udev) + add_executable(rbd_replay - rbd_replay/rbd-replay.cc) - target_link_libraries(rbd_replay librbd librados global librbd_replay) + rbd_replay/rbd-replay.cc + $ + ) + target_link_libraries(rbd_replay librbd librados global librbd_replay keyutils) + install(TARGETS rbd_replay DESTINATION bin) set(librbd_replay_ios_srcs rbd_replay/ios.cc) From 283e81c215d270e641ca66497d78cfd1f3adc3da Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Wed, 19 Aug 2015 16:15:46 -0400 Subject: [PATCH 467/654] cmake: Removed traces of CDS, minor cmake fixes Signed-off-by: Ali Maredia --- cmake/modules/Findcds.cmake | 37 --------------------------------- src/CMakeLists.txt | 4 ---- src/erasure-code/CMakeLists.txt | 7 ++++--- src/include/config-h.in.cmake | 3 --- src/test/CMakeLists.txt | 35 +------------------------------ 5 files changed, 5 insertions(+), 81 deletions(-) delete mode 100644 cmake/modules/Findcds.cmake diff --git a/cmake/modules/Findcds.cmake b/cmake/modules/Findcds.cmake deleted file mode 100644 index b22dc025b9de7..0000000000000 --- a/cmake/modules/Findcds.cmake +++ /dev/null @@ -1,37 +0,0 @@ -# - Find cds -# -# CDS_INCLUDE_DIR - where to find cds/init.h -# FIO_FOUND - True if found. - -find_path(CDS_INC_DIR cds/init.h NO_DEFAULT_PATH PATHS - /usr/include - /opt/local/include - /usr/local/include - /opt/cds -) - -if (CDS_INC_DIR) - set(CDS_FOUND TRUE) -else () - set(CDS_FOUND FALSE) -endif () - -if (CDS_FOUND) - message(STATUS "Found cds: ${CDS_INC_DIR}") -else () - message(STATUS "Failed to find cds/init.h") - if (CDS_FIND_REQUIRED) - message(FATAL_ERROR "Missing required cds/init.h") - endif () -endif () - -find_library(CDS_LIBS - NAMES cds - PATHS /usr/lib /usr/lib/x86_64-linux-gnu /opt/cds/bin/gcc-amd64-linux-64 -) - -mark_as_advanced( - CDS_INC_DIR - CDS_LIBS -) - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dd7852246346e..6cb44c613b693 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -75,10 +75,6 @@ if(${WITH_PROFILER}) list(APPEND EXTRALIBS profiler) endif(${WITH_PROFILER}) -if(WITH_CDS) - list(APPEND EXTRALIBS ${CDS_LIBS}) -endif(WITH_CDS) - if(USE_NSS) if(NSS_FOUND) if(NSPR_FOUND) diff --git a/src/erasure-code/CMakeLists.txt b/src/erasure-code/CMakeLists.txt index 8bcba45b2767f..60112930fa359 100644 --- a/src/erasure-code/CMakeLists.txt +++ b/src/erasure-code/CMakeLists.txt @@ -6,9 +6,10 @@ add_subdirectory(jerasure) add_subdirectory(lrc) add_subdirectory(shec) -if (HAVE_BETTER_YASM_ELF64) - add_subdirectory(isa) -endif (HAVE_BETTER_YASM_ELF64) +#TODO: Understand and fix this conditional +#if (HAVE_BETTER_YASM_ELF64) +# add_subdirectory(isa) +#endif (HAVE_BETTER_YASM_ELF64) add_library(erasure_code ErasureCodePlugin.cc) target_link_libraries(erasure_code dl) diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake index fbd2e634178c3..84514915e510e 100644 --- a/src/include/config-h.in.cmake +++ b/src/include/config-h.in.cmake @@ -232,7 +232,4 @@ /* Defined if XIO */ #cmakedefine HAVE_XIO -/* Defined if CDS */ -#cmakedefine HAVE_CDS - #endif /* CONFIG_H */ diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 0e81894825475..6dfe56f055c21 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -850,24 +850,6 @@ target_link_libraries(unittest_escape global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} set_target_properties(unittest_escape PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_chain_xattr -set(unittest_chain_xattr_srcs - objectstore/chain_xattr.cc - ) -add_executable(unittest_chain_xattr - ${unittest_chain_xattr_srcs} - $ - ) -set_target_properties(unittest_chain_xattr PROPERTIES COMPILE_FLAGS - ${UNITTEST_CXX_FLAGS}) -target_link_libraries(unittest_chain_xattr - os - global - ${CMAKE_DL_LIBS} - ${TCMALLOC_LIBS} - ${UNITTEST_LIBS} - ) - # unittest_strtol add_executable(unittest_strtol EXCLUDE_FROM_ALL strtol.cc @@ -932,22 +914,7 @@ target_link_libraries(unittest_chain_xattr ${TCMALLOC_LIBS} ${UNITTEST_LIBS} ) - -# unittest_flatindex -add_executable(unittest_flatindex EXCLUDE_FROM_ALL - os/TestFlatIndex.cc - $ - ) -add_test(unittest_flatindex unittest_flatindex) -add_dependencies(check unittest_flatindex) -target_link_libraries(unittest_flatindex - os - global - ${CMAKE_DL_LIBS} - ${TCMALLOC_LIBS} - ${UNITTEST_LIBS} - ) -set_target_properties(unittest_flatindex PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_chain_xattr PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_safe_io From 1397f6602b4ea87a71285c080fa26c308161d32e Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Fri, 21 Aug 2015 13:33:05 -0400 Subject: [PATCH 468/654] cmake: Uncommented erasure-code/shec conditional Signed-off-by: Ali Maredia --- src/erasure-code/CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/erasure-code/CMakeLists.txt b/src/erasure-code/CMakeLists.txt index 60112930fa359..8bcba45b2767f 100644 --- a/src/erasure-code/CMakeLists.txt +++ b/src/erasure-code/CMakeLists.txt @@ -6,10 +6,9 @@ add_subdirectory(jerasure) add_subdirectory(lrc) add_subdirectory(shec) -#TODO: Understand and fix this conditional -#if (HAVE_BETTER_YASM_ELF64) -# add_subdirectory(isa) -#endif (HAVE_BETTER_YASM_ELF64) +if (HAVE_BETTER_YASM_ELF64) + add_subdirectory(isa) +endif (HAVE_BETTER_YASM_ELF64) add_library(erasure_code ErasureCodePlugin.cc) target_link_libraries(erasure_code dl) From 74a9a0366baa37126612f1d21d63d6a3d1318ba8 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 24 Aug 2015 14:11:01 -0400 Subject: [PATCH 469/654] cmake: Removed trailing spaces from isa .s files Signed-off-by: Ali Maredia --- src/erasure-code/isa/CMakeLists.txt | 70 ++++++++++++++--------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/src/erasure-code/isa/CMakeLists.txt b/src/erasure-code/isa/CMakeLists.txt index 446e44782fdab..923427441b7a7 100644 --- a/src/erasure-code/isa/CMakeLists.txt +++ b/src/erasure-code/isa/CMakeLists.txt @@ -3,46 +3,46 @@ include_directories(isa-l/include) set(isa_srcs - isa-l/erasure_code/ec_base.c - isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s + isa-l/erasure_code/ec_base.c + isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s - isa-l/erasure_code/gf_2vect_mad_avx2.asm.s - isa-l/erasure_code/gf_3vect_mad_avx2.asm.s - isa-l/erasure_code/gf_4vect_mad_avx2.asm.s - isa-l/erasure_code/gf_5vect_mad_avx2.asm.s - isa-l/erasure_code/gf_6vect_mad_avx2.asm.s + isa-l/erasure_code/gf_2vect_mad_avx2.asm.s + isa-l/erasure_code/gf_3vect_mad_avx2.asm.s + isa-l/erasure_code/gf_4vect_mad_avx2.asm.s + isa-l/erasure_code/gf_5vect_mad_avx2.asm.s + isa-l/erasure_code/gf_6vect_mad_avx2.asm.s isa-l/erasure_code/gf_vect_mad_avx2.asm.s - isa-l/erasure_code/ec_highlevel_func.c - isa-l/erasure_code/gf_2vect_mad_avx.asm.s - isa-l/erasure_code/gf_3vect_mad_avx.asm.s - isa-l/erasure_code/gf_4vect_mad_avx.asm.s - isa-l/erasure_code/gf_5vect_mad_avx.asm.s - isa-l/erasure_code/gf_6vect_mad_avx.asm.s + isa-l/erasure_code/ec_highlevel_func.c + isa-l/erasure_code/gf_2vect_mad_avx.asm.s + isa-l/erasure_code/gf_3vect_mad_avx.asm.s + isa-l/erasure_code/gf_4vect_mad_avx.asm.s + isa-l/erasure_code/gf_5vect_mad_avx.asm.s + isa-l/erasure_code/gf_6vect_mad_avx.asm.s isa-l/erasure_code/gf_vect_mad_avx.asm.s - isa-l/erasure_code/ec_multibinary.asm.s - isa-l/erasure_code/gf_2vect_mad_sse.asm.s - isa-l/erasure_code/gf_3vect_mad_sse.asm.s - isa-l/erasure_code/gf_4vect_mad_sse.asm.s - isa-l/erasure_code/gf_5vect_mad_sse.asm.s - isa-l/erasure_code/gf_6vect_mad_sse.asm.s + isa-l/erasure_code/ec_multibinary.asm.s + isa-l/erasure_code/gf_2vect_mad_sse.asm.s + isa-l/erasure_code/gf_3vect_mad_sse.asm.s + isa-l/erasure_code/gf_4vect_mad_sse.asm.s + isa-l/erasure_code/gf_5vect_mad_sse.asm.s + isa-l/erasure_code/gf_6vect_mad_sse.asm.s isa-l/erasure_code/gf_vect_mad_sse.asm.s - isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s - isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s + isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s isa-l/erasure_code/gf_vect_mul_avx.asm.s - isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s - isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s + isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s isa-l/erasure_code/gf_vect_mul_sse.asm.s ErasureCodeIsa.cc ErasureCodeIsaTableCache.cc From 55b7c86c480246f390ea95aead21435cab64eaab Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 24 Aug 2015 15:32:53 -0400 Subject: [PATCH 470/654] cmake: Fixed HAVE_BETTER_YASM_ELF64 variable Signed-off-by: Ali Maredia --- src/test/erasure-code/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/test/erasure-code/CMakeLists.txt b/src/test/erasure-code/CMakeLists.txt index 4ef8430dade6a..3ce5641420a5e 100644 --- a/src/test/erasure-code/CMakeLists.txt +++ b/src/test/erasure-code/CMakeLists.txt @@ -50,9 +50,7 @@ target_link_libraries(unittest_erasure_code_jerasure_plugin set_target_properties(unittest_erasure_code_jerasure_plugin PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -#not sure how to check for YASM_EFL64 right now, setting it true -set(WITH_BETTER_YASM_ELF64 1) -if(WITH_BETTER_YASM_ELF64) +if(HAVE_BETTER_YASM_ELF64) #unittest_erasure_code_isa add_executable(unittest_erasure_code_isa EXCLUDE_FROM_ALL @@ -94,7 +92,7 @@ target_link_libraries(unittest_erasure_code_plugin_isa ${UNITTEST_LIBS}) set_target_properties(unittest_erasure_code_plugin_isa PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -endif(WITH_BETTER_YASM_ELF64) +endif(HAVE_BETTER_YASM_ELF64) # unittest_erasure_code_lrc add_executable(unittest_erasure_code_lrc EXCLUDE_FROM_ALL From 18d619646f0a952a2525ea6c1494016585d5779e Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 24 Aug 2015 18:01:09 -0400 Subject: [PATCH 471/654] cmake: Added shell script tests Signed-off-by: Ali Maredia --- src/test/CMakeLists.txt | 97 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 6dfe56f055c21..5885ceb351c76 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -301,6 +301,101 @@ target_link_libraries(bench_log global pthread rt ${BLKID_LIBRARIES} ${CMAKE_DL_ ## Unit tests #make check starts here +add_custom_target(symlinks COMMAND + ln -sf ${CMAKE_SOURCE_DIR}/src/test/ ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-mon ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/ceph ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-authtool ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-conf ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_BINARY_DIR}/src/ceph-osd ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_SOURCE_DIR}/src/ceph-disk ${CMAKE_BINARY_DIR}/src/test/ && + ln -sf ${CMAKE_SOURCE_DIR}/qa/ ${CMAKE_BINARY_DIR}/src/ + COMMENT "Symlinks for test scripts have been created") +add_dependencies(check symlinks) + +add_test(NAME unittest_bufferlist_shell COMMAND bash ${CMAKE_SOURCE_DIR}/src/unittest_bufferlist.sh) +add_dependencies(check unittest_bufferlist_shell) + +add_test(NAME run_tox COMMAND bash ${CMAKE_SOURCE_DIR}/src/ceph-detect-init/run-tox.sh) +add_dependencies(check run_tox) + +add_test(NAME cephtool_test_osd COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/cephtool-test-osd.sh) +add_dependencies(check cephtool_test_osd) + +add_test(NAME check_generated COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/encoding/check-generated.sh) +add_dependencies(check check_generated) + +add_test(NAME cephtool_test_rados COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/cephtool-test-rados.sh) +add_dependencies(check cephtool_test_rados) + +add_test(NAME run_rbd_unit_tests COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/run-rbd-unit-tests.sh) +add_dependencies(check run_rbd_unit_tests) + +add_test(NAME mkfs COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mkfs.sh) +add_dependencies(check mkfs) + +add_test(NAME misc COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/misc.sh) +add_dependencies(check misc) + +add_test(NAME osd_config COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/osd/osd-config.sh) +add_dependencies(check osd_config) + +add_test(NAME osd_pool_create COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/osd-pool-create.sh) +add_dependencies(check osd_pool_create) + +add_test(NAME osd_copy_from COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/osd/osd-copy-from.sh) +add_dependencies(check osd_copy_from) + +add_test(NAME mon_handle_forward COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-handle-forward.sh) +add_dependencies(check mon_handle_forward) + +add_test(NAME osd_erasure_code_profile COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/osd-erasure-code-profile.sh) +add_dependencies(check osd_erasure_code_profile) + +add_test(NAME osd_crush COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/osd-crush.sh) +add_dependencies(check osd_crush) + +add_test(NAME test_ceph_daemon COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_daemon.py) +add_dependencies(check test_ceph_daemon) + +add_test(NAME rados_striper COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/libradosstriper/rados-striper.sh) +add_dependencies(check rados_striper) + +add_test(NAME osd_bench COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/osd/osd-bench.sh) +add_dependencies(check osd_bench) + +add_test(NAME test_erasure_code COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/erasure-code/test-erasure-code.sh) +add_dependencies(check test_erasure_code) + +add_test(NAME cephtool_test_mds COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/cephtool-test-mds.sh) +add_dependencies(check cephtool_test_mds) + +add_test(NAME test_ceph_argparse_py COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_argparse.py) +add_dependencies(check test_ceph_argparse_py) + +add_test(NAME test_erasure_eio COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/erasure-code/test-erasure-eio.sh) +add_dependencies(check test_erasure_eio) + +add_test(NAME ceph_disk COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/ceph-disk.sh) +add_dependencies(check ceph_disk) + +add_test(NAME readable COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/encoding/readable.sh) +add_dependencies(check readable) + +add_test(NAME ceph_objectstore_tool COMMAND python ${CMAKE_SOURCE_DIR}/src/test/ceph_objectstore_tool.py) +add_dependencies(check ceph_objectstore_tool) + +add_test(NAME test_ceph_helpers COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/test-ceph-helpers.sh) +add_dependencies(check test_ceph_helpers) + +add_test(NAME cephtool_test_mon COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/cephtool-test-mon.sh) +add_dependencies(check cephtool_test_mon) + +add_test(NAME encode_decode_non_regression COMMAND bash ${CMAKE_SOURCE_DIR}/qa/workunits/erasure-code/encode-decode-non-regression.sh) +add_dependencies(check encode_decode_non_regression) + +add_test(NAME osd_scrub_repair COMMAND bash ${CMAKE_SOURCE_DIR}/src/osd/osd-scrub-repair.sh) +add_dependencies(check osd_scrub_repair) set(UNITTEST_LIBS gtest_main ${PTHREAD_LIBS}) set(UNITTEST_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CMAKE_SOURCE_DIR}/src/gtest/include -I${CMAKE_BINARY_DIR}/src/gtest/include -fno-strict-aliasing") @@ -730,7 +825,7 @@ target_link_libraries(unittest_librados set_target_properties(unittest_librados PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) -# unittest_crc32 +# unittest_crc32c add_executable(unittest_crc32 EXCLUDE_FROM_ALL common/test_crc32c.cc $ From 57b47b49a57ef36f28c167e3db95c845d53be478 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 25 Aug 2015 13:49:23 -0400 Subject: [PATCH 472/654] cmake: Changed name of crc32 target to crc32c Signed-off-by: Ali Maredia --- src/test/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 5885ceb351c76..76c364819df80 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -826,15 +826,15 @@ set_target_properties(unittest_librados PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_crc32c -add_executable(unittest_crc32 EXCLUDE_FROM_ALL +add_executable(unittest_crc32c EXCLUDE_FROM_ALL common/test_crc32c.cc $ ) -add_test(unittest_crc32 unittest_crc32) -add_dependencies(check unittest_crc32) -target_link_libraries(unittest_crc32 global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} +add_test(unittest_crc32c unittest_crc32c) +add_dependencies(check unittest_crc32c) +target_link_libraries(unittest_crc32c global ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} ${UNITTEST_LIBS}) -set_target_properties(unittest_crc32 PROPERTIES COMPILE_FLAGS +set_target_properties(unittest_crc32c PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS}) # unittest_arch From 4cea74a7ce8f3b8c0973e9b6fdb163062655d7a2 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 1 Sep 2015 15:33:31 -0400 Subject: [PATCH 473/654] cmake: add blkid as dependency to libcommon Signed-off-by: Casey Bodley --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6cb44c613b693..798684f718cc9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -341,7 +341,7 @@ endif(${WITH_PROFILER}) add_library(common_utf8 STATIC common/utf8.c) -target_link_libraries( common json_spirit common_utf8 erasure_code rt uuid ${CRYPTO_LIBS} ${Boost_LIBRARIES}) +target_link_libraries( common json_spirit common_utf8 erasure_code rt uuid ${CRYPTO_LIBS} ${Boost_LIBRARIES} ${BLKID_LIBRARIES}) set(libglobal_srcs global/global_init.cc From fab0a3b8084c46fafd5aa4af0da40a52d6f26b60 Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Tue, 25 Aug 2015 13:49:25 -0400 Subject: [PATCH 474/654] cmake: install crushtool to destdir/bin Signed-off-by: Matt Benjamin --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 798684f718cc9..7206e73bcd84e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -683,6 +683,7 @@ set(crushtool_srcs tools/crushtool.cc) add_executable(crushtool ${crushtool_srcs}) target_link_libraries(crushtool global) +install(TARGETS crushtool DESTINATION bin) # Support/Tools add_subdirectory(gmock) From ef7418421b3748c712019c8aedd02b8005c1e1ea Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Thu, 3 Sep 2015 20:30:50 +0200 Subject: [PATCH 475/654] ceph.spec.in: fix License line This is closer to my reading of https://github.com/ceph/ceph/blob/master/COPYING than the previous version. http://tracker.ceph.com/issues/12935 Fixes: #12935 Signed-off-by: Nathan Cutler --- ceph.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index a8374dc4f35eb..bc2f58577e353 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -46,7 +46,7 @@ Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} Epoch: 1 Summary: User space components of the Ceph file system -License: GPL-2.0 +License: LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT Group: System Environment/Base URL: http://ceph.com/ Source0: http://ceph.com/download/%{name}-%{version}.tar.bz2 From 50bc48b141dfe30c1f172cf5171cd668ac141b1c Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 3 Sep 2015 17:27:18 +0200 Subject: [PATCH 476/654] tests: ceph-erasure-code-corpus must test SIMD variants ceph_erasure_code.cc and ceph_erasure_code_benchmark.cc failed to load the plugins. It went unnoticed when 660ae5bcbb250b06cf88ec7f9a3f37b05c6c8118 was reviewed because * ceph_erasure_code_benchmark is not used in make check * qa/workunits/erasure-code/encode-decode-non-regression.sh silently interpreted the failure as the absence of SIMD variants http://tracker.ceph.com/issues/12933 Fixes: #12933 Signed-off-by: Loic Dachary --- ceph-erasure-code-corpus | 2 +- src/test/erasure-code/ceph_erasure_code.cc | 1 + src/test/erasure-code/ceph_erasure_code_benchmark.cc | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ceph-erasure-code-corpus b/ceph-erasure-code-corpus index dc409e0b2095e..b0d1137d31e4b 160000 --- a/ceph-erasure-code-corpus +++ b/ceph-erasure-code-corpus @@ -1 +1 @@ -Subproject commit dc409e0b2095eeb960518ab9c8ee47a34264f4c1 +Subproject commit b0d1137d31e4b36b72ccae9c0a9a13de2ec82faa diff --git a/src/test/erasure-code/ceph_erasure_code.cc b/src/test/erasure-code/ceph_erasure_code.cc index fd688d542e080..00d44964760a4 100644 --- a/src/test/erasure-code/ceph_erasure_code.cc +++ b/src/test/erasure-code/ceph_erasure_code.cc @@ -88,6 +88,7 @@ int ErasureCodeCommand::setup(int argc, char** argv) { CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); common_init_finish(g_ceph_context); g_ceph_context->_conf->apply_changes(NULL); + g_conf->set_val("erasure_code_dir", ".libs", false, false); if (vm.count("help")) { cout << desc << std::endl; diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc index 31a73d45a9fd3..052d8fef22146 100644 --- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc +++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc @@ -87,6 +87,7 @@ int ErasureCodeBench::setup(int argc, char** argv) { CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); common_init_finish(g_ceph_context); g_ceph_context->_conf->apply_changes(NULL); + g_conf->set_val("erasure_code_dir", ".libs", false, false); if (vm.count("help")) { cout << desc << std::endl; From f69498f3203c3ef7ac376ca454c42e1017aac4ba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 14:59:37 -0400 Subject: [PATCH 477/654] debian/control: build-requires libboost-regex-dev Signed-off-by: Sage Weil --- debian/control | 1 + 1 file changed, 1 insertion(+) diff --git a/debian/control b/debian/control index d4d6eb6412f6e..be24a77a5db36 100644 --- a/debian/control +++ b/debian/control @@ -27,6 +27,7 @@ Build-Depends: autoconf, libboost-program-options-dev (>= 1.42), libboost-system-dev (>= 1.42), libboost-thread-dev (>= 1.42), + libboost-regex-dev, libboost-random-dev, libcurl4-gnutls-dev, libedit-dev, From 322ad808cd3d48536e679b0dfb8b243af9d58c8a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 15:01:34 -0400 Subject: [PATCH 478/654] debian/control: build requires cmake Signed-off-by: Sage Weil --- debian/control | 1 + 1 file changed, 1 insertion(+) diff --git a/debian/control b/debian/control index be24a77a5db36..363bb08d4a71a 100644 --- a/debian/control +++ b/debian/control @@ -11,6 +11,7 @@ Build-Depends: autoconf, automake, autotools-dev, libbz2-dev, + cmake, cryptsetup-bin | cryptsetup, debhelper (>= 6.0.7~), default-jdk, From df44a579495c68bfbf10cf15212998a8c75095c1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 15:01:53 -0400 Subject: [PATCH 479/654] ceph.spec: build requires cmake Signed-off-by: Sage Weil --- ceph.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/ceph.spec.in b/ceph.spec.in index a8374dc4f35eb..97a0063fb32fc 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -93,6 +93,7 @@ BuildRequires: /usr/share/selinux/devel/policyhelp BuildRequires: gcc-c++ BuildRequires: boost-devel BuildRequires: boost-random +BuildRequires: cmake BuildRequires: cryptsetup BuildRequires: gdbm BuildRequires: hdparm From 64a0f0c0141570699b413224b6f21f63ac6cec32 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 15:13:40 -0400 Subject: [PATCH 480/654] rgw/Makefile.am: ship rgw_object_expirer_core.h Signed-off-by: Sage Weil --- src/rgw/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index c1baa6a3490da..9e330a1cdb045 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -146,6 +146,7 @@ noinst_HEADERS += \ rgw/rgw_gc.h \ rgw/rgw_metadata.h \ rgw/rgw_multi_del.h \ + rgw/rgw_object_expirer_core.h \ rgw/rgw_op.h \ rgw/rgw_orphan.h \ rgw/rgw_http_client.h \ From 86da3735231cb46fe16ca4bb9969ac94d6bdff76 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 15:27:34 -0400 Subject: [PATCH 481/654] .gitignore: ignore build (usually used by cmake) Signed-off-by: Sage Weil --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 01cccf5507345..557861a19335e 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ GTAGS /examples/librados/.libs/ /examples/librados/librados_hello_world /examples/librados/librados_hello_world_c + +# common cmake build dir +/build From 6d459c02443a6e3d4efc41a4615c5d5a63823b0d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 15:27:45 -0400 Subject: [PATCH 482/654] CMakeLists.txt: add newstore files Signed-off-by: Sage Weil --- src/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 71eefc369c93a..814228bf35c75 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -573,7 +573,11 @@ set(libos_srcs os/KeyValueDB.cc os/MemStore.cc os/GenericObjectMap.cc - os/HashIndex.cc) + os/HashIndex.cc + os/newstore/NewStore.cc + os/newstore/newstore_types.cc + os/fs/FS.cc + os/fs/XFS.cc) set(os_mon_files os/LevelDBStore.cc) add_library(os_mon_objs OBJECT ${os_mon_files}) From b092bd027358186831543436dc18585531f3cc63 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 1 Sep 2015 15:07:11 -0400 Subject: [PATCH 483/654] ceph-disk: use blkid for get_partition_{type,uuid} Signed-off-by: Sage Weil --- src/ceph-disk | 53 ++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 61d2b651ed4ad..bb57726830762 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2649,23 +2649,6 @@ def get_dev_fs(dev): else: return None -def get_dev_udev_properties(dev): - out, _ = command( - [ - '/sbin/blkid', - '-o', - 'udev', - '-p', - dev, - ] - ) - p = {} - for line in out.split('\n'): - if line: - (key, value) = line.split('=') - p[key] = value - return p - def split_dev_base_partnum(dev): if is_mpath(dev): partnum = partnum_mpath(dev) @@ -2677,10 +2660,31 @@ def split_dev_base_partnum(dev): return (base, partnum) def get_partition_type(part): - return get_sgdisk_partition_info(part, 'Partition GUID code: (\S+)') + return get_blkid_partition_info(part, 'ID_PART_ENTRY_TYPE') + #return get_sgdisk_partition_info(part, 'Partition GUID code: (\S+)') def get_partition_uuid(part): - return get_sgdisk_partition_info(part, 'Partition unique GUID: (\S+)') + return get_blkid_partition_info(part, 'ID_PART_ENTRY_UUID') + #return get_sgdisk_partition_info(part, 'Partition unique GUID: (\S+)') + +def get_blkid_partition_info(dev, what=None): + out, _ = command( + [ + '/sbin/blkid', + '-o', + 'udev', + '-p', + dev, + ] + ) + p = {} + for line in out.splitlines(): + (key, value) = line.split('=') + p[key] = value + if what: + return p.get(what) + else: + return p def get_sgdisk_partition_info(dev, regexp): (base, partnum) = split_dev_base_partnum(dev) @@ -3018,15 +3022,8 @@ def main_trigger(args): ) return - p = get_dev_udev_properties(args.dev) - - if 'ID_PART_ENTRY_TYPE' not in p: - raise Error('no ID_PART_ENTRY_TYPE for %s' % args.dev) - parttype = p['ID_PART_ENTRY_TYPE'] - - if 'ID_PART_ENTRY_UUID' not in p: - raise Error('no ID_PART_ENTRY_UUID for %s' % args.dev) - partid = p['ID_PART_ENTRY_UUID'] + parttype = get_partition_type(args.dev) + partid = get_partition_uuid(args.dev) LOG.info('trigger {dev} parttype {parttype} uuid {partid}'.format( dev=args.dev, From bde6ab322242b0a5f054cc25abc197eb7f325228 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 16:27:31 -0400 Subject: [PATCH 484/654] ceph-disk: is_upstart() Signed-off-by: Sage Weil --- src/ceph-disk | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index bb57726830762..2015a0c48652f 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -242,6 +242,15 @@ def is_systemd(): return True return False +def is_upstart(): + """ + Detect whether upstart is running + """ + (out, _) = command(['init', '--version']) + if 'upstart' in out: + return True + return False + def maybe_mkdir(*a, **kw): """ Creates a new directory if it doesn't exist, removes From 0f974a3e54168eae7b6ad0934c25cf703d43cbac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 17:01:21 -0400 Subject: [PATCH 485/654] ceph-disk: use async upstart job for trigger Signed-off-by: Sage Weil --- src/Makefile.am | 1 + src/ceph-disk | 12 ++++++++++++ src/upstart/ceph-disk.conf | 9 +++++++++ 3 files changed, 22 insertions(+) create mode 100644 src/upstart/ceph-disk.conf diff --git a/src/Makefile.am b/src/Makefile.am index 502f83a014fc4..6ba07195a55ee 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -81,6 +81,7 @@ EXTRA_DIST += \ $(srcdir)/ceph-rbdnamer \ $(srcdir)/tools/ceph-monstore-update-crush.sh \ $(srcdir)/upstart/ceph-all.conf \ + $(srcdir)/upstart/ceph-disk.conf \ $(srcdir)/upstart/ceph-mon.conf \ $(srcdir)/upstart/ceph-mon-all.conf \ $(srcdir)/upstart/ceph-mon-all-starter.conf \ diff --git a/src/ceph-disk b/src/ceph-disk index 2015a0c48652f..b04c54bd5e8fa 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -3030,6 +3030,18 @@ def main_trigger(args): ] ) return + if is_upstart() and not args.sync: + LOG.info('upstart detected, triggering ceph-disk task') + command( + [ + 'initctl', + 'emit', + 'ceph-disk', + 'dev={dev}'.format(dev=args.dev), + 'pid={pid}'.format(pid=os.getpid()), + ] + ) + return parttype = get_partition_type(args.dev) partid = get_partition_uuid(args.dev) diff --git a/src/upstart/ceph-disk.conf b/src/upstart/ceph-disk.conf new file mode 100644 index 0000000000000..a388785f50849 --- /dev/null +++ b/src/upstart/ceph-disk.conf @@ -0,0 +1,9 @@ +description "ceph-disk async worker" + +start on ceph-disk + +instance $dev/$pid +export dev +export pid + +exec ceph-disk --verbose --log-stdout trigger --sync $dev From 3ce06e185caf96c4bf8d930ac8e77b03d59b3c4d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 17:10:01 -0400 Subject: [PATCH 486/654] ceph-disk: set ownership of newly mapped dm device This needs to be ceph:ceph or else things fail in confusin ways later. Signed-off-by: Sage Weil --- src/ceph-disk | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index b04c54bd5e8fa..006e84a76ca93 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -1049,6 +1049,8 @@ def dmcrypt_map( else: # Plain mode has no format function, nor any validation that the key is correct. command_check_call(create_args) + # set proper ownership of mapped device + command_check_call(['chown', 'ceph:ceph', dev]) return dev except subprocess.CalledProcessError as e: From fbf4e6bebf7550f5ff933b7667f4ee28b6eb38fd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 18:41:26 -0400 Subject: [PATCH 487/654] ceph.spec: package new rgw files /usr/bin/radosgw-object-expirer /usr/lib64/rados-classes/libcls_timeindex.so Signed-off-by: Sage Weil --- ceph.spec.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ceph.spec.in b/ceph.spec.in index 97a0063fb32fc..c2824f7ee6905 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -772,6 +772,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_libdir}/rados-classes/libcls_log.so* %{_libdir}/rados-classes/libcls_replica_log.so* %{_libdir}/rados-classes/libcls_statelog.so* +%{_libdir}/rados-classes/libcls_timeindex.so* %{_libdir}/rados-classes/libcls_user.so* %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code @@ -898,6 +899,7 @@ fi %defattr(-,root,root,-) %{_bindir}/radosgw %{_bindir}/radosgw-admin +%{_bindir}/radosgw-object-expirer %{_mandir}/man8/radosgw.8* %{_mandir}/man8/radosgw-admin.8* %config(noreplace) %{_sysconfdir}/logrotate.d/radosgw From 2d1d2efb1b30bb87acf0e995965c44e43bffc35d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 18:41:52 -0400 Subject: [PATCH 488/654] debian: package radosgw-object-expirer in radosgw deb Signed-off-by: Sage Weil --- debian/radosgw.install | 1 + 1 file changed, 1 insertion(+) diff --git a/debian/radosgw.install b/debian/radosgw.install index 1ed620e0ad35d..25f0cddaa107d 100644 --- a/debian/radosgw.install +++ b/debian/radosgw.install @@ -1,5 +1,6 @@ etc/bash_completion.d/radosgw-admin usr/bin/radosgw usr/bin/radosgw-admin +usr/bin/radosgw-object-expirer usr/share/man/man8/radosgw-admin.8 usr/share/man/man8/radosgw.8 From 1b9fbffdc24160251b96cec820d62fb2a12b6eab Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Fri, 4 Sep 2015 01:07:48 +0000 Subject: [PATCH 489/654] Fix casing of Content-Type header It turns out, despite the HTTP spec declaring that header field names should be case-insensitive, some clients treat them wrongly, and consider "Content-type" to not match "Content-Type". CyberDuck was one of those clients, now fixed upstream in https://trac.cyberduck.io/ticket/8999 To reduce future occurances of this bug, fix the casing of the Content-Type header, to strictly comply with the HTTP specification (be strict about what you send, and generous about what you receive). Fixes: #12939 Backport: infernalis, hammer, firefly Signed-off-by: Robin H. Johnson --- src/rgw/rgw_rest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index c69d7ecefc20e..1b96ad9cb3dfd 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -557,7 +557,7 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int r; if (content_type) { - r = s->cio->print("Content-type: %s\r\n", content_type); + r = s->cio->print("Content-Type: %s\r\n", content_type); if (r < 0) { ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl; } From c3d379429e9d63e573e6ef820895c44f778e6dd6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 21:57:56 -0400 Subject: [PATCH 490/654] test_cls_numops: fix iterator use CID 1322828 (#1 of 1): Wrapper object use after free (WRAPPER_ESCAPE) 28. use_after_free: Using invalidated internal representation of local it. CID 1322827 (#1 of 1): Wrapper object use after free (WRAPPER_ESCAPE) 25. use_after_free: Using invalidated internal representation of local it. CID 1322826 (#1 of 1): Wrapper object use after free (WRAPPER_ESCAPE) 31. use_after_free: Using invalidated internal representation of local it. CID 1322825 (#1 of 1): Wrapper object use after free (WRAPPER_ESCAPE) 31. use_after_free: Using invalidated internal representation of local it. Signed-off-by: Sage Weil --- src/test/cls_numops/test_cls_numops.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/cls_numops/test_cls_numops.cc b/src/test/cls_numops/test_cls_numops.cc index 8abf110581e0f..844caf993cf1d 100644 --- a/src/test/cls_numops/test_cls_numops.cc +++ b/src/test/cls_numops/test_cls_numops.cc @@ -77,7 +77,7 @@ TEST(ClsNumOps, Add) { ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); - omap.find(key); + it = omap.find(key); ASSERT_NE(omap.end(), it); @@ -166,7 +166,7 @@ TEST(ClsNumOps, Sub) { ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); - omap.find(key); + it = omap.find(key); ASSERT_NE(omap.end(), it); @@ -269,7 +269,7 @@ TEST(ClsNumOps, Mul) { ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); - omap.find(key); + it = omap.find(key); ASSERT_NE(omap.end(), it); @@ -370,7 +370,7 @@ TEST(ClsNumOps, Div) { ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap)); - omap.find(key); + it = omap.find(key); ASSERT_NE(omap.end(), it); From 15fa1382510375b9170c7ca1b1f6ebb083e9a199 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 21:59:00 -0400 Subject: [PATCH 491/654] osd/osd_types: init coll_t::removal_seq in all ctors CID 1322784 (#1 of 1): Uninitialized scalar variable (UNINIT) 2. uninit_use_in_call: Using uninitialized value coll.removal_seq when calling coll_t. [show details] Signed-off-by: Sage Weil --- src/osd/osd_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 2842c84970e7b..c7503c11397d2 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -533,7 +533,7 @@ class coll_t { } explicit coll_t(spg_t pgid) - : type(TYPE_PG), pgid(pgid) + : type(TYPE_PG), pgid(pgid), removal_seq(0) { calc_str(); } From f6f442d4da11b39ac529430162d7b8533cc9a803 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 22:01:19 -0400 Subject: [PATCH 492/654] osd/ReplicatedPG: ProxyWriteOp::reqid should not be a ref CID 1322778 (#1 of 1): Pointer to local outside scope (RETURN_LOCAL) 1. escape_local_addr: Returning, through this->reqid, the address of stack variable _reqid. 2. return: Returning here. Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 4517e68730ebb..31491801ffc0e 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -221,7 +221,7 @@ class ReplicatedPG : public PG, public PGBackend::Listener { bool sent_ack; utime_t mtime; bool canceled; - osd_reqid_t &reqid; + osd_reqid_t reqid; ProxyWriteOp(OpRequestRef _op, hobject_t oid, vector& _ops, osd_reqid_t _reqid) : ctx(NULL), op(_op), soid(oid), From eceadee257ea6134ab9573e70cee7cce2d56b086 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Wed, 2 Sep 2015 23:30:05 -0400 Subject: [PATCH 493/654] Throttle: added new OrderedThrottle class It is similar to the SimpleThrottle in usage but intercepts Context callbacks to ensure they are completed in-order. Signed-off-by: Jason Dillaman --- src/common/Throttle.cc | 86 ++++++++++++++++++++++++++++++++++++++++++ src/common/Throttle.h | 68 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc index 307c0ec9e4b85..d117794606be8 100644 --- a/src/common/Throttle.cc +++ b/src/common/Throttle.cc @@ -282,3 +282,89 @@ int SimpleThrottle::wait_for_ret() m_cond.Wait(m_lock); return m_ret; } + +void C_OrderedThrottle::finish(int r) { + m_ordered_throttle->finish_op(m_tid, r); +} + +OrderedThrottle::OrderedThrottle(uint64_t max, bool ignore_enoent) + : m_lock("OrderedThrottle::m_lock"), m_max(max), m_current(0), m_ret_val(0), + m_ignore_enoent(ignore_enoent), m_next_tid(0), m_complete_tid(0) { +} + +C_OrderedThrottle *OrderedThrottle::start_op(Context *on_finish) { + assert(on_finish != NULL); + + Mutex::Locker locker(m_lock); + uint64_t tid = m_next_tid++; + m_tid_result[tid] = Result(on_finish); + C_OrderedThrottle *ctx = new C_OrderedThrottle(this, tid); + + complete_pending_ops(); + while (m_max == m_current) { + m_cond.Wait(m_lock); + complete_pending_ops(); + } + ++m_current; + + return ctx; +} + +void OrderedThrottle::end_op(int r) { + Mutex::Locker locker(m_lock); + assert(m_current > 0); + + if (r < 0 && m_ret_val == 0 && (r != -ENOENT || !m_ignore_enoent)) { + m_ret_val = r; + } + --m_current; + m_cond.Signal(); +} + +void OrderedThrottle::finish_op(uint64_t tid, int r) { + Mutex::Locker locker(m_lock); + + TidResult::iterator it = m_tid_result.find(tid); + assert(it != m_tid_result.end()); + + it->second.finished = true; + it->second.ret_val = r; + m_cond.Signal(); +} + +bool OrderedThrottle::pending_error() const { + Mutex::Locker locker(m_lock); + return (m_ret_val < 0); +} + +int OrderedThrottle::wait_for_ret() { + Mutex::Locker locker(m_lock); + complete_pending_ops(); + + while (m_current > 0) { + m_cond.Wait(m_lock); + complete_pending_ops(); + } + return m_ret_val; +} + +void OrderedThrottle::complete_pending_ops() { + assert(m_lock.is_locked()); + + while (true) { + TidResult::iterator it = m_tid_result.begin(); + if (it == m_tid_result.end() || it->first != m_complete_tid || + !it->second.finished) { + break; + } + + Result result = it->second; + m_tid_result.erase(it); + + m_lock.Unlock(); + result.on_finish->complete(result.ret_val); + m_lock.Lock(); + + ++m_complete_tid; + } +} diff --git a/src/common/Throttle.h b/src/common/Throttle.h index 2faea594d96ed..c04a9319e5602 100644 --- a/src/common/Throttle.h +++ b/src/common/Throttle.h @@ -7,7 +7,9 @@ #include "Mutex.h" #include "Cond.h" #include +#include #include "include/atomic.h" +#include "include/Context.h" class CephContext; class PerfCounters; @@ -150,4 +152,70 @@ class C_SimpleThrottle : public Context { SimpleThrottle *m_throttle; }; +class OrderedThrottle; + +class C_OrderedThrottle : public Context { +public: + C_OrderedThrottle(OrderedThrottle *ordered_throttle, uint64_t tid) + : m_ordered_throttle(ordered_throttle), m_tid(tid) { + } + +protected: + virtual void finish(int r); + +private: + OrderedThrottle *m_ordered_throttle; + uint64_t m_tid; +}; + +/** + * @class OrderedThrottle + * Throttles the maximum number of active requests and completes them in order + * + * Operations can complete out-of-order but their associated Context callback + * will completed in-order during invokation of start_op() and wait_for_ret() + */ +class OrderedThrottle { +public: + OrderedThrottle(uint64_t max, bool ignore_enoent); + + C_OrderedThrottle *start_op(Context *on_finish); + void end_op(int r); + + bool pending_error() const; + int wait_for_ret(); + +protected: + friend class C_OrderedThrottle; + + void finish_op(uint64_t tid, int r); + +private: + struct Result { + bool finished; + int ret_val; + Context *on_finish; + + Result(Context *_on_finish = NULL) + : finished(false), ret_val(0), on_finish(_on_finish) { + } + }; + + typedef std::map TidResult; + + mutable Mutex m_lock; + Cond m_cond; + uint64_t m_max; + uint64_t m_current; + int m_ret_val; + bool m_ignore_enoent; + + uint64_t m_next_tid; + uint64_t m_complete_tid; + + TidResult m_tid_result; + + void complete_pending_ops(); +}; + #endif From 3fec9da435638ba497247550a613f028e0c3c65e Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Thu, 3 Sep 2015 00:16:40 -0400 Subject: [PATCH 494/654] librbd: migrate diff iterate to new OrderedThrottle implementation Signed-off-by: Jason Dillaman --- src/librbd/DiffIterate.cc | 133 ++++++++++---------------------------- 1 file changed, 33 insertions(+), 100 deletions(-) diff --git a/src/librbd/DiffIterate.cc b/src/librbd/DiffIterate.cc index 00a9ebae2b729..80401323f71a8 100644 --- a/src/librbd/DiffIterate.cc +++ b/src/librbd/DiffIterate.cc @@ -29,99 +29,23 @@ enum ObjectDiffState { OBJECT_DIFF_STATE_HOLE = 2 }; -class DiffContext { -public: - typedef boost::tuple Diff; - typedef std::list Diffs; - +struct DiffContext { + DiffIterate::Callback callback; + void *callback_arg; bool whole_object; uint64_t from_snap_id; uint64_t end_snap_id; interval_set parent_diff; + OrderedThrottle throttle; DiffContext(ImageCtx &image_ctx, DiffIterate::Callback callback, void *callback_arg, bool _whole_object, uint64_t _from_snap_id, uint64_t _end_snap_id) - : whole_object(_whole_object), from_snap_id(_from_snap_id), - end_snap_id(_end_snap_id), m_lock("librbd::DiffContext::m_lock"), - m_image_ctx(image_ctx), m_callback(callback), - m_callback_arg(callback_arg), m_pending_ops(0), m_return_value(0), - m_next_request(0), m_waiting_request(0) - { + : callback(callback), callback_arg(callback_arg), + whole_object(_whole_object), from_snap_id(_from_snap_id), + end_snap_id(_end_snap_id), + throttle(image_ctx.concurrent_management_ops, true) { } - - int invoke_callback() { - Mutex::Locker locker(m_lock); - if (m_return_value < 0) { - return m_return_value; - } - - std::map::iterator it; - while ((it = m_request_diffs.begin()) != m_request_diffs.end() && - it->first == m_waiting_request) { - Diffs diffs = it->second; - m_request_diffs.erase(it); - - for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) { - m_lock.Unlock(); - int r = m_callback(d->get<0>(), d->get<1>(), d->get<2>(), - m_callback_arg); - m_lock.Lock(); - - if (m_return_value == 0 && r < 0) { - m_return_value = r; - return m_return_value; - } - } - ++m_waiting_request; - } - return 0; - } - - int wait_for_ret() { - Mutex::Locker locker(m_lock); - while (m_pending_ops > 0) { - m_cond.Wait(m_lock); - } - return m_return_value; - } - - uint64_t start_op() { - Mutex::Locker locker(m_lock); - while (m_pending_ops >= m_image_ctx.concurrent_management_ops) { - m_cond.Wait(m_lock); - } - ++m_pending_ops; - return m_next_request++; - } - - void finish_op(uint64_t request_num, int r, const Diffs &diffs) { - Mutex::Locker locker(m_lock); - m_request_diffs[request_num] = diffs; - - if (m_return_value == 0 && r < 0) { - m_return_value = r; - } - - --m_pending_ops; - m_cond.Signal(); - } - -private: - Mutex m_lock; - Cond m_cond; - - ImageCtx &m_image_ctx; - DiffIterate::Callback m_callback; - void *m_callback_arg; - - uint32_t m_pending_ops; - int m_return_value; - - uint64_t m_next_request; - uint64_t m_waiting_request; - - std::map m_request_diffs; }; class C_DiffObject : public Context { @@ -131,30 +55,33 @@ class C_DiffObject : public Context { uint64_t offset, const std::vector &object_extents) : m_image_ctx(image_ctx), m_head_ctx(head_ctx), m_diff_context(diff_context), m_oid(oid), m_offset(offset), - m_object_extents(object_extents), m_snap_ret(0) - { - m_request_num = m_diff_context.start_op(); + m_object_extents(object_extents), m_snap_ret(0) { } void send() { + C_OrderedThrottle *ctx = m_diff_context.throttle.start_op(this); + librados::AioCompletion *rados_completion = + librados::Rados::aio_create_completion(ctx, NULL, rados_ctx_cb); + librados::ObjectReadOperation op; op.list_snaps(&m_snap_set, &m_snap_ret); - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb); int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL); assert(r == 0); rados_completion->release(); } protected: + typedef boost::tuple Diff; + typedef std::list Diffs; + virtual void finish(int r) { CephContext *cct = m_image_ctx.cct; if (r == 0 && m_snap_ret < 0) { r = m_snap_ret; } - DiffContext::Diffs diffs; + Diffs diffs; if (r == 0) { ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl; compute_diffs(&diffs); @@ -168,7 +95,16 @@ class C_DiffObject : public Context { << cpp_strerror(r) << dendl; } - m_diff_context.finish_op(m_request_num, r, diffs); + if (r == 0) { + for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) { + r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(), + m_diff_context.callback_arg); + if (r < 0) { + break; + } + } + } + m_diff_context.throttle.end_op(r); } private: @@ -183,7 +119,7 @@ class C_DiffObject : public Context { librados::snap_set_t m_snap_set; int m_snap_ret; - void compute_diffs(DiffContext::Diffs *diffs) { + void compute_diffs(Diffs *diffs) { CephContext *cct = m_image_ctx.cct; // calc diff from from_snap_id -> to_snap_id @@ -236,7 +172,7 @@ class C_DiffObject : public Context { } } - void compute_parent_overlap(DiffContext::Diffs *diffs) { + void compute_parent_overlap(Diffs *diffs) { if (m_diff_context.from_snap_id == 0 && !m_diff_context.parent_diff.empty()) { // report parent diff instead @@ -379,9 +315,8 @@ int DiffIterate::execute() { p->second); diff_object->send(); - r = diff_context.invoke_callback(); - if (r < 0) { - diff_context.wait_for_ret(); + if (diff_context.throttle.pending_error()) { + r = diff_context.throttle.wait_for_ret(); return r; } } @@ -391,13 +326,11 @@ int DiffIterate::execute() { off += read_len; } - r = diff_context.wait_for_ret(); + r = diff_context.throttle.wait_for_ret(); if (r < 0) { return r; } - - r = diff_context.invoke_callback(); - return r; + return 0; } int DiffIterate::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, From cfe8fa221b6b0c7aaa457673b2ebf720720d2a1d Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Thu, 3 Sep 2015 00:17:17 -0400 Subject: [PATCH 495/654] rbd: export diff needs should write chunks in-order Use new OrderedThrottle class to ensure multiple AIO reads are written to the export diff in-order. Fixes: #12911 Signed-off-by: Jason Dillaman --- src/rbd.cc | 243 +++++++++++++++++++++++++---------------------------- 1 file changed, 116 insertions(+), 127 deletions(-) diff --git a/src/rbd.cc b/src/rbd.cc index 062034ccf09c0..9257c1f442514 100755 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -77,6 +77,19 @@ map map_options; // -o / --options map #define dout_subsys ceph_subsys_rbd +namespace { + +void aio_context_callback(librbd::completion_t completion, void *arg) +{ + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast(completion); + Context *context = reinterpret_cast(arg); + context->complete(aio_completion->get_return_value()); + aio_completion->release(); +} + +} // anonymous namespace + static std::map feature_mapping = boost::assign::map_list_of( RBD_FEATURE_LAYERING, "layering")( @@ -1104,53 +1117,33 @@ static int do_bench_write(librbd::Image& image, uint64_t io_size, return 0; } -struct ExportContext { - librbd::Image *image; - int fd; - uint64_t totalsize; - MyProgressContext pc; - - SimpleThrottle throttle; - Mutex lock; - - ExportContext(librbd::Image *i, int f, uint64_t t, int max_ops) : - image(i), - fd(f), - totalsize(t), - pc("Exporting image"), - throttle(max_ops, true), - lock("ExportContext::lock") - {} -}; - -class AioExportContext : public Context +class C_Export : public Context { public: - AioExportContext(SimpleThrottle &simple_throttle, librbd::Image &image, + C_Export(SimpleThrottle &simple_throttle, librbd::Image &image, uint64_t offset, uint64_t length, int fd) : m_aio_completion( - new librbd::RBD::AioCompletion(this, &AioExportContext::aio_callback)), - m_throttle(simple_throttle), - m_offset(offset), - m_fd(fd) + new librbd::RBD::AioCompletion(this, &aio_context_callback)), + m_throttle(simple_throttle), m_image(image), m_offset(offset), + m_length(length), m_fd(fd) + { + } + + void send() { m_throttle.start_op(); int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE; - int r = image.aio_read2(offset, length, m_bufferlist, m_aio_completion, - op_flags); + int r = m_image.aio_read2(m_offset, m_length, m_bufferlist, + m_aio_completion, op_flags); if (r < 0) { cerr << "rbd: error requesting read from source image" << std::endl; + m_aio_completion->release(); m_throttle.end_op(r); } } - virtual ~AioExportContext() - { - m_aio_completion->release(); - } - virtual void finish(int r) { BOOST_SCOPE_EXIT((&m_throttle) (&r)) @@ -1186,19 +1179,13 @@ class AioExportContext : public Context } } - static void aio_callback(librbd::completion_t completion, void *arg) - { - librbd::RBD::AioCompletion *aio_completion = - reinterpret_cast(completion); - AioExportContext *export_context = reinterpret_cast(arg); - export_context->complete(aio_completion->get_return_value()); - } - private: librbd::RBD::AioCompletion *m_aio_completion; SimpleThrottle &m_throttle; + librbd::Image &m_image; bufferlist m_bufferlist; uint64_t m_offset; + uint64_t m_length; int m_fd; }; @@ -1229,8 +1216,14 @@ static int do_export(librbd::Image& image, const char *path) SimpleThrottle throttle(max_concurrent_ops, false); uint64_t period = image.get_stripe_count() * (1ull << info.order); for (uint64_t offset = 0; offset < info.size; offset += period) { + if (throttle.pending_error()) { + break; + } + uint64_t length = min(period, info.size - offset); - new AioExportContext(throttle, image, offset, length, fd); + C_Export *ctx = new C_Export(throttle, image, offset, length, fd); + ctx->send(); + pc.update_progress(offset, info.size); } @@ -1250,92 +1243,90 @@ static int do_export(librbd::Image& image, const char *path) return r; } -class C_ExportDiff { +struct ExportDiffContext { + librbd::Image *image; + int fd; + uint64_t totalsize; + MyProgressContext pc; + OrderedThrottle throttle; + + ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops) : + image(i), fd(f), totalsize(t), pc("Exporting image"), + throttle(max_ops, true) { + } +}; + +class C_ExportDiff : public Context { public: - C_ExportDiff(ExportContext *ec, uint64_t offset, uint64_t length) - : m_export_context(ec), m_offset(offset), m_length(length) - { + C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length, + bool exists) + : m_export_diff_context(edc), m_offset(offset), m_length(length), + m_exists(exists) { } int send() { - if (m_export_context->throttle.pending_error()) { - return m_export_context->throttle.wait_for_ret(); + if (m_export_diff_context->throttle.pending_error()) { + return m_export_diff_context->throttle.wait_for_ret(); } - m_export_context->throttle.start_op(); - librbd::RBD::AioCompletion *aio_completion = - new librbd::RBD::AioCompletion(this, &C_ExportDiff::aio_callback); - int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE; - int r = m_export_context->image->aio_read2(m_offset, m_length, m_read_data, - aio_completion, op_flags); - if (r < 0) { - aio_completion->release(); - complete(r); + C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this); + if (m_exists) { + librbd::RBD::AioCompletion *aio_completion = + new librbd::RBD::AioCompletion(ctx, &aio_context_callback); + + int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + int r = m_export_diff_context->image->aio_read2( + m_offset, m_length, m_read_data, aio_completion, op_flags); + if (r < 0) { + aio_completion->release(); + ctx->complete(r); + } + } else { + ctx->complete(0); } - return r; + return 0; } static int export_diff_cb(uint64_t offset, size_t length, int exists, void *arg) { - ExportContext *ec = reinterpret_cast(arg); + ExportDiffContext *edc = reinterpret_cast(arg); - int r; - { - if (exists) { - C_ExportDiff *context = new C_ExportDiff(ec, offset, length); - r = context->send(); - } else { - Mutex::Locker lock(ec->lock); - r = write_extent(ec, offset, length, false); + C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists); + return context->send(); + } + +protected: + virtual void finish(int r) { + if (r >= 0) { + if (m_exists) { + m_exists = !m_read_data.is_zero(); + } + r = write_extent(m_export_diff_context, m_offset, m_length, m_exists); + if (r == 0 && m_exists) { + r = m_read_data.write_fd(m_export_diff_context->fd); } } - ec->pc.update_progress(offset, ec->totalsize); - return r; + m_export_diff_context->throttle.end_op(r); } private: - ExportContext *m_export_context; + ExportDiffContext *m_export_diff_context; uint64_t m_offset; uint64_t m_length; + bool m_exists; bufferlist m_read_data; - void complete(int r) { - { - Mutex::Locker locker(m_export_context->lock); - if (r >= 0) { - r = write_extent(m_export_context, m_offset, m_length, - !m_read_data.is_zero()); - if (r == 0) { - // block - r = m_read_data.write_fd(m_export_context->fd); - } - } - } - m_export_context->throttle.end_op(r); - delete this; - } - - static void aio_callback(librbd::completion_t completion, void *arg) - { - librbd::RBD::AioCompletion *aio_completion = - reinterpret_cast(completion); - C_ExportDiff *context = reinterpret_cast(arg); - - context->complete(aio_completion->get_return_value()); - aio_completion->release(); - } - - static int write_extent(ExportContext *ec, uint64_t offset, uint64_t length, - bool exists) { - assert(ec->lock.is_locked()); - + static int write_extent(ExportDiffContext *edc, uint64_t offset, + uint64_t length, bool exists) { // extent bufferlist bl; __u8 tag = exists ? 'w' : 'z'; ::encode(tag, bl); ::encode(offset, bl); ::encode(length, bl); - int r = bl.write_fd(ec->fd); + int r = bl.write_fd(edc->fd); + + edc->pc.update_progress(offset, edc->totalsize); return r; } }; @@ -1397,15 +1388,15 @@ static int do_export_diff(librbd::Image& image, const char *fromsnapname, } } - ExportContext ec(&image, fd, info.size, - g_conf->rbd_concurrent_management_ops); + ExportDiffContext edc(&image, fd, info.size, + g_conf->rbd_concurrent_management_ops); r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object, - &C_ExportDiff::export_diff_cb, (void *)&ec); + &C_ExportDiff::export_diff_cb, (void *)&edc); if (r < 0) { goto out; } - r = ec.throttle.wait_for_ret(); + r = edc.throttle.wait_for_ret(); if (r < 0) { goto out; } @@ -1419,9 +1410,9 @@ static int do_export_diff(librbd::Image& image, const char *fromsnapname, out: if (r < 0) - ec.pc.fail(); + edc.pc.fail(); else - ec.pc.finish(); + edc.pc.finish(); return r; } @@ -1539,33 +1530,33 @@ static void set_pool_image_name(const char *orig_img, char **new_pool, update_snap_name(*new_img, snap); } -class AioImportContext : public Context +class C_Import : public Context { public: - AioImportContext(SimpleThrottle &simple_throttle, librbd::Image &image, - bufferlist &bl, uint64_t offset) - : m_throttle(simple_throttle), + C_Import(SimpleThrottle &simple_throttle, librbd::Image &image, + bufferlist &bl, uint64_t offset) + : m_throttle(simple_throttle), m_image(image), m_aio_completion( - new librbd::RBD::AioCompletion(this, &AioImportContext::aio_callback)), + new librbd::RBD::AioCompletion(this, &aio_context_callback)), m_bufferlist(bl), m_offset(offset) + { + } + + void send() { m_throttle.start_op(); int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE; - int r = image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist, - m_aio_completion, op_flags); + int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist, + m_aio_completion, op_flags); if (r < 0) { cerr << "rbd: error requesting write to destination image" << std::endl; + m_aio_completion->release(); m_throttle.end_op(r); } } - virtual ~AioImportContext() - { - m_aio_completion->release(); - } - virtual void finish(int r) { if (r < 0) { @@ -1575,16 +1566,9 @@ class AioImportContext : public Context m_throttle.end_op(r); } - static void aio_callback(librbd::completion_t completion, void *arg) - { - librbd::RBD::AioCompletion *aio_completion = - reinterpret_cast(completion); - AioImportContext *import_context = reinterpret_cast(arg); - import_context->complete(aio_completion->get_return_value()); - } - private: SimpleThrottle &m_throttle; + librbd::Image &m_image; librbd::RBD::AioCompletion *m_aio_completion; bufferlist m_bufferlist; uint64_t m_offset; @@ -1669,6 +1653,10 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx, // loop body handles 0 return, as we may have a block to flush while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) { + if (throttle->pending_error()) { + break; + } + blklen += readlen; // if read was short, try again to fill the block before writing if (readlen && ((size_t)readlen < reqlen)) { @@ -1693,7 +1681,8 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx, // write as much as we got; perhaps less than imgblklen // but skip writing zeros to create sparse images if (!bl.is_zero()) { - new AioImportContext(*throttle, image, bl, image_pos); + C_Import *ctx = new C_Import(*throttle, image, bl, image_pos); + ctx->send(); } // done with whole block, whether written or not From 575537046819bb0f5201f7772208c049779d2c07 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 18 Aug 2015 13:42:45 -0400 Subject: [PATCH 496/654] gtest: enable use of TR1 tuples Since the TR1 tuple is already in use within the Ceph project, this flag needs to be enabled to avoid gmock link errors within gtest. Signed-off-by: Jason Dillaman --- src/Makefile-env.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile-env.am b/src/Makefile-env.am index 1acb3563ef00a..2304193ce0bd0 100644 --- a/src/Makefile-env.am +++ b/src/Makefile-env.am @@ -85,7 +85,7 @@ AM_COMMON_CPPFLAGS = \ -D_GNU_SOURCE \ -DCEPH_LIBDIR=\"${libdir}\" \ -DCEPH_PKGLIBDIR=\"${pkglibdir}\" \ - -DGTEST_HAS_TR1_TUPLE=0 + -DGTEST_USE_OWN_TR1_TUPLE=0 AM_COMMON_CFLAGS = \ -Wall \ From c0a6218da1348354ddfc3a9579ff30794b4e70f6 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Mon, 29 Jun 2015 18:38:48 -0400 Subject: [PATCH 497/654] librados_test_stub: add reference counting to pools This will ensure that in-flight operations can't access freed pool memory. Signed-off-by: Jason Dillaman --- src/test/librados_test_stub/TestMemIoCtxImpl.cc | 15 +++++++++++---- src/test/librados_test_stub/TestMemIoCtxImpl.h | 7 ++++--- src/test/librados_test_stub/TestMemRadosClient.cc | 2 +- src/test/librados_test_stub/TestMemRadosClient.h | 4 +++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.cc b/src/test/librados_test_stub/TestMemIoCtxImpl.cc index a5b27cd3d2227..bb39b49d62db1 100644 --- a/src/test/librados_test_stub/TestMemIoCtxImpl.cc +++ b/src/test/librados_test_stub/TestMemIoCtxImpl.cc @@ -23,14 +23,21 @@ TestMemIoCtxImpl::TestMemIoCtxImpl() { } TestMemIoCtxImpl::TestMemIoCtxImpl(const TestMemIoCtxImpl& rhs) - : TestIoCtxImpl(rhs), m_client(rhs.m_client), m_pool(rhs.m_pool) { - } + : TestIoCtxImpl(rhs), m_client(rhs.m_client), m_pool(rhs.m_pool) { + m_pool->get(); +} TestMemIoCtxImpl::TestMemIoCtxImpl(TestMemRadosClient &client, int64_t pool_id, const std::string& pool_name, TestMemRadosClient::Pool *pool) - : TestIoCtxImpl(client, pool_id, pool_name), m_client(&client), m_pool(pool) { - } + : TestIoCtxImpl(client, pool_id, pool_name), m_client(&client), + m_pool(pool) { + m_pool->get(); +} + +TestMemIoCtxImpl::~TestMemIoCtxImpl() { + m_pool->put(); +} TestIoCtxImpl *TestMemIoCtxImpl::clone() { return new TestMemIoCtxImpl(*this); diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.h b/src/test/librados_test_stub/TestMemIoCtxImpl.h index 4fc2eb47ac51e..e58b97a3fecae 100644 --- a/src/test/librados_test_stub/TestMemIoCtxImpl.h +++ b/src/test/librados_test_stub/TestMemIoCtxImpl.h @@ -12,9 +12,10 @@ namespace librados { class TestMemIoCtxImpl : public TestIoCtxImpl { public: TestMemIoCtxImpl(); - explicit TestMemIoCtxImpl(TestMemRadosClient &client, int64_t m_pool_id, - const std::string& pool_name, - TestMemRadosClient::Pool *pool); + TestMemIoCtxImpl(TestMemRadosClient &client, int64_t m_pool_id, + const std::string& pool_name, + TestMemRadosClient::Pool *pool); + virtual ~TestMemIoCtxImpl(); virtual TestIoCtxImpl *clone(); diff --git a/src/test/librados_test_stub/TestMemRadosClient.cc b/src/test/librados_test_stub/TestMemRadosClient.cc index b89f4eb6fed2a..b8bdf0777c31f 100644 --- a/src/test/librados_test_stub/TestMemRadosClient.cc +++ b/src/test/librados_test_stub/TestMemRadosClient.cc @@ -78,7 +78,7 @@ int TestMemRadosClient::pool_delete(const std::string &pool_name) { if (iter == m_pools.end()) { return -ENOENT; } - delete iter->second; + iter->second->put(); m_pools.erase(iter); return 0; } diff --git a/src/test/librados_test_stub/TestMemRadosClient.h b/src/test/librados_test_stub/TestMemRadosClient.h index a6fb3bdb1d5fe..e0afacb3a7054 100644 --- a/src/test/librados_test_stub/TestMemRadosClient.h +++ b/src/test/librados_test_stub/TestMemRadosClient.h @@ -6,8 +6,10 @@ #include "test/librados_test_stub/TestRadosClient.h" #include "include/atomic.h" +#include "include/assert.h" #include "include/buffer.h" #include "include/interval_set.h" +#include "common/RefCountedObj.h" #include "common/RWLock.h" #include #include @@ -48,7 +50,7 @@ class TestMemRadosClient : public TestRadosClient { typedef std::map Files; typedef std::set SnapSeqs; - struct Pool { + struct Pool : public RefCountedObject { Pool(); int64_t pool_id; From 3dc29de56cb73eaa00556428cbeac88ff2a5d8cb Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 4 Sep 2015 10:37:26 +0800 Subject: [PATCH 498/654] mon: fix the build with boost 1.59 cmd_vartype is a boost::variant>, so we can not get a int8_t from it. Fixes: #12922 Signed-off-by: Kefu Chai --- src/mon/OSDMonitor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 4021054734fee..06dd21808c1eb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -6599,8 +6599,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, goto reply; } - int8_t fast_read_param; - cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int8_t(-1)); + int64_t fast_read_param; + cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1)); FastReadType fast_read = FAST_READ_DEFAULT; if (fast_read_param == 0) fast_read = FAST_READ_OFF; From 58ea8bebf73a376908e6bd079a974a8596cfd889 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Fri, 4 Sep 2015 10:08:35 +0200 Subject: [PATCH 499/654] logrotate.conf: Simplify log files reopening after log rotation SIGHUP (1) signal makes ceph services reopen its log files so we do not need to call any init scripts magic, here. Additionally, the old approach caused problems with rhel-based systems since it found the service command and tried to reload ceph.service instead of ceph.target. This also caused problems with SELinux as processes with context logrotate_t are not allowed to issue systemctl reload on a process. This patch is also in sync with other logrotate postrotate scripts which do exactly the same thing (send SIGHUP to the daemon). Signed-off-by: Boris Ranto --- src/logrotate.conf | 23 +---------------------- src/rgw/logrotate.conf | 20 +------------------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/src/logrotate.conf b/src/logrotate.conf index 5888f20db987d..0c5df242b78d4 100644 --- a/src/logrotate.conf +++ b/src/logrotate.conf @@ -4,28 +4,7 @@ compress sharedscripts postrotate - if which service > /dev/null 2>&1 && [ -x `which service` ]; then - service ceph reload >/dev/null - elif which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then - invoke-rc.d ceph reload >/dev/null - elif which systemctl > /dev/null 2>&1 && [ -x `which systemctl` ]; then - # systemd does not provide an easy way to list (active) units - killall -q -1 ceph-mon ceph-mds ceph-osd - fi - # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op - if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then - for daemon in osd mon mds ; do - find -L /var/lib/ceph/$daemon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -printf '%P\n' \ - | while read f; do - if [ -e "/var/lib/ceph/$daemon/$f/done" -o -e "/var/lib/ceph/$daemon/$f/ready" ] && [ -e "/var/lib/ceph/$daemon/$f/upstart" ] && [ ! -e "/var/lib/ceph/$daemon/$f/sysvinit" ]; then - cluster="${f%%-*}" - id="${f#*-}" - - initctl reload ceph-$daemon cluster="$cluster" id="$id" 2>/dev/null || : - fi - done - done - fi + killall -q -1 ceph-mon ceph-mds ceph-osd endscript missingok notifempty diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf index f1dc58af33233..b80e81edd4c9d 100644 --- a/src/rgw/logrotate.conf +++ b/src/rgw/logrotate.conf @@ -4,25 +4,7 @@ compress sharedscripts postrotate - if which service > /dev/null 2>&1 && [ -x `which service` ]; then - service ceph-radosgw reload >/dev/null - elif which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then - invoke-rc.d radosgw reload >/dev/null - elif which systemctl > /dev/null 2>&1 && [ -x `which systemctl` ]; then - # systemd does not provide an easy way to list (active) units - killall -q -1 radosgw - fi - # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op - if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then - find -L /var/lib/ceph/radosgw/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -printf '%P\n' \ - | while read f; do - if [ -e "/var/lib/ceph/radosgw/$f/done" ]; then - cluster="${f%%-*}" - id="${f#*-}" - initctl reload radosgw cluster="$cluster" id="$id" 2>/dev/null || : - fi - done - fi + killall -q -1 radosgw endscript missingok notifempty From ab430f14da924bc309a690ac901ee6f8fe5b4a29 Mon Sep 17 00:00:00 2001 From: Takanori Nakao Date: Fri, 10 Jul 2015 13:41:45 +0900 Subject: [PATCH 500/654] erasure code: shec performance optimization with decoding cache continued from https://github.com/ceph/ceph/pull/4132 Signed-off-by: Takanori Nakao --- src/erasure-code/shec/ErasureCodeShec.cc | 11 + .../shec/ErasureCodeShecTableCache.cc | 256 ++++++++++++++++-- .../shec/ErasureCodeShecTableCache.h | 66 ++++- 3 files changed, 311 insertions(+), 22 deletions(-) diff --git a/src/erasure-code/shec/ErasureCodeShec.cc b/src/erasure-code/shec/ErasureCodeShec.cc index f02a972268222..5dcaf44da1259 100644 --- a/src/erasure-code/shec/ErasureCodeShec.cc +++ b/src/erasure-code/shec/ErasureCodeShec.cc @@ -562,6 +562,14 @@ int ErasureCodeShec::shec_make_decoding_matrix(bool prepare, int *want_, int *av } } + if (tcache.getDecodingTableFromCache(decoding_matrix, + dm_row, dm_column, minimum, + technique, + k, m, c, w, + want, avails)) { + return 0; + } + for (unsigned long long pp = 0; pp < (1ull << m); ++pp) { // select parity chunks @@ -755,6 +763,9 @@ int ErasureCodeShec::shec_make_decoding_matrix(bool prepare, int *want_, int *av int ret = jerasure_invert_matrix(tmpmat, decoding_matrix, mindup, w); + tcache.putDecodingTableToCache(decoding_matrix, dm_row, dm_column, minimum, technique, + k, m, c, w, want, avails); + return ret; } diff --git a/src/erasure-code/shec/ErasureCodeShecTableCache.cc b/src/erasure-code/shec/ErasureCodeShecTableCache.cc index 8fb64b2519cb8..a037892f67c09 100644 --- a/src/erasure-code/shec/ErasureCodeShecTableCache.cc +++ b/src/erasure-code/shec/ErasureCodeShecTableCache.cc @@ -23,34 +23,95 @@ #include "common/debug.h" // ----------------------------------------------------------------------------- +// ----------------------------------------------------------------------------- +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _tc_prefix(_dout) +// ----------------------------------------------------------------------------- + +// ----------------------------------------------------------------------------- + +static ostream& +_tc_prefix(std::ostream* _dout) { + return *_dout << "ErasureCodeShecTableCache: "; +} + +// ----------------------------------------------------------------------------- + ErasureCodeShecTableCache::~ErasureCodeShecTableCache() { Mutex::Locker lock(codec_tables_guard); - codec_technique_tables_t::const_iterator ttables_it; - codec_tables_t::const_iterator tables_it; - codec_tables_t_::const_iterator tables_it_; - codec_tables_t__::const_iterator tables_it__; - codec_table_t::const_iterator table_it; - // clean-up all allocated tables + { + codec_technique_tables_t::const_iterator ttables_it; + codec_tables_t::const_iterator tables_it; + codec_tables_t_::const_iterator tables_it_; + codec_tables_t__::const_iterator tables_it__; + codec_table_t::const_iterator table_it; - for (ttables_it = encoding_table.begin(); ttables_it != encoding_table.end(); ++ttables_it) { - for (tables_it = ttables_it->second.begin(); tables_it != ttables_it->second.end(); ++tables_it) { - for (tables_it_ = tables_it->second.begin(); tables_it_ != tables_it->second.end(); ++tables_it_) { - for (tables_it__ = tables_it_->second.begin(); tables_it__ != tables_it_->second.end(); ++tables_it__) { - for (table_it = tables_it__->second.begin(); table_it != tables_it__->second.end(); ++table_it) { - if (table_it->second) { - if (*(table_it->second)) { - delete *(table_it->second); - } - delete table_it->second; - } - } + for (ttables_it = encoding_table.begin(); ttables_it != encoding_table.end(); ++ttables_it) { + for (tables_it = ttables_it->second.begin(); tables_it != ttables_it->second.end(); ++tables_it) { + for (tables_it_ = tables_it->second.begin(); tables_it_ != tables_it->second.end(); ++tables_it_) { + for (tables_it__ = tables_it_->second.begin(); tables_it__ != tables_it_->second.end(); ++tables_it__) { + for (table_it = tables_it__->second.begin(); table_it != tables_it__->second.end(); ++table_it) { + if (table_it->second) { + if (*(table_it->second)) { + delete *(table_it->second); + } + delete table_it->second; + } + } + } } } } } + + { + std::map::const_iterator lru_map_it; + std::map::const_iterator lru_list_it; + + for (lru_map_it = decoding_tables.begin(); + lru_map_it != decoding_tables.end(); + ++lru_map_it) { + if (lru_map_it->second) { + delete lru_map_it->second; + } + } + + for (lru_list_it = decoding_tables_lru.begin(); + lru_list_it != decoding_tables_lru.end(); + ++lru_list_it) { + if (lru_list_it->second) { + delete lru_list_it->second; + } + } + } +} + +ErasureCodeShecTableCache::lru_map_t* +ErasureCodeShecTableCache::getDecodingTables(int technique) { + // the caller must hold the guard mutex: + // => Mutex::Locker lock(codec_tables_guard); + + // create an lru_map if not yet allocated + if (!decoding_tables[technique]) { + decoding_tables[technique] = new lru_map_t; + } + return decoding_tables[technique]; +} + +ErasureCodeShecTableCache::lru_list_t* +ErasureCodeShecTableCache::getDecodingTablesLru(int technique) { + // the caller must hold the guard mutex: + // => Mutex::Locker lock(codec_tables_guard); + + // create an lru_list if not yet allocated + if (!decoding_tables_lru[technique]) { + decoding_tables_lru[technique] = new lru_list_t; + } + return decoding_tables_lru[technique]; } int** @@ -95,3 +156,162 @@ ErasureCodeShecTableCache::getLock() { return &codec_tables_guard; } + +uint64_t +ErasureCodeShecTableCache::getDecodingCacheSignature(int k, int m, int c, int w, + int *erased, int *avails) { + uint64_t signature = 0; + signature = (uint64_t)k; + signature |= ((uint64_t)m << 6); + signature |= ((uint64_t)c << 12); + signature |= ((uint64_t)w << 18); + + for (int i=0; i < k+m; i++) { + signature |= ((uint64_t)(avails[i] ? 1 : 0) << (24+i)); + } + for (int i=0; i < k+m; i++) { + signature |= ((uint64_t)(erased[i] ? 1 : 0) << (44+i)); + } + return signature; +} + +bool +ErasureCodeShecTableCache::getDecodingTableFromCache(int* decoding_matrix, + int* dm_row, + int* dm_column, + int* minimum, + int technique, + int k, + int m, + int c, + int w, + int* erased, + int* avails) { + // -------------------------------------------------------------------------- + // LRU decoding matrix cache + // -------------------------------------------------------------------------- + + uint64_t signature = getDecodingCacheSignature(k, m, c, w, erased, avails); + Mutex::Locker lock(codec_tables_guard); + + dout(20) << "[ get table ] = " << signature << dendl; + + // we try to fetch a decoding table from an LRU cache + lru_map_t* decode_tbls_map = + getDecodingTables(technique); + + lru_list_t* decode_tbls_lru = + getDecodingTablesLru(technique); + + lru_map_t::iterator decode_tbls_map_it = decode_tbls_map->find(signature); + if (decode_tbls_map_it == decode_tbls_map->end()) { + return false; + } + + dout(20) << "[ cached table ] = " << signature << dendl; + // copy parameters out of the cache + + memcpy(decoding_matrix, + decode_tbls_map_it->second.second.decoding_matrix, + k * k * sizeof(int)); + memcpy(dm_row, + decode_tbls_map_it->second.second.dm_row, + k * sizeof(int)); + memcpy(dm_column, + decode_tbls_map_it->second.second.dm_column, + k * sizeof(int)); + memcpy(minimum, + decode_tbls_map_it->second.second.minimum, + (k+m) * sizeof(int)); + + // find item in LRU queue and push back + decode_tbls_lru->splice(decode_tbls_lru->end(), + *decode_tbls_lru, + decode_tbls_map_it->second.first); + return true; +} + +void +ErasureCodeShecTableCache::putDecodingTableToCache(int* decoding_matrix, + int* dm_row, + int* dm_column, + int* minimum, + int technique, + int k, + int m, + int c, + int w, + int* erased, + int* avails) { + // -------------------------------------------------------------------------- + // LRU decoding matrix cache + // -------------------------------------------------------------------------- + + Mutex::Locker lock(codec_tables_guard); + + uint64_t signature = getDecodingCacheSignature(k, m, c, w, erased, avails); + dout(20) << "[ put table ] = " << signature << dendl; + + // we store a new table to the cache + + // bufferptr cachetable; + + lru_map_t* decode_tbls_map = + getDecodingTables(technique); + + lru_list_t* decode_tbls_lru = + getDecodingTablesLru(technique); + + if (decode_tbls_map->count(signature)) { + dout(20) << "[ already on table ] = " << signature << dendl; + + // find item in LRU queue and push back + decode_tbls_lru->splice(decode_tbls_lru->end(), + *decode_tbls_lru, + (*decode_tbls_map)[signature].first); + return; + } + + // evt. shrink the LRU queue/map + if ((int)decode_tbls_lru->size() >= + ErasureCodeShecTableCache::decoding_tables_lru_length) { + dout(20) << "[ shrink lru ] = " << signature << dendl; + // remove from map + decode_tbls_map->erase(decode_tbls_lru->front()); + // remove from lru + decode_tbls_lru->pop_front(); + } + + { + dout(20) << "[ store table ] = " << signature << dendl; + + decode_tbls_lru->push_back(signature); + + // allocate a new buffer + lru_list_t::iterator it_end = decode_tbls_lru->end(); + it_end--; + + lru_entry_t &map_value = + (*decode_tbls_map)[signature] = + std::make_pair(it_end, DecodingCacheParameter()); + map_value.second.decoding_matrix = new int[k*k]; + map_value.second.dm_row = new int[k]; + map_value.second.dm_column = new int[k]; + map_value.second.minimum = new int[k+m]; + + memcpy(map_value.second.decoding_matrix, + decoding_matrix, + k * k * sizeof(int)); + memcpy(map_value.second.dm_row, + dm_row, + k * sizeof(int)); + memcpy(map_value.second.dm_column, + dm_column, + k * sizeof(int)); + memcpy(map_value.second.minimum, + minimum, + (k+m) * sizeof(int)); + + dout(20) << "[ cache size ] = " << decode_tbls_lru->size() << dendl; + } +} diff --git a/src/erasure-code/shec/ErasureCodeShecTableCache.h b/src/erasure-code/shec/ErasureCodeShecTableCache.h index 21f65bdfa6ff0..e4eaf0f0eaa33 100644 --- a/src/erasure-code/shec/ErasureCodeShecTableCache.h +++ b/src/erasure-code/shec/ErasureCodeShecTableCache.h @@ -31,11 +31,43 @@ class ErasureCodeShecTableCache { // --------------------------------------------------------------------------- // This class implements a table cache for encoding and decoding matrices. // Encoding matrices are shared for the same (k,m,c,w) combination. + // It supplies a decoding matrix lru cache which is shared for identical + // matrix types e.g. there is one cache (lru-list + lru-map) // --------------------------------------------------------------------------- + class DecodingCacheParameter { + public: + int* decoding_matrix; // size: k*k + int* dm_row; // size: k + int* dm_column; // size: k + int* minimum; // size: k+m + DecodingCacheParameter() { + decoding_matrix = 0; + dm_row = 0; + dm_column = 0; + minimum = 0; + } + ~DecodingCacheParameter() { + if (decoding_matrix) { + delete[] decoding_matrix; + } + if (dm_row) { + delete[] dm_row; + } + if (dm_column) { + delete[] dm_column; + } + if (minimum) { + delete[] minimum; + } + } + }; + public: - typedef std::pair::iterator, bufferptr> lru_entry_t; + static const int decoding_tables_lru_length = 10000; + typedef std::pair::iterator, + DecodingCacheParameter> lru_entry_t; typedef std::map< int, int** > codec_table_t; typedef std::map< int, codec_table_t > codec_tables_t__; typedef std::map< int, codec_tables_t__ > codec_tables_t_; @@ -43,6 +75,9 @@ class ErasureCodeShecTableCache { typedef std::map< int, codec_tables_t > codec_technique_tables_t; // int** matrix = codec_technique_tables_t[technique][k][m][c][w] + typedef std::map< uint64_t, lru_entry_t > lru_map_t; + typedef std::list< uint64_t > lru_list_t; + ErasureCodeShecTableCache() : codec_tables_guard("shec-lru-cache") { @@ -52,15 +87,38 @@ class ErasureCodeShecTableCache { Mutex codec_tables_guard; // mutex used to protect modifications in encoding/decoding table maps + bool getDecodingTableFromCache(int* matrix, + int* dm_row, int* dm_column, + int* minimum, + int technique, + int k, int m, int c, int w, + int* want, int* avails); + + void putDecodingTableToCache(int* matrix, + int* dm_row, int* dm_column, + int* minimum, + int technique, + int k, int m, int c, int w, + int* want, int* avails); + int** getEncodingTable(int technique, int k, int m, int c, int w); int** getEncodingTableNoLock(int technique, int k, int m, int c, int w); int* setEncodingTable(int technique, int k, int m, int c, int w, int*); private: - codec_technique_tables_t encoding_table; // encoding coefficients accessed via table[technique][k][m] - + // encoding table accessed via table[matrix][k][m][c][w] + // decoding table cache accessed via map[matrixtype] + // decoding table lru list accessed via list[matrixtype] + codec_technique_tables_t encoding_table; + std::map decoding_tables; + std::map decoding_tables_lru; + + lru_map_t* getDecodingTables(int technique); + lru_list_t* getDecodingTablesLru(int technique); + uint64_t getDecodingCacheSignature(int k, int m, int c, int w, + int *want, int *avails); + Mutex* getLock(); - }; #endif From f850d058be6c147c2af8acafc5fe8465d2fda33d Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Fri, 4 Sep 2015 12:20:08 +0200 Subject: [PATCH 501/654] rgw: improve debugs in RGWPutObj and RGWPutObj_ObjStore_SWIFT. Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_op.cc | 12 ++++++++++-- src/rgw/rgw_rest_swift.cc | 9 ++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index dca6b75e66d3a..722f39d2b7ec8 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1829,11 +1829,15 @@ void RGWPutObj::execute() } ret = get_params(); - if (ret < 0) + if (ret < 0) { + ldout(s->cct, 20) << "get_params() returned ret=" << ret << dendl; goto done; + } ret = get_system_versioning_params(s, &olh_epoch, &version_id); if (ret < 0) { + ldout(s->cct, 20) << "get_system_versioning_params() returned ret=" \ + << ret << dendl; goto done; } @@ -1858,6 +1862,7 @@ void RGWPutObj::execute() ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, user_quota, bucket_quota, s->content_length); if (ret < 0) { + ldout(s->cct, 20) << "check_quota() returned ret=" << ret << dendl; goto done; } } @@ -1870,8 +1875,10 @@ void RGWPutObj::execute() processor = select_processor(*static_cast(s->obj_ctx), &multipart); ret = processor->prepare(store, NULL); - if (ret < 0) + if (ret < 0) { + ldout(s->cct, 20) << "processor->prepare() returned ret=" << ret << dendl; goto done; + } do { bufferlist data; @@ -1941,6 +1948,7 @@ void RGWPutObj::execute() ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, user_quota, bucket_quota, s->obj_size); if (ret < 0) { + ldout(s->cct, 20) << "second check_quota() returned ret=" << ret << dendl; goto done; } diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 1e5fc08d0d808..adb034c9fef41 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -517,13 +517,16 @@ static int get_delete_at_param(req_state *s, time_t *delete_at) int RGWPutObj_ObjStore_SWIFT::get_params() { - if (s->has_bad_meta) + if (s->has_bad_meta) { return -EINVAL; + } if (!s->length) { const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); - if (!encoding || strcmp(encoding, "chunked") != 0) + if (!encoding || strcmp(encoding, "chunked") != 0) { + ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl; return -ERR_LENGTH_REQUIRED; + } chunked_upload = true; } @@ -531,7 +534,7 @@ int RGWPutObj_ObjStore_SWIFT::get_params() supplied_etag = s->info.env->get("HTTP_ETAG"); if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) { - dout(5) << "content type wasn't provided, trying to guess" << dendl; + ldout(s->cct, 5) << "content type wasn't provided, trying to guess" << dendl; const char *suffix = strrchr(s->object.name.c_str(), '.'); if (suffix) { suffix++; From 406b1d0347dcc27f698c7ec361a1e5d6a66277e8 Mon Sep 17 00:00:00 2001 From: Vikhyat Umrao Date: Fri, 4 Sep 2015 16:55:13 +0530 Subject: [PATCH 502/654] doc: Add pgcalc tool link in placement-groups document Fixes #12944 Signed-off-by: Vikhyat Umrao --- doc/rados/operations/placement-groups.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst index b408f9c8bd974..05f07514bbb17 100644 --- a/doc/rados/operations/placement-groups.rst +++ b/doc/rados/operations/placement-groups.rst @@ -23,6 +23,8 @@ calculated automatically. Here are a few values commonly used: - If you have more than 50 OSDs, you need to understand the tradeoffs and how to calculate the ``pg_num`` value by yourself +- For calculating ``pg_num`` value by yourself please take help of `pgcalc`_ tool + As the number of OSDs increases, chosing the right value for pg_num becomes more important because it has a significant influence on the behavior of the cluster as well as the durability of the data when @@ -431,3 +433,4 @@ entirely. To mark the "unfound" objects as "lost", execute the following:: .. _Create a Pool: ../pools#createpool .. _Mapping PGs to OSDs: ../../../architecture#mapping-pgs-to-osds +.. _pgcalc: http://ceph.com/pgcalc/ From b0714c3d2d5562a59666a376e1dac04aae3e0672 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Fri, 4 Sep 2015 15:11:35 +0200 Subject: [PATCH 503/654] rgw: don't append empty ETag HTTP header. Fixes: #12950 Signed-off-by: Radoslaw Zarzynski --- src/rgw/rgw_rest.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index c69d7ecefc20e..cb84b27c4812d 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -339,13 +339,19 @@ void dump_content_length(struct req_state *s, uint64_t len) } } -void dump_etag(struct req_state *s, const char *etag) +void dump_etag(struct req_state * const s, const char * const etag) { + if ('\0' == *etag) { + return; + } + int r; - if (s->prot_flags & RGW_REST_SWIFT) + if (s->prot_flags & RGW_REST_SWIFT) { r = s->cio->print("etag: %s\r\n", etag); - else + } else { r = s->cio->print("ETag: \"%s\"\r\n", etag); + } + if (r < 0) { ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl; } From e267128ce22d8b8cd83f6d6d82f24f496600e678 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Wed, 15 Jul 2015 12:54:20 +0200 Subject: [PATCH 504/654] ceph.spec.in: fix lttng/babeltrace conditionals lttng and babeltrace are build dependencies for rbd-replay-prep. Make sure the right package names are used. Enable for SLE12, as well as for openSUSE 13.1 and higher. Move the BuildRequires out of the ceph-test subpackage and into the distro-conditional dependencies section. Make ordering of BuildRequires a little more alphabetical. http://tracker.ceph.com/issues/12360 Fixes: #12360 Signed-off-by: Nathan Cutler sqme --- ceph.spec.in | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 8e63d6e4e6af5..4b5a3ccc6e586 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -97,6 +97,7 @@ BuildRequires: cmake BuildRequires: cryptsetup BuildRequires: gdbm BuildRequires: hdparm +BuildRequires: leveldb-devel > 1.2 BuildRequires: libaio-devel BuildRequires: libcurl-devel BuildRequires: libedit-devel @@ -104,10 +105,9 @@ BuildRequires: libxml2-devel BuildRequires: libblkid-devel >= 2.17 BuildRequires: libudev-devel BuildRequires: libtool -BuildRequires: leveldb-devel > 1.2 BuildRequires: make -BuildRequires: perl BuildRequires: parted +BuildRequires: perl BuildRequires: pkgconfig BuildRequires: python BuildRequires: python-nose @@ -159,6 +159,15 @@ Requires(preun): initscripts BuildRequires: gperftools-devel Requires: python-flask %endif +# lttng and babeltrace for rbd-replay-prep +%if 0%{?fedora} || 0%{?rhel} == 6 +BuildRequires: lttng-ust-devel +BuildRequires: libbabeltrace-devel +%endif +%if 0%{?suse_version} >= 1310 +BuildRequires: lttng-ust-devel +BuildRequires: babeltrace-devel +%endif %description Ceph is a massively scalable, open-source, distributed @@ -371,10 +380,6 @@ Group: System Environment/Libraries License: LGPL-2.0 Requires: ceph-common Requires: xmlstarlet -%if (0%{?fedora} || 0%{?rhel} == 6) -BuildRequires: lttng-ust-devel -BuildRequires: libbabeltrace-devel -%endif %description -n ceph-test This package contains Ceph benchmarks and test tools. @@ -1093,7 +1098,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_mandir}/man8/rbd-replay-prep.8* %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if (0%{?fedora} || 0%{?rhel} == 6) +%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} >= 1310 %{_bindir}/rbd-replay-prep %endif %dir %{_libdir}/ceph From 0d18f9b83a361e61c78906897d04c3bb88eba053 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Mon, 17 Aug 2015 20:51:50 -0400 Subject: [PATCH 505/654] librados_test_stub: add mock class for IoCtx operations Unit tests can now use gmock to simulate responses from the OSDs via a mocked librados library. Signed-off-by: Jason Dillaman --- .../librados_test_stub/LibradosTestStub.cc | 72 +++++++------ .../librados_test_stub/LibradosTestStub.h | 23 ++++ .../librados_test_stub/MockTestMemIoCtxImpl.h | 101 ++++++++++++++++++ .../MockTestMemRadosClient.h | 36 +++++++ .../librados_test_stub/TestClassHandler.cc | 10 +- .../librados_test_stub/TestClassHandler.h | 2 + src/test/librados_test_stub/TestIoCtxImpl.cc | 10 +- src/test/librados_test_stub/TestIoCtxImpl.h | 11 +- .../librados_test_stub/TestMemIoCtxImpl.cc | 4 +- .../librados_test_stub/TestMemIoCtxImpl.h | 7 +- .../librados_test_stub/TestMemRadosClient.cc | 12 ++- .../librados_test_stub/TestMemRadosClient.h | 2 + src/test/librados_test_stub/TestRadosClient.h | 5 + 13 files changed, 249 insertions(+), 46 deletions(-) create mode 100644 src/test/librados_test_stub/LibradosTestStub.h create mode 100644 src/test/librados_test_stub/MockTestMemIoCtxImpl.h create mode 100644 src/test/librados_test_stub/MockTestMemRadosClient.h diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc index 7b70cfaf462f0..1aa256ff18a2d 100644 --- a/src/test/librados_test_stub/LibradosTestStub.cc +++ b/src/test/librados_test_stub/LibradosTestStub.cc @@ -1,6 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab +#include "test/librados_test_stub/LibradosTestStub.h" #include "include/rados/librados.hpp" #include "common/ceph_argparse.h" #include "common/common_init.h" @@ -25,15 +26,7 @@ namespace { -static void DeallocateRadosClient(librados::TestRadosClient* client) -{ - client->put(); -} - -} // anonymous namespace - - -static librados::TestClassHandler *get_class_handler() { +librados::TestClassHandler *get_class_handler() { static boost::shared_ptr s_class_handler; if (!s_class_handler) { s_class_handler.reset(new librados::TestClassHandler()); @@ -42,23 +35,7 @@ static librados::TestClassHandler *get_class_handler() { return s_class_handler.get(); } -static librados::TestRadosClient *get_rados_client() { - // TODO: use factory to allow tests to swap out impl - static boost::shared_ptr s_rados_client; - if (!s_rados_client) { - CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT); - CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0); - cct->_conf->parse_env(); - cct->_conf->apply_changes(NULL); - s_rados_client.reset(new librados::TestMemRadosClient(cct), - &DeallocateRadosClient); - cct->put(); - } - s_rados_client->get(); - return s_rados_client.get(); -} - -static void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) { +void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) { if (outbuf) { if (outbl.length() > 0) { *outbuf = (char *)malloc(outbl.length()); @@ -72,7 +49,7 @@ static void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) { } } -static void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) { +void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) { if (outbuf) { if (outbl.length() > 0) { *outbuf = (char *)malloc(outbl.length()); @@ -86,6 +63,40 @@ static void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) { } } +} // anonymous namespace + +namespace librados_test_stub { + +TestRadosClientPtr *rados_client() { + // force proper destruction order by delaying construction + static TestRadosClientPtr s_rados_client; + return &s_rados_client; +} + +void set_rados_client( + const boost::shared_ptr &new_client) { + assert(new_client.get() != nullptr); + *rados_client() = new_client; +} + +TestRadosClientPtr get_rados_client() { + // TODO: use factory to allow tests to swap out impl + TestRadosClientPtr *client = rados_client(); + if (client->get() == nullptr) { + CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT); + CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0); + cct->_conf->parse_env(); + cct->_conf->apply_changes(NULL); + client->reset(new librados::TestMemRadosClient(cct), + &librados::TestRadosClient::Deallocate); + cct->put(); + } + (*client)->get(); + return *client; +} + +} // namespace librados_test_stub + extern "C" int rados_aio_create_completion(void *cb_arg, rados_callback_t cb_complete, rados_callback_t cb_safe, @@ -158,7 +169,7 @@ extern "C" int rados_connect(rados_t cluster) { } extern "C" int rados_create(rados_t *cluster, const char * const id) { - *cluster = get_rados_client(); + *cluster = librados_test_stub::get_rados_client().get(); return 0; } @@ -409,7 +420,7 @@ void IoCtx::dup(const IoCtx& rhs) { int IoCtx::exec(const std::string& oid, const char *cls, const char *method, bufferlist& inbl, bufferlist& outbl) { TestIoCtxImpl *ctx = reinterpret_cast(io_ctx_impl); - return ctx->exec(oid, *get_class_handler(), cls, method, inbl, &outbl, + return ctx->exec(oid, get_class_handler(), cls, method, inbl, &outbl, ctx->get_snap_context()); } @@ -601,8 +612,7 @@ void ObjectOperation::exec(const char *cls, const char *method, bufferlist& inbl) { TestObjectOperationImpl *o = reinterpret_cast(impl); o->ops.push_back(boost::bind(&TestIoCtxImpl::exec, _1, _2, - boost::ref(*get_class_handler()), - cls, method, inbl, _3, _4)); + get_class_handler(), cls, method, inbl, _3, _4)); } void ObjectOperation::set_op_flags2(int flags) { diff --git a/src/test/librados_test_stub/LibradosTestStub.h b/src/test/librados_test_stub/LibradosTestStub.h new file mode 100644 index 0000000000000..9fed68df24b28 --- /dev/null +++ b/src/test/librados_test_stub/LibradosTestStub.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRADOS_TEST_STUB_H +#define LIBRADOS_TEST_STUB_H + +#include + +namespace librados { +class TestRadosClient; +} + +namespace librados_test_stub { + +typedef boost::shared_ptr TestRadosClientPtr; + +void set_rados_client(const TestRadosClientPtr &rados_client); + +TestRadosClientPtr get_rados_client(); + +} // namespace librados_test_stub + +#endif // LIBRADOS_TEST_STUB_H diff --git a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h new file mode 100644 index 0000000000000..198db6ccb6861 --- /dev/null +++ b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H +#define LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H + +#include "test/librados_test_stub/TestMemIoCtxImpl.h" +#include "gmock/gmock.h" + +namespace librados { + +class MockTestMemRadosClient; + +class MockTestMemIoCtxImpl : public TestMemIoCtxImpl { +public: + MockTestMemIoCtxImpl(MockTestMemRadosClient *mock_client, + TestMemRadosClient *client, int64_t pool_id, + const std::string& pool_name, + TestMemRadosClient::Pool *pool) + : TestMemIoCtxImpl(client, pool_id, pool_name, pool), + m_mock_client(mock_client), m_client(client) { + default_to_parent(); + } + + MockTestMemRadosClient *get_mock_rados_client() { + return m_mock_client; + } + + virtual TestIoCtxImpl *clone() { + TestIoCtxImpl *io_ctx_impl = new ::testing::NiceMock( + m_mock_client, m_client, get_pool_id(), get_pool_name(), get_pool()); + io_ctx_impl->set_snap_read(get_snap_read()); + io_ctx_impl->set_snap_context(get_snap_context()); + return io_ctx_impl; + } + + MOCK_METHOD7(exec, int(const std::string& oid, + TestClassHandler *handler, + const char *cls, + const char *method, + bufferlist& inbl, + bufferlist* outbl, + const SnapContext &snapc)); + int do_exec(const std::string& oid, TestClassHandler *handler, + const char *cls, const char *method, bufferlist& inbl, + bufferlist* outbl, const SnapContext &snapc) { + return TestMemIoCtxImpl::exec(oid, handler, cls, method, inbl, outbl, + snapc); + } + + MOCK_METHOD4(read, int(const std::string& oid, + size_t len, + uint64_t off, + bufferlist *bl)); + int do_read(const std::string& oid, size_t len, uint64_t off, + bufferlist *bl) { + return TestMemIoCtxImpl::read(oid, len, off, bl); + } + + MOCK_METHOD1(remove, int(const std::string& oid)); + int do_remove(const std::string& oid) { + return TestMemIoCtxImpl::remove(oid); + } + + MOCK_METHOD1(selfmanaged_snap_create, int(uint64_t *snap_id)); + int do_selfmanaged_snap_create(uint64_t *snap_id) { + return TestMemIoCtxImpl::selfmanaged_snap_create(snap_id); + } + + MOCK_METHOD1(selfmanaged_snap_remove, int(uint64_t snap_id)); + int do_selfmanaged_snap_remove(uint64_t snap_id) { + return TestMemIoCtxImpl::selfmanaged_snap_remove(snap_id); + } + + MOCK_METHOD3(write_full, int(const std::string& oid, + bufferlist& bl, + const SnapContext &snapc)); + int do_write_full(const std::string& oid, bufferlist& bl, + const SnapContext &snapc) { + return TestMemIoCtxImpl::write_full(oid, bl, snapc); + } + + void default_to_parent() { + using namespace ::testing; + + ON_CALL(*this, exec(_, _, _, _, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_exec)); + ON_CALL(*this, read(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_read)); + ON_CALL(*this, remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_remove)); + ON_CALL(*this, selfmanaged_snap_create(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_create)); + ON_CALL(*this, selfmanaged_snap_remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_remove)); + ON_CALL(*this, write_full(_, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_write_full)); + } + +private: + MockTestMemRadosClient *m_mock_client; + TestMemRadosClient *m_client; +}; + +} // namespace librados + +#endif // LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H diff --git a/src/test/librados_test_stub/MockTestMemRadosClient.h b/src/test/librados_test_stub/MockTestMemRadosClient.h new file mode 100644 index 0000000000000..1d0b994afa629 --- /dev/null +++ b/src/test/librados_test_stub/MockTestMemRadosClient.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H +#define LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H + +#include "test/librados_test_stub/TestMemRadosClient.h" +#include "test/librados_test_stub/MockTestMemIoCtxImpl.h" +#include "gmock/gmock.h" + +namespace librados { + +class MockTestMemRadosClient : public TestMemRadosClient { +public: + MockTestMemRadosClient(CephContext *cct) : TestMemRadosClient(cct) { + default_to_dispatch(); + } + + MOCK_METHOD2(create_ioctx, TestIoCtxImpl *(int64_t pool_id, + const std::string &pool_name)); + TestIoCtxImpl *do_create_ioctx(int64_t pool_id, + const std::string &pool_name) { + return new ::testing::NiceMock( + this, this, pool_id, pool_name, get_pool(pool_name)); + } + + void default_to_dispatch() { + using namespace ::testing; + + ON_CALL(*this, create_ioctx(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_create_ioctx)); + } +}; + +} // namespace librados + +#endif // LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H diff --git a/src/test/librados_test_stub/TestClassHandler.cc b/src/test/librados_test_stub/TestClassHandler.cc index 1ac29cd35694e..4f66e1effa8b6 100644 --- a/src/test/librados_test_stub/TestClassHandler.cc +++ b/src/test/librados_test_stub/TestClassHandler.cc @@ -2,10 +2,12 @@ // vim: ts=8 sw=2 smarttab #include "test/librados_test_stub/TestClassHandler.h" +#include "test/librados_test_stub/TestIoCtxImpl.h" #include #include #include #include "common/debug.h" +#include "include/assert.h" #define dout_subsys ceph_subsys_rados @@ -106,10 +108,16 @@ TestClassHandler::SharedMethodContext TestClassHandler::get_method_context( TestIoCtxImpl *io_ctx_impl, const std::string &oid, const SnapContext &snapc) { SharedMethodContext ctx(new MethodContext()); - ctx->io_ctx_impl = io_ctx_impl; + + // clone to ioctx to provide a firewall for gmock expectations + ctx->io_ctx_impl = io_ctx_impl->clone(); ctx->oid = oid; ctx->snapc = snapc; return ctx; } +TestClassHandler::MethodContext::~MethodContext() { + io_ctx_impl->put(); +} + } // namespace librados diff --git a/src/test/librados_test_stub/TestClassHandler.h b/src/test/librados_test_stub/TestClassHandler.h index 97062cec6917b..e25db273ae23c 100644 --- a/src/test/librados_test_stub/TestClassHandler.h +++ b/src/test/librados_test_stub/TestClassHandler.h @@ -23,6 +23,8 @@ class TestClassHandler { ~TestClassHandler(); struct MethodContext { + ~MethodContext(); + TestIoCtxImpl *io_ctx_impl; std::string oid; SnapContext snapc; diff --git a/src/test/librados_test_stub/TestIoCtxImpl.cc b/src/test/librados_test_stub/TestIoCtxImpl.cc index 1611a60fa21f4..e7582b46a4ccc 100644 --- a/src/test/librados_test_stub/TestIoCtxImpl.cc +++ b/src/test/librados_test_stub/TestIoCtxImpl.cc @@ -18,9 +18,9 @@ TestIoCtxImpl::TestIoCtxImpl() : m_client(NULL) { get(); } -TestIoCtxImpl::TestIoCtxImpl(TestRadosClient &client, int64_t pool_id, +TestIoCtxImpl::TestIoCtxImpl(TestRadosClient *client, int64_t pool_id, const std::string& pool_name) - : m_client(&client), m_pool_id(pool_id), m_pool_name(pool_name), + : m_client(client), m_pool_id(pool_id), m_pool_name(pool_name), m_snap_seq(CEPH_NOSNAP) { m_client->get(); @@ -113,17 +113,17 @@ int TestIoCtxImpl::aio_operate_read(const std::string& oid, return 0; } -int TestIoCtxImpl::exec(const std::string& oid, TestClassHandler &handler, +int TestIoCtxImpl::exec(const std::string& oid, TestClassHandler *handler, const char *cls, const char *method, bufferlist& inbl, bufferlist* outbl, const SnapContext &snapc) { - cls_method_cxx_call_t call = handler.get_method(cls, method); + cls_method_cxx_call_t call = handler->get_method(cls, method); if (call == NULL) { return -ENOSYS; } return (*call)(reinterpret_cast( - handler.get_method_context(this, oid, snapc).get()), &inbl, outbl); + handler->get_method_context(this, oid, snapc).get()), &inbl, outbl); } int TestIoCtxImpl::list_watchers(const std::string& o, diff --git a/src/test/librados_test_stub/TestIoCtxImpl.h b/src/test/librados_test_stub/TestIoCtxImpl.h index 3a71582388883..1cb6213ff9d8c 100644 --- a/src/test/librados_test_stub/TestIoCtxImpl.h +++ b/src/test/librados_test_stub/TestIoCtxImpl.h @@ -36,7 +36,7 @@ class TestIoCtxImpl { public: TestIoCtxImpl(); - explicit TestIoCtxImpl(TestRadosClient &client, int64_t m_pool_id, + explicit TestIoCtxImpl(TestRadosClient *client, int64_t m_pool_id, const std::string& pool_name); TestRadosClient *get_rados_client() { @@ -46,6 +46,10 @@ class TestIoCtxImpl { void get(); void put(); + inline int64_t get_pool_id() const { + return m_pool_id; + } + virtual TestIoCtxImpl *clone() = 0; virtual uint64_t get_instance_id() const; @@ -56,6 +60,9 @@ class TestIoCtxImpl { return m_snap_seq; } + inline void set_snap_context(const SnapContext& snapc) { + m_snapc = snapc; + } const SnapContext &get_snap_context() const { return m_snapc; } @@ -73,7 +80,7 @@ class TestIoCtxImpl { virtual int assert_exists(const std::string &oid) = 0; virtual int create(const std::string& oid, bool exclusive) = 0; - virtual int exec(const std::string& oid, TestClassHandler &handler, + virtual int exec(const std::string& oid, TestClassHandler *handler, const char *cls, const char *method, bufferlist& inbl, bufferlist* outbl, const SnapContext &snapc); diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.cc b/src/test/librados_test_stub/TestMemIoCtxImpl.cc index bb39b49d62db1..37fa6124e16d1 100644 --- a/src/test/librados_test_stub/TestMemIoCtxImpl.cc +++ b/src/test/librados_test_stub/TestMemIoCtxImpl.cc @@ -27,10 +27,10 @@ TestMemIoCtxImpl::TestMemIoCtxImpl(const TestMemIoCtxImpl& rhs) m_pool->get(); } -TestMemIoCtxImpl::TestMemIoCtxImpl(TestMemRadosClient &client, int64_t pool_id, +TestMemIoCtxImpl::TestMemIoCtxImpl(TestMemRadosClient *client, int64_t pool_id, const std::string& pool_name, TestMemRadosClient::Pool *pool) - : TestIoCtxImpl(client, pool_id, pool_name), m_client(&client), + : TestIoCtxImpl(client, pool_id, pool_name), m_client(client), m_pool(pool) { m_pool->get(); } diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.h b/src/test/librados_test_stub/TestMemIoCtxImpl.h index e58b97a3fecae..aa6541530b5de 100644 --- a/src/test/librados_test_stub/TestMemIoCtxImpl.h +++ b/src/test/librados_test_stub/TestMemIoCtxImpl.h @@ -12,7 +12,7 @@ namespace librados { class TestMemIoCtxImpl : public TestIoCtxImpl { public: TestMemIoCtxImpl(); - TestMemIoCtxImpl(TestMemRadosClient &client, int64_t m_pool_id, + TestMemIoCtxImpl(TestMemRadosClient *client, int64_t m_pool_id, const std::string& pool_name, TestMemRadosClient::Pool *pool); virtual ~TestMemIoCtxImpl(); @@ -56,6 +56,11 @@ class TestMemIoCtxImpl : public TestIoCtxImpl { bufferlist& bl); virtual int zero(const std::string& oid, uint64_t off, uint64_t len); +protected: + TestMemRadosClient::Pool *get_pool() { + return m_pool; + } + private: TestMemIoCtxImpl(const TestMemIoCtxImpl&); diff --git a/src/test/librados_test_stub/TestMemRadosClient.cc b/src/test/librados_test_stub/TestMemRadosClient.cc index b8bdf0777c31f..6492d25b76b3d 100644 --- a/src/test/librados_test_stub/TestMemRadosClient.cc +++ b/src/test/librados_test_stub/TestMemRadosClient.cc @@ -38,10 +38,7 @@ TestMemRadosClient::Pool::Pool() TestIoCtxImpl *TestMemRadosClient::create_ioctx(int64_t pool_id, const std::string &pool_name) { - Pools::iterator iter = m_pools.find(pool_name); - assert(iter != m_pools.end()); - - return new TestMemIoCtxImpl(*this, pool_id, pool_name, iter->second); + return new TestMemIoCtxImpl(this, pool_id, pool_name, get_pool(pool_name)); } void TestMemRadosClient::object_list(int64_t pool_id, @@ -125,4 +122,11 @@ int TestMemRadosClient::blacklist_add(const std::string& client_address, return 0; } +TestMemRadosClient::Pool *TestMemRadosClient::get_pool( + const std::string &pool_name) { + Pools::iterator iter = m_pools.find(pool_name); + assert(iter != m_pools.end()); + return iter->second; +} + } // namespace librados diff --git a/src/test/librados_test_stub/TestMemRadosClient.h b/src/test/librados_test_stub/TestMemRadosClient.h index e0afacb3a7054..dada74ec62a1a 100644 --- a/src/test/librados_test_stub/TestMemRadosClient.h +++ b/src/test/librados_test_stub/TestMemRadosClient.h @@ -87,6 +87,8 @@ class TestMemRadosClient : public TestRadosClient { protected: ~TestMemRadosClient(); + Pool *get_pool(const std::string &pool_name); + private: typedef std::map Pools; diff --git a/src/test/librados_test_stub/TestRadosClient.h b/src/test/librados_test_stub/TestRadosClient.h index a0611051cac52..e811bafaa16ad 100644 --- a/src/test/librados_test_stub/TestRadosClient.h +++ b/src/test/librados_test_stub/TestRadosClient.h @@ -25,6 +25,11 @@ class TestIoCtxImpl; class TestRadosClient { public: + static void Deallocate(librados::TestRadosClient* client) + { + client->put(); + } + typedef boost::function AioFunction; struct Object { From 1c522be10993282842b403260478bcc4adbc445f Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Sat, 1 Aug 2015 09:30:59 +0200 Subject: [PATCH 506/654] ceph.spec.in: put distro conditional around Group: Neither RHEL and Fedora require a Group: line, so retain only for SLE/openSUSE (and set the right value for these distros) Signed-off-by: Nathan Cutler --- ceph.spec.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index b976e2441c00b..18f2481c1d565 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -47,7 +47,9 @@ Release: @RPM_RELEASE@%{?dist} Epoch: 1 Summary: User space components of the Ceph file system License: LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT -Group: System Environment/Base +%if 0%{?suse_version} +Group: System/Filesystems +%endif URL: http://ceph.com/ Source0: http://ceph.com/download/%{name}-%{version}.tar.bz2 %if 0%{?fedora} || 0%{?rhel} From e8749b2a7f5a60dbfa03518315a1f1caa4039f59 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Fri, 4 Sep 2015 14:59:09 -0400 Subject: [PATCH 507/654] librbd: support templating of ImageCtx for async state machines This will facilitate the creation of unit test mocks to verify non-librados actions. The templates (by default) will be flagged as extern to avoid the translation unit bloat. Signed-off-by: Jason Dillaman --- src/librbd/AsyncFlattenRequest.cc | 12 +++++----- src/librbd/AsyncFlattenRequest.h | 2 +- src/librbd/AsyncObjectThrottle.cc | 32 +++++++++++++-------------- src/librbd/AsyncObjectThrottle.h | 31 +++++++++++++++++--------- src/librbd/AsyncRequest.cc | 26 +++++++++++++++------- src/librbd/AsyncRequest.h | 29 +++++------------------- src/librbd/AsyncResizeRequest.h | 2 +- src/librbd/AsyncTrimRequest.cc | 16 +++++++------- src/librbd/AsyncTrimRequest.h | 2 +- src/librbd/CopyupRequest.cc | 8 +++---- src/librbd/ImageCtx.cc | 2 +- src/librbd/ImageCtx.h | 4 ++-- src/librbd/ObjectMap.h | 2 +- src/librbd/RebuildObjectMapRequest.cc | 8 +++---- src/librbd/RebuildObjectMapRequest.h | 2 +- 15 files changed, 90 insertions(+), 88 deletions(-) diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc index 7555b179d5113..9136220e57e76 100644 --- a/src/librbd/AsyncFlattenRequest.cc +++ b/src/librbd/AsyncFlattenRequest.cc @@ -18,11 +18,11 @@ namespace librbd { -class AsyncFlattenObjectContext : public C_AsyncObjectThrottle { +class AsyncFlattenObjectContext : public C_AsyncObjectThrottle<> { public: - AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, - uint64_t object_size, ::SnapContext snapc, - uint64_t object_no) + AsyncFlattenObjectContext(AsyncObjectThrottle<> &throttle, + ImageCtx *image_ctx, uint64_t object_size, + ::SnapContext snapc, uint64_t object_no) : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_size(object_size), m_snapc(snapc), m_object_no(object_no) { @@ -94,11 +94,11 @@ void AsyncFlattenRequest::send() { ldout(cct, 5) << this << " send" << dendl; m_state = STATE_FLATTEN_OBJECTS; - AsyncObjectThrottle::ContextFactory context_factory( + AsyncObjectThrottle<>::ContextFactory context_factory( boost::lambda::bind(boost::lambda::new_ptr(), boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>( this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx, 0, m_overlap_objects); throttle->start_ops(m_image_ctx.concurrent_management_ops); diff --git a/src/librbd/AsyncFlattenRequest.h b/src/librbd/AsyncFlattenRequest.h index c2af903ef4da4..01f16671c5269 100644 --- a/src/librbd/AsyncFlattenRequest.h +++ b/src/librbd/AsyncFlattenRequest.h @@ -12,7 +12,7 @@ namespace librbd { class ImageCtx; class ProgressContext; -class AsyncFlattenRequest : public AsyncRequest +class AsyncFlattenRequest : public AsyncRequest<> { public: AsyncFlattenRequest(ImageCtx &image_ctx, Context *on_finish, diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc index e6eb4aba96259..59b3a1f6021cd 100644 --- a/src/librbd/AsyncObjectThrottle.cc +++ b/src/librbd/AsyncObjectThrottle.cc @@ -3,6 +3,7 @@ #include "librbd/AsyncObjectThrottle.h" #include "include/rbd/librbd.hpp" #include "common/RWLock.h" +#include "common/WorkQueue.h" #include "librbd/AsyncRequest.h" #include "librbd/ImageCtx.h" #include "librbd/internal.h" @@ -10,17 +11,11 @@ namespace librbd { -void C_AsyncObjectThrottle::finish(int r) { - RWLock::RLocker l(m_image_ctx.owner_lock); - m_finisher.finish_op(r); -} - -AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request, - ImageCtx &image_ctx, - const ContextFactory& context_factory, - Context *ctx, ProgressContext *prog_ctx, - uint64_t object_no, - uint64_t end_object_no) +template +AsyncObjectThrottle::AsyncObjectThrottle( + const AsyncRequest* async_request, T &image_ctx, + const ContextFactory& context_factory, Context *ctx, + ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no) : m_lock(unique_lock_name("librbd::AsyncThrottle::m_lock", this)), m_async_request(async_request), m_image_ctx(image_ctx), m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx), @@ -29,7 +24,8 @@ AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request, { } -void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { +template +void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { assert(m_image_ctx.owner_lock.is_locked()); bool complete; { @@ -48,11 +44,12 @@ void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { } } -void AsyncObjectThrottle::finish_op(int r) { +template +void AsyncObjectThrottle::finish_op(int r) { assert(m_image_ctx.owner_lock.is_locked()); bool complete; { - Mutex::Locker l(m_lock); + Mutex::Locker locker(m_lock); --m_current_ops; if (r < 0 && r != -ENOENT && m_ret == 0) { m_ret = r; @@ -67,7 +64,8 @@ void AsyncObjectThrottle::finish_op(int r) { } } -void AsyncObjectThrottle::start_next_op() { +template +void AsyncObjectThrottle::start_next_op() { bool done = false; while (!done) { if (m_async_request != NULL && m_async_request->is_canceled() && @@ -80,7 +78,7 @@ void AsyncObjectThrottle::start_next_op() { } uint64_t ono = m_object_no++; - C_AsyncObjectThrottle *ctx = m_context_factory(*this, ono); + C_AsyncObjectThrottle *ctx = m_context_factory(*this, ono); int r = ctx->send(); if (r < 0) { @@ -101,3 +99,5 @@ void AsyncObjectThrottle::start_next_op() { } } // namespace librbd + +template class librbd::AsyncObjectThrottle; diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h index 222baf00d1bdd..a83105145a49e 100644 --- a/src/librbd/AsyncObjectThrottle.h +++ b/src/librbd/AsyncObjectThrottle.h @@ -5,13 +5,14 @@ #include "include/int_types.h" #include "include/Context.h" +#include "common/RWLock.h" #include #include "include/assert.h" namespace librbd { -class AsyncRequest; +template class AsyncRequest; class ProgressContext; struct ImageCtx; @@ -21,31 +22,37 @@ class AsyncObjectThrottleFinisher { virtual void finish_op(int r) = 0; }; +template class C_AsyncObjectThrottle : public Context { public: C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher, - ImageCtx &image_ctx) - : m_image_ctx(image_ctx), m_finisher(finisher) - { + ImageCtxT &image_ctx) + : m_image_ctx(image_ctx), m_finisher(finisher) { } virtual int send() = 0; protected: - ImageCtx &m_image_ctx; + ImageCtxT &m_image_ctx; - virtual void finish(int r); + virtual void finish(int r) { + RWLock::RLocker locker(m_image_ctx.owner_lock); + m_finisher.finish_op(r); + } private: AsyncObjectThrottleFinisher &m_finisher; }; +template class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { public: - typedef boost::function ContextFactory; + typedef boost::function< + C_AsyncObjectThrottle* (AsyncObjectThrottle&, + uint64_t)> ContextFactory; - AsyncObjectThrottle(const AsyncRequest *async_request, ImageCtx &image_ctx, + AsyncObjectThrottle(const AsyncRequest *async_request, + ImageCtxT &image_ctx, const ContextFactory& context_factory, Context *ctx, ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no); @@ -55,8 +62,8 @@ class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { private: Mutex m_lock; - const AsyncRequest *m_async_request; - ImageCtx &m_image_ctx; + const AsyncRequest *m_async_request; + ImageCtxT &m_image_ctx; ContextFactory m_context_factory; Context *m_ctx; ProgressContext *m_prog_ctx; @@ -70,4 +77,6 @@ class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { } // namespace librbd +extern template class librbd::AsyncObjectThrottle; + #endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc index 4fcb7afb09476..b6e41eb175ad1 100644 --- a/src/librbd/AsyncRequest.cc +++ b/src/librbd/AsyncRequest.cc @@ -3,40 +3,50 @@ #include "librbd/AsyncRequest.h" #include "librbd/ImageCtx.h" #include "librbd/internal.h" +#include "common/WorkQueue.h" #include namespace librbd { -AsyncRequest::AsyncRequest(ImageCtx &image_ctx, Context *on_finish) +template +AsyncRequest::AsyncRequest(T &image_ctx, Context *on_finish) : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false), m_xlist_item(this) { + assert(m_on_finish != NULL); Mutex::Locker l(m_image_ctx.async_ops_lock); m_image_ctx.async_requests.push_back(&m_xlist_item); } -AsyncRequest::~AsyncRequest() { +template +AsyncRequest::~AsyncRequest() { Mutex::Locker l(m_image_ctx.async_ops_lock); assert(m_xlist_item.remove_myself()); m_image_ctx.async_requests_cond.Signal(); } -void AsyncRequest::async_complete(int r) { +template +void AsyncRequest::async_complete(int r) { m_image_ctx.op_work_queue->queue(create_callback_context(), r); } -librados::AioCompletion *AsyncRequest::create_callback_completion() { +template +librados::AioCompletion *AsyncRequest::create_callback_completion() { return librados::Rados::aio_create_completion(create_callback_context(), NULL, rados_ctx_cb); } -Context *AsyncRequest::create_callback_context() { - return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1)); +template +Context *AsyncRequest::create_callback_context() { + return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1)); } -Context *AsyncRequest::create_async_callback_context() { - return new FunctionContext(boost::bind(&AsyncRequest::async_complete, this, +template +Context *AsyncRequest::create_async_callback_context() { + return new FunctionContext(boost::bind(&AsyncRequest::async_complete, this, _1));; } } // namespace librbd + +template class librbd::AsyncRequest; diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h index c0a60131106bc..4830349580193 100644 --- a/src/librbd/AsyncRequest.h +++ b/src/librbd/AsyncRequest.h @@ -8,17 +8,15 @@ #include "include/rados/librados.hpp" #include "include/xlist.h" -/* DARWIN Missing ERESTART */ -#include "include/compat.h" - namespace librbd { class ImageCtx; +template class AsyncRequest { public: - AsyncRequest(ImageCtx &image_ctx, Context *on_finish); + AsyncRequest(ImageCtxT &image_ctx, Context *on_finish); virtual ~AsyncRequest(); void complete(int r) { @@ -41,7 +39,7 @@ class AsyncRequest } protected: - ImageCtx &m_image_ctx; + ImageCtxT &m_image_ctx; Context *m_on_finish; librados::AioCompletion *create_callback_completion(); @@ -59,26 +57,11 @@ class AsyncRequest } private: bool m_canceled; - xlist::item m_xlist_item; -}; - -class C_AsyncRequest : public Context -{ -public: - C_AsyncRequest(AsyncRequest *req) - : m_req(req) - { - } - -protected: - virtual void finish(int r) { - m_req->complete(r); - } - -private: - AsyncRequest *m_req; + typename xlist *>::item m_xlist_item; }; } // namespace librbd +extern template class librbd::AsyncRequest; + #endif //CEPH_LIBRBD_ASYNC_REQUEST_H diff --git a/src/librbd/AsyncResizeRequest.h b/src/librbd/AsyncResizeRequest.h index c13677c51ceb3..0acad6fb9504b 100644 --- a/src/librbd/AsyncResizeRequest.h +++ b/src/librbd/AsyncResizeRequest.h @@ -12,7 +12,7 @@ namespace librbd class ImageCtx; class ProgressContext; -class AsyncResizeRequest : public AsyncRequest +class AsyncResizeRequest : public AsyncRequest<> { public: AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish, uint64_t new_size, diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc index 801cf8959820d..90668ce43123a 100644 --- a/src/librbd/AsyncTrimRequest.cc +++ b/src/librbd/AsyncTrimRequest.cc @@ -24,9 +24,9 @@ namespace librbd { -class C_CopyupObject : public C_AsyncObjectThrottle { +class C_CopyupObject : public C_AsyncObjectThrottle<> { public: - C_CopyupObject(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + C_CopyupObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx, ::SnapContext snapc, uint64_t object_no) : C_AsyncObjectThrottle(throttle, *image_ctx), m_snapc(snapc), m_object_no(object_no) @@ -51,9 +51,9 @@ class C_CopyupObject : public C_AsyncObjectThrottle { uint64_t m_object_no; }; -class C_RemoveObject : public C_AsyncObjectThrottle { +class C_RemoveObject : public C_AsyncObjectThrottle<> { public: - C_RemoveObject(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + C_RemoveObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx, uint64_t object_no) : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no) { @@ -203,10 +203,10 @@ void AsyncTrimRequest::send_copyup_objects() { m_state = STATE_COPYUP_OBJECTS; Context *ctx = create_callback_context(); - AsyncObjectThrottle::ContextFactory context_factory( + AsyncObjectThrottle<>::ContextFactory context_factory( boost::lambda::bind(boost::lambda::new_ptr(), boost::lambda::_1, &m_image_ctx, snapc, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>( this, m_image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start, copyup_end); throttle->start_ops(m_image_ctx.concurrent_management_ops); @@ -221,10 +221,10 @@ void AsyncTrimRequest::send_remove_objects() { m_state = STATE_REMOVE_OBJECTS; Context *ctx = create_callback_context(); - AsyncObjectThrottle::ContextFactory context_factory( + AsyncObjectThrottle<>::ContextFactory context_factory( boost::lambda::bind(boost::lambda::new_ptr(), boost::lambda::_1, &m_image_ctx, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>( this, m_image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start, m_num_objects); throttle->start_ops(m_image_ctx.concurrent_management_ops); diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/AsyncTrimRequest.h index cf69831993f0e..2160c405a4ffa 100644 --- a/src/librbd/AsyncTrimRequest.h +++ b/src/librbd/AsyncTrimRequest.h @@ -11,7 +11,7 @@ namespace librbd class ImageCtx; class ProgressContext; -class AsyncTrimRequest : public AsyncRequest +class AsyncTrimRequest : public AsyncRequest<> { public: AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish, diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc index 2580af1f42a5a..667d19d89c4ca 100644 --- a/src/librbd/CopyupRequest.cc +++ b/src/librbd/CopyupRequest.cc @@ -26,9 +26,9 @@ namespace librbd { namespace { -class UpdateObjectMap : public C_AsyncObjectThrottle { +class UpdateObjectMap : public C_AsyncObjectThrottle<> { public: - UpdateObjectMap(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + UpdateObjectMap(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx, uint64_t object_no, const std::vector *snap_ids, size_t snap_id_idx) : C_AsyncObjectThrottle(throttle, *image_ctx), @@ -301,11 +301,11 @@ class UpdateObjectMap : public C_AsyncObjectThrottle { m_state = STATE_OBJECT_MAP; RWLock::RLocker owner_locker(m_ictx->owner_lock); - AsyncObjectThrottle::ContextFactory context_factory( + AsyncObjectThrottle<>::ContextFactory context_factory( boost::lambda::bind(boost::lambda::new_ptr(), boost::lambda::_1, m_ictx, m_object_no, &m_snap_ids, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>( NULL, *m_ictx, context_factory, create_callback_context(), NULL, 0, m_snap_ids.size()); throttle->start_ops(m_ictx->concurrent_management_ops); diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index 3e860322c991a..bf161498f4d92 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -798,7 +798,7 @@ class ThreadPoolSingleton : public ThreadPool { ldout(cct, 10) << "canceling async requests: count=" << async_requests.size() << dendl; - for (xlist::iterator it = async_requests.begin(); + for (xlist*>::iterator it = async_requests.begin(); !it.end(); ++it) { ldout(cct, 10) << "canceling async request: " << *it << dendl; (*it)->cancel(); diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h index 16634f6cea61e..3c7f170b58439 100644 --- a/src/librbd/ImageCtx.h +++ b/src/librbd/ImageCtx.h @@ -38,7 +38,7 @@ class PerfCounters; namespace librbd { class AsyncOperation; - class AsyncRequest; + template class AsyncRequest; class AsyncResizeRequest; class CopyupRequest; class ImageWatcher; @@ -122,7 +122,7 @@ namespace librbd { std::map copyup_list; xlist async_ops; - xlist async_requests; + xlist*> async_requests; Cond async_requests_cond; ObjectMap object_map; diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h index abcf7372d3262..115023306944f 100644 --- a/src/librbd/ObjectMap.h +++ b/src/librbd/ObjectMap.h @@ -60,7 +60,7 @@ class ObjectMap { private: - class Request : public AsyncRequest { + class Request : public AsyncRequest<> { public: Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish) : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id), diff --git a/src/librbd/RebuildObjectMapRequest.cc b/src/librbd/RebuildObjectMapRequest.cc index f726e6647fbd2..0a7950a8fb1d9 100644 --- a/src/librbd/RebuildObjectMapRequest.cc +++ b/src/librbd/RebuildObjectMapRequest.cc @@ -22,9 +22,9 @@ namespace librbd { namespace { -class C_VerifyObject : public C_AsyncObjectThrottle { +class C_VerifyObject : public C_AsyncObjectThrottle<> { public: - C_VerifyObject(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + C_VerifyObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx, uint64_t snap_id, uint64_t object_no) : C_AsyncObjectThrottle(throttle, *image_ctx), m_snap_id(snap_id), m_object_no(object_no), m_oid(m_image_ctx.get_object_name(m_object_no)) @@ -296,10 +296,10 @@ void RebuildObjectMapRequest::send_verify_objects() { m_state = STATE_VERIFY_OBJECTS; ldout(cct, 5) << this << " send_verify_objects" << dendl; - AsyncObjectThrottle::ContextFactory context_factory( + AsyncObjectThrottle<>::ContextFactory context_factory( boost::lambda::bind(boost::lambda::new_ptr(), boost::lambda::_1, &m_image_ctx, snap_id, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>( this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx, 0, num_objects); throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); diff --git a/src/librbd/RebuildObjectMapRequest.h b/src/librbd/RebuildObjectMapRequest.h index a7708ad71cafd..02a41ef568ca0 100644 --- a/src/librbd/RebuildObjectMapRequest.h +++ b/src/librbd/RebuildObjectMapRequest.h @@ -11,7 +11,7 @@ namespace librbd { class ImageCtx; class ProgressContext; -class RebuildObjectMapRequest : public AsyncRequest { +class RebuildObjectMapRequest : public AsyncRequest<> { public: RebuildObjectMapRequest(ImageCtx &image_ctx, Context *on_finish, From 404dd16d4b1930ebb3e918221d0bb751f5ffdce7 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Fri, 4 Sep 2015 15:01:38 -0400 Subject: [PATCH 508/654] tests: base gmock class support for librbd Created mock classes to represent a few central librbd classes and a basic gmock test fixture for future gmock-based unit tests. Signed-off-by: Jason Dillaman --- src/test/Makefile-client.am | 12 ++- src/test/Makefile.am | 2 - src/test/librbd/mock/MockContextWQ.h | 17 ++++ src/test/librbd/mock/MockImageCtx.h | 112 ++++++++++++++++++++++++ src/test/librbd/mock/MockImageWatcher.h | 19 ++++ src/test/librbd/mock/MockObjectMap.h | 20 +++++ src/test/librbd/test_mock_fixture.cc | 68 ++++++++++++++ src/test/librbd/test_mock_fixture.h | 64 ++++++++++++++ 8 files changed, 311 insertions(+), 3 deletions(-) create mode 100644 src/test/librbd/mock/MockContextWQ.h create mode 100644 src/test/librbd/mock/MockImageCtx.h create mode 100644 src/test/librbd/mock/MockImageWatcher.h create mode 100644 src/test/librbd/mock/MockObjectMap.h create mode 100644 src/test/librbd/test_mock_fixture.cc create mode 100644 src/test/librbd/test_mock_fixture.h diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 0ea7b80ad3a3f..49e17f6e61d6f 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -321,7 +321,8 @@ librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS) noinst_LTLIBRARIES += librbd_test.la unittest_librbd_SOURCES = \ - test/librbd/test_main.cc + test/librbd/test_main.cc \ + test/librbd/test_mock_fixture.cc unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS unittest_librbd_LDADD = \ librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \ @@ -351,6 +352,15 @@ ceph_test_librbd_api_LDADD = \ $(LIBRBD) $(LIBRADOS) $(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) bin_DEBUGPROGRAMS += ceph_test_librbd_api +noinst_HEADERS += \ + test/librbd/test_fixture.h \ + test/librbd/test_mock_fixture.h \ + test/librbd/test_support.h \ + test/librbd/mock/MockContextWQ.h \ + test/librbd/mock/MockImageCtx.h \ + test/librbd/mock/MockImageWatcher.h \ + test/librbd/mock/MockObjectMap.h + if WITH_LTTNG unittest_librbd_LDADD += $(LIBRBD_TP) ceph_test_librbd_LDADD += $(LIBRBD_TP) diff --git a/src/test/Makefile.am b/src/test/Makefile.am index f6373c76d6d56..73c2da1a15f4d 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -449,8 +449,6 @@ noinst_HEADERS += \ test/librados/test.h \ test/librados/TestCase.h \ test/libradosstriper/TestCase.h \ - test/librbd/test_fixture.h \ - test/librbd/test_support.h \ test/ObjectMap/KeyValueDBMemory.h \ test/omap_bench.h \ test/osdc/FakeWriteback.h \ diff --git a/src/test/librbd/mock/MockContextWQ.h b/src/test/librbd/mock/MockContextWQ.h new file mode 100644 index 0000000000000..a690d4eeb82c6 --- /dev/null +++ b/src/test/librbd/mock/MockContextWQ.h @@ -0,0 +1,17 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H +#define CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H + +#include "gmock/gmock.h" + +namespace librbd { + +struct MockContextWQ { + MOCK_METHOD2(queue, void(Context *, int r)); +}; + +} // namespace librbd + +#endif // CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H diff --git a/src/test/librbd/mock/MockImageCtx.h b/src/test/librbd/mock/MockImageCtx.h new file mode 100644 index 0000000000000..53d6fd0d01e1d --- /dev/null +++ b/src/test/librbd/mock/MockImageCtx.h @@ -0,0 +1,112 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H +#define CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H + +#include "test/librbd/mock/MockContextWQ.h" +#include "test/librbd/mock/MockImageWatcher.h" +#include "test/librbd/mock/MockObjectMap.h" +#include "common/RWLock.h" +#include "librbd/ImageCtx.h" +#include "gmock/gmock.h" + +namespace librbd { + +struct MockImageCtx { + MockImageCtx(librbd::ImageCtx &image_ctx) + : image_ctx(&image_ctx), + cct(image_ctx.cct), + snapc(image_ctx.snapc), + snaps(image_ctx.snaps), + snap_info(image_ctx.snap_info), + old_format(image_ctx.old_format), + read_only(image_ctx.read_only), + owner_lock("owner_lock"), + md_lock("md_lock"), + snap_lock("snap_lock"), + parent_lock("parent_lock"), + object_map_lock("object_map_lock"), + async_ops_lock("async_ops_lock"), + size(image_ctx.size), + features(image_ctx.features), + header_oid(image_ctx.header_oid), + id(image_ctx.id), + parent_md(image_ctx.parent_md), + aio_work_queue(new MockContextWQ()), + op_work_queue(new MockContextWQ()), + image_watcher(NULL), + concurrent_management_ops(image_ctx.concurrent_management_ops) + { + md_ctx.dup(image_ctx.md_ctx); + data_ctx.dup(image_ctx.data_ctx); + + if (image_ctx.image_watcher != NULL) { + image_watcher = new MockImageWatcher(); + } + } + + ~MockImageCtx() { + delete image_watcher; + delete op_work_queue; + delete aio_work_queue; + } + + MOCK_CONST_METHOD1(get_snap_id, librados::snap_t(std::string in_snap_name)); + MOCK_CONST_METHOD1(get_snap_info, const SnapInfo*(librados::snap_t)); + MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id, + parent_spec *pspec)); + + MOCK_CONST_METHOD2(is_snap_protected, int(librados::snap_t in_snap_id, + bool *is_protected)); + MOCK_CONST_METHOD2(is_snap_unprotected, int(librados::snap_t in_snap_id, + bool *is_unprotected)); + + MOCK_METHOD6(add_snap, void(std::string in_snap_name, librados::snap_t id, + uint64_t in_size, parent_info parent, + uint8_t protection_status, uint64_t flags)); + MOCK_METHOD2(rm_snap, void(std::string in_snap_name, librados::snap_t id)); + MOCK_METHOD1(flush, void(Context *)); + + ImageCtx *image_ctx; + CephContext *cct; + + ::SnapContext snapc; + std::vector snaps; + std::map snap_info; + + + bool old_format; + bool read_only; + + librados::IoCtx md_ctx; + librados::IoCtx data_ctx; + + RWLock owner_lock; + RWLock md_lock; + RWLock snap_lock; + RWLock parent_lock; + RWLock object_map_lock; + Mutex async_ops_lock; + + uint64_t size; + uint64_t features; + std::string header_oid; + std::string id; + parent_info parent_md; + + xlist*> async_requests; + Cond async_requests_cond; + + MockContextWQ *aio_work_queue; + MockContextWQ *op_work_queue; + + MockImageWatcher *image_watcher; + MockObjectMap object_map; + + int concurrent_management_ops; +}; + +} // namespace librbd + +#endif // CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H diff --git a/src/test/librbd/mock/MockImageWatcher.h b/src/test/librbd/mock/MockImageWatcher.h new file mode 100644 index 0000000000000..1c339bceb61dc --- /dev/null +++ b/src/test/librbd/mock/MockImageWatcher.h @@ -0,0 +1,19 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H +#define CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H + +#include "gmock/gmock.h" + +namespace librbd { + +struct MockImageWatcher { + MOCK_CONST_METHOD0(is_lock_owner, bool()); + MOCK_CONST_METHOD1(is_lock_supported, bool(const RWLock &)); + MOCK_METHOD1(assert_header_locked, void (librados::ObjectWriteOperation *)); +}; + +} // namespace librbd + +#endif // CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H diff --git a/src/test/librbd/mock/MockObjectMap.h b/src/test/librbd/mock/MockObjectMap.h new file mode 100644 index 0000000000000..7f2f84bbde7e8 --- /dev/null +++ b/src/test/librbd/mock/MockObjectMap.h @@ -0,0 +1,20 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H +#define CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H + +#include "gmock/gmock.h" + +namespace librbd { + +struct MockObjectMap { + MOCK_CONST_METHOD1(enabled, bool(const RWLock &object_map_lock)); + + MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish)); + MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish)); +}; + +} // namespace librbd + +#endif // CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H diff --git a/src/test/librbd/test_mock_fixture.cc b/src/test/librbd/test_mock_fixture.cc new file mode 100644 index 0000000000000..1839b9150eca6 --- /dev/null +++ b/src/test/librbd/test_mock_fixture.cc @@ -0,0 +1,68 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "test/librbd/test_mock_fixture.h" +#include "test/librbd/mock/MockImageCtx.h" +#include "test/librados_test_stub/LibradosTestStub.h" +#include "test/librados_test_stub/MockTestMemRadosClient.h" + +// template definitions +#include "librbd/AsyncRequest.cc" +#include "librbd/AsyncObjectThrottle.cc" + +template class librbd::AsyncRequest; +template class librbd::AsyncObjectThrottle; + +using ::testing::_; +using ::testing::DoDefault; + +TestMockFixture::TestRadosClientPtr TestMockFixture::s_test_rados_client; +::testing::NiceMock * + TestMockFixture::s_mock_rados_client = NULL; + +void TestMockFixture::SetUpTestCase() { + s_test_rados_client = librados_test_stub::get_rados_client(); + + // use a mock version of the in-memory rados client + s_mock_rados_client = new ::testing::NiceMock( + s_test_rados_client->cct()); + librados_test_stub::set_rados_client(TestRadosClientPtr(s_mock_rados_client)); + TestFixture::SetUpTestCase(); +} + +void TestMockFixture::TearDownTestCase() { + TestFixture::TearDownTestCase(); + librados_test_stub::set_rados_client(s_test_rados_client); +} + +void TestMockFixture::SetUp() { + TestFixture::SetUp(); +} + +void TestMockFixture::TearDown() { + TestFixture::TearDown(); + + // Mock rados client lives across tests -- reset it to initial state + ::testing::Mock::VerifyAndClear(s_mock_rados_client); + s_mock_rados_client->default_to_dispatch(); +} + +void TestMockFixture::expect_unlock_exclusive_lock(librbd::ImageCtx &ictx) { + EXPECT_CALL(get_mock_io_ctx(ictx.md_ctx), + exec(_, _, "lock", "unlock", _, _, _)) + .WillRepeatedly(DoDefault()); +} + +void TestMockFixture::expect_op_work_queue(librbd::MockImageCtx &mock_image_ctx) { + EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _)) + .WillRepeatedly(DispatchContext( + mock_image_ctx.image_ctx->op_work_queue)); +} + +librados::MockTestMemIoCtxImpl &TestMockFixture::get_mock_io_ctx( + librados::IoCtx &ioctx) { + // TODO become friend of IoCtx so that we can cleanly extract io_ctx_impl + librados::MockTestMemIoCtxImpl **mock = + reinterpret_cast(&ioctx); + return **mock; +} diff --git a/src/test/librbd/test_mock_fixture.h b/src/test/librbd/test_mock_fixture.h new file mode 100644 index 0000000000000..150e312f259b3 --- /dev/null +++ b/src/test/librbd/test_mock_fixture.h @@ -0,0 +1,64 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H +#define CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H + +#include "test/librbd/test_fixture.h" +#include "common/WorkQueue.h" +#include +#include + +namespace librados { +class TestRadosClient; +class MockTestMemIoCtxImpl; +class MockTestMemRadosClient; +} +namespace librbd { +class MockImageCtx; +} + +ACTION_P2(CompleteContext, r, wq) { + ContextWQ *context_wq = reinterpret_cast(wq); + if (context_wq != NULL) { + context_wq->queue(arg0, r); + } else { + arg0->complete(r); + } +} + +ACTION_P(DispatchContext, wq) { + wq->queue(arg0, arg1); +} + +ACTION_P(GetReference, ref_object) { + ref_object->get(); +} + +MATCHER_P(ContentsEqual, bl, "") { + // TODO fix const-correctness of bufferlist + return const_cast(arg).contents_equal( + const_cast(bl)); +} + +class TestMockFixture : public TestFixture { +public: + typedef boost::shared_ptr TestRadosClientPtr; + + static void SetUpTestCase(); + static void TearDownTestCase(); + + virtual void SetUp(); + virtual void TearDown(); + + librados::MockTestMemIoCtxImpl &get_mock_io_ctx(librados::IoCtx &ioctx); + + void expect_op_work_queue(librbd::MockImageCtx &mock_image_ctx); + void expect_unlock_exclusive_lock(librbd::ImageCtx &ictx); + +private: + static TestRadosClientPtr s_test_rados_client; + static ::testing::NiceMock *s_mock_rados_client; +}; + +#endif // CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H From 7d781f7a09479c5f049cb665968f683c77e5d890 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Sep 2015 15:59:34 -0400 Subject: [PATCH 509/654] doc: 'ceph --admin-daemon ...' -> 'ceph daemon ...' Signed-off-by: Sage Weil --- doc/dev/mon-bootstrap.rst | 4 ++-- doc/dev/perf_counters.rst | 4 ++-- doc/man/8/ceph.rst | 4 ++-- doc/rados/operations/monitoring.rst | 10 ++++++++-- doc/rados/troubleshooting/log-and-debug.rst | 11 +++++++---- doc/rados/troubleshooting/troubleshooting-mon.rst | 4 ++-- doc/rados/troubleshooting/troubleshooting-osd.rst | 11 ++++++++--- doc/radosgw/troubleshooting.rst | 6 +++--- 8 files changed, 34 insertions(+), 20 deletions(-) diff --git a/doc/dev/mon-bootstrap.rst b/doc/dev/mon-bootstrap.rst index 3c0a3ddc96c17..86f06dcaada37 100644 --- a/doc/dev/mon-bootstrap.rst +++ b/doc/dev/mon-bootstrap.rst @@ -146,7 +146,7 @@ their own address). For example:: When these daemons are started, they will know their own address, but not their peers. They can learn those addresses via the admin socket:: - ceph --admin-daemon /var/run/ceph/mon..asok add_bootstrap_peer_hint + ceph mon. add_bootstrap_peer_hint Once they learn enough of their peers from the initial member set, they will be able to create the cluster. @@ -174,7 +174,7 @@ example:: Once the daemon starts, you can give it one or more peer addresses to join with:: - ceph --admin-daemon /var/run/ceph/mon..asok add_bootstrap_peer_hint + ceph daemon mon. add_bootstrap_peer_hint This monitor will never participate in cluster creation; it can only join an existing cluster. diff --git a/doc/dev/perf_counters.rst b/doc/dev/perf_counters.rst index 6e004391457e8..91589f47a9dee 100644 --- a/doc/dev/perf_counters.rst +++ b/doc/dev/perf_counters.rst @@ -11,8 +11,8 @@ Access The perf counter data is accessed via the admin socket. For example:: - ceph --admin-daemon /var/run/ceph/ceph-osd.0.asok perf schema - ceph --admin-daemon /var/run/ceph/ceph-osd.0.asok perf dump + ceph daemon osd.0 perf schema + ceph daemon osd.0 perf dump Collections diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst index 38d782d9f6cf3..37bb897262631 100644 --- a/doc/man/8/ceph.rst +++ b/doc/man/8/ceph.rst @@ -1365,9 +1365,9 @@ Options Name of the Ceph cluster. -.. option:: --admin-daemon ADMIN_SOCKET +.. option:: daemon ADMIN_SOCKET, daemon DAEMON_NAME, --admin-socket ADMIN_SOCKET, --admin-socket DAEMON_NAME - Submit admin-socket commands. + Submit admin-socket commands via admin sockets in /var/run/ceph. .. option:: --admin-socket ADMIN_SOCKET_NOPE diff --git a/doc/rados/operations/monitoring.rst b/doc/rados/operations/monitoring.rst index c269e99bf89db..1ea3231ba544a 100644 --- a/doc/rados/operations/monitoring.rst +++ b/doc/rados/operations/monitoring.rst @@ -267,11 +267,17 @@ By default, Ceph sockets reside under ``/var/run/ceph``. To access a daemon via the admin socket, login to the host running the daemon and use the following command:: - ceph --admin-daemon /var/run/ceph/{socket-name} + ceph daemon {daemon-name} + ceph daemon {path-to-socket-file} + +For example, the following are equivalent:: + + ceph daemon osd.0 foo + ceph daemon /var/run/ceph/ceph-osd.0.asok foo To view the available admin socket commands, execute the following command:: - ceph --admin-daemon /var/run/ceph/{socket-name} help + ceph daemon {daemon-name} help The admin socket command enables you to show and set your configuration at runtime. See `Viewing a Configuration at Runtime`_ for details. diff --git a/doc/rados/troubleshooting/log-and-debug.rst b/doc/rados/troubleshooting/log-and-debug.rst index 1b9317a3c7b77..d618a880d95f3 100644 --- a/doc/rados/troubleshooting/log-and-debug.rst +++ b/doc/rados/troubleshooting/log-and-debug.rst @@ -34,8 +34,11 @@ Runtime If you would like to see the configuration settings at runtime, you must log in to a host with a running daemon and execute the following:: - ceph --admin-daemon {/path/to/admin/socket} config show | less - ceph --admin-daemon /var/run/ceph/ceph-osd.0.asok config show | less + ceph daemon {daemon-name} config show | less + +For example,:: + + ceph daemon osd.0 config show | less To activate Ceph's debugging output (*i.e.*, ``dout()``) at runtime, use the ``ceph tell`` command to inject arguments into the runtime configuration:: @@ -51,10 +54,10 @@ debug logging for a ``ceph-osd`` daemon named ``osd.0``, execute the following:: The ``ceph tell`` command goes through the monitors. If you cannot bind to the monitor, you can still make the change by logging into the host of the daemon -whose configuration you'd like to change using ``ceph --admin-daemon``. +whose configuration you'd like to change using ``ceph daemon``. For example:: - sudo ceph --admin-daemon /var/run/ceph/ceph-osd.0.asok config set debug_osd 0/5 + sudo ceph daemon osd.0 config set debug_osd 0/5 See `Subsystem, Log and Debug Settings`_ for details on available settings. diff --git a/doc/rados/troubleshooting/troubleshooting-mon.rst b/doc/rados/troubleshooting/troubleshooting-mon.rst index dff673b39bddd..4dcd5a429b734 100644 --- a/doc/rados/troubleshooting/troubleshooting-mon.rst +++ b/doc/rados/troubleshooting/troubleshooting-mon.rst @@ -82,12 +82,12 @@ admin socket, with ``ceph`` likely returning ``Error 111: Connection Refused``. Accessing the admin socket is as simple as telling the ``ceph`` tool to use the ``asok`` file. In pre-Dumpling Ceph, this can be achieved by:: - ceph --admin-daemon /var/run/ceph/ceph-mon.ID.asok + ceph --admin-daemon /var/run/ceph/ceph-mon..asok while in Dumpling and beyond you can use the alternate (and recommended) format:: - ceph daemon mon.ID + ceph daemon mon. Using ``help`` as the command to the ``ceph`` tool will show you the supported commands available through the admin socket. Please take a look diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst index 1cbdaeec8e29a..e1fb30e8320e3 100644 --- a/doc/rados/troubleshooting/troubleshooting-osd.rst +++ b/doc/rados/troubleshooting/troubleshooting-osd.rst @@ -42,10 +42,15 @@ the sockets for your Ceph processes:: ls /var/run/ceph -Then, execute the following, replacing ``{socket-name}`` with an actual -socket name to show the list of available options:: +Then, execute the following, replacing ``{daemon-name}`` with an actual +daemon (e.g., ``osd.0``):: + + ceph daemon osd.0 help + +Alternatively, you can specify a ``{socket-file}`` (e.g., something in ``/var/run/ceph``):: + + ceph daemon {socket-file} help - ceph --admin-daemon /var/run/ceph/{socket-name} help The admin socket, among other things, allows you to: diff --git a/doc/radosgw/troubleshooting.rst b/doc/radosgw/troubleshooting.rst index 84f17607b75b9..3e4a057776889 100644 --- a/doc/radosgw/troubleshooting.rst +++ b/doc/radosgw/troubleshooting.rst @@ -56,7 +56,7 @@ some insight into the internal state of the ``radosgw`` daemon via its admin socket. By default, there will be a socket configured to reside in ``/var/run/ceph``, and the daemon can be queried with:: - ceph --admin-daemon /var/run/ceph/client.rgw help + ceph daemon /var/run/ceph/client.rgw help help list available commands objecter_requests show in-progress osd requests @@ -66,7 +66,7 @@ reside in ``/var/run/ceph``, and the daemon can be queried with:: Of particular interest:: - ceph --admin-daemon /var/run/ceph/client.rgw objecter_requests + ceph daemon /var/run/ceph/client.rgw objecter_requests ... will dump information about current in-progress requests with the @@ -114,7 +114,7 @@ check the OSD status with:: This tells us to look at ``osd.1``, the primary copy for this PG:: - ceph --admin-daemon /var/run/ceph/osd.1.asok + ceph daemon osd.1 ops { "num_ops": 651, "ops": [ { "description": "osd_op(client.4124.0:1858 fatty_25647_object1857 [write 0~4096] 2.d2041a48)", From 351d957992d16286900f6c6ca47bb954f7d0411b Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sun, 6 Sep 2015 19:28:31 +0800 Subject: [PATCH 510/654] doc: fix the typo in command example always indent using tab, the rendered html looks good, but it helps with editor to highlight the codeblock properly. Signed-off-by: Kefu Chai --- doc/dev/mon-bootstrap.rst | 2 +- doc/rados/operations/monitoring.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/dev/mon-bootstrap.rst b/doc/dev/mon-bootstrap.rst index 86f06dcaada37..75c8a5e764970 100644 --- a/doc/dev/mon-bootstrap.rst +++ b/doc/dev/mon-bootstrap.rst @@ -146,7 +146,7 @@ their own address). For example:: When these daemons are started, they will know their own address, but not their peers. They can learn those addresses via the admin socket:: - ceph mon. add_bootstrap_peer_hint + ceph daemon mon. add_bootstrap_peer_hint Once they learn enough of their peers from the initial member set, they will be able to create the cluster. diff --git a/doc/rados/operations/monitoring.rst b/doc/rados/operations/monitoring.rst index 1ea3231ba544a..cc3424bc61ca6 100644 --- a/doc/rados/operations/monitoring.rst +++ b/doc/rados/operations/monitoring.rst @@ -272,8 +272,8 @@ following command:: For example, the following are equivalent:: - ceph daemon osd.0 foo - ceph daemon /var/run/ceph/ceph-osd.0.asok foo + ceph daemon osd.0 foo + ceph daemon /var/run/ceph/ceph-osd.0.asok foo To view the available admin socket commands, execute the following command:: From bfe359af0b80f44ca04847f74d5a2d81097ce4e6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 6 Sep 2015 13:56:38 -0400 Subject: [PATCH 511/654] osd: dump full map bl at 20 when crc doesn't match This will help us debug cases where the encoding doesn't match due to a bug. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a98db3401d046..a577fd802c82a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6248,6 +6248,9 @@ void OSD::handle_osd_map(MOSDMap *m) << " but failed to encode full with correct crc; requesting" << dendl; clog->warn() << "failed to encode map e" << e << " with expected crc\n"; + dout(20) << "my encoded map was:\n"; + fbl.hexdump(*_dout); + *_dout << dendl; delete o; MMonGetOSDMap *req = new MMonGetOSDMap; req->request_full(e, last); From 82b0243ec318457e376288a6b32487fbea059705 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 5 Sep 2015 09:28:17 -0400 Subject: [PATCH 512/654] qa/workunits/post-file.sh: sudo Only root can read the key in /usr/share/... Signed-off-by: Sage Weil --- qa/workunits/post-file.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/post-file.sh b/qa/workunits/post-file.sh index 02a4ca292beaa..133e66867c0e8 100755 --- a/qa/workunits/post-file.sh +++ b/qa/workunits/post-file.sh @@ -2,6 +2,6 @@ what="$1" [ -z "$what" ] && what=/etc/udev/rules.d -ceph-post-file -d ceph-test-workunit $what +sudo ceph-post-file -d ceph-test-workunit $what echo OK From 3aefd91a5521d201e0cc63fa1f45902f9e47a845 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Mon, 7 Sep 2015 00:40:29 +0200 Subject: [PATCH 513/654] erasure-code: fix gf-complete warnings Update to the latest gf-complete v2 branch which has the compilation warning fixed. In addition the default compilation flag upstream has been set to display such warnings. git log --oneline --graph d384952c68a64d93ac7af6341d5519ea5d2958b9..9caeefbf2860e56a75502f4d3342deed5b5ba265 * 9caeefb add -Wsign-compare and address the warnings * 5979f98 gf_w64.c: fix integer overflow * 93a9845 gf_w64.c: fix integer overflow * 513c87b gf_w64.c: fix integer overflow * 4d6fa89 gf_w64.c: fix integer overflow http://tracker.ceph.com/issues/12731 Fixes: #12731 Signed-off-by: Loic Dachary --- src/erasure-code/jerasure/gf-complete | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/erasure-code/jerasure/gf-complete b/src/erasure-code/jerasure/gf-complete index d384952c68a64..9caeefbf2860e 160000 --- a/src/erasure-code/jerasure/gf-complete +++ b/src/erasure-code/jerasure/gf-complete @@ -1 +1 @@ -Subproject commit d384952c68a64d93ac7af6341d5519ea5d2958b9 +Subproject commit 9caeefbf2860e56a75502f4d3342deed5b5ba265 From 438b4e43cbbe8fa63764fef4563ed1b44ed36c4c Mon Sep 17 00:00:00 2001 From: Ruifeng Yang Date: Mon, 7 Sep 2015 09:05:38 +0800 Subject: [PATCH 514/654] msg: we should set the socket options before connect or listen in order to have it take effect. Signed-off-by: Ruifeng Yang <149233652@qq.com> --- src/msg/async/AsyncConnection.cc | 1 - src/msg/async/AsyncMessenger.cc | 3 +++ src/msg/async/net_handler.cc | 5 +++-- src/msg/simple/Accepter.cc | 10 ++++++++++ src/msg/simple/Pipe.cc | 5 +++-- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 36a19f80cccce..37258b337a8ca 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -975,7 +975,6 @@ int AsyncConnection::_process_connection() if (r < 0) { goto fail; } - net.set_socket_options(sd); center->create_file_event(sd, EVENT_READABLE, read_handler); state = STATE_CONNECTING_WAIT_BANNER; diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc index c4be6a7746bf3..e5e393a4fb472 100644 --- a/src/msg/async/AsyncMessenger.cc +++ b/src/msg/async/AsyncMessenger.cc @@ -97,6 +97,9 @@ int Processor::bind(const entity_addr_t &bind_addr, const set& avoid_ports) listen_sd = -1; return -errno; } + + net.set_socket_options(listen_sd); + // use whatever user specified (if anything) entity_addr_t listen_addr = bind_addr; listen_addr.set_family(family); diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc index 8e6468cfde362..2639fdc3b2b9f 100644 --- a/src/msg/async/net_handler.cc +++ b/src/msg/async/net_handler.cc @@ -116,6 +116,9 @@ int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock) return ret; } } + + set_socket_options(s); + ret = ::connect(s, (sockaddr*)&addr.addr, addr.addr_size()); if (ret < 0) { if (errno == EINPROGRESS && nonblock) @@ -126,8 +129,6 @@ int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock) return -errno; } - set_socket_options(s); - return s; } diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc index 7d989a93691e1..3333693b706dc 100644 --- a/src/msg/simple/Accepter.cc +++ b/src/msg/simple/Accepter.cc @@ -140,6 +140,16 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set& avoid_ports) return rc; } + if (msgr->cct->_conf->ms_tcp_rcvbuf) { + int size = msgr->cct->_conf->ms_tcp_rcvbuf; + rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size)); + if (rc < 0) { + rc = -errno; + lderr(msgr->cct) << "accepter.bind failed to set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl; + return rc; + } + } + ldout(msgr->cct,10) << "accepter.bind bound to " << listen_addr << dendl; // listen! diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc index a9b3b54e870e1..7d6419c3ee27d 100644 --- a/src/msg/simple/Pipe.cc +++ b/src/msg/simple/Pipe.cc @@ -913,6 +913,9 @@ int Pipe::connect() } recv_reset(); + + set_socket_options(); + // connect! ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl; rc = ::connect(sd, (sockaddr*)&peer_addr.addr, peer_addr.addr_size()); @@ -922,8 +925,6 @@ int Pipe::connect() goto fail; } - set_socket_options(); - // verify banner // FIXME: this should be non-blocking, or in some other way verify the banner as we get it. if (tcp_read((char*)&banner, strlen(CEPH_BANNER)) < 0) { From 68d47f25655273d30151e7c9679b98f32414bfce Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Mon, 17 Aug 2015 13:14:08 +0800 Subject: [PATCH 515/654] fix metadata loading error if we open an image Signed-off-by: xinxin shu --- src/librbd/ImageCtx.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index 3e860322c991a..91dddca96febb 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -819,13 +819,10 @@ class ThreadPoolSingleton : public ThreadPool { return false; string key = it->first.substr(conf_prefix_len, it->first.size() - conf_prefix_len); - for (map::iterator cit = configs.begin(); - cit != configs.end(); ++cit) { - if (!key.compare(cit->first)) { - cit->second = true; - res->insert(make_pair(key, it->second)); - break; - } + map::iterator cit = configs.find(key); + if ( cit != configs.end()) { + cit->second = true; + res->insert(make_pair(key, it->second)); } } return true; @@ -882,20 +879,22 @@ class ThreadPoolSingleton : public ThreadPool { pairs, &res); for (map::iterator it = res.begin(); it != res.end(); ++it) { - j = local_config_t.set_val(it->first.c_str(), it->second.c_str()); + string val(it->second.c_str(), it->second.length()); + j = local_config_t.set_val(it->first.c_str(), val); if (j < 0) { lderr(cct) << __func__ << " failed to set config " << it->first << " with value " << it->second.c_str() << ": " << j << dendl; } - break; } start = pairs.rbegin()->first; } #define ASSIGN_OPTION(config) \ do { \ - if (configs[#config]) \ + string key = "rbd_"; \ + key = key + #config; \ + if (configs[key]) \ config = local_config_t.rbd_##config; \ else \ config = cct->_conf->rbd_##config; \ From e6fbe539b00b5b59ddbc75b4bf4e903b342e097e Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Mon, 17 Aug 2015 18:15:04 +0800 Subject: [PATCH 516/654] improve error handle of rbd metadata operation & format output Signed-off-by: xinxin shu --- src/rbd.cc | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/rbd.cc b/src/rbd.cc index f8330ff45f0c4..39acdbe197974 100755 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -2255,8 +2255,10 @@ static int do_metadata_list(librbd::Image& image, Formatter *f) TextTable tbl; r = image.metadata_list("", 0, &pairs); - if (r < 0) + if (r < 0) { + cerr << "failed to list metadata of image : " << cpp_strerror(r) << std::endl; return r; + } if (f) { f->open_object_section("metadatas"); @@ -2275,10 +2277,11 @@ static int do_metadata_list(librbd::Image& image, Formatter *f) for (map::iterator it = pairs.begin(); it != pairs.end(); ++it) { + string val(it->second.c_str(), it->second.length()); if (f) { - f->dump_string(it->first.c_str(), it->second.c_str()); + f->dump_string(it->first.c_str(), val.c_str()); } else { - tbl << it->first << it->second.c_str() << TextTable::endrow; + tbl << it->first << val.c_str() << TextTable::endrow; } } if (!f) @@ -2295,22 +2298,31 @@ static int do_metadata_list(librbd::Image& image, Formatter *f) static int do_metadata_set(librbd::Image& image, const char *key, const char *value) { - return image.metadata_set(key, value); + int r = image.metadata_set(key, value); + if (r < 0) { + cerr << "failed to set metadata " << key << " of image : " << cpp_strerror(r) << std::endl; + } + return r; } static int do_metadata_remove(librbd::Image& image, const char *key) { - return image.metadata_remove(key); + int r = image.metadata_remove(key); + if (r < 0) { + cerr << "failed to remove metadata " << key << " of image : " << cpp_strerror(r) << std::endl; + } } static int do_metadata_get(librbd::Image& image, const char *key) { string s; int r = image.metadata_get(key, &s); - if (r < 0) + if (r < 0) { + cerr << "failed to get metadata " << key << " of image : " << cpp_strerror(r) << std::endl; return r; - cout << s; - return 0; + } + cout << s << std::endl; + return r; } static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp, From d85383941f25321322eb96b6c46aa6c0cbb94b30 Mon Sep 17 00:00:00 2001 From: xinxin shu Date: Mon, 17 Aug 2015 18:55:35 +0800 Subject: [PATCH 517/654] refine tests for metadata ops Signed-off-by: xinxin shu --- src/test/librbd/test_internal.cc | 55 ++++++++++++++++---------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc index 0d909497953f5..36a58f5f35513 100644 --- a/src/test/librbd/test_internal.cc +++ b/src/test/librbd/test_internal.cc @@ -377,39 +377,38 @@ TEST_F(TestInternal, MetadatConfig) { "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", false)( "cccccccccccccc", false); map::iterator it = test_confs.begin(); - const string prefix = "test_config_"; - bool is_continue; + int r; librbd::ImageCtx *ictx; ASSERT_EQ(0, open_image(m_image_name, &ictx)); - librbd::Image image1; - map pairs, res; - pairs[prefix+it->first].append("value1"); + r = librbd::metadata_set(ictx, it->first, "value1"); + ASSERT_EQ(0, r); ++it; - pairs[prefix+it->first].append("value2"); + r = librbd::metadata_set(ictx, it->first, "value2"); + ASSERT_EQ(0, r); ++it; - pairs[prefix+it->first].append("value3"); - pairs[prefix+"asdfsdaf"].append("value6"); - pairs[prefix+"zxvzxcv123"].append("value5"); - - is_continue = ictx->_filter_metadata_confs(prefix, test_confs, pairs, &res); - ASSERT_TRUE(is_continue); - ASSERT_TRUE(res.size() == 3U); - it = test_confs.begin(); - ASSERT_TRUE(res.count(it->first)); - ASSERT_TRUE(it->second); - ++it; - ASSERT_TRUE(res.count(it->first)); - ASSERT_TRUE(it->second); - ++it; - ASSERT_TRUE(res.count(it->first)); - ASSERT_TRUE(it->second); - res.clear(); - - pairs["zzzzzzzz"].append("value7"); - is_continue = ictx->_filter_metadata_confs(prefix, test_confs, pairs, &res); - ASSERT_FALSE(is_continue); - ASSERT_TRUE(res.size() == 3U); + r = librbd::metadata_set(ictx, it->first, "value3"); + ASSERT_EQ(0, r); + r = librbd::metadata_set(ictx, "abcd", "value4"); + ASSERT_EQ(0, r); + r = librbd::metadata_set(ictx, "xyz", "value5"); + ASSERT_EQ(0, r); + map pairs; + r = librbd::metadata_list(ictx, "", 0, &pairs); + ASSERT_EQ(0, r); + ASSERT_EQ(5, pairs.size()); + r = librbd::metadata_remove(ictx, "abcd"); + ASSERT_EQ(0, r); + r = librbd::metadata_remove(ictx, "xyz"); + ASSERT_EQ(0, r); + pairs.clear(); + r = librbd::metadata_list(ictx, "", 0, &pairs); + ASSERT_EQ(0, r); + ASSERT_EQ(3, pairs.size()); + string val; + r = librbd::metadata_get(ictx, it->first, &val); + ASSERT_EQ(0, r); + ASSERT_STREQ(val.c_str(), "value3"); } TEST_F(TestInternal, SnapshotCopyup) From 27cf257248ea55f8f4bc7851c3956611828bcae2 Mon Sep 17 00:00:00 2001 From: Sylvain Baubeau Date: Fri, 4 Sep 2015 22:51:44 +0200 Subject: [PATCH 518/654] rgw: add delimiter to prefix only when path is specified http://tracker.ceph.com/issues/12960 Fixes: #12960 Signed-off-by: Sylvain Baubeau --- src/rgw/rgw_rest_swift.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index bdd5b9b1f2bf6..de60d6c98fea2 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -200,13 +200,14 @@ int RGWListBucket_ObjStore_SWIFT::get_params() path = prefix; if (path.size() && path[path.size() - 1] != '/') path.append("/"); - } - int len = prefix.size(); - int delim_size = delimiter.size(); - if (len >= delim_size) { - if (prefix.substr(len - delim_size).compare(delimiter) != 0) - prefix.append(delimiter); + int len = prefix.size(); + int delim_size = delimiter.size(); + + if (len >= delim_size) { + if (prefix.substr(len - delim_size).compare(delimiter) != 0) + prefix.append(delimiter); + } } return 0; From bfde30db98c5d41be19a093f67d00f5c995cb831 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 3 Sep 2015 01:23:45 +0200 Subject: [PATCH 519/654] tests: CentOS 7 needs systemd-container It is now needed by the Ceph dependencies and won't work unless the fake version of it is replaced by the actual one. Signed-off-by: Loic Dachary --- src/test/centos-7/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/centos-7/Dockerfile.in b/src/test/centos-7/Dockerfile.in index dfde99ae51419..19fe1aa099f00 100644 --- a/src/test/centos-7/Dockerfile.in +++ b/src/test/centos-7/Dockerfile.in @@ -23,7 +23,7 @@ FROM centos:%%os_version%% COPY install-deps.sh /root/ COPY ceph.spec.in /root/ # http://jperrin.github.io/centos/2014/09/25/centos-docker-and-systemd/ -RUN yum -y swap -- remove fakesystemd -- install systemd systemd-libs && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && rm -f /lib/systemd/system/multi-user.target.wants/* && rm -f /etc/systemd/system/*.wants/* && rm -f /lib/systemd/system/local-fs.target.wants/* && rm -f /lib/systemd/system/sockets.target.wants/*udev* && rm -f /lib/systemd/system/sockets.target.wants/*initctl* && rm -f /lib/systemd/system/basic.target.wants/* && rm -f /lib/systemd/system/anaconda.target.wants/* && yum install -y redhat-lsb-core +RUN yum -y swap -- remove fakesystemd systemd-libs systemd-container -- install systemd systemd-libs && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && rm -f /lib/systemd/system/multi-user.target.wants/* && rm -f /etc/systemd/system/*.wants/* && rm -f /lib/systemd/system/local-fs.target.wants/* && rm -f /lib/systemd/system/sockets.target.wants/*udev* && rm -f /lib/systemd/system/sockets.target.wants/*initctl* && rm -f /lib/systemd/system/basic.target.wants/* && rm -f /lib/systemd/system/anaconda.target.wants/* && yum install -y redhat-lsb-core RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org* # build dependencies RUN cd /root ; ./install-deps.sh From e8089049c3ca861017b959b0bdb3ef389dbf9575 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 3 Sep 2015 01:37:03 +0200 Subject: [PATCH 520/654] tests: drop docker-tests.sh root and /dev support Now that ceph-disk.sh activation tests switched to the ceph-disk suite as found in https://github.com/ceph/ceph-qa-suite/tree/master/suites/ceph-disk, it does not need for root and /dev support from docker-tests.sh. There currently is no other use case and since it's generally not a good idea to run anything as root in a container anyway. Signed-off-by: Loic Dachary --- src/test/docker-test-helper.sh | 40 +++++----------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/src/test/docker-test-helper.sh b/src/test/docker-test-helper.sh index 797e37c8df02b..f66911d70d1d1 100755 --- a/src/test/docker-test-helper.sh +++ b/src/test/docker-test-helper.sh @@ -107,10 +107,6 @@ function run_in_docker() { shift local ref=$1 shift - local dev=$1 - shift - local user=$1 - shift local opts="$1" shift local script=$1 @@ -123,24 +119,15 @@ function run_in_docker() { local ccache mkdir -p $HOME/.ccache ccache="--volume $HOME/.ccache:$HOME/.ccache" - if $dev ; then - dev="--volume /dev:/dev" - else - dev= - fi - if test $user != root ; then - user="--user $user" - else - user= - fi + user="--user $USER" local cmd="docker run $opts --rm --name $image --privileged $ccache" cmd+=" --volume $downstream:$downstream" cmd+=" --volume $upstream:$upstream" local status=0 if test "$script" = "SHELL" ; then - $cmd --tty --interactive --workdir $downstream $user $dev $image bash + $cmd --tty --interactive --workdir $downstream $user $image bash else - if ! $cmd --workdir $downstream $user $dev $image "$@" ; then + if ! $cmd --workdir $downstream $user $image "$@" ; then status=1 fi fi @@ -175,8 +162,6 @@ $0 [options] command args ... [--shell] run an interactive shell in the container [--remove-all] remove the container and the image for the specified types+versions - [--dev] run the container with --volume /dev:/dev - [--user name] execute the command as user 'name' (defaults to $USER) [--opts options] run the contain with 'options' docker-test.sh must be run from a Ceph clone and it will run the @@ -244,9 +229,6 @@ docker-test.sh --os-type centos --os-version 7 -- make check Run make check on a giant docker-test.sh --ref giant -- make check -Run a test as root with access to the host /dev for losetup to work -docker-test.sh --user root --dev -- make TESTS=test/ceph-disk-root.sh check - Run an interactive shell and set resolv.conf to use 172.17.42.1 docker-test.sh --opts --dns=172.17.42.1 --shell @@ -262,7 +244,7 @@ function main_docker() { fi local temp - temp=$(getopt -o scdht:v:u:o:a:r: --long remove-all,verbose,shell,dev,help,os-type:,os-version:,user:,opts:,all:,ref: -n $0 -- "$@") || return 1 + temp=$(getopt -o scht:v:o:a:r: --long remove-all,verbose,shell,help,os-type:,os-version:,opts:,all:,ref: -n $0 -- "$@") || return 1 eval set -- "$temp" @@ -271,8 +253,6 @@ function main_docker() { local all local remove=false local shell=false - local dev=false - local user=$USER local opts local ref=$(git rev-parse HEAD) @@ -291,10 +271,6 @@ function main_docker() { shell=true shift ;; - -d|--dev) - dev=true - shift - ;; -h|--help) usage return 0 @@ -307,10 +283,6 @@ function main_docker() { os_version=$2 shift 2 ;; - -u|--user) - user="$2" - shift 2 - ;; -o|--opts) opts="$2" shift 2 @@ -346,9 +318,9 @@ function main_docker() { if $remove ; then remove_all $os_type $os_version || return 1 elif $shell ; then - run_in_docker $os_type $os_version $ref $dev $user "$opts" SHELL || return 1 + run_in_docker $os_type $os_version $ref "$opts" SHELL || return 1 else - run_in_docker $os_type $os_version $ref $dev $user "$opts" "$@" || return 1 + run_in_docker $os_type $os_version $ref "$opts" "$@" || return 1 fi done done From 70d31082fd3dc8c7857994104577f1a3631c678c Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Mon, 7 Sep 2015 14:12:19 +0100 Subject: [PATCH 521/654] mon: MonitorDBStore: make get_next_key() work properly We introduced a significant bug with 2cc7aee, when we fixed issue #11786. Although that patch would fix the problem described in #11786, we managed to not increment the iterator upon returning the current key. This would have the iterator iterating over the same key, forever and ever. Signed-off-by: Joao Eduardo Luis --- src/mon/MonitorDBStore.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index b9aab3094ef5b..1a6f419e515a3 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -461,8 +461,10 @@ class MonitorDBStore for (; iter->valid(); iter->next()) { pair r = iter->raw_key(); - if (sync_prefixes.count(r.first) > 0) + if (sync_prefixes.count(r.first) > 0) { + iter->next(); return r; + } } return pair(); } From 8c2dfadbb95dc1a6c66871d49bdcbba0bb6d4a00 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 3 Sep 2015 13:36:06 +0800 Subject: [PATCH 522/654] osd: force promote for ops which ec base pool can't handle For ops which the ec base pool can't handle, if they are proxied to the base ec pool, ENOTSUPP is returned. Need to force promote the objects into the cache pool. Fixes: #12903 Signed-off-by: Zhiqiang Wang --- src/osd/OSD.cc | 51 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a98db3401d046..e22960bc37649 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8647,6 +8647,40 @@ int OSD::init_op_flags(OpRequestRef& op) if (ceph_osd_op_mode_cache(iter->op.op)) op->set_cache(); + // check for ec base pool + int64_t poolid = m->get_pg().pool(); + const pg_pool_t *pool = osdmap->get_pg_pool(poolid); + if (pool && pool->is_tier()) { + const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of); + if (base_pool && base_pool->require_rollback()) { + if ((iter->op.op != CEPH_OSD_OP_READ) && + (iter->op.op != CEPH_OSD_OP_STAT) && + (iter->op.op != CEPH_OSD_OP_ISDIRTY) && + (iter->op.op != CEPH_OSD_OP_UNDIRTY) && + (iter->op.op != CEPH_OSD_OP_GETXATTR) && + (iter->op.op != CEPH_OSD_OP_GETXATTRS) && + (iter->op.op != CEPH_OSD_OP_CMPXATTR) && + (iter->op.op != CEPH_OSD_OP_SRC_CMPXATTR) && + (iter->op.op != CEPH_OSD_OP_ASSERT_VER) && + (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) && + (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) && + (iter->op.op != CEPH_OSD_OP_ASSERT_SRC_VERSION) && + (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) && + (iter->op.op != CEPH_OSD_OP_WRITEFULL) && + (iter->op.op != CEPH_OSD_OP_ROLLBACK) && + (iter->op.op != CEPH_OSD_OP_CREATE) && + (iter->op.op != CEPH_OSD_OP_DELETE) && + (iter->op.op != CEPH_OSD_OP_SETXATTR) && + (iter->op.op != CEPH_OSD_OP_RMXATTR) && + (iter->op.op != CEPH_OSD_OP_STARTSYNC) && + (iter->op.op != CEPH_OSD_OP_COPY_GET_CLASSIC) && + (iter->op.op != CEPH_OSD_OP_COPY_GET) && + (iter->op.op != CEPH_OSD_OP_COPY_FROM)) { + op->set_promote(); + } + } + } + switch (iter->op.op) { case CEPH_OSD_OP_CALL: { @@ -8722,23 +8756,6 @@ int OSD::init_op_flags(OpRequestRef& op) } break; - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_ZERO: - case CEPH_OSD_OP_TRUNCATE: - // always force promotion for object overwrites on a ec base pool - { - int64_t poolid = m->get_pg().pool(); - const pg_pool_t *pool = osdmap->get_pg_pool(poolid); - if (pool->is_tier()) { - const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of); - assert(base_pool); - if (base_pool->is_erasure()) { - op->set_promote(); - } - } - } - break; - case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SYNC_READ: case CEPH_OSD_OP_SPARSE_READ: From 02f4461cada9f0f23c958aa7fb79081340d5bb79 Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Tue, 8 Sep 2015 09:48:41 +0100 Subject: [PATCH 523/654] test: mon: mon-scrub.sh: test 'mon scrub' In its current state is used to catch regressions on 'ceph mon scrub' hanging due to an infinite loop. Signed-off-by: Joao Eduardo Luis --- src/test/mon/mon-scrub.sh | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 src/test/mon/mon-scrub.sh diff --git a/src/test/mon/mon-scrub.sh b/src/test/mon/mon-scrub.sh new file mode 100755 index 0000000000000..289e03b72d545 --- /dev/null +++ b/src/test/mon/mon-scrub.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source ../qa/workunits/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7104" + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_mon_scrub() { + local dir=$1 + + run_mon $dir a || return 1 + + ./ceph mon scrub || return 1 +} + +main mon-scrub "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/mon-scrub.sh" +# End: From e48cec3dc93b3988dcd8924933deb1b3a43e1d0f Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 8 Sep 2015 16:52:32 +0800 Subject: [PATCH 524/654] mon: disable gmt_hitset if not supported the gmt_hitset is enabled by default in the ctor of pg_pool_t, this is intentional. because we want to remove this setting and make gmt_hitset=true as a default in future. but this forces us to disable it explicitly when preparing a new pool if any OSD does not support gmt hitset. Fixes: #12968 Signed-off-by: Kefu Chai --- src/mon/OSDMonitor.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 06dd21808c1eb..49fbe79353e6a 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4536,6 +4536,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, if (g_conf->osd_pool_use_gmt_hitset && (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) pi->use_gmt_hitset = true; + else + pi->use_gmt_hitset = false; if (pool_type == pg_pool_t::TYPE_ERASURE) { switch (fast_read) { From ab4232baa7bcc86e90746e13312ac9bda1772872 Mon Sep 17 00:00:00 2001 From: Xiaowei Chen Date: Tue, 8 Sep 2015 06:58:57 -0400 Subject: [PATCH 525/654] rgw: init_rados failed leads to repeated delete Fixes: #12978 Signed-off-by: Xiaowei Chen --- src/rgw/rgw_rados.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index c23a2db3ee7f0..70e806b08aec4 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -1508,10 +1508,13 @@ int RGWRados::init_rados() for (uint32_t i=0; i < num_rados_handles; i++) { if (rados[i]) { delete rados[i]; + rados[i] = NULL; } } + num_rados_handles = 0; if (rados) { delete[] rados; + rados = NULL; } return ret; From d32a3be1a6b64b032aad1067e08a496a85fe05ef Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 8 Sep 2015 18:20:04 +0800 Subject: [PATCH 526/654] qa/workunits/rados/test_alloc_hint.sh: sudo to ls files The osd data dir is owned by ceph and not readable by other non-root users. Fixes: #12861 Signed-off-by: Kefu Chai --- qa/workunits/rados/test_alloc_hint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/rados/test_alloc_hint.sh b/qa/workunits/rados/test_alloc_hint.sh index c43fc3c00bde5..b3e185adefd6e 100755 --- a/qa/workunits/rados/test_alloc_hint.sh +++ b/qa/workunits/rados/test_alloc_hint.sh @@ -61,7 +61,7 @@ function expect_alloc_hint_eq() { # e.g., .../25.6_head/foo__head_7FC1F406__19 # .../26.bs1_head/bar__head_EFE6384B__1a_ffffffffffffffff_1 - local fns=$(sudo find ${OSD_DATA[i]}/current/${PGID}*_head -type f | grep head/${OBJ}_) + local fns=$(sudo sh -c "ls ${OSD_DATA[i]}/current/${PGID}*_head/${OBJ}_*") local count="${#fns[@]}" if [ "${count}" -ne 1 ]; then echo "bad fns count: ${count}" >&2 From d5650c9cf85188efa73b279c8f4e4723fa475308 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 8 Sep 2015 15:45:45 -0400 Subject: [PATCH 527/654] tests: new test case for librbd diff_iterate over discard extents Signed-off-by: Jason Dillaman --- src/test/librbd/test_librbd.cc | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc index fb015f65338c1..99d272a0d3d34 100644 --- a/src/test/librbd/test_librbd.cc +++ b/src/test/librbd/test_librbd.cc @@ -2438,6 +2438,54 @@ TYPED_TEST(DiffIterateTest, DiffIterateCallbackError) } ioctx.close(); } + +TYPED_TEST(DiffIterateTest, DiffIterateParentDiscard) +{ + REQUIRE_FEATURE(RBD_FEATURE_LAYERING); + + librados::IoCtx ioctx; + ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx)); + + librbd::RBD rbd; + librbd::Image image; + std::string name = this->get_temp_image_name(); + uint64_t size = 20 << 20; + int order = 0; + + ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order)); + ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL)); + + uint64_t object_size = 0; + if (this->whole_object) { + object_size = 1 << order; + } + + interval_set exists; + interval_set one; + scribble(image, 10, 102400, &exists, &one); + ASSERT_EQ(0, image.snap_create("one")); + + ASSERT_EQ(1 << order, image.discard(0, 1 << order)); + ASSERT_EQ(0, image.snap_create("two")); + ASSERT_EQ(0, image.snap_protect("two")); + exists.clear(); + one.clear(); + + std::string clone_name = this->get_temp_image_name(); + ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "two", ioctx, + clone_name.c_str(), RBD_FEATURE_LAYERING, &order)); + ASSERT_EQ(0, rbd.open(ioctx, image, clone_name.c_str(), NULL)); + + interval_set two; + scribble(image, 10, 102400, &exists, &two); + two = round_diff_interval(two, object_size); + + interval_set diff; + ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object, + iterate_cb, (void *)&diff)); + ASSERT_TRUE(two.subset_of(diff)); +} + TEST_F(TestLibRBD, ZeroLengthWrite) { rados_ioctx_t ioctx; From 3ccc3bb4bd35e57209852d460633e371b4d004e2 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 8 Sep 2015 15:47:37 -0400 Subject: [PATCH 528/654] librbd: diff_iterate needs to handle holes in parent images If a clone's parent image snapshot includes a discarded extent, this was previously causing an assert failure. Instead, ignore any discard holes in the parent image. Fixes: #12885 Backport: hammer Signed-off-by: Jason Dillaman --- src/librbd/DiffIterate.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/librbd/DiffIterate.cc b/src/librbd/DiffIterate.cc index 00a9ebae2b729..311758785a1bc 100644 --- a/src/librbd/DiffIterate.cc +++ b/src/librbd/DiffIterate.cc @@ -513,12 +513,11 @@ int DiffIterate::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, int DiffIterate::simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) { - // This reads the existing extents in a parent from the beginning - // of time. Since images are thin-provisioned, the extents will - // always represent data, not holes. - assert(exists); - interval_set *diff = static_cast *>(arg); - diff->insert(off, len); + // it's possible for a discard to create a hole in the parent image -- ignore + if (exists) { + interval_set *diff = static_cast *>(arg); + diff->insert(off, len); + } return 0; } From 95685c19d6f1eab50b903e61273b5351bedc2980 Mon Sep 17 00:00:00 2001 From: dwj192 Date: Wed, 9 Sep 2015 09:48:16 +0800 Subject: [PATCH 529/654] rgw:add --reset-regions for regionmap update Fixes: #12964 Signed-off-by: Weijun Duan --- src/rgw/rgw_admin.cc | 8 ++++++++ src/test/cli/radosgw-admin/help.t | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 1140cbdbbc5f8..15aedaa9bba29 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -165,6 +165,7 @@ void _usage() cout << " --categories= comma separated list of categories, used in usage show\n"; cout << " --caps= list of caps (e.g., \"usage=read, write; user=read\"\n"; cout << " --yes-i-really-mean-it required for certain operations\n"; + cout << " --reset-regions reset regionmap when regionmap update"; cout << "\n"; cout << " := \"YYYY-MM-DD[ hh:mm:ss]\"\n"; cout << "\nQuota options:\n"; @@ -1142,6 +1143,7 @@ int main(int argc, char **argv) int include_all = false; int sync_stats = false; + int reset_regions = false; uint64_t min_rewrite_size = 4 * 1024 * 1024; uint64_t max_rewrite_size = ULLONG_MAX; @@ -1314,6 +1316,8 @@ int main(int argc, char **argv) // do nothing } else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) { // do nothing + } else if (ceph_argparse_binary_flag(args, i, &reset_regions, NULL, "--reset-regions", (char*)NULL)) { + // do nothing } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { caps = val; } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) { @@ -1552,6 +1556,10 @@ int main(int argc, char **argv) return -ret; } + if (reset_regions) { + regionmap.regions.clear(); + } + for (list::iterator iter = regions.begin(); iter != regions.end(); ++iter) { ret = region.read_info(*iter); if (ret < 0) { diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index 33aee1d5eb332..e2a8447e26213 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -122,7 +122,8 @@ --categories= comma separated list of categories, used in usage show --caps= list of caps (e.g., "usage=read, write; user=read" --yes-i-really-mean-it required for certain operations - + --reset-regions reset regionmap when regionmap update + := "YYYY-MM-DD[ hh:mm:ss]" Quota options: From d74135231c9b6199b3eb710cad851c937b167fbc Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 9 Sep 2015 11:40:48 +0800 Subject: [PATCH 530/654] AsyncMessenger: add instance name in debug log when processing msg To better debug. Signed-off-by: Zhiqiang Wang --- src/msg/async/AsyncConnection.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index b0433e81f8850..7a6de148521cd 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -841,7 +841,7 @@ void AsyncConnection::process() in_seq.set(message->get_seq()); ldout(async_msgr->cct, 10) << __func__ << " got message " << message->get_seq() << " " << message << " " << *message << dendl; - ldout(async_msgr->cct, 1) << " == rx == " << message << " " << *message + ldout(async_msgr->cct, 1) << " == rx == " << message->get_source() << " " << message << " " << *message << dendl; // if send_message always successfully send, it may have no From b968fb3bcefb0cdafb1e84d7c679b61a201a940f Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 8 Sep 2015 13:41:31 -0700 Subject: [PATCH 531/654] rados: Fix usage for "notify" command Signed-off-by: David Zafman --- src/tools/rados/rados.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc index c845d30027f74..a1a31a647e62e 100644 --- a/src/tools/rados/rados.cc +++ b/src/tools/rados/rados.cc @@ -114,7 +114,7 @@ void usage(ostream& out) " setomapheader \n" " tmap-to-omap convert tmap keys/values to omap\n" " watch add watcher on this object\n" -" notify notify wather of this object with message\n" +" notify notify watcher of this object with message\n" " listwatchers list the watchers of this object\n" " set-alloc-hint \n" " set allocation hint for an object\n" From 95bd3c2c54312417b132ddb91c89fdbe63b01fd0 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 8 Sep 2015 12:33:44 -0700 Subject: [PATCH 532/654] test: Fix failure test to find message anywhere in stderr Consolidate test_failure() and test_failure_tty() Signed-off-by: David Zafman --- src/test/ceph_objectstore_tool.py | 55 ++++++++++++------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py index 40b3568cc2081..7f616b7d94c6e 100755 --- a/src/test/ceph_objectstore_tool.py +++ b/src/test/ceph_objectstore_tool.py @@ -142,51 +142,38 @@ def vstart(new, opt=""): print "DONE" -def test_failure_tty(cmd, errmsg): - try: - ttyfd = open("/dev/tty", "rw") - except Exception, e: - logging.info(str(e)) - logging.info("SKIP " + cmd) - return 0 +def test_failure(cmd, errmsg, tty=False): + if tty: + try: + ttyfd = open("/dev/tty", "rw") + except Exception, e: + logging.info(str(e)) + logging.info("SKIP " + cmd) + return 0 TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid()) tmpfd = open(TMPFILE, "w") logging.debug(cmd) - ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd) - ttyfd.close() + if tty: + ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd) + ttyfd.close() + else: + ret = call(cmd, shell=True, stderr=tmpfd) tmpfd.close() if ret == 0: logging.error(cmd) logging.error("Should have failed, but got exit 0") return 1 lines = get_lines(TMPFILE) - line = lines[0] - if line == errmsg: - logging.info("Correctly failed with message \"" + line + "\"") + matched = [ l for l in lines if errmsg in l ] + if any(matched): + logging.info("Correctly failed with message \"" + matched[0] + "\"") return 0 else: - logging.error("Bad message to stderr \"" + line + "\"") + logging.error("Bad messages to stderr \"" + str(lines) + "\"") return 1 -def test_failure(cmd, errmsg): - logging.debug(cmd) - try: - check_output(cmd, stderr=subprocess.STDOUT, shell=True) - logging.error(cmd) - logging.error("Should have failed, but got exit 0") - return 1 - except subprocess.CalledProcessError, e: - if errmsg in e.output: - logging.info("Correctly failed with message \"" + errmsg + "\"") - return 0 - else: - errmsg = e.output.split('\n')[0] - logging.error("Bad message to stderr \"" + errmsg + "\"") - return 1 - - def get_nspace(num): if num == 0: return "" @@ -730,11 +717,11 @@ def main(argv): print "Test invalid parameters" # On export can't use stdout to a terminal cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True) # On export can't use stdout to a terminal cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True) # Prep a valid ec export file for import failure tests ONEECPG = ALLECPGS[0] @@ -777,11 +764,11 @@ def main(argv): # On import can't use stdin from a terminal cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True) # On import can't use stdin from a terminal cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True) # Specify a bad --type cmd = (CFSD_PREFIX + "--type foobar --op list --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) From 19a210a0e75b5bfdd9d8f0d4d21d9403b22dac95 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 24 Jul 2015 10:12:51 +0800 Subject: [PATCH 533/654] osd: make read(off=0,len=0) ec-object work. When len=0, it mean read the whole object. It can work when reading no-ec object off=0 & len=0. But for ec-object, it can't work. Signed-off-by: Jianpeng Ma --- src/osd/ReplicatedPG.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index eb93b30bd0af8..46392773f3b2a 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3845,6 +3845,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) (op.extent.offset + op.extent.length > op.extent.truncate_size) ) size = op.extent.truncate_size; + if (op.extent.length == 0) //length is zero mean read the whole object + op.extent.length = size; + if (op.extent.offset >= size) { op.extent.length = 0; trimmed_read = true; From 797caae1f465c1a1bd97816f4f577e730a19273a Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 24 Jul 2015 10:25:41 +0800 Subject: [PATCH 534/654] test/librados: add test case for read object (off=0,len=0). Signed-off-by: Jianpeng Ma --- src/test/librados/io.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc index 2634119a4d29e..cb37c45428cc9 100644 --- a/src/test/librados/io.cc +++ b/src/test/librados/io.cc @@ -117,6 +117,15 @@ TEST_F(LibRadosIoPP, ReadOpPP) { ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf))); } + { + bufferlist op_bl; + ObjectReadOperation op; + op.read(0, 0, NULL, NULL); //len=0 mean read the whole object data. + ASSERT_EQ(0, ioctx.operate("foo", &op, &op_bl)); + ASSERT_EQ(sizeof(buf), op_bl.length()); + ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf))); + } + { bufferlist read_bl, op_bl; ObjectReadOperation op; @@ -634,6 +643,15 @@ TEST_F(LibRadosIoECPP, ReadOpPP) { ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf))); } + { + bufferlist op_bl; + ObjectReadOperation op; + op.read(0, 0, NULL, NULL); //len=0 mean read the whole object data + ASSERT_EQ(0, ioctx.operate("foo", &op, &op_bl)); + ASSERT_EQ(sizeof(buf), op_bl.length()); + ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf))); + } + { bufferlist read_bl, op_bl; ObjectReadOperation op; From 6f9ee7961eee9ee3a61a07cbe0d8d289ee98fa9a Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 30 Jul 2015 16:59:32 -0700 Subject: [PATCH 535/654] ReplicatedPG,Objecter: copy_get should include truncate_seq and size Otherwise, we break CephFS over cache tiers. Fixes: #12551 Signed-off-by: Samuel Just --- src/osd/ReplicatedPG.cc | 11 +++++++++++ src/osd/ReplicatedPG.h | 5 ++++- src/osd/osd_types.cc | 10 ++++++++-- src/osd/osd_types.h | 10 ++++++++-- src/osdc/Objecter.h | 18 ++++++++++++++++-- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index dcd11f5553f29..902aba78f9715 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6044,6 +6044,8 @@ int ReplicatedPG::fill_in_copy_get( reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST; reply_obj.omap_digest = oi.omap_digest; } + reply_obj.truncate_seq = oi.truncate_seq; + reply_obj.truncate_size = oi.truncate_size; // attrs map& out_attrs = reply_obj.attrs; @@ -6242,6 +6244,8 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) &cop->results.source_data_digest, &cop->results.source_omap_digest, &cop->results.reqids, + &cop->results.truncate_seq, + &cop->results.truncate_size, &cop->rval); op.set_last_op_flags(cop->src_obj_fadvise_flags); @@ -6341,6 +6345,8 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) cop->results.final_tx = pgbackend->get_transaction(); _build_finish_copy_transaction(cop, cop->results.final_tx); + derr << __func__ << " got truncate_seq " << cop->results.truncate_seq + << " " << cop->results.truncate_size << dendl; // verify digests? if (cop->results.is_data_digest() || cop->results.is_omap_digest()) { dout(20) << __func__ << std::hex @@ -6545,6 +6551,9 @@ void ReplicatedPG::finish_copyfrom(OpContext *ctx) obs.oi.set_data_digest(cb->results->data_digest); obs.oi.set_omap_digest(cb->results->omap_digest); + obs.oi.truncate_seq = cb->results->truncate_seq; + obs.oi.truncate_size = cb->results->truncate_size; + ctx->extra_reqids = cb->results->reqids; // cache: clear whiteout? @@ -6726,6 +6735,8 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results, tctx->new_obs.oi.set_data_digest(results->data_digest); if (results->has_omap) tctx->new_obs.oi.set_omap_digest(results->omap_digest); + tctx->new_obs.oi.truncate_seq = results->truncate_seq; + tctx->new_obs.oi.truncate_size = results->truncate_size; if (soid.snap != CEPH_NOSNAP) { tctx->new_obs.oi.snaps = results->snaps; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 9c280365da27f..89794c24f2ba3 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -127,6 +127,8 @@ class ReplicatedPG : public PG, public PGBackend::Listener { uint32_t source_data_digest, source_omap_digest; uint32_t data_digest, omap_digest; vector > reqids; // [(reqid, user_version)] + uint64_t truncate_seq; + uint64_t truncate_size; bool is_data_digest() { return flags & object_copy_data_t::FLAG_DATA_DIGEST; } @@ -140,7 +142,8 @@ class ReplicatedPG : public PG, public PGBackend::Listener { has_omap(false), flags(0), source_data_digest(-1), source_omap_digest(-1), - data_digest(-1), omap_digest(-1) + data_digest(-1), omap_digest(-1), + truncate_seq(0), truncate_size(0) {} }; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index dbcbd3dab357a..f0a1e4730f161 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -3632,7 +3632,7 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(size, bl); ::encode(mtime, bl); ::encode(attrs, bl); @@ -3646,12 +3646,14 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const ::encode(data_digest, bl); ::encode(omap_digest, bl); ::encode(reqids, bl); + ::encode(truncate_seq, bl); + ::encode(truncate_size, bl); ENCODE_FINISH(bl); } void object_copy_data_t::decode(bufferlist::iterator& bl) { - DECODE_START(6, bl); + DECODE_START(7, bl); if (struct_v < 5) { // old ::decode(size, bl); @@ -3703,6 +3705,10 @@ void object_copy_data_t::decode(bufferlist::iterator& bl) if (struct_v >= 6) { ::decode(reqids, bl); } + if (struct_v >= 7) { + ::decode(truncate_seq, bl); + ::decode(truncate_size, bl); + } } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 0c45f413ebc05..0bedb490481de 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2677,9 +2677,15 @@ struct object_copy_data_t { ///< recent reqids on this object vector > reqids; + uint64_t truncate_seq; + uint64_t truncate_size; + public: - object_copy_data_t() : size((uint64_t)-1), data_digest(-1), - omap_digest(-1), flags(0) {} + object_copy_data_t() : + size((uint64_t)-1), data_digest(-1), + omap_digest(-1), flags(0), + truncate_seq(0), + truncate_size(0) {} static void generate_test_instances(list& o); void encode_classic(bufferlist& bl) const; diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 7c49efb95c14d..038e5f3b6e6c6 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -633,6 +633,8 @@ struct ObjectOperation { uint32_t *out_data_digest; uint32_t *out_omap_digest; vector > *out_reqids; + uint64_t *out_truncate_seq; + uint64_t *out_truncate_size; int *prval; C_ObjectOperation_copyget(object_copy_cursor_t *c, uint64_t *s, @@ -646,13 +648,18 @@ struct ObjectOperation { uint32_t *dd, uint32_t *od, vector > *oreqids, + uint64_t *otseq, + uint64_t *otsize, int *r) : cursor(c), out_size(s), out_mtime(m), out_attrs(a), out_data(d), out_omap_header(oh), out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq), out_flags(flags), out_data_digest(dd), out_omap_digest(od), - out_reqids(oreqids), prval(r) {} + out_reqids(oreqids), + out_truncate_seq(otseq), + out_truncate_size(otsize), + prval(r) {} void finish(int r) { if (r < 0) return; @@ -684,6 +691,10 @@ struct ObjectOperation { *out_omap_digest = copy_reply.omap_digest; if (out_reqids) *out_reqids = copy_reply.reqids; + if (out_truncate_seq) + *out_truncate_seq = copy_reply.truncate_seq; + if (out_truncate_size) + *out_truncate_size = copy_reply.truncate_size; *cursor = copy_reply.cursor; } catch (buffer::error& e) { if (prval) @@ -707,6 +718,8 @@ struct ObjectOperation { uint32_t *out_data_digest, uint32_t *out_omap_digest, vector > *out_reqids, + uint64_t *truncate_seq, + uint64_t *truncate_size, int *prval) { OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET); osd_op.op.copy_get.max = max; @@ -720,7 +733,8 @@ struct ObjectOperation { out_attrs, out_data, out_omap_header, out_omap_data, out_snaps, out_snap_seq, out_flags, out_data_digest, out_omap_digest, - out_reqids, prval); + out_reqids, truncate_seq, truncate_size, + prval); out_bl[p] = &h->bl; out_handler[p] = h; } From 4f98dab99c35663de89a06e2dfdbd874f56aed41 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 12 Aug 2015 18:38:38 +0200 Subject: [PATCH 536/654] client/Client.cc: fix realloc memory leak Fix handling of realloc. If realloc() fails it returns NULL, assigning the return value of realloc() directly to the pointer without checking for the result will lead to a memory leak. Signed-off-by: Danny Al-Gaaf --- src/client/Client.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 638ceecb58558..20e946d8e87c5 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4597,11 +4597,13 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) if (getgrouplist(pw->pw_name, gid, sgids, &sgid_count) == -1) { #endif // we need to resize the group list and try again - sgids = (gid_t*)realloc(sgids, sgid_count * sizeof(gid_t)); - if (sgids == NULL) { + void *_realloc = NULL; + if ((_realloc = realloc(sgids, sgid_count * sizeof(gid_t))) == NULL) { ldout(cct, 3) << "allocating group memory failed" << dendl; + free(sgids); return -EACCES; } + sgids = (gid_t*)_realloc; continue; } // list was successfully retrieved From 8810f8f2b8452d99caf6eba28e5de763d482d180 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 12 Aug 2015 18:48:40 +0200 Subject: [PATCH 537/654] SnappyCompressor.h: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/compressor/SnappyCompressor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressor/SnappyCompressor.h b/src/compressor/SnappyCompressor.h index f5d86d5a2420d..ba58b4624ee79 100644 --- a/src/compressor/SnappyCompressor.h +++ b/src/compressor/SnappyCompressor.h @@ -40,7 +40,7 @@ class BufferlistSource : public snappy::Source { } virtual void Skip(size_t n) { if (n + pb_off == pb->length()) { - pb++; + ++pb; pb_off = 0; } else { pb_off += n; From 9d9b305981ccb6eae242c8d3d881f39184993d57 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 12 Aug 2015 18:50:11 +0200 Subject: [PATCH 538/654] os/KeyValueStore.cc: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/os/KeyValueStore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc index a147c0b27ab42..f2531f7508109 100644 --- a/src/os/KeyValueStore.cc +++ b/src/os/KeyValueStore.cc @@ -2026,7 +2026,7 @@ int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, r = check_get_rc(header->cid, header->oid, r, lookup_keys.size() == values.size()); if (r < 0) return r; - for(set::iterator it = lookup_keys.begin(); it != lookup_keys.end(); it++) + for(set::iterator it = lookup_keys.begin(); it != lookup_keys.end(); ++it) { pair p = off_len[*it]; values[*it].zero(p.first, p.second); From 897f074969405dd69375c1c3e24b107558d84194 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 12 Aug 2015 18:51:17 +0200 Subject: [PATCH 539/654] test_async_compressor.cc: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/test/common/test_async_compressor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/common/test_async_compressor.cc b/src/test/common/test_async_compressor.cc index aca448c0a00cd..1655596d1a038 100644 --- a/src/test/common/test_async_compressor.cc +++ b/src/test/common/test_async_compressor.cc @@ -142,7 +142,7 @@ class SyntheticWorkload { for (set >::iterator it = compress_jobs.begin(); it != compress_jobs.end();) { prev = it; - it++; + ++it; ASSERT_EQ(0, async_compressor->get_compress_data(prev->first, data, blocking, &finished)); if (finished) { c_reap++; @@ -157,7 +157,7 @@ class SyntheticWorkload { for (set >::iterator it = decompress_jobs.begin(); it != decompress_jobs.end();) { prev = it; - it++; + ++it; ASSERT_EQ(0, async_compressor->get_decompress_data(prev->first, data, blocking, &finished)); if (finished) { d_reap++; From 6ceb37d4bacf92182046854e38a71b9bc61bdc83 Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Wed, 9 Sep 2015 18:15:34 +0100 Subject: [PATCH 540/654] test: mon/mon-scrub.sh: port clashed with other tests This will allow the test to be run during make check. Signed-off-by: Joao Eduardo Luis --- src/test/mon/mon-scrub.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/mon/mon-scrub.sh b/src/test/mon/mon-scrub.sh index 289e03b72d545..eb33bbcc7f305 100755 --- a/src/test/mon/mon-scrub.sh +++ b/src/test/mon/mon-scrub.sh @@ -21,7 +21,7 @@ function run() { local dir=$1 shift - export CEPH_MON="127.0.0.1:7104" + export CEPH_MON="127.0.0.1:7120" export CEPH_ARGS CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " CEPH_ARGS+="--mon-host=$CEPH_MON " From 293d12a2bc26e769935a270478b21b9e9e5a9406 Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Wed, 9 Sep 2015 15:39:27 +0100 Subject: [PATCH 541/654] test/Makefile.am: run mon/mon-scrub.sh as part of checks Signed-off-by: Joao Eduardo Luis --- src/test/CMakeLists.txt | 3 +++ src/test/Makefile.am | 1 + 2 files changed, 4 insertions(+) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 76be468f88a9d..71f304c5c1941 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -355,6 +355,9 @@ add_dependencies(check osd_erasure_code_profile) add_test(NAME osd_crush COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/osd-crush.sh) add_dependencies(check osd_crush) +add_test(NAME mon_scrub COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-scrub.sh) +add_dependencies(check mon_scrub) + add_test(NAME test_ceph_daemon COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_daemon.py) add_dependencies(check test_ceph_daemon) diff --git a/src/test/Makefile.am b/src/test/Makefile.am index f6373c76d6d56..29e98866546e4 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -76,6 +76,7 @@ check_SCRIPTS += \ test/mon/osd-crush.sh \ test/mon/osd-erasure-code-profile.sh \ test/mon/mkfs.sh \ + test/mon/mon-scrub.sh \ test/osd/osd-scrub-repair.sh \ test/osd/osd-config.sh \ test/osd/osd-bench.sh \ From 06147dda2c50c56f01b44664cdf6e0dc58bdfa22 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 9 Sep 2015 15:41:02 -0700 Subject: [PATCH 542/654] rgw: preserve all attrs if intra-zone copy Fixes: #13015 Intra zone copy requires that all objects' attributes are preserved. This was broken at commit: e41d97c8e38bb60d7e09e9801c0179efe7af1734 Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index c23a2db3ee7f0..8c6418201c14a 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -3885,6 +3885,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, if (source_zone.empty()) { set_copy_attrs(src_attrs, attrs, attrs_mod); + } else { + attrs = src_attrs; } ret = cb.complete(etag, mtime, set_mtime, attrs); From 6907778d767ba08bb80c495785056ed122b023fe Mon Sep 17 00:00:00 2001 From: Mykola Golub Date: Tue, 16 Jun 2015 11:57:08 +0300 Subject: [PATCH 543/654] ceph-objectstore-tool: add mark-complete operation It is supposed to be used as a last resort to fix a cluster that has PGs in 'incomplete' state, using the following procedure: 1) stop the osd that is primary for the incomplete PG; 2) run: ceph-objectstore-tool --data-path ... --journal-path ... --pgid $PGID --op mark-complete 3) start the osd. Fixes: #10098 Signed-off-by: Mykola Golub --- src/test/ceph_objectstore_tool.py | 2 +- src/tools/ceph_objectstore_tool.cc | 37 ++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py index 7f616b7d94c6e..79230d22834f6 100755 --- a/src/test/ceph_objectstore_tool.py +++ b/src/test/ceph_objectstore_tool.py @@ -791,7 +791,7 @@ def main(argv): # Specify a bad --op command cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD) - ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap)") + ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)") # Provide just the object param not a command cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD) diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index a2dbccbc5fc20..4f74501e8021c 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -1909,10 +1909,10 @@ int main(int argc, char **argv) ("journal-path", po::value(&jpath), "path to journal, mandatory for filestore type") ("pgid", po::value(&pgidstr), - "PG id, mandatory for info, log, remove, export, rm-past-intervals") + "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete") ("op", po::value(&op), "Arg is one of [info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, " - "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap]") + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete]") ("epoch", po::value(&epoch), "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified") ("file", po::value(&file), @@ -2285,7 +2285,8 @@ int main(int argc, char **argv) // The ops which require --pgid option are checked here and // mentioned in the usage for --pgid. if ((op == "info" || op == "log" || op == "remove" || op == "export" - || op == "rm-past-intervals") && pgidstr.length() == 0) { + || op == "rm-past-intervals" || op == "mark-complete") && + pgidstr.length() == 0) { cerr << "Must provide pgid" << std::endl; usage(desc); ret = 1; @@ -2563,9 +2564,9 @@ int main(int argc, char **argv) // If not an object command nor any of the ops handled below, then output this usage // before complaining about a bad pgid - if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals") { + if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") { cerr << "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, " - "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap)" + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)" << std::endl; usage(desc); ret = 1; @@ -2822,6 +2823,32 @@ int main(int argc, char **argv) fs->apply_transaction(*t); cout << "Removal succeeded" << std::endl; } + } else if (op == "mark-complete") { + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver != PG::cur_struct_v) { + cerr << "Can't mark-complete, version mismatch " << (int)struct_ver + << " (pg) != " << (int)PG::cur_struct_v << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Marking complete " << std::endl; + + info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1); + info.last_backfill = hobject_t::get_max(); + info.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_clean = superblock.current_epoch; + past_intervals.clear(); + + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret == 0) { + fs->apply_transaction(*t); + cout << "Marking complete succeeded" << std::endl; + } } else { assert(!"Should have already checked for valid --op"); } From 94d84ccd958aa4ece47ea1b96deb763b7d70b32b Mon Sep 17 00:00:00 2001 From: Joao Eduardo Luis Date: Wed, 9 Sep 2015 20:08:38 +0100 Subject: [PATCH 544/654] test: mon/mon-ping.sh: make sure 'ceph mon ping' works as expected Signed-off-by: Joao Eduardo Luis --- src/test/CMakeLists.txt | 3 +++ src/test/Makefile.am | 1 + src/test/mon/mon-ping.sh | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100755 src/test/mon/mon-ping.sh diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 71f304c5c1941..18ea8815758e2 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -358,6 +358,9 @@ add_dependencies(check osd_crush) add_test(NAME mon_scrub COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-scrub.sh) add_dependencies(check mon_scrub) +add_test(NAME mon_ping COMMAND bash ${CMAKE_SOURCE_DIR}/src/test/mon/mon-ping.sh) +add_dependencies(check mon_ping) + add_test(NAME test_ceph_daemon COMMAND python ${CMAKE_SOURCE_DIR}/src/test/pybind/test_ceph_daemon.py) add_dependencies(check test_ceph_daemon) diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 65f1f77fdc35a..89fc7dfa7603a 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -74,6 +74,7 @@ check_SCRIPTS += \ test/mon/osd-pool-create.sh \ test/mon/misc.sh \ test/mon/osd-crush.sh \ + test/mon/mon-ping.sh \ test/mon/osd-erasure-code-profile.sh \ test/mon/mkfs.sh \ test/mon/mon-scrub.sh \ diff --git a/src/test/mon/mon-ping.sh b/src/test/mon/mon-ping.sh new file mode 100755 index 0000000000000..e3f7395658aac --- /dev/null +++ b/src/test/mon/mon-ping.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Copyright (C) 2015 SUSE LINUX GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source ../qa/workunits/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7119" + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_mon_ping() { + local dir=$1 + + run_mon $dir a || return 1 + + ./ceph ping mon.a || return 1 +} + +main mon-ping "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh" +# End: From c503e97b7f7e0349147629f78b1a17c77bcf9b0c Mon Sep 17 00:00:00 2001 From: "Javier M. Mellid" Date: Wed, 9 Sep 2015 15:56:04 +0200 Subject: [PATCH 545/654] rgw: include RequestId as part of the Error response The RGW error responses must contain a RequestId to be compliant with the Amazon S3 errors. This RequestId is the ID of the request associated with the error. Fixes: #13020 Signed-off-by: Javier M. Mellid --- src/rgw/rgw_rest.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index b7f51970d09cf..e2b1568247c31 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -553,6 +553,8 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const s->formatter->dump_string("Code", s->err.s3_code); if (!s->err.message.empty()) s->formatter->dump_string("Message", s->err.message); + if (!s->trans_id.empty()) + s->formatter->dump_string("RequestId", s->trans_id); s->formatter->close_section(); dump_content_length(s, s->formatter->get_len()); } else { From 71909b64b8caaacfcd2ee0b1ab3d47e42b6808fe Mon Sep 17 00:00:00 2001 From: "Javier M. Mellid" Date: Wed, 9 Sep 2015 22:07:23 +0200 Subject: [PATCH 546/654] doc: rgw: update x-amz-request-id status Signed-off-by: Javier M. Mellid --- doc/dev/radosgw/s3_compliance.rst | 2 +- doc/radosgw/s3.rst | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/dev/radosgw/s3_compliance.rst b/doc/dev/radosgw/s3_compliance.rst index ce88e7f7d01fd..f22b3039f93e6 100644 --- a/doc/dev/radosgw/s3_compliance.rst +++ b/doc/dev/radosgw/s3_compliance.rst @@ -90,7 +90,7 @@ S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/REST +---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+ | x-amz-id-2 | No | | | +---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+ -| x-amz-request-id | No | | | +| x-amz-request-id | Yes | https://github.com/ceph/ceph/commit/b711e3124f8f73c17ebd19b38807a1b77f201e44 | | +---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+ | x-amz-version-id | No | | | +---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+ diff --git a/doc/radosgw/s3.rst b/doc/radosgw/s3.rst index 8b21e42ef4fd1..67f497ca1491c 100644 --- a/doc/radosgw/s3.rst +++ b/doc/radosgw/s3.rst @@ -89,8 +89,6 @@ The following common request header fields are not supported: +----------------------------+------------+ | **x-amz-id-2** | Response | +----------------------------+------------+ -| **x-amz-request-id** | Response | -+----------------------------+------------+ | **x-amz-version-id** | Response | +----------------------------+------------+ From c4401ad60e27d67f0b38529b5471dc7e00f92651 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Sep 2015 07:51:52 -0400 Subject: [PATCH 547/654] test/Makefile-client: ship LibradosTestStub.h in tarball Signed-off-by: Sage Weil --- src/test/Makefile-client.am | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am index 49e17f6e61d6f..ec40ccaab925d 100644 --- a/src/test/Makefile-client.am +++ b/src/test/Makefile-client.am @@ -282,6 +282,7 @@ librados_test_stub_la_SOURCES = \ test/librados_test_stub/TestRadosClient.cc \ test/librados_test_stub/TestWatchNotify.cc noinst_HEADERS += \ + test/librados_test_stub/LibradosTestStub.h \ test/librados_test_stub/TestClassHandler.h \ test/librados_test_stub/TestRadosClient.h \ test/librados_test_stub/TestMemRadosClient.h \ From 82ba048d2c485259e586271ec1519ed359962eaa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Sep 2015 10:35:11 -0400 Subject: [PATCH 548/654] common: add debug option to deliberately leak some memory This will be used to verify leak detection tests are working. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 ++ src/global/global_init.cc | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a09873666e319..de05abf6be647 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1136,3 +1136,5 @@ OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf // This will be set to true when it is safe to start threads. // Once it is true, it will never change. OPTION(internal_safe_to_start_threads, OPT_BOOL, false) + +OPTION(debug_deliberately_leak_memory, OPT_BOOL, false) diff --git a/src/global/global_init.cc b/src/global/global_init.cc index ce044078fbe89..4e6a6a6e9aa70 100644 --- a/src/global/global_init.cc +++ b/src/global/global_init.cc @@ -240,6 +240,13 @@ void global_init(std::vector < const char * > *alt_def_args, // and opening the log file immediately. g_conf->call_all_observers(); + // test leak checking + if (g_conf->debug_deliberately_leak_memory) { + derr << "deliberately leaking some memory" << dendl; + char *s = new char[1234567]; + (void)s; + } + if (code_env == CODE_ENVIRONMENT_DAEMON && !(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) output_ceph_version(); } From 8e07c8c8af4e115851f9a31be430f3540eea1de2 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Thu, 10 Sep 2015 12:21:57 -0400 Subject: [PATCH 549/654] rbd: missing return statement within do_metadata_remove Signed-off-by: Jason Dillaman --- src/rbd.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rbd.cc b/src/rbd.cc index ee26fae59ae12..64094e3c39a47 100755 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -2321,6 +2321,7 @@ static int do_metadata_remove(librbd::Image& image, const char *key) if (r < 0) { cerr << "failed to remove metadata " << key << " of image : " << cpp_strerror(r) << std::endl; } + return r; } static int do_metadata_get(librbd::Image& image, const char *key) From 742feec7bcd9e84d771c490b8e3ea9da2e1a4c27 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 12 Aug 2015 18:46:19 +0200 Subject: [PATCH 550/654] mds/MDCache.cc: fix unitialized variable Fix for: [src/mds/MDCache.cc:8172]: (error) Uninitialized variable: checked_rank Signed-off-by: Danny Al-Gaaf --- src/mds/MDCache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 494fc015a95b6..3ec852a97d136 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8167,7 +8167,7 @@ void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) _open_ino_traverse_dir(ino, info, 0); } else { if (ret >= 0) { - mds_rank_t checked_rank; + mds_rank_t checked_rank = mds_rank_t(ret); info.check_peers = true; info.auth_hint = checked_rank; info.checked.erase(checked_rank); From 7a7248d952899338e885ca884796a1ee5691b463 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Tue, 18 Aug 2015 12:26:44 +0200 Subject: [PATCH 551/654] ConfigKeyService.cc: move assert before first deref Fix for: [src/mon/ConfigKeyService.cc:94] -> [src/mon/ConfigKeyService.cc:100]: (warning) Possible null pointer dereference: m - otherwise it is redundant to check it against null. Signed-off-by: Danny Al-Gaaf --- src/mon/ConfigKeyService.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mon/ConfigKeyService.cc b/src/mon/ConfigKeyService.cc index ba6408bd93893..64ed42ffc883f 100644 --- a/src/mon/ConfigKeyService.cc +++ b/src/mon/ConfigKeyService.cc @@ -91,13 +91,14 @@ void ConfigKeyService::store_list(stringstream &ss) bool ConfigKeyService::service_dispatch(MonOpRequestRef op) { Message *m = op->get_req(); + assert(m != NULL); dout(10) << __func__ << " " << *m << dendl; + if (!in_quorum()) { dout(1) << __func__ << " not in quorum -- ignore message" << dendl; return false; } - assert(m != NULL); assert(m->get_type() == MSG_MON_COMMAND); MMonCommand *cmd = static_cast(m); From a29dd45dd89f59ff15018f541601ac5ede162174 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Tue, 18 Aug 2015 12:34:01 +0200 Subject: [PATCH 552/654] client/Client.cc: remove only once used variable Fix for: [src/client/Client.cc:4555]: (style) The scope of the variable 'initial_group_count' can be reduced. Signed-off-by: Danny Al-Gaaf --- src/client/Client.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 20e946d8e87c5..369d2b2d46169 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4560,12 +4560,6 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient int Client::check_permissions(Inode *in, int flags, int uid, int gid) { - // initial number of group entries, defaults to posix standard of 16 - // PAM implementations may provide more than 16 groups.... -#if HAVE_GETGROUPLIST - int initial_group_count = 16; -#endif - gid_t *sgids = NULL; int sgid_count = 0; if (getgroups_cb) { @@ -4578,7 +4572,9 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) #if HAVE_GETGROUPLIST else { //use PAM to get the group list - sgid_count = initial_group_count; + // initial number of group entries, defaults to posix standard of 16 + // PAM implementations may provide more than 16 groups.... + sgid_count = 16; sgids = (gid_t*)malloc(sgid_count * sizeof(gid_t)); if (sgids == NULL) { ldout(cct, 3) << "allocating group memory failed" << dendl; From e243aa84c0bc3e7c15178fc5a5a7499db3b88c37 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Fri, 21 Aug 2015 16:11:16 +0200 Subject: [PATCH 553/654] mds/MDSDaemon.cc: fix resource leak in MDSDaemon Delete mdsmap in descructor. Remove not needed checks for mds_rank and objecter before call delete since the C++ standard allows the deletion of pointer with NULL-value. The check is redundant. Fix for: CID 1316224 (#1 of 1): Resource leak in object (CTOR_DTOR_LEAK) 1. alloc_new: Allocating memory by calling new MDSMap. 2. var_assign: Assigning: this->mdsmap = new MDSMap. 3. ctor_dtor_leak: The constructor allocates field mdsmap of MDSDaemon but the destructor and whatever functions it calls do not free it. Signed-off-by: Danny Al-Gaaf --- src/mds/MDSDaemon.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index 7d23722792c5a..793dab40de319 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -139,8 +139,12 @@ MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) : MDSDaemon::~MDSDaemon() { Mutex::Locker lock(mds_lock); - if (mds_rank) {delete mds_rank ; mds_rank = NULL; } - if (objecter) {delete objecter ; objecter = NULL; } + delete mds_rank; + mds_rank = NULL; + delete objecter; + objecter = NULL; + delete mdsmap; + mdsmap = NULL; delete authorize_handler_service_registry; delete authorize_handler_cluster_registry; From b229162e368ac0b10c8da1112587128f029a29c2 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Fri, 21 Aug 2015 16:21:17 +0200 Subject: [PATCH 554/654] client/Client.cc: fix memory leak, free sgids in error case Signed-off-by: Danny Al-Gaaf --- src/client/Client.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/client/Client.cc b/src/client/Client.cc index 369d2b2d46169..fdb872ca269fe 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4584,6 +4584,7 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) pw = getpwuid(uid); if (pw == NULL) { ldout(cct, 3) << "getting user entry failed" << dendl; + free(sgids); return -EACCES; } while (1) { From 4a2377c40bc34906c212f8a324df668f5bcc0a8c Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:45:55 +0200 Subject: [PATCH 555/654] rgw_object_expirer_core.cc: remove left over unused variable Remove unused variable is_next_available and while loop. These are leftovers from commit #a69a989f. Signed-off-by: Danny Al-Gaaf --- src/rgw/rgw_object_expirer_core.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc index b092b40bc3945..48577fe1967e7 100644 --- a/src/rgw/rgw_object_expirer_core.cc +++ b/src/rgw/rgw_object_expirer_core.cc @@ -181,7 +181,6 @@ void RGWObjectExpirer::process_single_shard(const string& shard, void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, const utime_t& round_start) { - bool is_next_available; utime_t shard_marker; CephContext *cct = store->ctx(); @@ -194,7 +193,7 @@ void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, const utime_t ldout(store->ctx(), 20) << "proceeding shard = " << shard << dendl; process_single_shard(shard, last_run, round_start); - } while (is_next_available); + } return; } From 534fdd5c8f97e5e6dfcb40d87df51154b849ea68 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:48:56 +0200 Subject: [PATCH 556/654] ErasureCodeShecTableCache.cc: prefer --operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/erasure-code/shec/ErasureCodeShecTableCache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/erasure-code/shec/ErasureCodeShecTableCache.cc b/src/erasure-code/shec/ErasureCodeShecTableCache.cc index a037892f67c09..ebd32db0ae5bd 100644 --- a/src/erasure-code/shec/ErasureCodeShecTableCache.cc +++ b/src/erasure-code/shec/ErasureCodeShecTableCache.cc @@ -289,7 +289,7 @@ ErasureCodeShecTableCache::putDecodingTableToCache(int* decoding_matrix, // allocate a new buffer lru_list_t::iterator it_end = decode_tbls_lru->end(); - it_end--; + --it_end; lru_entry_t &map_value = (*decode_tbls_map)[signature] = From a1be9ef4fed1dbf04a256e8196ddc3cca3bc0274 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:50:38 +0200 Subject: [PATCH 557/654] NewStore.cc: prefer --/++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/os/newstore/NewStore.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 953dcd22d5864..2288894042a50 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -463,7 +463,7 @@ int NewStore::OnodeHashLRU::trim(int max) int num = onode_map.size() - max; lru_list_t::iterator p = lru.end(); if (num) - p--; + --p; while (num > 0) { Onode *o = &*p; int refs = o->nref.read(); @@ -2487,7 +2487,7 @@ void NewStore::_kv_sync_thread() if (!g_conf->newstore_sync_submit_transaction) { for (std::deque::iterator it = kv_committing.begin(); it != kv_committing.end(); - it++) { + ++it) { db->submit_transaction((*it)->t); } } @@ -2497,7 +2497,7 @@ void NewStore::_kv_sync_thread() KeyValueDB::Transaction txc_cleanup_sync = db->get_transaction(); for (std::deque::iterator it = wal_cleaning.begin(); it != wal_cleaning.end(); - it++) { + ++it) { wal_transaction_t& wt =*(*it)->wal_txn; // cleanup the data in overlays for (list::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) { From 120071bcc65ad874af7c0ce01df66e5c84ef327f Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:51:51 +0200 Subject: [PATCH 558/654] osd/OSD.cc: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/osd/OSD.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a577fd802c82a..bba4128247904 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8797,7 +8797,7 @@ void OSD::set_pool_last_map_marked_full(OSDMap *o, epoch_t &e) { map &pool_last_map_marked_full = superblock.pool_last_map_marked_full; for (map::const_iterator it = o->get_pools().begin(); - it != o->get_pools().end(); it++) { + it != o->get_pools().end(); ++it) { bool exist = pool_last_map_marked_full.count(it->first); if (it->second.has_flag(pg_pool_t::FLAG_FULL) && !exist) pool_last_map_marked_full[it->first] = e; From 0297ce191a1bc5ea092781d6117eec6b732270f8 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:52:42 +0200 Subject: [PATCH 559/654] osd/ReplicatedPG.cc: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/osd/ReplicatedPG.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 5c6d587d61825..ec1bc0fb34d0f 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -12352,7 +12352,7 @@ void ReplicatedPG::setattrs_maybe_cache( { if (pool.info.require_rollback()) { for (map::iterator it = attrs.begin(); - it != attrs.end(); it++ ) { + it != attrs.end(); ++it) { op->pending_attrs[obc][it->first] = it->second; } } From 2767736a91608310d6b9be2749c116f090cbdd22 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 16:54:07 +0200 Subject: [PATCH 560/654] osdc/Objecter.cc: prefer ++operator for non-primitive iterators Signed-off-by: Danny Al-Gaaf --- src/osdc/Objecter.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 53b3cdadfa3a7..a0d2980c73dc0 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1019,7 +1019,7 @@ void Objecter::handle_osd_map(MOSDMap *m) bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full || _osdmap_has_pool_full(); map pool_full_map; for (map::const_iterator it = osdmap->get_pools().begin(); - it != osdmap->get_pools().end(); it++) + it != osdmap->get_pools().end(); ++it) pool_full_map[it->first] = it->second.has_flag(pg_pool_t::FLAG_FULL); @@ -2413,7 +2413,7 @@ bool Objecter::_osdmap_pool_full(const int64_t pool_id) const bool Objecter::_osdmap_has_pool_full() const { for (map::const_iterator it = osdmap->get_pools().begin(); - it != osdmap->get_pools().end(); it++) { + it != osdmap->get_pools().end(); ++it) { if (it->second.has_flag(pg_pool_t::FLAG_FULL)) return true; } @@ -2432,7 +2432,7 @@ bool Objecter::_osdmap_full_flag() const void Objecter::update_pool_full_map(map& pool_full_map) { for (map::const_iterator it = osdmap->get_pools().begin(); - it != osdmap->get_pools().end(); it++) { + it != osdmap->get_pools().end(); ++it) { if (pool_full_map.find(it->first) == pool_full_map.end()) { pool_full_map[it->first] = it->second.has_flag(pg_pool_t::FLAG_FULL); } else { From 6e97b0fa850fe64007a5a3383b4c2124f9729655 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 18:55:04 +0200 Subject: [PATCH 561/654] rbd.cc: fix -Wreturn-type, return result from do_metadata_remove() Signed-off-by: Danny Al-Gaaf --- src/rbd.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rbd.cc b/src/rbd.cc index 39acdbe197974..0b3785606a2c3 100755 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -2311,6 +2311,7 @@ static int do_metadata_remove(librbd::Image& image, const char *key) if (r < 0) { cerr << "failed to remove metadata " << key << " of image : " << cpp_strerror(r) << std::endl; } + return r; } static int do_metadata_get(librbd::Image& image, const char *key) From d9d4989046836e70350a2ac946e06e94a32f62b6 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 19:00:04 +0200 Subject: [PATCH 562/654] Transaction.c: replace deprecated function call Replace deprecated use of collection_move() with collection_move_rename(). Signed-off-by: Danny Al-Gaaf --- src/os/Transaction.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc index f1deb2595e129..6b0d77d9308a9 100644 --- a/src/os/Transaction.cc +++ b/src/os/Transaction.cc @@ -284,7 +284,7 @@ void ObjectStore::Transaction::_build_actions_from_tbl() assert(ocid2 == ocid); assert(oid2 == oid); - collection_move(ncid, ocid, oid); + collection_move_rename(ocid, oid, ncid, oid); } break; From 99ea120ba987c9836813295931ec13e66e576327 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 20:00:22 +0200 Subject: [PATCH 563/654] objectstore/store_test.cc: fix -Wsign-compare Signed-off-by: Danny Al-Gaaf --- src/test/objectstore/store_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 8e9ca11222603..3a47d120f1437 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -408,7 +408,7 @@ TEST_P(StoreTest, SimpleObjectTest) { bufferlist in; r = store->read(cid, hoid, 0, bl.length(), in); - ASSERT_EQ(bl.length(), r); + ASSERT_EQ((int)bl.length(), r); in.hexdump(cout); ASSERT_TRUE(in.contents_equal(bl)); } @@ -2000,14 +2000,14 @@ TEST_P(StoreTest, OMapTest) { bufferlist hdr; map m; store->omap_get(cid, hoid, &hdr, &m); - ASSERT_EQ(6, hdr.length()); + ASSERT_EQ(6u, hdr.length()); ASSERT_TRUE(m.count("2")); ASSERT_TRUE(!m.count("3")); ASSERT_TRUE(!m.count("6")); ASSERT_TRUE(m.count("7")); ASSERT_TRUE(m.count("8")); //cout << m << std::endl; - ASSERT_EQ(6, m.size()); + ASSERT_EQ(6u, m.size()); } { ObjectStore::Transaction t; @@ -2018,8 +2018,8 @@ TEST_P(StoreTest, OMapTest) { bufferlist hdr; map m; store->omap_get(cid, hoid, &hdr, &m); - ASSERT_EQ(0, hdr.length()); - ASSERT_EQ(0, m.size()); + ASSERT_EQ(0u, hdr.length()); + ASSERT_EQ(0u, m.size()); } } @@ -2077,7 +2077,7 @@ TEST_P(StoreTest, OMapIterator) { } ASSERT_EQ(correct, true); } - ASSERT_EQ(attrs.size(), count); + ASSERT_EQ((int)attrs.size(), count); // FileStore may deadlock an active iterator vs apply_transaction iter = ObjectMap::ObjectMapIterator(); From 1aaccd79f310d764096aead1678ac3f70d1a6173 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Wed, 9 Sep 2015 20:01:04 +0200 Subject: [PATCH 564/654] librbd/test_internal.cc: fix -Wsign-compare Signed-off-by: Danny Al-Gaaf --- src/test/librbd/test_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc index 36a58f5f35513..75a6db2791962 100644 --- a/src/test/librbd/test_internal.cc +++ b/src/test/librbd/test_internal.cc @@ -396,7 +396,7 @@ TEST_F(TestInternal, MetadatConfig) { map pairs; r = librbd::metadata_list(ictx, "", 0, &pairs); ASSERT_EQ(0, r); - ASSERT_EQ(5, pairs.size()); + ASSERT_EQ(5u, pairs.size()); r = librbd::metadata_remove(ictx, "abcd"); ASSERT_EQ(0, r); r = librbd::metadata_remove(ictx, "xyz"); @@ -404,7 +404,7 @@ TEST_F(TestInternal, MetadatConfig) { pairs.clear(); r = librbd::metadata_list(ictx, "", 0, &pairs); ASSERT_EQ(0, r); - ASSERT_EQ(3, pairs.size()); + ASSERT_EQ(3u, pairs.size()); string val; r = librbd::metadata_get(ictx, it->first, &val); ASSERT_EQ(0, r); From 5a2b688cd836ce653e33786fef09375911d11af8 Mon Sep 17 00:00:00 2001 From: Danny Al-Gaaf Date: Thu, 10 Sep 2015 11:56:29 +0200 Subject: [PATCH 565/654] src/.gitignore: add some more files to be ignored Signed-off-by: Danny Al-Gaaf --- src/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/.gitignore b/src/.gitignore index c7b9dd5a8e0c9..5fcfc62440c80 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -25,6 +25,7 @@ Makefile /ceph-mon /ceph-osd /ceph-syn +/ceph.tmpe /ceph.conf /ceph_bench_log /ceph-objectstore-tool @@ -71,6 +72,7 @@ Makefile /rados /radosgw /radosgw-admin +/radosgw-object-expirer /rbd /rbd-fuse /rbd-replay From 6504e37334f5d5a9f2613ce158bc98d4fbf76874 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Sat, 27 Jun 2015 12:00:02 +0800 Subject: [PATCH 566/654] common/buffer: add bufferlist::const_iterator * implement bufferlist::iterator using bufferlist::iterator_impl * unlike its cousin, `bufferlist::const_iterator` is not exported using CEPH_BUFFER_API. will do it once we think it will have external users. Signed-off-by: Kefu Chai --- src/common/buffer.cc | 68 +++++++++++++++++++++++++++--------------- src/include/buffer.h | 66 +++++++++++++++++++++++++++++----------- src/test/bufferlist.cc | 17 +++++++++++ 3 files changed, 109 insertions(+), 42 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 1ab62241f9c06..8d681e3b8e7eb 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -893,7 +893,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return *this; }*/ - void buffer::list::iterator::advance(int o) + template + void buffer::list::iterator_impl::advance(int o) { //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl; if (o > 0) { @@ -931,22 +932,31 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; } } - void buffer::list::iterator::seek(unsigned o) + template + void buffer::list::iterator_impl::seek(unsigned o) { - //cout << this << " seek " << o << std::endl; p = ls->begin(); off = p_off = 0; advance(o); } - char buffer::list::iterator::operator*() + template + bool buffer::list::iterator_impl::operator!=(const buffer::list::iterator_impl& rhs) const + { + return bl == rhs.bl && off == rhs.off; + } + + template + char buffer::list::iterator_impl::operator*() const { if (p == ls->end()) throw end_of_buffer(); return (*p)[p_off]; } - - buffer::list::iterator& buffer::list::iterator::operator++() + + template + buffer::list::iterator_impl& + buffer::list::iterator_impl::operator++() { if (p == ls->end()) throw end_of_buffer(); @@ -954,24 +964,25 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return *this; } - buffer::ptr buffer::list::iterator::get_current_ptr() + template + buffer::ptr buffer::list::iterator_impl::get_current_ptr() const { if (p == ls->end()) throw end_of_buffer(); return ptr(*p, p_off, p->length() - p_off); } - + // copy data out. // note that these all _append_ to dest! - - void buffer::list::iterator::copy(unsigned len, char *dest) + template + void buffer::list::iterator_impl::copy(unsigned len, char *dest) { if (p == ls->end()) seek(off); while (len > 0) { if (p == ls->end()) throw end_of_buffer(); - assert(p->length() > 0); - + assert(p->length() > 0); + unsigned howmuch = p->length() - p_off; if (len < howmuch) howmuch = len; p->copy_out(p_off, howmuch, dest); @@ -981,39 +992,42 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; advance(howmuch); } } - - void buffer::list::iterator::copy(unsigned len, ptr &dest) + + template + void buffer::list::iterator_impl::copy(unsigned len, ptr &dest) { dest = create(len); copy(len, dest.c_str()); } - void buffer::list::iterator::copy(unsigned len, list &dest) + template + void buffer::list::iterator_impl::copy(unsigned len, list &dest) { if (p == ls->end()) seek(off); while (len > 0) { if (p == ls->end()) throw end_of_buffer(); - + unsigned howmuch = p->length() - p_off; if (len < howmuch) howmuch = len; dest.append(*p, p_off, howmuch); - + len -= howmuch; advance(howmuch); } } - void buffer::list::iterator::copy(unsigned len, std::string &dest) + template + void buffer::list::iterator_impl::copy(unsigned len, std::string &dest) { if (p == ls->end()) seek(off); while (len > 0) { if (p == ls->end()) throw end_of_buffer(); - + unsigned howmuch = p->length() - p_off; const char *c_str = p->c_str(); if (len < howmuch) @@ -1025,7 +1039,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; } } - void buffer::list::iterator::copy_all(list &dest) + template + void buffer::list::iterator_impl::copy_all(list &dest) { if (p == ls->end()) seek(off); @@ -1033,17 +1048,22 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; if (p == ls->end()) return; assert(p->length() > 0); - + unsigned howmuch = p->length() - p_off; const char *c_str = p->c_str(); dest.append(c_str + p_off, howmuch); - + advance(howmuch); } } - - // copy data in + // explicitly instantiate only the iterator types we need, so we can hide the + // details in this compilation unit without introducing unnecessary link time + // dependencies. + template class buffer::list::iterator_impl; + template class buffer::list::iterator_impl; + + // copy data in void buffer::list::iterator::copy_in(unsigned len, const char *src, bool crc_reset) { // copy diff --git a/src/include/buffer.h b/src/include/buffer.h index bb6ea9356c537..5348eaf430718 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -43,6 +43,7 @@ #include #include #include +#include #include "page.h" #include "crc32c.h" @@ -256,23 +257,34 @@ class CEPH_BUFFER_API buffer { unsigned _memcopy_count; //the total of memcopy using rebuild(). ptr append_buffer; // where i put small appends. - public: - class CEPH_BUFFER_API iterator { - list *bl; - std::list *ls; // meh.. just here to avoid an extra pointer dereference.. - unsigned off; // in bl - std::list::iterator p; - unsigned p_off; // in *p + template + class iterator_impl: public std::iterator { + protected: + typedef typename std::conditional::type bl_t; + typedef typename std::conditional, + std::list >::type list_t; + typedef typename std::conditional::const_iterator, + typename std::list::iterator>::type list_iter_t; + bl_t* bl; + list_t* ls; // meh.. just here to avoid an extra pointer dereference.. + unsigned off; // in bl + list_iter_t p; + unsigned p_off; // in *p + public: // constructor. position. - iterator() : - bl(0), ls(0), off(0), p_off(0) {} - iterator(list *l, unsigned o=0) : - bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) { + iterator_impl() + : bl(0), ls(0), off(0), p_off(0) {} + iterator_impl(bl_t *l, unsigned o=0) + : bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) { advance(o); } - iterator(list *l, unsigned o, std::list::iterator ip, unsigned po) : - bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) { } + iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po) + : bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) {} /// get current iterator offset in buffer::list unsigned get_off() const { return off; } @@ -288,11 +300,12 @@ class CEPH_BUFFER_API buffer { void advance(int o); void seek(unsigned o); - char operator*(); - iterator& operator++(); - ptr get_current_ptr(); + bool operator!=(const iterator_impl& rhs) const; + char operator*() const; + iterator_impl& operator++(); + ptr get_current_ptr() const; - list& get_bl() { return *bl; } + bl_t& get_bl() { return *bl; } // copy data out. // note that these all _append_ to dest! @@ -301,11 +314,21 @@ class CEPH_BUFFER_API buffer { void copy(unsigned len, list &dest); void copy(unsigned len, std::string &dest); void copy_all(list &dest); + }; + + public: + typedef iterator_impl const_iterator; + class CEPH_BUFFER_API iterator : public iterator_impl { + public: + iterator(): iterator_impl() {} + iterator(bl_t *l, unsigned o=0) : + iterator_impl(l, o) {} + iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po) : + iterator_impl(l, o, ip, po) {} // copy data in void copy_in(unsigned len, const char *src, bool crc_reset = true); void copy_in(unsigned len, const list& otherl); - }; private: @@ -433,6 +456,13 @@ class CEPH_BUFFER_API buffer { return iterator(this, _len, _buffers.end(), 0); } + const_iterator begin() const { + return const_iterator(this, 0); + } + const_iterator end() const { + return const_iterator(this, _len, _buffers.end(), 0); + } + // crope lookalikes. // **** WARNING: this are horribly inefficient for large bufferlists. **** void copy(unsigned off, unsigned len, char *dest) const; diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index 838f23af2c34a..6b60a97aa2898 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -1084,6 +1084,23 @@ TEST(BufferListIterator, copy_in) { } } +// iterator& buffer::list::const_iterator::operator++() +TEST(BufferListConstIterator, operator_plus_plus) { + bufferlist bl; + { + bufferlist::const_iterator i(&bl); + EXPECT_THROW(++i, buffer::end_of_buffer); + } + bl.append("ABC", 3); + { + const bufferlist const_bl(bl); + bufferlist::const_iterator i(const_bl.begin()); + ++i; + EXPECT_EQ('B', *i); + } + +} + TEST(BufferList, constructors) { // // list() From 8ed724222651812c2ee8cc3804dc1f54c973897d Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 4 Sep 2015 01:23:31 +0800 Subject: [PATCH 567/654] test/bufferlist: do not expect !is_page_aligned() after unaligned rebuild if the size of a bufferlist is page aligned we allocate page aligned memory chunk for it when rebuild() is called. otherwise we just call the plain new() to allocate new memory chunk for holding the continuous buffer. but we should not expect that `new` allocator always returns unaligned memory chunks. instead, it *could* return page aligned memory chunk as long as the allocator feels appropriate. so, the `EXPECT_FALSE(bl.is_page_aligned())` after the `rebuild()` call is removed. Signed-off-by: Kefu Chai --- src/test/bufferlist.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index 6b60a97aa2898..43cb6543f493f 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -1506,12 +1506,15 @@ TEST(BufferList, rebuild) { { bufferlist bl; bufferptr ptr(buffer::create_page_aligned(2)); + ptr[0] = 'X'; + ptr[1] = 'Y'; ptr.set_offset(1); ptr.set_length(1); bl.append(ptr); EXPECT_FALSE(bl.is_page_aligned()); bl.rebuild(); - EXPECT_FALSE(bl.is_page_aligned()); + EXPECT_EQ(1U, bl.length()); + EXPECT_EQ('Y', *bl.begin()); } { bufferlist bl; From 1b43d811dece0578b9f8d5a43437ebb697126f25 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 10 Sep 2015 12:16:35 -0700 Subject: [PATCH 568/654] common/buffer: fix the const-ness of bufferlist::contents_equal() Signed-off-by: Kefu Chai --- src/common/buffer.cc | 6 +++--- src/include/buffer.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 8d681e3b8e7eb..d02013b816202 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1116,7 +1116,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; other.last_p = other.begin(); } - bool buffer::list::contents_equal(ceph::buffer::list& other) + bool buffer::list::contents_equal(const ceph::buffer::list& other) const { if (length() != other.length()) return false; @@ -1149,8 +1149,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; // byte-wise comparison if (false) { - bufferlist::iterator me = begin(); - bufferlist::iterator him = other.begin(); + bufferlist::const_iterator me = begin(); + bufferlist::const_iterator him = other.begin(); while (!me.end()) { if (*me != *him) return false; diff --git a/src/include/buffer.h b/src/include/buffer.h index 5348eaf430718..007d8d6ca10d4 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -372,7 +372,7 @@ class CEPH_BUFFER_API buffer { #endif return _len; } - bool contents_equal(buffer::list& other); + bool contents_equal(const buffer::list& other) const; bool can_zero_copy() const; bool is_aligned(unsigned align) const; From a795c885e9f5e1292577d6e95f98308cbd04a8fb Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 10 Sep 2015 12:51:36 -0700 Subject: [PATCH 569/654] common/buffer: add the move constructor for bufferlist Signed-off-by: Kefu Chai --- src/common/buffer.cc | 9 +++++++++ src/include/buffer.h | 1 + src/test/bufferlist.cc | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index d02013b816202..a7ed8883f0e58 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1105,6 +1105,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; // -- buffer::list -- + buffer::list::list(list&& other) + : _buffers(std::move(other._buffers)), + _len(other._len), + _memcopy_count(other._memcopy_count), + last_p(this) { + append_buffer.swap(other.append_buffer); + other.clear(); + } + void buffer::list::swap(list& other) { std::swap(_len, other._len); diff --git a/src/include/buffer.h b/src/include/buffer.h index 007d8d6ca10d4..854a286358128 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -347,6 +347,7 @@ class CEPH_BUFFER_API buffer { _memcopy_count(other._memcopy_count), last_p(this) { make_shareable(); } + list(list&& other); list& operator= (const list& other) { if (this != &other) { _buffers = other._buffers; diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index 43cb6543f493f..b05a15a4eca00 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -1128,6 +1128,17 @@ TEST(BufferList, constructors) { bufferlist copy(bl); ASSERT_EQ('A', copy[0]); } + // + // list(list&& other) + // + { + bufferlist bl(1); + bl.append('A'); + bufferlist copy = std::move(bl); + ASSERT_EQ(0U, bl.length()); + ASSERT_EQ(1U, copy.length()); + ASSERT_EQ('A', copy[0]); + } } TEST(BufferList, operator_equal) { From d4eeb9be0f84a57f2c106da2f2e7db9895f03b32 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Sep 2015 17:03:03 -0400 Subject: [PATCH 570/654] rocksdb: recognized --without-tcmalloc Signed-off-by: Sage Weil --- src/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rocksdb b/src/rocksdb index 15ed8c97fdbdf..a5e1ec9bcddc4 160000 --- a/src/rocksdb +++ b/src/rocksdb @@ -1 +1 @@ -Subproject commit 15ed8c97fdbdf022ec9c8da02b06f4cd9189bb7e +Subproject commit a5e1ec9bcddc44038a735e269fd5d6586f01c2cb From f4498f527a1a7ddd53e38327a24c12095b1a0337 Mon Sep 17 00:00:00 2001 From: Lu Shi Date: Fri, 11 Sep 2015 10:35:03 +0800 Subject: [PATCH 571/654] osd:the fuction osd::shutdown Lock failed. Fixed:#13004 Signed-off-by: Lu Shi --- src/osd/OSD.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a577fd802c82a..273f2d3534f33 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6469,8 +6469,10 @@ void OSD::handle_osd_map(MOSDMap *m) else if (do_restart) start_boot(); + osd_lock.Unlock(); if (do_shutdown) shutdown(); + osd_lock.Lock(); m->put(); } From aa4a5b0d15cb720561d7ba9ff2b1ca472f433820 Mon Sep 17 00:00:00 2001 From: Dan van der Ster Date: Fri, 11 Sep 2015 09:30:04 +0200 Subject: [PATCH 572/654] logrotate: ignore exit status 1 from killall If any of ceph-osd, ceph-mon, ceph-mds were not running then killall -q will exit status 1, leading to anacron sending a mail error: error running shared postrotate script for '/var/log/ceph/*.log ' Fix by overriding the exit status with || true. Fixes: #13033 Signed-off-by: Dan van der Ster --- src/logrotate.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/logrotate.conf b/src/logrotate.conf index 0c5df242b78d4..50e7ee8867048 100644 --- a/src/logrotate.conf +++ b/src/logrotate.conf @@ -4,7 +4,7 @@ compress sharedscripts postrotate - killall -q -1 ceph-mon ceph-mds ceph-osd + killall -q -1 ceph-mon ceph-mds ceph-osd || true endscript missingok notifempty From 217e424810726fd631fe4089c47a1fea1b13280a Mon Sep 17 00:00:00 2001 From: Dan van der Ster Date: Fri, 11 Sep 2015 09:42:50 +0200 Subject: [PATCH 573/654] logrotate: ignore postrotate error for radosgw Fix the annoying logrotate error for radosgw. Fixes: #13033 Signed-off-by: Dan van der Ster --- src/rgw/logrotate.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf index b80e81edd4c9d..65ae1132abd66 100644 --- a/src/rgw/logrotate.conf +++ b/src/rgw/logrotate.conf @@ -4,7 +4,7 @@ compress sharedscripts postrotate - killall -q -1 radosgw + killall -q -1 radosgw || true endscript missingok notifempty From 338bd3d177138abeba7ad42107dd0a82a4195959 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Mon, 10 Aug 2015 17:07:32 +0200 Subject: [PATCH 574/654] selinux: Update policy for radosgw The current SELinux policy does not cover radosgw daemon. This patch introduces the SELinux support for radosgw daemon (civetweb only). Signed-off-by: Boris Ranto --- ceph.spec.in | 5 +++- selinux/ceph.fc | 5 +++- selinux/ceph.te | 13 ++++------ selinux/ceph_selinux.8 | 56 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 65 insertions(+), 14 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 09635f11dc313..625cbb399d8c3 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -20,10 +20,13 @@ restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \ restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \ restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \ +restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \ restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \ +restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \ restorecon -R /var/run/ceph > /dev/null 2>&1; \ restorecon -R /var/lib/ceph > /dev/null 2>&1; \ -restorecon -R /var/log/ceph > /dev/null 2>&1; +restorecon -R /var/log/ceph > /dev/null 2>&1; \ +restorecon -R /var/log/radosgw > /dev/null 2>&1; %endif %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} diff --git a/selinux/ceph.fc b/selinux/ceph.fc index 2eeee223056ba..31926895c465b 100644 --- a/selinux/ceph.fc +++ b/selinux/ceph.fc @@ -1,11 +1,14 @@ -/etc/rc\.d/init\.d/ceph -- gen_context(system_u:object_r:ceph_initrc_exec_t,s0) +/etc/rc\.d/init\.d/ceph -- gen_context(system_u:object_r:ceph_initrc_exec_t,s0) +/etc/rc\.d/init\.d/radosgw -- gen_context(system_u:object_r:ceph_initrc_exec_t,s0) /usr/bin/ceph-mon -- gen_context(system_u:object_r:ceph_exec_t,s0) /usr/bin/ceph-mds -- gen_context(system_u:object_r:ceph_exec_t,s0) /usr/bin/ceph-osd -- gen_context(system_u:object_r:ceph_exec_t,s0) +/usr/bin/radosgw -- gen_context(system_u:object_r:ceph_exec_t,s0) /var/lib/ceph(/.*)? gen_context(system_u:object_r:ceph_var_lib_t,s0) /var/log/ceph(/.*)? gen_context(system_u:object_r:ceph_log_t,s0) +/var/log/radosgw(/.*)? gen_context(system_u:object_r:ceph_log_t,s0) /var/run/ceph(/.*)? gen_context(system_u:object_r:ceph_var_run_t,s0) diff --git a/selinux/ceph.te b/selinux/ceph.te index e25ec846ee323..a215df8efa821 100644 --- a/selinux/ceph.te +++ b/selinux/ceph.te @@ -1,9 +1,10 @@ -policy_module(ceph, 1.0.0) +policy_module(ceph, 1.1.0) require { type sysfs_t; type var_run_t; type random_device_t; + type urandom_device_t; type setfiles_t; class sock_file unlink; class lnk_file read; @@ -42,7 +43,6 @@ files_pid_file(ceph_var_run_t) allow ceph_t self:process { signal_perms }; allow ceph_t self:fifo_file rw_fifo_file_perms; allow ceph_t self:unix_stream_socket create_stream_socket_perms; -# not needed at the moment, for future releases, not needed at all if we switch to systemd init scripts allow ceph_t self:capability { setuid setgid }; manage_dirs_pattern(ceph_t, ceph_log_t, ceph_log_t) @@ -83,9 +83,8 @@ logging_send_syslog_msg(ceph_t) sysnet_dns_name_resolve(ceph_t) -# added 2015-06-17, need review - -allow ceph_t ceph_var_run_t:sock_file create; +# basis for future security review +allow ceph_t ceph_var_run_t:sock_file { create unlink write }; allow ceph_t self:capability sys_rawio; allow ceph_t self:tcp_socket { accept listen }; @@ -96,14 +95,12 @@ fstools_exec(ceph_t) nis_use_ypbind_uncond(ceph_t) storage_raw_rw_fixed_disk(ceph_t) -# added 2015-07-28, needs review just as well -allow ceph_t ceph_var_run_t:sock_file unlink; allow ceph_t sysfs_t:dir read; allow ceph_t sysfs_t:file { read getattr open }; allow ceph_t sysfs_t:lnk_file read; - allow ceph_t random_device_t:chr_file getattr; +allow ceph_t urandom_device_t:chr_file getattr; allow ceph_t self:process setpgid; allow ceph_t var_run_t:dir { write create add_name }; allow ceph_t var_run_t:file { write create open getattr }; diff --git a/selinux/ceph_selinux.8 b/selinux/ceph_selinux.8 index de74807c8ed87..6e91a212725e0 100644 --- a/selinux/ceph_selinux.8 +++ b/selinux/ceph_selinux.8 @@ -1,4 +1,4 @@ -.TH "ceph_selinux" "8" "15-06-17" "ceph" "SELinux Policy ceph" +.TH "ceph_selinux" "8" "15-08-10" "ceph" "SELinux Policy ceph" .SH "NAME" ceph_selinux \- Security Enhanced Linux Policy for the ceph processes .SH "DESCRIPTION" @@ -18,7 +18,7 @@ The ceph_t SELinux type can be entered via the \fBceph_exec_t\fP file type. The default entrypoint paths for the ceph_t domain are the following: -/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd +/usr/bin/radosgw, /usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd .SH PROCESS TYPES SELinux defines process types (domains) for each process running on the system .PP @@ -145,6 +145,22 @@ If you want to allow confined applications to use nscd shared memory, you must t .EE +.SH NSSWITCH DOMAIN + +.PP +If you want to allow users to resolve user passwd entries directly from ldap rather then using a sssd server for the ceph_t, you must turn on the authlogin_nsswitch_use_ldap boolean. + +.EX +.B setsebool -P authlogin_nsswitch_use_ldap 1 +.EE + +.PP +If you want to allow confined applications to run with kerberos for the ceph_t, you must turn on the kerberos_enabled boolean. + +.EX +.B setsebool -P kerberos_enabled 1 +.EE + .SH "MANAGED FILES" The SELinux process type ceph_t can manage files labeled with the following file types. The paths listed are the default paths for these file types. Note the processes UID still need to have DAC permissions. @@ -154,6 +170,8 @@ The SELinux process type ceph_t can manage files labeled with the following file /var/log/ceph(/.*)? .br + /var/log/radosgw(/.*)? +.br .br .B ceph_var_lib_t @@ -215,6 +233,12 @@ The SELinux process type ceph_t can manage files labeled with the following file /var/run/cluster/rgmanager\.sk .br +.br +.B fsadm_var_run_t + + /var/run/blkid(/.*)? +.br + .br .B root_t @@ -223,6 +247,22 @@ The SELinux process type ceph_t can manage files labeled with the following file /initrd .br +.br +.B var_run_t + + /run/.* +.br + /var/run/.* +.br + /run +.br + /var/run +.br + /var/run +.br + /var/spool/postfix/pid +.br + .SH FILE CONTEXTS SELinux requires files to have an extended attribute to define the file type. .PP @@ -238,7 +278,7 @@ SELinux ceph policy is very flexible allowing users to setup their ceph processe SELinux defines the file context types for the ceph, if you wanted to store files with these types in a diffent paths, you need to execute the semanage command to sepecify alternate labeling and then use restorecon to put the labels on disk. -.B semanage fcontext -a -t ceph_var_run_t '/srv/myceph_content(/.*)?' +.B semanage fcontext -a -t ceph_exec_t '/srv/ceph/content(/.*)?' .br .B restorecon -R -v /srv/myceph_content @@ -257,7 +297,7 @@ Note: SELinux often uses regular expressions to specify labels that match multip .br .TP 5 Paths: -/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd +/usr/bin/radosgw, /usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd .EX .PP @@ -266,6 +306,10 @@ Paths: - Set files with the ceph_initrc_exec_t type, if you want to transition an executable to the ceph_initrc_t domain. +.br +.TP 5 +Paths: +/etc/rc\.d/init\.d/ceph, /etc/rc\.d/init\.d/radosgw .EX .PP @@ -274,6 +318,10 @@ Paths: - Set files with the ceph_log_t type, if you want to treat the data as ceph log data, usually stored under the /var/log directory. +.br +.TP 5 +Paths: +/var/log/ceph(/.*)?, /var/log/radosgw(/.*)? .EX .PP From e5a8022ddb0d9ef36d9b52200e5e3fd8df250b26 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Fri, 4 Sep 2015 09:30:29 +0200 Subject: [PATCH 575/654] ceph.spec.in: ceph-radosgw should require ceph-selinux Signed-off-by: Boris Ranto --- ceph.spec.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ceph.spec.in b/ceph.spec.in index 625cbb399d8c3..9de3f7e7cbd2e 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -212,6 +212,9 @@ FUSE based client to map Ceph rbd images to files Summary: Rados REST gateway Group: Development/Libraries Requires: ceph-common = %{epoch}:%{version}-%{release} +%if 0%{with selinux} +Requires: ceph-selinux = %{epoch}:%{version}-%{release} +%endif Requires: librados2 = %{epoch}:%{version}-%{release} %if 0%{defined suse_version} BuildRequires: libexpat-devel From f167e8d3dcb87d3b90ccc1727217a9d73001da87 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 09:30:15 -0400 Subject: [PATCH 576/654] .gitignore: radosgw-object-expirer Signed-off-by: Sage Weil --- src/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/src/.gitignore b/src/.gitignore index c7b9dd5a8e0c9..23378701d22aa 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -71,6 +71,7 @@ Makefile /rados /radosgw /radosgw-admin +/radosgw-object-expirer /rbd /rbd-fuse /rbd-replay From e7837d1d36d92cfa156d1bf13a008505a7c2f20e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 11:24:18 -0400 Subject: [PATCH 577/654] ceph.spec: make /var/{lib,log,run} owned by ceph Move creation of /var/lib/ceph and /var/run/ceph to ceph-common, too. Signed-off-by: Sage Weil --- ceph.spec.in | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 3420461c09653..4e864f3fa2fcd 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -809,15 +809,14 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_mandir}/man8/ceph-clsinfo.8* %{_mandir}/man8/librados-config.8* #set up placeholder directories -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/ -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds -%attr(770,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw -%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph/ +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw +%attr(750,ceph,ceph) %dir %{_localstatedir}/run/ceph ################################################################################# %files -n ceph-common @@ -846,7 +845,6 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_datadir}/ceph/id_dsa_drop.ceph.com %{_datadir}/ceph/id_dsa_drop.ceph.com.pub %dir %{_sysconfdir}/ceph/ -%dir %{_localstatedir}/log/ceph/ %dir %{_datarootdir}/ceph/ %dir %{_libexecdir}/ceph/ %config %{_sysconfdir}/bash_completion.d/rados @@ -856,7 +854,8 @@ mkdir -p %{_localstatedir}/run/ceph/ %{python_sitelib}/ceph_argparse.py* %{python_sitelib}/ceph_daemon.py* %{_udevrulesdir}/50-rbd.rules -%attr(2750,ceph,ceph) %dir %{_localstatedir}/log/ceph/ +%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/ +%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/ %pre -n ceph-common CEPH_GROUP_ID="" From 3aa38bc07f84f452d70757585ef2620ee60ee227 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 11:26:59 -0400 Subject: [PATCH 578/654] make /var/run/ceph 770 ceph:ceph This allows members of the ceph group to make librados clients (like the ceph cli and qemu) create sockets in the default /var/run/ceph/* location. Signed-off-by: Sage Weil --- ceph.spec.in | 2 +- src/init-ceph.in | 2 +- src/upstart/ceph-mds.conf | 2 +- src/upstart/ceph-mon.conf | 2 +- src/upstart/ceph-osd.conf | 2 +- src/upstart/radosgw.conf | 2 +- systemd/ceph.tmpfiles.d | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 4e864f3fa2fcd..149ed406f4d1a 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -816,7 +816,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw -%attr(750,ceph,ceph) %dir %{_localstatedir}/run/ceph +%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph ################################################################################# %files -n ceph-common diff --git a/src/init-ceph.in b/src/init-ceph.in index 4255c550a434d..faeb7bd76d6ca 100755 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -381,7 +381,7 @@ for name in $what; do echo Starting Ceph $name on $host... if [ ! -d $run_dir ]; then # assume /var/run exists - install -d -m0755 -o ceph -g ceph /var/run/ceph + install -d -m0770 -o ceph -g ceph /var/run/ceph fi get_conf pre_start_eval "" "pre start eval" [ -n "$pre_start_eval" ] && $pre_start_eval diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf index 5c74fc16b2e0f..7c691173927c0 100644 --- a/src/upstart/ceph-mds.conf +++ b/src/upstart/ceph-mds.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-mds || { stop; exit 0; } test -d "/var/lib/ceph/mds/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 -o ceph -g ceph /var/run/ceph + install -d -m0770 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf index be4e0efad60f7..89119457b9c28 100644 --- a/src/upstart/ceph-mon.conf +++ b/src/upstart/ceph-mon.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-mon || { stop; exit 0; } test -d "/var/lib/ceph/mon/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 -o ceph -g ceph /var/run/ceph + install -d -m0770 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf index a508b4126274c..02ca2389a17b5 100644 --- a/src/upstart/ceph-osd.conf +++ b/src/upstart/ceph-osd.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/ceph-osd || { stop; exit 0; } test -d "/var/lib/ceph/osd/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 -o ceph -g ceph /var/run/ceph + install -d -m0770 -o ceph -g ceph /var/run/ceph /usr/libexec/ceph/ceph-osd-prestart.sh --cluster="${cluster:-ceph}" -i "$id" end script diff --git a/src/upstart/radosgw.conf b/src/upstart/radosgw.conf index 4cb6b5d08db25..828c314d2386d 100644 --- a/src/upstart/radosgw.conf +++ b/src/upstart/radosgw.conf @@ -13,7 +13,7 @@ pre-start script test -x /usr/bin/radosgw || { stop; exit 0; } test -d "/var/lib/ceph/radosgw/${cluster:-ceph}-$id" || { stop; exit 0; } - install -d -m0755 -o ceph -g ceph /var/run/ceph + install -d -m0770 -o ceph -g ceph /var/run/ceph end script instance ${cluster:-ceph}/$id diff --git a/systemd/ceph.tmpfiles.d b/systemd/ceph.tmpfiles.d index d2a7aa1b5136b..f068e268a133a 100644 --- a/systemd/ceph.tmpfiles.d +++ b/systemd/ceph.tmpfiles.d @@ -1 +1 @@ -d /var/run/ceph 0755 ceph ceph - +d /var/run/ceph 0770 ceph ceph - From c8bfc354e108a159f6c9a79086c6dbd2fde0c9aa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Sep 2015 22:21:38 -0400 Subject: [PATCH 579/654] ceph.spec: install /etc/sysconfig/ceph Make it easy to enable jemalloc, too. Signed-off-by: Sage Weil --- ceph.spec.in | 1 + etc/sysconfig/ceph | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 etc/sysconfig/ceph diff --git a/ceph.spec.in b/ceph.spec.in index 0eb8a4801819c..62e9e8ac6531d 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -593,6 +593,7 @@ find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap install -D etc/ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph.conf +install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf diff --git a/etc/sysconfig/ceph b/etc/sysconfig/ceph new file mode 100644 index 0000000000000..19584dbbbee3a --- /dev/null +++ b/etc/sysconfig/ceph @@ -0,0 +1,12 @@ +# /etc/sysconfig/ceph +# +# Environment file for ceph daemon systemd unit files. +# + +## use jemalloc instead of tcmalloc +# +# jemalloc is generally faster for small IO workloads and when +# ceph-osd is backed by SSDs. However, memory usage is usually +# higher by 200-300mb. +# +#LD_PRELOAD=/usr/lib64/libjemalloc.so.1 From cd4bd80a34af54e0f070d80f9270cdd4dce74f19 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Sep 2015 13:49:09 -0400 Subject: [PATCH 580/654] Makefile.am: include etc in tarball Signed-off-by: Sage Weil --- Makefile.am | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.am b/Makefile.am index 1fa6e8faa068b..2508278d31d86 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,6 +11,10 @@ EXTRA_DIST += \ src/test/downloads \ systemd/ceph.tmpfiles.d \ etc/ceph.limits.d \ + etc/default/ceph \ + etc/sysconfig/ceph \ + etc/sysconfig/SuSEfirewall2.d/services/ceph-mon \ + etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds \ udev/50-rbd.rules \ udev/60-ceph-partuuid-workaround.rules \ udev/95-ceph-osd.rules \ From 7384a14f243519547a61534d22e21e6069aae016 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 5 Sep 2015 09:14:15 -0400 Subject: [PATCH 581/654] debian/rules: install systemd files and /etc/default/ceph Fix the systemd units to use /etc/default instead of /etc/sysconfig. There is probably a better way... Signed-off-by: Sage Weil --- debian/rules | 28 ++++++++++++++++++++++++++++ etc/default/ceph | 12 ++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 etc/default/ceph diff --git a/debian/rules b/debian/rules index 5521d05bb2fd4..b2bb77cf64c0e 100755 --- a/debian/rules +++ b/debian/rules @@ -129,6 +129,34 @@ binary-arch: build install mv debian/ceph/etc/init/ceph-mds* debian/ceph-mds/etc/init install -d -m0755 debian/radosgw/etc/init install -m0644 src/upstart/radosgw*.conf debian/radosgw/etc/init + # install the systemd stuff manually since we have funny service names + install -d -m0755 debian/ceph-common/lib/systemd/system + install -m0644 systemd/ceph.target debian/ceph-common/lib/systemd/system + install -d -m0755 debian/ceph-common/etc/default/ceph + install -m0644 etc/default/ceph debian/ceph-common/etc/default/ceph + install -d -m0755 debian/ceph-common/lib/tmpfiles.d + install -m 0644 -D systemd/ceph.tmpfiles.d debian/ceph-common/lib/tmpfiles.d/ceph.conf + + install -d -m0755 debian/ceph/lib/systemd/system + install -m0644 systemd/ceph-mon@.service debian/ceph/lib/systemd/system + install -m0644 systemd/ceph-create-keys@.service debian/ceph/lib/systemd/system + install -m0644 systemd/ceph-osd@.service debian/ceph/lib/systemd/system + install -m0644 systemd/ceph-disk@.service debian/ceph/lib/systemd/system + sed -i s./etc/sysconfig/./etc/default/.g debian/ceph/lib/systemd/system/ceph-mon@.service + sed -i s./etc/sysconfig/./etc/default/.g debian/ceph/lib/systemd/system/ceph-create-keys@.service + sed -i s./etc/sysconfig/./etc/default/.g debian/ceph/lib/systemd/system/ceph-osd@.service + sed -i s./etc/sysconfig/./etc/default/.g debian/ceph/lib/systemd/system/ceph-disk@.service + + install -d -m0755 debian/ceph-mds/lib/systemd/system + install -m0644 systemd/ceph-mds@.service debian/ceph-mds/lib/systemd/system + sed -i s./etc/sysconfig/./etc/default/.g debian/ceph-mds/lib/systemd/system/ceph-mds@.service + + install -d -m0755 debian/radosgw/lib/systemd/system + install -m0644 systemd/ceph-radosgw@.service debian/radosgw/lib/systemd/system + sed -i s./etc/sysconfig/./etc/default/.g debian/radosgw/lib/systemd/system/ceph-radosgw@.service + install -d -m0755 debian/radosgw/lib/tmpfiles.d + install -m 0644 -D systemd/ceph-rgw.tmpfiles.d debian/radosgw/lib/tmpfiles.d/ceph-rgw.conf + dh_installman -a dh_lintian -a dh_link -a diff --git a/etc/default/ceph b/etc/default/ceph new file mode 100644 index 0000000000000..6d6f40ec571e8 --- /dev/null +++ b/etc/default/ceph @@ -0,0 +1,12 @@ +# /etc/default/ceph +# +# Environment file for ceph daemon systemd unit files. +# + +## use jemalloc instead of tcmalloc +# +# jemalloc is generally faster for small IO workloads and when +# ceph-osd is backed by SSDs. However, memory usage is usually +# higher by 200-300mb. +# +#LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1 From c92d13b0411cc2a4f709aec099623e1fd064c45b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 6 Sep 2015 14:18:16 -0400 Subject: [PATCH 582/654] debian: fix location of tmpfiles.d /usr/lib/tmpfiles.d, not /lib/tmpfiles.d Signed-off-by: Sage Weil --- debian/rules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/debian/rules b/debian/rules index b2bb77cf64c0e..251eb42d37b61 100755 --- a/debian/rules +++ b/debian/rules @@ -134,8 +134,8 @@ binary-arch: build install install -m0644 systemd/ceph.target debian/ceph-common/lib/systemd/system install -d -m0755 debian/ceph-common/etc/default/ceph install -m0644 etc/default/ceph debian/ceph-common/etc/default/ceph - install -d -m0755 debian/ceph-common/lib/tmpfiles.d - install -m 0644 -D systemd/ceph.tmpfiles.d debian/ceph-common/lib/tmpfiles.d/ceph.conf + install -d -m0755 debian/ceph-common/usr/lib/tmpfiles.d + install -m 0644 -D systemd/ceph.tmpfiles.d debian/ceph-common/usr/lib/tmpfiles.d/ceph.conf install -d -m0755 debian/ceph/lib/systemd/system install -m0644 systemd/ceph-mon@.service debian/ceph/lib/systemd/system @@ -154,8 +154,8 @@ binary-arch: build install install -d -m0755 debian/radosgw/lib/systemd/system install -m0644 systemd/ceph-radosgw@.service debian/radosgw/lib/systemd/system sed -i s./etc/sysconfig/./etc/default/.g debian/radosgw/lib/systemd/system/ceph-radosgw@.service - install -d -m0755 debian/radosgw/lib/tmpfiles.d - install -m 0644 -D systemd/ceph-rgw.tmpfiles.d debian/radosgw/lib/tmpfiles.d/ceph-rgw.conf + install -d -m0755 debian/radosgw/usr/lib/tmpfiles.d + install -m 0644 -D systemd/ceph-rgw.tmpfiles.d debian/radosgw/usr/lib/tmpfiles.d/ceph-rgw.conf dh_installman -a dh_lintian -a From ea91c4ef8558a0ad914d75ed7e3679b6c6767c89 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 8 Sep 2015 13:50:31 -0400 Subject: [PATCH 583/654] systemd: tmpfiles.d in /run, not /var/run Signed-off-by: Sage Weil --- systemd/ceph-rgw.tmpfiles.d.in | 2 +- systemd/ceph.tmpfiles.d | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/systemd/ceph-rgw.tmpfiles.d.in b/systemd/ceph-rgw.tmpfiles.d.in index 7cffe65a2b68d..9986b00949170 100644 --- a/systemd/ceph-rgw.tmpfiles.d.in +++ b/systemd/ceph-rgw.tmpfiles.d.in @@ -1,2 +1,2 @@ # create rgw socket directory -d /var/run/ceph-rgw 0755 @user_rgw@ @group_rgw@ - - +d /run/ceph-rgw 0755 @user_rgw@ @group_rgw@ - - diff --git a/systemd/ceph.tmpfiles.d b/systemd/ceph.tmpfiles.d index f068e268a133a..2ded82f5f659e 100644 --- a/systemd/ceph.tmpfiles.d +++ b/systemd/ceph.tmpfiles.d @@ -1 +1 @@ -d /var/run/ceph 0770 ceph ceph - +d /run/ceph 0770 ceph ceph - From c3d23caddf0f75ee01f2dc8730b59f4b62e97d49 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 8 Sep 2015 13:50:52 -0400 Subject: [PATCH 584/654] ceph-common: explicitly trigger /run/ceph creation This doesn't happen magically on debian. Signed-off-by: Sage Weil --- debian/ceph-common.postinst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debian/ceph-common.postinst b/debian/ceph-common.postinst index 36410a3b630b6..719dc7908b95f 100644 --- a/debian/ceph-common.postinst +++ b/debian/ceph-common.postinst @@ -85,6 +85,9 @@ case "$1" in echo "..done" fi + # create /run/ceph. fail softly if systemd isn't present or + # something. + [ -x /bin/systemd-tmpfiles ] && systemd-tmpfiles --create || true ;; abort-upgrade|abort-remove|abort-deconfigure) : From 865708120faab14c5445577754ea9d71cc2b8dfe Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 11 Sep 2015 16:46:52 +0100 Subject: [PATCH 585/654] doc: remove references to default data/metadata pools These haven't existed since 0.84 -- the cephfs documentation was updated at the time, but there were also references in the rados documentation. Signed-off-by: John Spray --- doc/rados/operations/crush-map.rst | 6 +----- doc/rados/operations/monitoring-osd-pg.rst | 6 +++--- doc/rados/operations/pools.rst | 6 +----- doc/rados/operations/user-management.rst | 3 +-- 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/doc/rados/operations/crush-map.rst b/doc/rados/operations/crush-map.rst index 01db2b30799bb..d838ea96c8387 100644 --- a/doc/rados/operations/crush-map.rst +++ b/doc/rados/operations/crush-map.rst @@ -479,11 +479,7 @@ CRUSH maps support the notion of 'CRUSH rules', which are the rules that determine data placement for a pool. For large clusters, you will likely create many pools where each pool may have its own CRUSH ruleset and rules. The default CRUSH map has a rule for each pool, and one ruleset assigned to each of the -default pools, which include: - -- ``data`` -- ``metadata`` -- ``rbd`` +default pools. .. note:: In most cases, you will not need to modify the default rules. When you create a new pool, its default ruleset is ``0``. diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst index 686b27bd6e80a..9a5bfe3ce47d9 100644 --- a/doc/rados/operations/monitoring-osd-pg.rst +++ b/doc/rados/operations/monitoring-osd-pg.rst @@ -230,9 +230,9 @@ few cases: Placement group IDs consist of the pool number (not pool name) followed by a period (.) and the placement group ID--a hexadecimal number. You can view pool numbers and their names from the output of ``ceph osd - lspools``. The default pool names ``data``, ``metadata`` and ``rbd`` - correspond to pool numbers ``0``, ``1`` and ``2`` respectively. A fully - qualified placement group ID has the following form:: + lspools``. For example, the default pool ``rbd`` corresponds to + pool number ``0``. A fully qualified placement group ID has the + following form:: {pool-num}.{pg-id} diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst index 6f021b7f99e4a..1872b498a5c19 100644 --- a/doc/rados/operations/pools.rst +++ b/doc/rados/operations/pools.rst @@ -38,11 +38,7 @@ To list your cluster's pools, execute:: ceph osd lspools -The default pools include: - -- ``data`` -- ``metadata`` -- ``rbd`` +On a freshly installed cluster, only the ``rbd`` pool exists. .. _createpool: diff --git a/doc/rados/operations/user-management.rst b/doc/rados/operations/user-management.rst index a991c23f975ef..ccdb09787c488 100644 --- a/doc/rados/operations/user-management.rst +++ b/doc/rados/operations/user-management.rst @@ -201,8 +201,7 @@ The following entries describe each capability. Pool ---- -A pool is a logical partition where users store data. By default, a Ceph Storage -Cluster has `pools`_ for ``data``, ``rbd`` and ``metadata`` (metadata server). +A pool is a logical partition where users store data. In Ceph deployments, it is common to create a pool as a logical partition for similar types of data. For example, when deploying Ceph as a backend for OpenStack, a typical deployment would have pools for volumes, images, backups From 4dea76e0281aa3b1e4a157b5b0473db66e42adf4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 12:19:59 -0400 Subject: [PATCH 586/654] ceph.spec: include /etc/sysconfig/ceph Signed-off-by: Sage Weil --- ceph.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/ceph.spec.in b/ceph.spec.in index 62e9e8ac6531d..e9161ad3ffaf1 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -793,6 +793,7 @@ mkdir -p %{_localstatedir}/run/ceph/ %config %{_sysconfdir}/security/limits.d/ceph.conf %config %{_sysconfdir}/bash_completion.d/ceph %config(noreplace) %{_sysconfdir}/logrotate.d/ceph +%config(noreplace) %{_sysconfdir}/sysconfig/ceph %if 0%{?suse_version} %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds From 6e854338329786e1bc74b9e535c6776eef99cda5 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Sat, 12 Sep 2015 17:31:10 +0800 Subject: [PATCH 587/654] AsyncMessenger: Kepp file_lock hold when accessing its event field When process_event get a file event, other thread may delete this event later but before we do event Fix #13001 Signed-off-by: Haomai Wang --- src/msg/async/Event.cc | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc index 79bff33673d0a..03119dec5c97c 100644 --- a/src/msg/async/Event.cc +++ b/src/msg/async/Event.cc @@ -367,29 +367,38 @@ int EventCenter::process_events(int timeout_microseconds) vector fired_events; next_time = shortest; numevents = driver->event_wait(fired_events, &tv); + file_lock.Lock(); for (int j = 0; j < numevents; j++) { int rfired = 0; FileEvent *event; - { - Mutex::Locker l(file_lock); - event = _get_file_event(fired_events[j].fd); - } + EventCallbackRef cb; + event = _get_file_event(fired_events[j].fd); + // FIXME: Actually we need to pick up some ways to reduce potential + // file_lock contention here. /* note the event->mask & mask & ... code: maybe an already processed * event removed an element that fired and we still didn't * processed, so we check if the event is still valid. */ if (event->mask & fired_events[j].mask & EVENT_READABLE) { rfired = 1; - event->read_cb->do_request(fired_events[j].fd); + cb = event->read_cb; + file_lock.Unlock(); + cb->do_request(fired_events[j].fd); + file_lock.Lock(); } if (event->mask & fired_events[j].mask & EVENT_WRITABLE) { - if (!rfired || event->read_cb != event->write_cb) - event->write_cb->do_request(fired_events[j].fd); + if (!rfired || event->read_cb != event->write_cb) { + cb = event->write_cb; + file_lock.Unlock(); + cb->do_request(fired_events[j].fd); + file_lock.Lock(); + } } ldout(cct, 20) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl; } + file_lock.Unlock(); if (trigger_time) numevents += process_time_events(); From ff9600a6cef613d40e875597b6392778df1bb04c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 08:33:44 -0400 Subject: [PATCH 588/654] osd/ReplicatedPG: remove stray debug line This snuck in Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 3d3ffe4e54dbb..b495c95b149d5 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6806,8 +6806,6 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) cop->results.final_tx = pgbackend->get_transaction(); _build_finish_copy_transaction(cop, cop->results.final_tx); - derr << __func__ << " got truncate_seq " << cop->results.truncate_seq - << " " << cop->results.truncate_size << dendl; // verify digests? if (cop->results.is_data_digest() || cop->results.is_omap_digest()) { dout(20) << __func__ << std::hex From f15d9585edc5a12ac2d076951076247253b897c2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 10:15:07 -0400 Subject: [PATCH 589/654] osd: allow peek_map_epoch to return an error Allow PG::peek_map_epoch to return an error indicating the PG should be skipped. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 8 +++++++- src/osd/PG.cc | 10 ++++++---- src/osd/PG.h | 3 ++- src/tools/ceph_objectstore_tool.cc | 12 +++++++++--- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a577fd802c82a..f204135fe7f3e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2858,7 +2858,13 @@ void OSD::load_pgs() dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; bufferlist bl; - epoch_t map_epoch = PG::peek_map_epoch(store, pgid, &bl); + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl); + if (r < 0) { + derr << __func__ << " unable to peek at " << pgid << " metadata, skipping" + << dendl; + continue; + } PG *pg = NULL; if (map_epoch > 0) { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 48443ce634490..4e08927470410 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2728,9 +2728,10 @@ bool PG::_has_removal_flag(ObjectStore *store, return false; } -epoch_t PG::peek_map_epoch(ObjectStore *store, - spg_t pgid, - bufferlist *bl) +int PG::peek_map_epoch(ObjectStore *store, + spg_t pgid, + epoch_t *pepoch, + bufferlist *bl) { coll_t coll(pgid); ghobject_t legacy_infos_oid(OSD::make_infos_oid()); @@ -2764,7 +2765,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, bp = values[epoch_key].begin(); ::decode(cur_epoch, bp); - return cur_epoch; + *pepoch = cur_epoch; + return 0; } void PG::write_if_dirty(ObjectStore::Transaction& t) diff --git a/src/osd/PG.h b/src/osd/PG.h index f1dde4cb6d54c..7859f1a5a3c0d 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -2187,7 +2187,8 @@ class PG { __u8 &); void read_state(ObjectStore *store, bufferlist &bl); static bool _has_removal_flag(ObjectStore *store, spg_t pgid); - static epoch_t peek_map_epoch(ObjectStore *store, spg_t pgid, bufferlist *bl); + static int peek_map_epoch(ObjectStore *store, spg_t pgid, + epoch_t *pepoch, bufferlist *bl); void update_snap_map( const vector &log_entries, ObjectStore::Transaction& t); diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index a2dbccbc5fc20..96dddc627df9a 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -475,10 +475,13 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid()); bufferlist bl; - PG::peek_map_epoch(fs, pgid, &bl); + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl); + if (r < 0) + cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl; map past_intervals; __u8 struct_v; - int r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v); + r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v); if (r < 0) { cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl; return r; @@ -2760,7 +2763,10 @@ int main(int argc, char **argv) } bufferlist bl; - map_epoch = PG::peek_map_epoch(fs, pgid, &bl); + map_epoch = 0; + ret = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl); + if (ret < 0) + cerr << "peek_map_epoch reports error" << std::endl; if (debug) cerr << "map_epoch " << map_epoch << std::endl; From d33fea5d386f4960c967178c1b8b492ff3af5830 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Sep 2015 10:16:23 -0400 Subject: [PATCH 590/654] sd/PG: tolerate missing pgmeta object Bug 10617 left stray PG dirs around in firefly. Hammer correctly ignores these, assuming they are leftover cruft. We broke this when we dropped compat support in cd4e676e6d45c8166290ef834d73c2a0bda98fa2 See also #13060. Signed-off-by: Sage Weil --- src/osd/PG.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 4e08927470410..39c26ea63ec7b 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2751,7 +2751,8 @@ int PG::peek_map_epoch(ObjectStore *store, map values; int r = store->omap_get_values(coll, pgmeta_oid, keys, &values); if (r != 0) { - assert(0 == "unable to open pg metadata"); + // probably bug 10617; see OSD::load_pgs() + return -1; } assert(values.size() == 2); From 4a5a5b3705b636131c6942c255787f4e950859f0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 13:51:13 -0400 Subject: [PATCH 591/654] qa/workunits/cephtool/test.sh: make mds epoch check more tolerant This can race with an actual mdsmap epoch update for some other reason. We just need to make sure the epoch *increased*, not that it is exactly old + 1. Fixes: #12991 Signed-off-by: Sage Weil --- qa/workunits/cephtool/test.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 1ba1efee73583..9ca1727d4e793 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -748,8 +748,15 @@ function test_mon_mds() mdsmapfile=$TMPDIR/mdsmap.$$ current_epoch=$(ceph mds getmap -o $mdsmapfile --no-log-to-stderr 2>&1 | grep epoch | sed 's/.*epoch //') [ -s $mdsmapfile ] + # make several attempts in case we race with another mdsmap update ((epoch = current_epoch + 1)) - ceph mds setmap -i $mdsmapfile $epoch + ((epoch2 = current_epoch + 2)) + ((epoch3 = current_epoch + 3)) + ((epoch4 = current_epoch + 4)) + ceph mds setmap -i $mdsmapfile $epoch || \ + ceph mds setmap -i $mdsmapfile $epoch2 || \ + ceph mds setmap -i $mdsmapfile $epoch3 || \ + ceph mds setmap -i $mdsmapfile $epoch4 rm $mdsmapfile ceph osd pool create data2 10 From 51abff11688f0201b8f4076ac515e4515929d4cb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 14:12:20 -0400 Subject: [PATCH 592/654] ceph.spec: respect CEPH_EXTRA_CONFIGURE_ARGS Signed-off-by: Sage Weil --- ceph.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index e9161ad3ffaf1..a2fda78f2f165 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -524,7 +524,7 @@ done %endif ./autogen.sh -MY_CONF_OPT="" +MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS" MY_CONF_OPT="$MY_CONF_OPT --with-radosgw" From c57e8688feb56da0d9411ae7c5ce4a03ffccd170 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 14:17:01 -0400 Subject: [PATCH 593/654] rocksdb: ignore m4 Signed-off-by: Sage Weil --- src/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rocksdb b/src/rocksdb index a5e1ec9bcddc4..dcdb0dd29232e 160000 --- a/src/rocksdb +++ b/src/rocksdb @@ -1 +1 @@ -Subproject commit a5e1ec9bcddc44038a735e269fd5d6586f01c2cb +Subproject commit dcdb0dd29232ece43f093c99220b0eea7ead51ff From 98302adf99e4eb02755359c5db89aad1b187a4bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 14:21:32 -0400 Subject: [PATCH 594/654] gmock: ignore *.pyc Signed-off-by: Sage Weil --- src/gmock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gmock b/src/gmock index d701f9d547a03..49beb3bdf05a7 160000 --- a/src/gmock +++ b/src/gmock @@ -1 +1 @@ -Subproject commit d701f9d547a03677d612d7af3f212871a427f058 +Subproject commit 49beb3bdf05a728afb48dbfbeb1a693ce4c38027 From af39f98b5342b1cb4db2a69f89a8b34f621b9a12 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 12 Sep 2015 17:43:52 -0400 Subject: [PATCH 595/654] .gitignore: ignore src/ceph.tmpe It makes the gitbuilders unhappy. Not sure where it comes from... Signed-off-by: Sage Weil --- src/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/.gitignore b/src/.gitignore index 23378701d22aa..524cb77f21ebe 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -94,3 +94,6 @@ Makefile /mkcephfs /.ceph_port /store_test_temp_dir + +# not sure what generates this... +/ceph.tmpe From 7d112c64839504177d556aa1b35721dad41b5362 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 13 Sep 2015 21:36:29 -0400 Subject: [PATCH 596/654] mon/MDSMonitor: drop incorrect m->put() Signed-off-by: Sage Weil --- src/mon/MDSMonitor.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 70b1110bb0951..2530628d80221 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -300,7 +300,6 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op) dout(7) << "mds_beacon " << *m << " is not in mdsmap (state " << ceph_mds_state_name(state) << ")" << dendl; mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &mdsmap)); - m->put(); return true; } else { return false; // not booted yet. From 3a7d91d250e9f185d7b001bce6d62a0a239a1574 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 13 Sep 2015 21:42:27 -0400 Subject: [PATCH 597/654] msg/simple: debug refs on sent messages Signed-off-by: Sage Weil --- src/msg/simple/SimpleMessenger.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc index be868b1e9323a..fc4b48b213c95 100644 --- a/src/msg/simple/SimpleMessenger.cc +++ b/src/msg/simple/SimpleMessenger.cc @@ -96,6 +96,7 @@ int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest) { // set envelope m->get_header().src = get_myname(); + m->set_cct(cct); if (!m->get_priority()) m->set_priority(get_default_send_priority()); From 68ecc55d791739feb9fa81284e6147942578d139 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 13 Sep 2015 21:44:51 -0400 Subject: [PATCH 598/654] mon: do not leak ref creating MonOpRequest Signed-off-by: Sage Weil --- src/mon/MonOpRequest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h index 65b031c0879ab..db08ba26eb8b1 100644 --- a/src/mon/MonOpRequest.h +++ b/src/mon/MonOpRequest.h @@ -84,7 +84,7 @@ struct MonOpRequest : public TrackedOp { MonOpRequest(Message *req, OpTracker *tracker) : TrackedOp(tracker, req->get_recv_stamp()), - request(req->get()), + request(req), session(NULL), con(NULL), forwarded_to_leader(false), From 640ebbfa3ad6f8dade0187588645f36f154625bf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 13 Sep 2015 21:48:47 -0400 Subject: [PATCH 599/654] mon/PGMonitor: fix use-after-free in stats ack The MonOpRequestRef owns one ref; we need to get() another so we can pass it to send_reply. Signed-off-by: Sage Weil --- src/mon/PGMonitor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index bc3ea49629ddb..9c6eefcf35cb1 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -847,6 +847,7 @@ void PGMonitor::_updated_stats(MonOpRequestRef op, MonOpRequestRef ack_op) op->mark_pgmon_event(__func__); ack_op->mark_pgmon_event(__func__); MPGStats *ack = static_cast(ack_op->get_req()); + ack->get(); // MonOpRequestRef owns one ref; give the other to send_reply. dout(7) << "_updated_stats for " << op->get_req()->get_orig_source_inst() << dendl; mon->send_reply(op, ack); From 40c3c85a39f7874b4288665ccb6a192a44bbc30f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 13 Sep 2015 22:04:29 -0400 Subject: [PATCH 600/654] mon: debug MonSession refs Signed-off-by: Sage Weil --- src/mon/Session.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/Session.h b/src/mon/Session.h index 98b4a8f2da8fd..a3260210fe786 100644 --- a/src/mon/Session.h +++ b/src/mon/Session.h @@ -59,6 +59,7 @@ struct MonSession : public RefCountedObject { uint64_t proxy_tid; MonSession(const entity_inst_t& i, Connection *c) : + RefCountedObject(g_ceph_context), con(c), inst(i), closed(false), item(this), auid(0), global_id(0), From d99e6899caa8836cef338034e34ac6e4da928ed1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 09:56:57 -0400 Subject: [PATCH 601/654] mon: fix MonSession leak when waitlisting op Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 08420265d3650..61fed6faec392 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3507,10 +3507,9 @@ void Monitor::dispatch(MonOpRequestRef op) if (is_synchronizing() && !src_is_mon) { waitlist_or_zap_client(op); - return; + } else { + dispatch_op(op); } - - dispatch_op(op); s->put(); return; } From c2a9764fddb9f5adb970150b4ce723e3580c8e4f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 10:24:22 -0400 Subject: [PATCH 602/654] mon: do not return ref to MonOpRequest::get_session() caller Most callers don't put the ref, and since we hold the op ref it's not necessary. This will also make a somewhat cleaner transition to MonSessionRef later. Signed-off-by: Sage Weil --- src/mon/AuthMonitor.cc | 3 --- src/mon/MonOpRequest.h | 2 +- src/mon/Monitor.cc | 15 +-------------- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index 24cacb31a4da0..c948680840bbc 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -452,8 +452,6 @@ bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable) goto done; } - s->put(); - if (!mon->is_leader()) { dout(10) << "not the leader, requesting more ids from leader" << dendl; int leader = mon->get_leader(); @@ -511,7 +509,6 @@ bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable) reply = new MAuthReply(proto, &response_bl, ret, s->global_id); mon->send_reply(op, reply); done: - s->put(); return true; } diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h index db08ba26eb8b1..e8339f6c646c1 100644 --- a/src/mon/MonOpRequest.h +++ b/src/mon/MonOpRequest.h @@ -140,7 +140,7 @@ struct MonOpRequest : public TrackedOp { MonSession *get_session() const { if (!session) return NULL; - return (MonSession*)session->get(); + return session; } template diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 61fed6faec392..d886272534a69 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3083,8 +3083,6 @@ void Monitor::forward_request_leader(MonOpRequestRef op) } else { dout(10) << "forward_request no session for request " << *req << dendl; } - if (session) - session->put(); } // fake connection attached to forwarded messages @@ -3166,7 +3164,6 @@ void Monitor::handle_forward(MonOpRequestRef op) _ms_dispatch(req); s->put(); } - session->put(); } void Monitor::try_send_message(Message *m, const entity_inst_t& to) @@ -3228,7 +3225,6 @@ void Monitor::send_reply(MonOpRequestRef op, Message *reply) session->con->send_message(reply); op->mark_event("reply: send"); } - session->put(); } void Monitor::no_reply(MonOpRequestRef op) @@ -3261,7 +3257,6 @@ void Monitor::no_reply(MonOpRequestRef op) << " " << *req << dendl; op->mark_event("no_reply"); } - session->put(); } void Monitor::handle_route(MonOpRequestRef op) @@ -3272,7 +3267,6 @@ void Monitor::handle_route(MonOpRequestRef op) if (session && !session->is_capable("mon", MON_CAP_X)) { dout(0) << "MRoute received from entity without appropriate perms! " << dendl; - session->put(); return; } if (m->msg) @@ -3304,8 +3298,6 @@ void Monitor::handle_route(MonOpRequestRef op) m->msg = NULL; } } - if (session) - session->put(); } void Monitor::resend_routed_requests() @@ -3444,7 +3436,6 @@ void Monitor::dispatch(MonOpRequestRef op) if (s && s->closed) { caps = s->caps; reuse_caps = true; - s->put(); s = NULL; } Message *m = op->get_req(); @@ -3510,7 +3501,6 @@ void Monitor::dispatch(MonOpRequestRef op) } else { dispatch_op(op); } - s->put(); return; } @@ -4207,7 +4197,6 @@ void Monitor::handle_subscribe(MonOpRequestRef op) if (reply) m->get_connection()->send_message(new MMonSubscribeAck(monmap->get_fsid(), (int)g_conf->mon_subscribe_interval)); - s->put(); } void Monitor::handle_get_version(MonOpRequestRef op) @@ -4252,10 +4241,8 @@ void Monitor::handle_get_version(MonOpRequestRef op) m->get_connection()->send_message(reply); } - - out: - s->put(); + return; } bool Monitor::ms_handle_reset(Connection *con) From 0b03b32d8ba76fe9f6f1158e68eb440e3670393a Mon Sep 17 00:00:00 2001 From: Bo Cai Date: Mon, 14 Sep 2015 19:19:05 +0800 Subject: [PATCH 603/654] tools:remove the local file when get map failed. Signed-off-by: Bo Cai --- src/tools/ceph_monstore_tool.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index eec3924ccc832..c979bcdabe38e 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -12,6 +12,7 @@ */ #include #include +#include #include #include @@ -600,6 +601,13 @@ int main(int argc, char **argv) { } } + BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) { + ::close(fd); + if (r < 0 && fd != STDOUT_FILENO) { + ::remove(outpath.c_str()); + } + } BOOST_SCOPE_EXIT_END + bufferlist bl; r = 0; if (map_type == "osdmap") { @@ -610,7 +618,6 @@ int main(int argc, char **argv) { if (r < 0) { std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; err = EINVAL; - ::close(fd); goto done; } bl.write_fd(fd); From adb8478b619138cbfbcc580c9955975328d2e755 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Mon, 14 Sep 2015 11:23:31 -0400 Subject: [PATCH 604/654] include/inline_memory: out-of-bounds read on unaligned memory When checking if a >=16 byte unaligned buffer is zeroed, the 32bit check will read outside the buffer memory range. Fixes: #13082 Signed-off-by: Jason Dillaman --- src/include/inline_memory.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h index 33c6bc0ea0dce..f2166826b76d2 100644 --- a/src/include/inline_memory.h +++ b/src/include/inline_memory.h @@ -77,13 +77,9 @@ static inline bool mem_is_zero(const char *data, size_t len) bool mem_is_zero(const char *data, size_t len) { - const char *max = data + len; - const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); -#if defined(__GNUC__) && defined(__x86_64__) // we do have XMM registers in x86-64, so if we need to check at least - // 16 bytes, make use of them - int left = len; - if (left / sizeof(uint128_t) > 0) { + // 16 bytes, make use of them + if (len / sizeof(uint128_t) > 0) { // align data pointer to 16 bytes, otherwise it'll segfault due to bug // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). // check up to 15 first bytes while at it. @@ -92,10 +88,11 @@ bool mem_is_zero(const char *data, size_t len) return false; } data += sizeof(uint8_t); - left--; + --len; } - const char* max128 = data + (left / sizeof(uint128_t))*sizeof(uint128_t); + const char* data_start = data; + const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t); while (data < max128) { if (*(uint128_t*)data != 0) { @@ -103,8 +100,11 @@ bool mem_is_zero(const char *data, size_t len) } data += sizeof(uint128_t); } + len -= (data - data_start); } -#endif + + const char* max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); while (data < max32) { if (*(uint32_t*)data != 0) { return false; From 8678a8acec1c0ee123f97021358ce26cdfe06c90 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 10:51:33 -0400 Subject: [PATCH 605/654] kill /etc/security/limits.d/ceph This only affects login shells; not useful. Signed-off-by: Sage Weil --- Makefile.am | 1 - ceph.spec.in | 2 -- debian/ceph-common.install | 1 - debian/rules | 1 - etc/ceph.limits.d | 9 --------- 5 files changed, 14 deletions(-) delete mode 100644 etc/ceph.limits.d diff --git a/Makefile.am b/Makefile.am index 2508278d31d86..7ff3cf7f1221f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,7 +10,6 @@ EXTRA_DIST += \ src/test/cli \ src/test/downloads \ systemd/ceph.tmpfiles.d \ - etc/ceph.limits.d \ etc/default/ceph \ etc/sysconfig/ceph \ etc/sysconfig/SuSEfirewall2.d/services/ceph-mon \ diff --git a/ceph.spec.in b/ceph.spec.in index a2fda78f2f165..45fc06eadc204 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -592,7 +592,6 @@ find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap -install -D etc/ceph.limits.d $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/ceph.conf install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf @@ -790,7 +789,6 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_libdir}/ceph/erasure-code/libec_*.so* %{_udevrulesdir}/60-ceph-partuuid-workaround.rules %{_udevrulesdir}/95-ceph-osd.rules -%config %{_sysconfdir}/security/limits.d/ceph.conf %config %{_sysconfdir}/bash_completion.d/ceph %config(noreplace) %{_sysconfdir}/logrotate.d/ceph %config(noreplace) %{_sysconfdir}/sysconfig/ceph diff --git a/debian/ceph-common.install b/debian/ceph-common.install index 1fa4c1309f086..4e21adff9c474 100644 --- a/debian/ceph-common.install +++ b/debian/ceph-common.install @@ -25,6 +25,5 @@ usr/share/ceph/id_dsa_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com.pub etc/ceph/rbdmap etc/init.d/rbdmap -etc/security/limits.d/ceph lib/udev/rules.d/50-rbd.rules usr/lib/python*/dist-packages/ceph_argparse.py* diff --git a/debian/rules b/debian/rules index 251eb42d37b61..c81036a471951 100755 --- a/debian/rules +++ b/debian/rules @@ -85,7 +85,6 @@ install: build install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules install -D -m 644 src/rbdmap $(DESTDIR)/etc/ceph/rbdmap install -D -m 755 src/init-rbdmap $(DESTDIR)/etc/init.d/rbdmap - install -D -m 644 etc/ceph.limits.d $(DESTDIR)/etc/security/limits.d/ceph # Add here commands to install the package into debian/testpack. # Build architecture-independent files here. diff --git a/etc/ceph.limits.d b/etc/ceph.limits.d deleted file mode 100644 index 702aa0332ecb7..0000000000000 --- a/etc/ceph.limits.d +++ /dev/null @@ -1,9 +0,0 @@ -# /etc/security/limits.d/ceph -# -# -# - -# We want a very large value for nofile for the ceph user as the ceph -# clients and daemons consume lots and lots of file descriptors. - -ceph - nofile 4194304 From 1ff51a299d20dd73a95ba55fdec57498b8c71c13 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 12:31:42 -0400 Subject: [PATCH 606/654] osd: drop default map cache size from 500 -> 200 OSDs don't need this from each other anymore since they send a pg_interval_map_t with pg notify. This cache is only used during startup when loading things up (and 200 is still > the max advance of 150), and when clients appear with super old maps (pretty unusual unless they are idle for long periods). It should mean a pretty significant cut in memory... Signed-off-by: Sage Weil --- src/common/config_opts.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index de05abf6be647..22139ba264900 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -591,8 +591,8 @@ OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // numbe OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write) OPTION(osd_map_dedup, OPT_BOOL, true) -OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size! -OPTION(osd_map_cache_size, OPT_INT, 500) +OPTION(osd_map_max_advance, OPT_INT, 150) // make this < cache_size! +OPTION(osd_map_cache_size, OPT_INT, 200) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0) From 8453a89cb237f21e35e7a10daddec592ac3af258 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 10:54:53 -0400 Subject: [PATCH 607/654] systemd: set nofile limit in unit files Make it big so hopefully nobody has to change it. Signed-off-by: Sage Weil --- systemd/ceph-mds@.service | 1 + systemd/ceph-mon@.service | 1 + systemd/ceph-osd@.service.in | 1 + systemd/ceph-radosgw@.service | 1 + 4 files changed, 4 insertions(+) diff --git a/systemd/ceph-mds@.service b/systemd/ceph-mds@.service index 7e5a95e8c4ebe..c66e641746876 100644 --- a/systemd/ceph-mds@.service +++ b/systemd/ceph-mds@.service @@ -5,6 +5,7 @@ Wants=network-online.target local-fs.target PartOf=ceph.target [Service] +LimitNOFILE=1048576 EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph diff --git a/systemd/ceph-mon@.service b/systemd/ceph-mon@.service index 7ac9b8f2ec769..ba6087daf2b9e 100644 --- a/systemd/ceph-mon@.service +++ b/systemd/ceph-mon@.service @@ -11,6 +11,7 @@ Wants=network-online.target local-fs.target ceph-create-keys@%i.service PartOf=ceph.target [Service] +LimitNOFILE=1048576 EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph diff --git a/systemd/ceph-osd@.service.in b/systemd/ceph-osd@.service.in index fac1932f58f22..1f7ccc4c10f1c 100644 --- a/systemd/ceph-osd@.service.in +++ b/systemd/ceph-osd@.service.in @@ -5,6 +5,7 @@ Wants=network-online.target local-fs.target PartOf=ceph.target [Service] +LimitNOFILE=1048576 EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph diff --git a/systemd/ceph-radosgw@.service b/systemd/ceph-radosgw@.service index 486cef889cb44..6558890794abf 100644 --- a/systemd/ceph-radosgw@.service +++ b/systemd/ceph-radosgw@.service @@ -5,6 +5,7 @@ Wants=network-online.target local-fs.target PartOf=ceph.target [Service] +LimitNOFILE=1048576 EnvironmentFile=-/etc/sysconfig/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/radosgw -f --cluster ${CLUSTER} --name client.%i --setuser ceph --setgroup ceph From 367c794cb1ab701d6a28bdbaa5f7cd84e16a5e32 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 10:58:15 -0400 Subject: [PATCH 608/654] systemd: no need to preprocess ceph-osd@service This used to be necessary but now is not. Signed-off-by: Sage Weil --- .gitignore | 1 - configure.ac | 1 - systemd/{ceph-osd@.service.in => ceph-osd@.service} | 0 3 files changed, 2 deletions(-) rename systemd/{ceph-osd@.service.in => ceph-osd@.service} (100%) diff --git a/.gitignore b/.gitignore index 557861a19335e..f6233f3e3e050 100644 --- a/.gitignore +++ b/.gitignore @@ -52,7 +52,6 @@ missing py-compile release stamp-h1 -systemd/ceph-osd@.service systemd/ceph-rgw.tmpfiles.d systemd/Makefile vgcore.* diff --git a/configure.ac b/configure.ac index 3ed736e7a7498..58c0b6bc592a3 100644 --- a/configure.ac +++ b/configure.ac @@ -1342,7 +1342,6 @@ AC_CONFIG_FILES([Makefile man/Makefile doc/Makefile selinux/Makefile - systemd/ceph-osd@.service systemd/ceph-rgw.tmpfiles.d ceph.spec]) AC_OUTPUT diff --git a/systemd/ceph-osd@.service.in b/systemd/ceph-osd@.service similarity index 100% rename from systemd/ceph-osd@.service.in rename to systemd/ceph-osd@.service From 8e13d89f0f136f5be8ab869190062abfcca0e23d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 13:52:11 -0400 Subject: [PATCH 609/654] systemd: eliminate ceph-rgw tmpfiles.d file This is for storing the rgw socket files for fastcgi, which we do not want to enable by default. Signed-off-by: Sage Weil --- .gitignore | 1 - ceph.spec.in | 4 ---- configure.ac | 1 - debian/rules | 2 -- systemd/Makefile.am | 1 - systemd/ceph-rgw.tmpfiles.d.in | 2 -- 6 files changed, 11 deletions(-) delete mode 100644 systemd/ceph-rgw.tmpfiles.d.in diff --git a/.gitignore b/.gitignore index f6233f3e3e050..3f9056e7e23e2 100644 --- a/.gitignore +++ b/.gitignore @@ -52,7 +52,6 @@ missing py-compile release stamp-h1 -systemd/ceph-rgw.tmpfiles.d systemd/Makefile vgcore.* diff --git a/ceph.spec.in b/ceph.spec.in index 45fc06eadc204..c068af6a4268c 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -595,7 +595,6 @@ install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph %if 0%{?_with_systemd} install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf - install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf install -m 0644 -D systemd/ceph-osd@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd@.service install -m 0644 -D systemd/ceph-mon@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon@.service install -m 0644 -D systemd/ceph-create-keys@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys@.service @@ -916,7 +915,6 @@ fi %dir %{_localstatedir}/log/radosgw/ %dir %{_localstatedir}/lib/ceph/radosgw %if 0%{?_with_systemd} -%{_tmpfilesdir}/%{name}-rgw.conf %else %{_initrddir}/ceph-radosgw %{_sbindir}/rcceph-radosgw @@ -925,8 +923,6 @@ fi %post radosgw /sbin/ldconfig %if 0%{?suse_version} - # TODO: find out what exactly this systemd-tmpfiles inovcation is for - systemd-tmpfiles --create /%{_tmpfilesdir}/ceph-rgw.conf # explicit systemctl daemon-reload (that's the only relevant bit of # service_add_post; the rest is all sysvinit --> systemd migration which # isn't applicable in this context (see above comment). diff --git a/configure.ac b/configure.ac index 58c0b6bc592a3..3e20affe674da 100644 --- a/configure.ac +++ b/configure.ac @@ -1342,6 +1342,5 @@ AC_CONFIG_FILES([Makefile man/Makefile doc/Makefile selinux/Makefile - systemd/ceph-rgw.tmpfiles.d ceph.spec]) AC_OUTPUT diff --git a/debian/rules b/debian/rules index c81036a471951..95fdfbd454524 100755 --- a/debian/rules +++ b/debian/rules @@ -153,8 +153,6 @@ binary-arch: build install install -d -m0755 debian/radosgw/lib/systemd/system install -m0644 systemd/ceph-radosgw@.service debian/radosgw/lib/systemd/system sed -i s./etc/sysconfig/./etc/default/.g debian/radosgw/lib/systemd/system/ceph-radosgw@.service - install -d -m0755 debian/radosgw/usr/lib/tmpfiles.d - install -m 0644 -D systemd/ceph-rgw.tmpfiles.d debian/radosgw/usr/lib/tmpfiles.d/ceph-rgw.conf dh_installman -a dh_lintian -a diff --git a/systemd/Makefile.am b/systemd/Makefile.am index 3db6c85f5d8af..b7fde384c83d1 100644 --- a/systemd/Makefile.am +++ b/systemd/Makefile.am @@ -15,5 +15,4 @@ EXTRA_DIST = \ $(unitfiles) \ ceph \ ceph.tmpfiles.d \ - ceph-rgw.tmpfiles.d.in \ ceph-radosgw-prestart.sh diff --git a/systemd/ceph-rgw.tmpfiles.d.in b/systemd/ceph-rgw.tmpfiles.d.in deleted file mode 100644 index 9986b00949170..0000000000000 --- a/systemd/ceph-rgw.tmpfiles.d.in +++ /dev/null @@ -1,2 +0,0 @@ -# create rgw socket directory -d /run/ceph-rgw 0755 @user_rgw@ @group_rgw@ - - From fedf360a3eddeeda641149e4f21ba5f95b6a9725 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 14:37:37 -0400 Subject: [PATCH 610/654] osd: queue all transactions under an osr This was missed in e7bbafa3bfbd5e936a8be026a30b83a89f6121c3. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 46514f354c03c..76a7cf65b7b6c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2515,7 +2515,7 @@ void OSD::clear_temp_objects() dout(20) << " removing " << *p << " object " << *q << dendl; t.remove(*p, *q); } - store->apply_transaction(t); + store->apply_transaction(service.meta_osr.get(), t); } } } @@ -6446,7 +6446,7 @@ void OSD::handle_osd_map(MOSDMap *m) // superblock and commit write_superblock(t); store->queue_transaction( - 0, + service.meta_osr.get(), _t, new C_OnMapApply(&service, _t, pinned_maps, osdmap->get_epoch()), 0, 0); @@ -6537,7 +6537,7 @@ void OSD::check_osdmap_features(ObjectStore *fs) superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); ObjectStore::Transaction *t = new ObjectStore::Transaction; write_superblock(*t); - int err = store->queue_transaction_and_cleanup(NULL, t); + int err = store->queue_transaction_and_cleanup(service.meta_osr.get(), t); assert(err == 0); fs->set_allow_sharded_objects(); } From 10b00f09d6f9c5434c917a122e81c19cee620cad Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 14:39:35 -0400 Subject: [PATCH 611/654] os/FileStore: assert Sequencer is passed in; kill default_osr Signed-off-by: Sage Weil --- src/os/FileStore.cc | 4 +--- src/os/FileStore.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 5f17544461937..44cd9fd9e1c7d 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -526,7 +526,6 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit stop(false), sync_thread(this), fdcache(g_ceph_context), wbthrottle(g_ceph_context), - default_osr("default"), throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops), throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes), op_finisher(g_ceph_context), @@ -1933,8 +1932,7 @@ int FileStore::queue_transactions(Sequencer *posr, list &tls, // set up the sequencer OpSequencer *osr; - if (!posr) - posr = &default_osr; + assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; diff --git a/src/os/FileStore.h b/src/os/FileStore.h index b9480b044cd9d..3e59827166915 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -329,7 +329,6 @@ class FileStore : public JournalingObjectStore, FDCache fdcache; WBThrottle wbthrottle; - Sequencer default_osr; deque op_queue; Throttle throttle_ops, throttle_bytes; Finisher op_finisher; From b46cdc02faaf814c33660833493c1d75967d2c4a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 14:48:11 -0400 Subject: [PATCH 612/654] os/ObjectStore: fix leak ref on Sequencer_impl This should start at 0 refs, not 1. Signed-off-by: Sage Weil --- src/os/ObjectStore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 57c99d540d81f..c8a54268c9702 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -155,7 +155,7 @@ class ObjectStore { Context *c ///< [in] context to call upon flush/commit ) = 0; ///< @return true if idle, false otherwise - Sequencer_impl() : RefCountedObject(0) {} + Sequencer_impl() : RefCountedObject(NULL, 0) {} virtual ~Sequencer_impl() {} }; typedef boost::intrusive_ptr Sequencer_implRef; From 47e7953c2bd38657f0ce881ee512fa833e176447 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:19:05 -0400 Subject: [PATCH 613/654] os/FileStore: improve debug output for sequencers Signed-off-by: Sage Weil --- src/os/FileStore.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 44cd9fd9e1c7d..43961deee3d01 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1935,12 +1935,13 @@ int FileStore::queue_transactions(Sequencer *posr, list &tls, assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); - dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; + dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl; } else { osr = new OpSequencer; + osr->set_cct(g_ceph_context); osr->parent = posr; posr->p = osr; - dout(5) << "queue_transactions new " << *osr << "/" << osr->parent << dendl; + dout(5) << "queue_transactions new " << osr << " " << *osr << dendl; } // used to include osr information in tracepoints during transaction apply From 16c672c7772888e7738729fb19ddcbca9c2e4bf9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:20:45 -0400 Subject: [PATCH 614/654] os/newstore: kill default_osr Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 4 +--- src/os/newstore/NewStore.h | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 953dcd22d5864..0bd70200acc3e 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -608,7 +608,6 @@ NewStore::NewStore(CephContext *cct, const string& path) kv_lock("NewStore::kv_lock"), kv_stop(false), logger(NULL), - default_osr("default"), reap_lock("NewStore::reap_lock") { _init_logger(); @@ -2759,8 +2758,7 @@ int NewStore::queue_transactions( // set up the sequencer OpSequencer *osr; - if (!posr) - posr = &default_osr; + assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); dout(5) << __func__ << " existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index a399bc0462375..97c5d6a8a979e 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -504,8 +504,6 @@ class NewStore : public ObjectStore { Logger *logger; - Sequencer default_osr; - Mutex reap_lock; Cond reap_cond; list removed_collections; From e7b57cd0dacb933fb11de3864e8b20ac4ea17081 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:20:52 -0400 Subject: [PATCH 615/654] os/newstore: better sequencer debug Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 0bd70200acc3e..4ece39b1e3514 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -2761,12 +2761,12 @@ int NewStore::queue_transactions( assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); - dout(5) << __func__ << " existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; + dout(5) << __func__ << " existing " << osr << " " << *osr << dendl; } else { osr = new OpSequencer; osr->parent = posr; posr->p = osr; - dout(5) << __func__ << " new " << *osr << "/" << osr->parent << dendl; + dout(5) << __func__ << " new " << osr << " " << *osr << dendl; } // prepare From 83c83e44403d332ffddf3b11d11124ee86f1cbdb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:21:41 -0400 Subject: [PATCH 616/654] os/KeyValueStore: kill default_osr Signed-off-by: Sage Weil --- src/os/KeyValueStore.cc | 4 +--- src/os/KeyValueStore.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc index a147c0b27ab42..3328db5e56f0c 100644 --- a/src/os/KeyValueStore.cc +++ b/src/os/KeyValueStore.cc @@ -534,7 +534,6 @@ KeyValueStore::KeyValueStore(const std::string &base, ondisk_finisher(g_ceph_context), collections_lock("KeyValueStore::collections_lock"), lock("KeyValueStore::lock"), - default_osr("default"), throttle_ops(g_ceph_context, "keyvaluestore_ops", g_conf->keyvaluestore_queue_max_ops), throttle_bytes(g_ceph_context, "keyvaluestore_bytes", g_conf->keyvaluestore_queue_max_bytes), op_finisher(g_ceph_context), @@ -1049,8 +1048,7 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list &tls, // set up the sequencer OpSequencer *osr; - if (!posr) - posr = &default_osr; + assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h index 148083762af97..90e41ee85a171 100644 --- a/src/os/KeyValueStore.h +++ b/src/os/KeyValueStore.h @@ -429,7 +429,6 @@ class KeyValueStore : public ObjectStore, friend ostream& operator<<(ostream& out, const OpSequencer& s); - Sequencer default_osr; deque op_queue; Throttle throttle_ops, throttle_bytes; Finisher op_finisher; From 1002201a097d10b00b3ee3cedd5f954188a2f1b1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:21:51 -0400 Subject: [PATCH 617/654] os/KeyValueStore: better osr debug Signed-off-by: Sage Weil --- src/os/KeyValueStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc index 3328db5e56f0c..681bd148bbe50 100644 --- a/src/os/KeyValueStore.cc +++ b/src/os/KeyValueStore.cc @@ -1051,13 +1051,13 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list &tls, assert(posr); if (posr->p) { osr = static_cast(posr->p.get()); - dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent + dout(5) << "queue_transactions existing " << osr << " " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl; } else { osr = new OpSequencer; osr->parent = posr; posr->p = osr; - dout(5) << "queue_transactions new " << *osr << "/" << osr->parent << dendl; + dout(5) << "queue_transactions new " << osr << " " << *osr << "/" << osr->parent << dendl; } Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op); From df921125b5d436bd9540b7b816710bc575494792 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 15:24:08 -0400 Subject: [PATCH 618/654] ceph_objectstore_test: fix warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In file included from test/objectstore/store_test.cc:34:0: ../src/gmock/gtest/include/gtest/gtest.h: In instantiation of ‘testing::AssertionResult testing::internal::CmpHelperEQ(const char*, const char*, const T1&, const T2&) [with T1 = unsigned int; T2 = int]’: ../src/gmock/gtest/include/gtest/gtest.h:1484:23: required from ‘static testing::AssertionResult testing::internal::EqHelper::Compare(const char*, const char*, const T1&, const T2&) [with T1 = unsigned int; T2 = int; bool lhs_is_null_literal = false]’ test/objectstore/store_test.cc:411:5: required from here ../src/gmock/gtest/include/gtest/gtest.h:1448:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] if (expected == actual) { ^ ../src/gmock/gtest/include/gtest/gtest.h: In instantiation of ‘testing::AssertionResult testing::internal::CmpHelperEQ(const char*, const char*, const T1&, const T2&) [with T1 = int; T2 = unsigned int]’: ../src/gmock/gtest/include/gtest/gtest.h:1484:23: required from ‘static testing::AssertionResult testing::internal::EqHelper::Compare(const char*, const char*, const T1&, const T2&) [with T1 = int; T2 = unsigned int; bool lhs_is_null_literal = false]’ test/objectstore/store_test.cc:2003:7: required from here ../src/gmock/gtest/include/gtest/gtest.h:1448:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] ../src/gmock/gtest/include/gtest/gtest.h: In instantiation of ‘testing::AssertionResult testing::internal::CmpHelperEQ(const char*, const char*, const T1&, const T2&) [with T1 = int; T2 = long unsigned int]’: ../src/gmock/gtest/include/gtest/gtest.h:1484:23: required from ‘static testing::AssertionResult testing::internal::EqHelper::Compare(const char*, const char*, const T1&, const T2&) [with T1 = int; T2 = long unsigned int; bool lhs_is_null_literal = false]’ test/objectstore/store_test.cc:2010:7: required from here ../src/gmock/gtest/include/gtest/gtest.h:1448:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] ../src/gmock/gtest/include/gtest/gtest.h: In instantiation of ‘testing::AssertionResult testing::internal::CmpHelperEQ(const char*, const char*, const T1&, const T2&) [with T1 = long unsigned int; T2 = int]’: ../src/gmock/gtest/include/gtest/gtest.h:1484:23: required from ‘static testing::AssertionResult testing::internal::EqHelper::Compare(const char*, const char*, const T1&, const T2&) [with T1 = long unsigned int; T2 = int; bool lhs_is_null_literal = false]’ test/objectstore/store_test.cc:2080:5: required from here ../src/gmock/gtest/include/gtest/gtest.h:1448:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] Signed-off-by: Sage Weil --- src/test/objectstore/store_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 8e9ca11222603..3a47d120f1437 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -408,7 +408,7 @@ TEST_P(StoreTest, SimpleObjectTest) { bufferlist in; r = store->read(cid, hoid, 0, bl.length(), in); - ASSERT_EQ(bl.length(), r); + ASSERT_EQ((int)bl.length(), r); in.hexdump(cout); ASSERT_TRUE(in.contents_equal(bl)); } @@ -2000,14 +2000,14 @@ TEST_P(StoreTest, OMapTest) { bufferlist hdr; map m; store->omap_get(cid, hoid, &hdr, &m); - ASSERT_EQ(6, hdr.length()); + ASSERT_EQ(6u, hdr.length()); ASSERT_TRUE(m.count("2")); ASSERT_TRUE(!m.count("3")); ASSERT_TRUE(!m.count("6")); ASSERT_TRUE(m.count("7")); ASSERT_TRUE(m.count("8")); //cout << m << std::endl; - ASSERT_EQ(6, m.size()); + ASSERT_EQ(6u, m.size()); } { ObjectStore::Transaction t; @@ -2018,8 +2018,8 @@ TEST_P(StoreTest, OMapTest) { bufferlist hdr; map m; store->omap_get(cid, hoid, &hdr, &m); - ASSERT_EQ(0, hdr.length()); - ASSERT_EQ(0, m.size()); + ASSERT_EQ(0u, hdr.length()); + ASSERT_EQ(0u, m.size()); } } @@ -2077,7 +2077,7 @@ TEST_P(StoreTest, OMapIterator) { } ASSERT_EQ(correct, true); } - ASSERT_EQ(attrs.size(), count); + ASSERT_EQ((int)attrs.size(), count); // FileStore may deadlock an active iterator vs apply_transaction iter = ObjectMap::ObjectMapIterator(); From 139b5d6235f055a1fb921b8c5087af712f8c86d9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Sep 2015 20:09:50 -0400 Subject: [PATCH 619/654] os: require Sequencer arg for apply_transaction() Signed-off-by: Sage Weil --- src/os/ObjectStore.h | 8 - src/test/bench/small_io_bench_fs.cc | 5 +- .../objectstore/DeterministicOpSequence.cc | 18 +- src/test/objectstore/TestObjectStoreState.cc | 3 +- src/test/objectstore/store_test.cc | 242 ++++++++++-------- src/test/objectstore/test_idempotent.cc | 3 +- src/test/objectstore_bench.cc | 12 +- src/test/streamtest.cc | 3 +- src/test/test_trans.cc | 3 +- src/test/xattr_bench.cc | 3 +- src/tools/ceph_objectstore_tool.cc | 55 ++-- 11 files changed, 200 insertions(+), 155 deletions(-) diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index c8a54268c9702..ec14dcd766c4d 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -1600,19 +1600,11 @@ class ObjectStore { }; // synchronous wrappers - unsigned apply_transaction(Transaction& t, Context *ondisk=0) { - list tls; - tls.push_back(&t); - return apply_transactions(NULL, tls, ondisk); - } unsigned apply_transaction(Sequencer *osr, Transaction& t, Context *ondisk=0) { list tls; tls.push_back(&t); return apply_transactions(osr, tls, ondisk); } - unsigned apply_transactions(list& tls, Context *ondisk=0) { - return apply_transactions(NULL, tls, ondisk); - } unsigned apply_transactions(Sequencer *osr, list& tls, Context *ondisk=0); int queue_transaction_and_cleanup(Sequencer *osr, Transaction* t, diff --git a/src/test/bench/small_io_bench_fs.cc b/src/test/bench/small_io_bench_fs.cc index 83b085ca53bb0..4b273e410a895 100644 --- a/src/test/bench/small_io_bench_fs.cc +++ b/src/test/bench/small_io_bench_fs.cc @@ -134,6 +134,7 @@ int main(int argc, char **argv) FileStore fs(vm["filestore-path"].as(), vm["journal-path"].as()); + ObjectStore::Sequencer osr(__func__); if (fs.mkfs() < 0) { cout << "mkfs failed" << std::endl; @@ -172,12 +173,12 @@ int main(int argc, char **argv) std::cout << "collection " << pgid << std::endl; ObjectStore::Transaction t; t.create_collection(coll_t(pgid), 0); - fs.apply_transaction(t); + fs.apply_transaction(&osr, t); } { ObjectStore::Transaction t; t.create_collection(coll_t(), 0); - fs.apply_transaction(t); + fs.apply_transaction(&osr, t); } vector > benchers( diff --git a/src/test/objectstore/DeterministicOpSequence.cc b/src/test/objectstore/DeterministicOpSequence.cc index 03209a7900b8c..c26173ffa3977 100644 --- a/src/test/objectstore/DeterministicOpSequence.cc +++ b/src/test/objectstore/DeterministicOpSequence.cc @@ -452,7 +452,7 @@ void DeterministicOpSequence::_do_coll_create(coll_t cid, uint32_t pg_num, uint6 dout(0) << "Give collection: " << cid << " a hint, pg_num is: " << pg_num << ", num_objs is: " << num_objs << dendl; - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_touch(coll_t coll, hobject_t& obj) @@ -460,7 +460,7 @@ void DeterministicOpSequence::_do_touch(coll_t coll, hobject_t& obj) ObjectStore::Transaction t; note_txn(&t); t.touch(coll, ghobject_t(obj)); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_remove(coll_t coll, hobject_t& obj) @@ -468,7 +468,7 @@ void DeterministicOpSequence::_do_remove(coll_t coll, hobject_t& obj) ObjectStore::Transaction t; note_txn(&t); t.remove(coll, ghobject_t(obj)); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_set_attrs(coll_t coll, @@ -478,7 +478,7 @@ void DeterministicOpSequence::_do_set_attrs(coll_t coll, ObjectStore::Transaction t; note_txn(&t); t.omap_setkeys(coll, ghobject_t(obj), attrs); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj, @@ -487,7 +487,7 @@ void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj, ObjectStore::Transaction t; note_txn(&t); t.write(coll, ghobject_t(obj), off, len, data); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj, @@ -496,7 +496,7 @@ void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj, ObjectStore::Transaction t; note_txn(&t); t.clone(coll, ghobject_t(orig_obj), ghobject_t(new_obj)); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_clone_range(coll_t coll, @@ -507,7 +507,7 @@ void DeterministicOpSequence::_do_clone_range(coll_t coll, note_txn(&t); t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj), srcoff, srclen, dstoff); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll, @@ -523,7 +523,7 @@ void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll, t.write(coll, ghobject_t(orig_obj), srcoff, bl.length(), bl); t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj), srcoff, srclen, dstoff); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll, @@ -533,6 +533,6 @@ void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll, note_txn(&t); t.remove(new_coll, ghobject_t(obj)); t.collection_move_rename(orig_coll, ghobject_t(obj), new_coll, ghobject_t(obj)); - m_store->apply_transaction(t); + m_store->apply_transaction(&m_osr, t); } diff --git a/src/test/objectstore/TestObjectStoreState.cc b/src/test/objectstore/TestObjectStoreState.cc index 7693fc7530d17..e4252ce7b24f5 100644 --- a/src/test/objectstore/TestObjectStoreState.cc +++ b/src/test/objectstore/TestObjectStoreState.cc @@ -33,11 +33,12 @@ void TestObjectStoreState::init(int colls, int objs) { dout(5) << "init " << colls << " colls " << objs << " objs" << dendl; + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction *t; t = new ObjectStore::Transaction; t->create_collection(coll_t::meta(), 0); - m_store->apply_transaction(*t); + m_store->apply_transaction(&osr, *t); wait_for_ready(); diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 3a47d120f1437..7f95a4f369fd7 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -104,6 +104,7 @@ TEST_P(StoreTest, TrivialRemount) { } TEST_P(StoreTest, SimpleRemount) { + ObjectStore::Sequencer osr("test"); coll_t cid; ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); @@ -115,7 +116,7 @@ TEST_P(StoreTest, SimpleRemount) { ObjectStore::Transaction t; t.create_collection(cid, 0); t.write(cid, hoid, 0, bl.length(), bl); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } store->umount(); @@ -124,7 +125,7 @@ TEST_P(StoreTest, SimpleRemount) { { ObjectStore::Transaction t; t.write(cid, hoid2, 0, bl.length(), bl); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -133,12 +134,13 @@ TEST_P(StoreTest, SimpleRemount) { t.remove(cid, hoid2); t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, IORemount) { + ObjectStore::Sequencer osr("test"); coll_t cid; bufferlist bl; bl.append("1234512345"); @@ -151,7 +153,7 @@ TEST_P(StoreTest, IORemount) { ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP))); t.write(cid, hoid, 0, bl.length(), bl); } - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } // overwrites @@ -161,7 +163,7 @@ TEST_P(StoreTest, IORemount) { ObjectStore::Transaction t; ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP))); t.write(cid, hoid, 1, bl.length(), bl); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } @@ -175,78 +177,81 @@ TEST_P(StoreTest, IORemount) { t.remove(cid, hoid); } t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleMetaColTest) { + ObjectStore::Sequencer osr("test"); coll_t cid; int r = 0; { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "create collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "add collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimplePGColTest) { + ObjectStore::Sequencer osr("test"); coll_t cid(spg_t(pg_t(1,2), shard_id_t::NO_SHARD)); int r = 0; { ObjectStore::Transaction t; t.create_collection(cid, 4); cerr << "create collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.create_collection(cid, 4); cerr << "add collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleColPreHashTest) { + ObjectStore::Sequencer osr("test"); // Firstly we will need to revert the value making sure // collection hint actually works int merge_threshold = g_ceph_context->_conf->filestore_merge_threshold; @@ -278,7 +283,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) { ::encode(expected_num_objs, hint); t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint); cerr << "collection hint" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -286,7 +291,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "remove collection" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } // Revert the config change so that it does not affect the split/merge tests @@ -298,6 +303,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) { } TEST_P(StoreTest, SimpleObjectTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); @@ -310,7 +316,7 @@ TEST_P(StoreTest, SimpleObjectTest) { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -320,7 +326,7 @@ TEST_P(StoreTest, SimpleObjectTest) { ObjectStore::Transaction t; t.touch(cid, hoid); cerr << "Creating object " << hoid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); exists = store->exists(cid, hoid); @@ -331,7 +337,7 @@ TEST_P(StoreTest, SimpleObjectTest) { t.remove(cid, hoid); t.touch(cid, hoid); cerr << "Remove then create" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -342,7 +348,7 @@ TEST_P(StoreTest, SimpleObjectTest) { t.remove(cid, hoid); t.write(cid, hoid, 0, 5, bl); cerr << "Remove then create" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); bufferlist in; @@ -358,7 +364,7 @@ TEST_P(StoreTest, SimpleObjectTest) { exp.append(bl); t.write(cid, hoid, 5, 5, bl); cerr << "Append" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); bufferlist in; @@ -373,7 +379,7 @@ TEST_P(StoreTest, SimpleObjectTest) { exp = bl; t.write(cid, hoid, 0, 10, bl); cerr << "Full overwrite" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); bufferlist in; @@ -387,7 +393,7 @@ TEST_P(StoreTest, SimpleObjectTest) { bl.append("abcde"); t.write(cid, hoid, 3, 5, bl); cerr << "Partial overwrite" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); bufferlist in, exp; @@ -403,7 +409,7 @@ TEST_P(StoreTest, SimpleObjectTest) { bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234"); t.write(cid, hoid, 0, bl.length(), bl); cerr << "larger overwrite" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); bufferlist in; @@ -417,12 +423,13 @@ TEST_P(StoreTest, SimpleObjectTest) { t.remove(cid, hoid); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, ManySmallWrite) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); @@ -431,7 +438,7 @@ TEST_P(StoreTest, ManySmallWrite) { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } bufferlist bl; @@ -441,13 +448,13 @@ TEST_P(StoreTest, ManySmallWrite) { for (int i=0; i<100; ++i) { ObjectStore::Transaction t; t.write(cid, a, i*4096, 4096, bl, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } for (int i=0; i<100; ++i) { ObjectStore::Transaction t; t.write(cid, b, (rand() % 1024)*4096, 4096, bl, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -456,12 +463,13 @@ TEST_P(StoreTest, ManySmallWrite) { t.remove(cid, b); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleAttrTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; ghobject_t hoid(hobject_t(sobject_t("attr object 1", CEPH_NOSNAP))); @@ -479,7 +487,7 @@ TEST_P(StoreTest, SimpleAttrTest) { { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -496,7 +504,7 @@ TEST_P(StoreTest, SimpleAttrTest) { t.touch(cid, hoid); t.setattr(cid, hoid, "foo", val); t.setattr(cid, hoid, "bar", val2); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -523,19 +531,20 @@ TEST_P(StoreTest, SimpleAttrTest) { ObjectStore::Transaction t; t.remove(cid, hoid); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleListTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1))); { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } set all; @@ -551,7 +560,7 @@ TEST_P(StoreTest, SimpleListTest) { t.touch(cid, hoid); cerr << "Creating object " << hoid << std::endl; } - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } for (int bitwise=0; bitwise<2; ++bitwise) { @@ -590,7 +599,7 @@ TEST_P(StoreTest, SimpleListTest) { t.remove(cid, *p); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } @@ -624,6 +633,7 @@ TEST_P(StoreTest, Sort) { } TEST_P(StoreTest, MultipoolListTest) { + ObjectStore::Sequencer osr("test"); int r; int poolid = 4373; coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); @@ -631,7 +641,7 @@ TEST_P(StoreTest, MultipoolListTest) { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } set all, saw; @@ -649,7 +659,7 @@ TEST_P(StoreTest, MultipoolListTest) { t.touch(cid, hoid); cerr << "Creating object " << hoid << std::endl; } - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -676,19 +686,20 @@ TEST_P(StoreTest, MultipoolListTest) { t.remove(cid, *p); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleCloneTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); @@ -704,7 +715,7 @@ TEST_P(StoreTest, SimpleCloneTest) { t.setattr(cid, hoid, "attr3", xlarge); t.write(cid, hoid, 10, small.length(), small); cerr << "Creating object and set attr " << hoid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); @@ -717,7 +728,7 @@ TEST_P(StoreTest, SimpleCloneTest) { t.setattr(cid, hoid, "attr1", large); t.setattr(cid, hoid, "attr2", small); cerr << "Clone object and rm attr" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); r = store->read(cid, hoid, 10, 5, newdata); @@ -749,19 +760,20 @@ TEST_P(StoreTest, SimpleCloneTest) { t.remove(cid, hoid2); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, OmapCloneTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); @@ -778,7 +790,7 @@ TEST_P(StoreTest, OmapCloneTest) { t.omap_setkeys(cid, hoid, km); t.omap_setheader(cid, hoid, header); cerr << "Creating object and set omap " << hoid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); @@ -786,7 +798,7 @@ TEST_P(StoreTest, OmapCloneTest) { ObjectStore::Transaction t; t.clone(cid, hoid, hoid2); cerr << "Clone object" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -802,19 +814,20 @@ TEST_P(StoreTest, OmapCloneTest) { t.remove(cid, hoid2); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleCloneRangeTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); @@ -825,7 +838,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) { ObjectStore::Transaction t; t.write(cid, hoid, 10, 5, small); cerr << "Creating object and write bl " << hoid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); @@ -834,7 +847,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) { ObjectStore::Transaction t; t.clone_range(cid, hoid, hoid2, 10, 5, 0); cerr << "Clone range object" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); r = store->read(cid, hoid2, 0, 5, newdata); ASSERT_EQ(r, 5); @@ -845,7 +858,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) { t.truncate(cid, hoid, 1024*1024); t.clone_range(cid, hoid, hoid2, 0, 1024*1024, 0); cerr << "Clone range object" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); struct stat stat, stat2; r = store->stat(cid, hoid, &stat); @@ -859,20 +872,21 @@ TEST_P(StoreTest, SimpleCloneRangeTest) { t.remove(cid, hoid2); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SimpleObjectLongnameTest) { + ObjectStore::Sequencer osr("test"); int r; coll_t cid; { ObjectStore::Transaction t; t.create_collection(cid, 0); cerr << "Creating collection " << cid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP))); @@ -880,7 +894,7 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) { ObjectStore::Transaction t; t.touch(cid, hoid); cerr << "Creating object " << hoid << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -888,12 +902,13 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) { t.remove(cid, hoid); t.remove_collection(cid); cerr << "Cleaning" << std::endl; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, ManyObjectTest) { + ObjectStore::Sequencer osr("test"); int NUM_OBJS = 2000; int r = 0; coll_t cid; @@ -903,7 +918,7 @@ TEST_P(StoreTest, ManyObjectTest) { { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } for (int i = 0; i < NUM_OBJS; ++i) { @@ -916,7 +931,7 @@ TEST_P(StoreTest, ManyObjectTest) { ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP))); t.touch(cid, hoid); created.insert(hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -1006,14 +1021,14 @@ TEST_P(StoreTest, ManyObjectTest) { ++i) { ObjectStore::Transaction t; t.remove(cid, *i); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } cerr << "cleaning up" << std::endl; { ObjectStore::Transaction t; t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } @@ -1101,6 +1116,7 @@ class SyntheticWorkloadState { SyntheticWorkloadState *state; ObjectStore::Transaction *t; ghobject_t oid, noid; + C_SyntheticOnClone(SyntheticWorkloadState *state, ObjectStore::Transaction *t, ghobject_t oid, ghobject_t noid) : state(state), t(t), oid(oid), noid(noid) {} @@ -1139,7 +1155,7 @@ class SyntheticWorkloadState { bl.append(bp); } - + SyntheticWorkloadState(ObjectStore *store, ObjectGenerator *gen, gen_type *rng, @@ -1151,7 +1167,7 @@ class SyntheticWorkloadState { int init() { ObjectStore::Transaction t; t.create_collection(cid, 0); - return store->apply_transaction(t); + return store->apply_transaction(osr, t); } ghobject_t get_uniform_random_object() { @@ -1679,13 +1695,14 @@ TEST_P(StoreTest, AttrSynthetic) { } TEST_P(StoreTest, HashCollisionTest) { + ObjectStore::Sequencer osr("test"); int64_t poolid = 11; coll_t cid(spg_t(pg_t(0,poolid),shard_id_t::NO_SHARD)); int r; { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } string base = ""; @@ -1704,7 +1721,7 @@ TEST_P(StoreTest, HashCollisionTest) { { ObjectStore::Transaction t; t.touch(cid, hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } created.insert(hoid); @@ -1751,23 +1768,24 @@ TEST_P(StoreTest, HashCollisionTest) { ++i) { ObjectStore::Transaction t; t.remove(cid, *i); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ObjectStore::Transaction t; t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } TEST_P(StoreTest, ScrubTest) { + ObjectStore::Sequencer osr("test"); int64_t poolid = 111; coll_t cid(spg_t(pg_t(0, poolid),shard_id_t(1))); int r; { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } string base = "aaaaa"; @@ -1784,7 +1802,7 @@ TEST_P(StoreTest, ScrubTest) { { ObjectStore::Transaction t; t.touch(cid, hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } created.insert(hoid); @@ -1800,7 +1818,7 @@ TEST_P(StoreTest, ScrubTest) { t.touch(cid, hoid1); t.touch(cid, hoid2); t.touch(cid, hoid3); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); created.insert(hoid1); created.insert(hoid2); created.insert(hoid3); @@ -1848,24 +1866,25 @@ TEST_P(StoreTest, ScrubTest) { ++i) { ObjectStore::Transaction t; t.remove(cid, *i); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ObjectStore::Transaction t; t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } TEST_P(StoreTest, OMapTest) { + ObjectStore::Sequencer osr("test"); coll_t cid; ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); int r; { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -1876,7 +1895,7 @@ TEST_P(StoreTest, OMapTest) { t.omap_clear(cid, hoid); map start_set; t.omap_setkeys(cid, hoid, start_set); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } for (int i = 0; i < 100; i++) { @@ -1911,7 +1930,7 @@ TEST_P(StoreTest, OMapTest) { to_add.insert(pair("key-" + string(buf), bl)); attrs.insert(pair("key-" + string(buf), bl)); t.omap_setkeys(cid, hoid, to_add); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } int i = 0; @@ -1941,7 +1960,7 @@ TEST_P(StoreTest, OMapTest) { set keys_to_remove; keys_to_remove.insert(to_remove); t.omap_rmkeys(cid, hoid, keys_to_remove); - store->apply_transaction(t); + store->apply_transaction(&osr, t); attrs.erase(to_remove); @@ -1953,14 +1972,14 @@ TEST_P(StoreTest, OMapTest) { bl1.append("omap_header"); ObjectStore::Transaction t; t.omap_setheader(cid, hoid, bl1); - store->apply_transaction(t); + store->apply_transaction(&osr, t); bufferlist bl2; bl2.append("value"); map to_add; to_add.insert(pair("key", bl2)); t.omap_setkeys(cid, hoid, to_add); - store->apply_transaction(t); + store->apply_transaction(&osr, t); bufferlist bl3; map cur_attrs; @@ -1989,12 +2008,12 @@ TEST_P(StoreTest, OMapTest) { t.touch(cid, hoid); t.omap_setheader(cid, hoid, h); t.omap_setkeys(cid, hoid, to_set); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } { ObjectStore::Transaction t; t.omap_rmkeyrange(cid, hoid, "3", "7"); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } { bufferlist hdr; @@ -2012,7 +2031,7 @@ TEST_P(StoreTest, OMapTest) { { ObjectStore::Transaction t; t.omap_clear(cid, hoid); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } { bufferlist hdr; @@ -2026,11 +2045,12 @@ TEST_P(StoreTest, OMapTest) { ObjectStore::Transaction t; t.remove(cid, hoid); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } TEST_P(StoreTest, OMapIterator) { + ObjectStore::Sequencer osr("test"); coll_t cid; ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); int count = 0; @@ -2038,7 +2058,7 @@ TEST_P(StoreTest, OMapIterator) { { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2049,7 +2069,7 @@ TEST_P(StoreTest, OMapIterator) { t.omap_clear(cid, hoid); map start_set; t.omap_setkeys(cid, hoid, start_set); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } ObjectMap::ObjectMapIterator iter; bool correct; @@ -2092,7 +2112,7 @@ TEST_P(StoreTest, OMapIterator) { attrs.insert(pair("key-" + string(buf), bl)); ObjectStore::Transaction t; t.omap_setkeys(cid, hoid, to_add); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } iter = store->get_omap_iterator(cid, hoid); @@ -2118,12 +2138,13 @@ TEST_P(StoreTest, OMapIterator) { ObjectStore::Transaction t; t.remove(cid, hoid); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, XattrTest) { + ObjectStore::Sequencer osr("test"); coll_t cid; ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); bufferlist big; @@ -2139,7 +2160,7 @@ TEST_P(StoreTest, XattrTest) { ObjectStore::Transaction t; t.create_collection(cid, 0); t.touch(cid, hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2158,7 +2179,7 @@ TEST_P(StoreTest, XattrTest) { attrs["attr4"] = big; t.setattr(cid, hoid, "attr3", big); attrs["attr3"] = big; - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2177,7 +2198,7 @@ TEST_P(StoreTest, XattrTest) { ObjectStore::Transaction t; t.rmattr(cid, hoid, "attr2"); attrs.erase("attr2"); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2205,7 +2226,7 @@ TEST_P(StoreTest, XattrTest) { ObjectStore::Transaction t; t.remove(cid, hoid); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2214,13 +2235,14 @@ void colsplittest( unsigned num_objects, unsigned common_suffix_size ) { + ObjectStore::Sequencer osr("test"); coll_t cid(spg_t(pg_t(0,52),shard_id_t::NO_SHARD)); coll_t tid(spg_t(pg_t(1<apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { @@ -2235,14 +2257,14 @@ void colsplittest( i<apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.create_collection(tid, common_suffix_size + 1); t.split_collection(cid, common_suffix_size+1, 1<apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2273,7 +2295,7 @@ void colsplittest( t.remove_collection(cid); t.remove_collection(tid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2298,12 +2320,13 @@ TEST_P(StoreTest, ColSplitTest3) { * stops at the common prefix subdir. See bug * #5273 */ TEST_P(StoreTest, TwoHash) { + ObjectStore::Sequencer osr("test"); coll_t cid; int r; { ObjectStore::Transaction t; t.create_collection(cid, 0); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } std::cout << "Making objects" << std::endl; @@ -2317,7 +2340,7 @@ TEST_P(StoreTest, TwoHash) { } o.hobj.set_hash((i << 16) | 0xB1); t.touch(cid, o); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } std::cout << "Removing half" << std::endl; @@ -2327,7 +2350,7 @@ TEST_P(StoreTest, TwoHash) { o.hobj.pool = -1; o.hobj.set_hash((i << 16) | 0xA1); t.remove(cid, o); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } std::cout << "Checking" << std::endl; @@ -2355,16 +2378,17 @@ TEST_P(StoreTest, TwoHash) { t.remove(cid, o); o.hobj.set_hash((i << 16) | 0xB1); t.remove(cid, o); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ObjectStore::Transaction t; t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } TEST_P(StoreTest, MoveRename) { + ObjectStore::Sequencer osr("test"); coll_t cid(spg_t(pg_t(0, 212),shard_id_t::NO_SHARD)); ghobject_t temp_oid(hobject_t("tmp_oid", "", CEPH_NOSNAP, 0, 0, "")); ghobject_t oid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, "")); @@ -2373,7 +2397,7 @@ TEST_P(StoreTest, MoveRename) { ObjectStore::Transaction t; t.create_collection(cid, 0); t.touch(cid, oid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ASSERT_TRUE(store->exists(cid, oid)); @@ -2388,7 +2412,7 @@ TEST_P(StoreTest, MoveRename) { t.write(cid, temp_oid, 0, data.length(), data); t.setattr(cid, temp_oid, "attr", attr); t.omap_setkeys(cid, temp_oid, omap); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ASSERT_TRUE(store->exists(cid, temp_oid)); @@ -2396,7 +2420,7 @@ TEST_P(StoreTest, MoveRename) { ObjectStore::Transaction t; t.remove(cid, oid); t.collection_move_rename(cid, temp_oid, cid, oid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } ASSERT_TRUE(store->exists(cid, oid)); @@ -2423,12 +2447,13 @@ TEST_P(StoreTest, MoveRename) { ObjectStore::Transaction t; t.remove(cid, oid); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, BigRGWObjectName) { + ObjectStore::Sequencer osr("test"); store->set_allow_sharded_objects(); coll_t cid(spg_t(pg_t(0,12),shard_id_t::NO_SHARD)); ghobject_t oid( @@ -2454,14 +2479,14 @@ TEST_P(StoreTest, BigRGWObjectName) { t.collection_move_rename(cid, oidhead, cid, oid); t.touch(cid, oidhead); t.collection_move_rename(cid, oidhead, cid, oid2); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove(cid, oid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } @@ -2480,13 +2505,14 @@ TEST_P(StoreTest, BigRGWObjectName) { ObjectStore::Transaction t; t.remove(cid, oid2); t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } TEST_P(StoreTest, SetAllocHint) { + ObjectStore::Sequencer osr("test"); coll_t cid; ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, "")); int r; @@ -2494,31 +2520,31 @@ TEST_P(StoreTest, SetAllocHint) { ObjectStore::Transaction t; t.create_collection(cid, 0); t.touch(cid, hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove(cid, hoid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } { ObjectStore::Transaction t; t.remove_collection(cid); - r = store->apply_transaction(t); + r = store->apply_transaction(&osr, t); ASSERT_EQ(r, 0); } } diff --git a/src/test/objectstore/test_idempotent.cc b/src/test/objectstore/test_idempotent.cc index dc685e09c9664..098bc81c85f48 100644 --- a/src/test/objectstore/test_idempotent.cc +++ b/src/test/objectstore/test_idempotent.cc @@ -68,6 +68,7 @@ int main(int argc, char **argv) { boost::scoped_ptr db(_db); boost::scoped_ptr store(new FileStore(store_path, store_dev)); + ObjectStore::Sequencer osr(__func__); coll_t coll(spg_t(pg_t(0,12),shard_id_t::NO_SHARD)); if (start_new) { @@ -76,7 +77,7 @@ int main(int argc, char **argv) { ObjectStore::Transaction t; assert(!store->mount()); t.create_collection(coll, 0); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } else { assert(!store->mount()); } diff --git a/src/test/objectstore_bench.cc b/src/test/objectstore_bench.cc index ae1f79fc4a611..d5e9f98d71461 100644 --- a/src/test/objectstore_bench.cc +++ b/src/test/objectstore_bench.cc @@ -223,9 +223,10 @@ int main(int argc, const char *argv[]) spg_t pg; const coll_t cid(pg); { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; t.create_collection(cid, 0); - os->apply_transaction(t); + os->apply_transaction(&osr, t); } // create the objects @@ -237,17 +238,19 @@ int main(int argc, const char *argv[]) oss << "osbench-thread-" << i; oids.emplace_back(pg.make_temp_object(oss.str())); + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; t.touch(cid, oids[i]); - int r = os->apply_transaction(t); + int r = os->apply_transaction(&osr, t); assert(r == 0); } } else { oids.emplace_back(pg.make_temp_object("osbench")); + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; t.touch(cid, oids.back()); - int r = os->apply_transaction(t); + int r = os->apply_transaction(&osr, t); assert(r == 0); } @@ -276,10 +279,11 @@ int main(int argc, const char *argv[]) << iops << " iops" << dendl; // remove the objects + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; for (const auto &oid : oids) t.remove(cid, oid); - os->apply_transaction(t); + os->apply_transaction(&osr,t); os->umount(); return 0; diff --git a/src/test/streamtest.cc b/src/test/streamtest.cc index 775b087e8ecff..e19dbcea7be0a 100644 --- a/src/test/streamtest.cc +++ b/src/test/streamtest.cc @@ -143,9 +143,10 @@ int main(int argc, const char **argv) return -1; } + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction ft; ft.create_collection(coll_t(), 0); - fs->apply_transaction(ft); + fs->apply_transaction(&osr, ft); utime_t now = ceph_clock_now(g_ceph_context); utime_t start = now; diff --git a/src/test/test_trans.cc b/src/test/test_trans.cc index d482a9a9604e7..c374ed440a765 100644 --- a/src/test/test_trans.cc +++ b/src/test/test_trans.cc @@ -55,6 +55,7 @@ int main(int argc, const char **argv) return -1; } + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; char buf[1 << 20]; bufferlist bl; @@ -71,7 +72,7 @@ int main(int argc, const char **argv) dout(0) << "starting thread" << dendl; foo.create(); dout(0) << "starting op" << dendl; - fs->apply_transaction(t); + fs->apply_transaction(&osr, t); } diff --git a/src/test/xattr_bench.cc b/src/test/xattr_bench.cc index f2e89958415ad..e26e32f9ee709 100644 --- a/src/test/xattr_bench.cc +++ b/src/test/xattr_bench.cc @@ -90,6 +90,7 @@ uint64_t do_run(ObjectStore *store, int attrsize, int numattrs, Mutex lock("lock"); Cond cond; int in_flight = 0; + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; map, ObjectStore::Sequencer*> > collections; for (int i = 0; i < 3*THREADS; ++i) { @@ -105,7 +106,7 @@ uint64_t do_run(ObjectStore *store, int attrsize, int numattrs, } collections[coll] = make_pair(objects, new ObjectStore::Sequencer(coll.to_str())); } - store->apply_transaction(t); + store->apply_transaction(&osr, t); bufferlist bl; for (int i = 0; i < attrsize; ++i) { diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index f00ca3da06814..78d218b89f217 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -396,6 +396,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll) { spg_t pg; coll.is_pg_prefix(&pg); + ObjectStore::Sequencer osr(__func__); OSDriver driver( store, coll_t(), @@ -426,7 +427,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll) t->remove(coll, *i); if (num >= 30) { - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); delete t; t = new ObjectStore::Transaction; num = 0; @@ -434,7 +435,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll) } } t->remove_collection(coll); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); out: delete t; } @@ -505,12 +506,13 @@ int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) if (dry_run) return 0; ObjectStore::Transaction *rmt = new ObjectStore::Transaction; + ObjectStore::Sequencer osr(__func__); int r = mark_pg_for_removal(store, r_pgid, rmt); if (r < 0) { delete rmt; return r; } - store->apply_transaction(*rmt); + store->apply_transaction(&osr, *rmt); finish_remove_pgs(store); return r; } @@ -724,9 +726,10 @@ int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { cout << "Creating a new epoch." << std::endl; } ObjectStore::Transaction t; + ObjectStore::Sequencer osr(__func__); t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl); t.truncate(coll_t::meta(), inc_oid, bl.length()); - int ret = store->apply_transaction(t); + int ret = store->apply_transaction(&osr, t); if (ret) { cerr << "Failed to set inc-osdmap (" << inc_oid << "): " << ret << std::endl; } else { @@ -767,10 +770,11 @@ int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { } cout << "Creating a new epoch." << std::endl; } + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; t.write(coll_t::meta(), full_oid, 0, bl.length(), bl); t.truncate(coll_t::meta(), full_oid, bl.length()); - int ret = store->apply_transaction(t); + int ret = store->apply_transaction(&osr, t); if (ret) { cerr << "Failed to set osdmap (" << full_oid << "): " << ret << std::endl; } else { @@ -928,6 +932,7 @@ int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid, int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap, bool *skipped_objects) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; bufferlist::iterator ebliter = bl.begin(); @@ -1020,7 +1025,7 @@ int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll, } } if (!dry_run) - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } @@ -1323,6 +1328,7 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, } if (!dry_run) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction *t = new ObjectStore::Transaction; PG::_create(*t, pgid, pgid.get_split_bits(curmap.get_pg_pool(pgid.pool())->get_pg_num())); @@ -1333,7 +1339,7 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, ::encode((char)1, values["_remove"]); t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); delete t; } @@ -1380,6 +1386,7 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, return -EFAULT; } + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction t; if (!dry_run) { pg_log_t newlog, reject; @@ -1430,7 +1437,7 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, set remove; remove.insert("_remove"); t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove); - store->apply_transaction(t); + store->apply_transaction(&osr, t); } return 0; @@ -1468,6 +1475,7 @@ int do_remove_object(ObjectStore *store, coll_t coll, ghobject_t &ghobj) { spg_t pg; coll.is_pg_prefix(&pg); + ObjectStore::Sequencer osr(__func__); OSDriver driver( store, coll_t(), @@ -1494,7 +1502,7 @@ int do_remove_object(ObjectStore *store, coll_t coll, ghobject_t &ghobj) t->remove(coll, ghobj); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); delete t; return 0; } @@ -1586,6 +1594,7 @@ int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; @@ -1620,7 +1629,7 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) } while(true); if (!dry_run) - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } @@ -1646,6 +1655,7 @@ int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) int do_set_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, int fd) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; bufferlist bl; @@ -1661,12 +1671,13 @@ int do_set_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, t->setattr(coll, ghobj, key, bl); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } int do_rm_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; @@ -1675,7 +1686,7 @@ int do_rm_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) t->rmattr(coll, ghobj, key); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } @@ -1712,6 +1723,7 @@ int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, int fd) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; map attrset; @@ -1730,12 +1742,13 @@ int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, t->omap_setkeys(coll, ghobj, attrset); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } int do_rm_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; set keys; @@ -1747,7 +1760,7 @@ int do_rm_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) t->omap_rmkeys(coll, ghobj, keys); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } @@ -1773,6 +1786,7 @@ int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj) int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) { + ObjectStore::Sequencer osr(__func__); ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; bufferlist hdrbl; @@ -1788,7 +1802,7 @@ int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) t->omap_setheader(coll, ghobj, hdrbl); - store->apply_transaction(*t); + store->apply_transaction(&osr, *t); return 0; } @@ -1804,9 +1818,10 @@ struct do_fix_lost : public action_on_object_t { oi.clear_flag(object_info_t::FLAG_LOST); bufferlist bl; ::encode(oi, bl); + ObjectStore::Sequencer osr("do_fix_lost"); ObjectStore::Transaction t; t.setattr(coll, ghobj, OI_ATTR, bl); - int r = store->apply_transaction(t); + int r = store->apply_transaction(&osr, t); if (r < 0) { cerr << "Error getting fixing attr on : " << make_pair(coll, ghobj) << ", " @@ -2149,6 +2164,8 @@ int main(int argc, char **argv) bool fs_sharded_objects = fs->get_allow_sharded_objects(); + ObjectStore::Sequencer osr(__func__); + vector ls; vector::iterator it; CompatSet supported; @@ -2356,7 +2373,7 @@ int main(int argc, char **argv) bl.clear(); ::encode(superblock, bl); t.write(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl); - ret = fs->apply_transaction(t); + ret = fs->apply_transaction(&osr, t); if (ret < 0) { cerr << "Error writing OSD superblock: " << cpp_strerror(ret) << std::endl; goto out; @@ -2826,7 +2843,7 @@ int main(int argc, char **argv) ret = write_info(*t, map_epoch, info, past_intervals); if (ret == 0) { - fs->apply_transaction(*t); + fs->apply_transaction(&osr, *t); cout << "Removal succeeded" << std::endl; } } else if (op == "mark-complete") { @@ -2852,7 +2869,7 @@ int main(int argc, char **argv) ret = write_info(*t, map_epoch, info, past_intervals); if (ret == 0) { - fs->apply_transaction(*t); + fs->apply_transaction(&osr, *t); cout << "Marking complete succeeded" << std::endl; } } else { From 7250fb18fbe8c1c7757f490c63246e3d75a98421 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 14 Sep 2015 16:53:01 +0800 Subject: [PATCH 620/654] os/OSD.cc cast osd_max_write_size to int64_t Otherwise overflow may occur and negative values generated. Signed-off-by: Xiaoxi Chen --- src/osd/OSD.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 46514f354c03c..f5c0683d64db7 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8105,7 +8105,7 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap) // too big? if (cct->_conf->osd_max_write_size && - m->get_data_len() > cct->_conf->osd_max_write_size << 20) { + m->get_data_len() > ((int64_t)g_conf->osd_max_write_size) << 20) { // journal can't hold commit! derr << "handle_op msg data len " << m->get_data_len() << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20) From 73d7bed94c5c14cd9d226ffa0ae170c79b8caca8 Mon Sep 17 00:00:00 2001 From: Dan van der Ster Date: Tue, 15 Sep 2015 09:10:59 +0200 Subject: [PATCH 621/654] logrotate: logs are now owned by ceph:ceph Add the su directive to avoid this error during logrotate: error: skipping "/var/log/ceph/ceph-osd.0.log" because parent directory has insecure permissions (It's world writable or writable by group which is not "root") Set "su" directive in config file to tell logrotate which user/group should be used for rotation. Signed-off-by: Dan van der Ster --- src/logrotate.conf | 1 + src/rgw/logrotate.conf | 1 + 2 files changed, 2 insertions(+) diff --git a/src/logrotate.conf b/src/logrotate.conf index 50e7ee8867048..014ce4532b088 100644 --- a/src/logrotate.conf +++ b/src/logrotate.conf @@ -8,4 +8,5 @@ endscript missingok notifempty + su ceph ceph } diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf index 65ae1132abd66..036b85bc3b8e0 100644 --- a/src/rgw/logrotate.conf +++ b/src/rgw/logrotate.conf @@ -8,4 +8,5 @@ endscript missingok notifempty + su ceph ceph } From 807a34cc0dbbb374e0b5e598497d73a8c0b81e34 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Tue, 15 Sep 2015 08:47:19 +0200 Subject: [PATCH 622/654] common: osd_pg_epoch_persisted_max_stale < map_cache_size 1ff51a299d20dd73a95ba55fdec57498b8c71c13 reduced map_cache_size to 200, osd_pg_epoch_persisted_max_stale must be lowered because it does not make sense for it to be the equal or greater than map_cache_size. Signed-off-by: Loic Dachary --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 22139ba264900..fd16972c30914 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -679,7 +679,7 @@ OPTION(osd_default_notify_timeout, OPT_U32, 30) // default notify timeout in sec OPTION(osd_kill_backfill_at, OPT_INT, 0) // Bounds how infrequently a new map epoch will be persisted for a pg -OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 200) +OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 150) // make this < map_cache_size! OPTION(osd_min_pg_log_entries, OPT_U32, 3000) // number of entries to keep in the pg log when trimming it OPTION(osd_max_pg_log_entries, OPT_U32, 10000) // max entries, say when degraded, before we trim From 4e8242a0494d1e60705a03ed9c7b130accdcf335 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 15 Sep 2015 16:31:23 +0800 Subject: [PATCH 623/654] mds: reset MDSRank heartbeat timeout even when MDS is laggy Fixes: #13067 Signed-off-by: Yan, Zheng --- src/mds/MDSDaemon.cc | 8 -------- src/mds/MDSRank.cc | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index 7d23722792c5a..d15040ea6df58 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -517,18 +517,10 @@ void MDSDaemon::tick() // reschedule reset_tick(); - if (beacon.is_laggy()) { - dout(5) << "tick bailing out since we seem laggy" << dendl; - return; - } - // Call through to subsystems' tick functions if (mds_rank) { mds_rank->tick(); } - - // Expose ourselves to Beacon to update health indicators - beacon.notify_health(mds_rank); } /* This function DOES put the passed message before returning*/ diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 715bf832ec030..8c04b2f3fde15 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -150,6 +150,11 @@ void MDSRankDispatcher::tick() { heartbeat_reset(); + if (beacon.is_laggy()) { + dout(5) << "tick bailing out since we seem laggy" << dendl; + return; + } + check_ops_in_flight(); // Wake up thread in case we use to be laggy and have waiting_for_nolaggy @@ -194,6 +199,9 @@ void MDSRankDispatcher::tick() if (snapserver) snapserver->check_osd_map(false); } + + // Expose ourselves to Beacon to update health indicators + beacon.notify_health(this); } void MDSRankDispatcher::shutdown() From 26bcb362f46daaef6f564e6d10c0f2439098ed91 Mon Sep 17 00:00:00 2001 From: Brad Hubbard Date: Mon, 14 Sep 2015 16:00:43 +1000 Subject: [PATCH 624/654] Examples: hello_world.cc, content displayed after read is not null terminated. Copy contents of bufferlist to a string before sending to stdout since a bufferlist is not null terminated. Fixes: #7822 Signed-off-by: Brad Hubbard --- examples/librados/hello_world.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/librados/hello_world.cc b/examples/librados/hello_world.cc index cb5476ffc8427..9d3713495a3bd 100644 --- a/examples/librados/hello_world.cc +++ b/examples/librados/hello_world.cc @@ -173,7 +173,9 @@ int main(int argc, const char **argv) } else { std::cout << "we read our object " << object_name << ", and got back " << ret << " bytes with contents\n" - << read_buf.c_str() << std::endl; + std::string read_string; + read_buf.copy(0, ret, read_string); + std::cout << read_string << std::endl; } } From bbe27dc6d7e53055db68c51b97c0e55e468b9d5c Mon Sep 17 00:00:00 2001 From: Li Peng Date: Tue, 15 Sep 2015 20:06:39 +0800 Subject: [PATCH 625/654] doc: Fixes a wrong directory name. Signed-off-by: Li Peng --- doc/start/documenting-ceph.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/start/documenting-ceph.rst b/doc/start/documenting-ceph.rst index d57192e8d1c2c..4fac6f0433127 100644 --- a/doc/start/documenting-ceph.rst +++ b/doc/start/documenting-ceph.rst @@ -8,7 +8,7 @@ increasing number of people are updating the documentation and adding new information. Even small contributions like fixing spelling errors or clarifying instructions will help the Ceph project immensely. -The Ceph documentation source resides in the ``ceph/docs`` directory of the Ceph +The Ceph documentation source resides in the ``ceph/doc`` directory of the Ceph repository, and Python Sphinx renders the source into HTML and manpages. The http://ceph.com/docs link currenly displays the ``master`` branch by default, but you may view documentation for older branches (e.g., ``argonaut``) or future @@ -611,4 +611,4 @@ improves the readability of the document in a command line interface. .. _Showing code examples: http://sphinx-doc.org/markup/code.html .. _paragraph level markup: http://sphinx-doc.org/markup/para.html .. _topic directive: http://docutils.sourceforge.net/docs/ref/rst/directives.html#topic -.. _John Wilkins: mailto:jowilkin@redhat.com \ No newline at end of file +.. _John Wilkins: mailto:jowilkin@redhat.com From 71f6529573fd413da946449b44b6da29327c6231 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Tue, 15 Sep 2015 20:55:08 +0800 Subject: [PATCH 626/654] doc: fix a broken hyperlink Signed-off-by: Li Peng --- doc/rados/deployment/preflight-checklist.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rados/deployment/preflight-checklist.rst b/doc/rados/deployment/preflight-checklist.rst index 80b8f0318f9e8..97b5dc086cad8 100644 --- a/doc/rados/deployment/preflight-checklist.rst +++ b/doc/rados/deployment/preflight-checklist.rst @@ -106,4 +106,4 @@ connections, traffic forwarding, etc. to allow what you need). Once you have completed this pre-flight checklist, you are ready to begin using ``ceph-deploy``. -.. _OS Recommendations: ../../../install/os-recommendations \ No newline at end of file +.. _OS Recommendations: ../../../start/os-recommendations From 95462525840b90d3ec23ed96f8b710969bb81b80 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 15 Sep 2015 22:08:10 +0800 Subject: [PATCH 627/654] mds: cast numbers for mds health to string when print MDSHealthMetric.metadata is a map, so the num in `m.metadata["foo"] = num` is casted to char when being printed. they should be coverted into std::string instead. Fixes: #13090 Signed-off-by: Kefu Chai --- src/mds/Beacon.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 1312758435f0b..93c54c734f9ea 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -321,8 +321,8 @@ void Beacon::notify_health(MDSRank const *mds) << "/" << g_conf->mds_log_max_segments << ")"; MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str()); - m.metadata["num_segments"] = mds->mdlog->get_num_segments(); - m.metadata["max_segments"] = g_conf->mds_log_max_segments; + m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments()); + m.metadata["max_segments"] = stringify(g_conf->mds_log_max_segments); health.metrics.push_back(m); } } @@ -361,7 +361,7 @@ void Beacon::notify_health(MDSRank const *mds) oss << "Many clients (" << late_cap_metrics.size() << ") failing to respond to capability release"; MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str()); - m.metadata["client_count"] = late_cap_metrics.size(); + m.metadata["client_count"] = stringify(late_cap_metrics.size()); health.metrics.push_back(m); late_cap_metrics.clear(); } @@ -390,7 +390,7 @@ void Beacon::notify_health(MDSRank const *mds) std::ostringstream oss; oss << "Client " << session->get_human_name() << " failing to respond to cache pressure"; MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); - m.metadata["client_id"] = session->info.inst.name.num(); + m.metadata["client_id"] = stringify(session->info.inst.name.num()); late_recall_metrics.push_back(m); } else { dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; @@ -401,7 +401,7 @@ void Beacon::notify_health(MDSRank const *mds) std::ostringstream oss; oss << "Client " << session->get_human_name() << " failing to advance its oldest_client_tid"; MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str()); - m.metadata["client_id"] = session->info.inst.name.num(); + m.metadata["client_id"] = stringify(session->info.inst.name.num()); large_completed_requests_metrics.push_back(m); } } @@ -413,7 +413,7 @@ void Beacon::notify_health(MDSRank const *mds) oss << "Many clients (" << late_recall_metrics.size() << ") failing to respond to cache pressure"; MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str()); - m.metadata["client_count"] = late_recall_metrics.size(); + m.metadata["client_count"] = stringify(late_recall_metrics.size()); health.metrics.push_back(m); late_recall_metrics.clear(); } @@ -425,7 +425,7 @@ void Beacon::notify_health(MDSRank const *mds) oss << "Many clients (" << large_completed_requests_metrics.size() << ") failing to advance their oldest_client_tid"; MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str()); - m.metadata["client_count"] = large_completed_requests_metrics.size(); + m.metadata["client_count"] = stringify(large_completed_requests_metrics.size()); health.metrics.push_back(m); large_completed_requests_metrics.clear(); } From 89cc479e93f5887ca7537f60a5953f332580bb88 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 12:28:35 -0400 Subject: [PATCH 628/654] mon: do not leak messages on shutdown Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index d886272534a69..19f6580a7c3f2 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3410,6 +3410,7 @@ void Monitor::waitlist_or_zap_client(MonOpRequestRef op) void Monitor::_ms_dispatch(Message *m) { if (is_shutdown()) { + m->put(); return; } From 0b309e97c92b658ec7cd5febaabdcd44cc79d742 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 12:32:02 -0400 Subject: [PATCH 629/654] mon: fix MonSession operator<< Print the object, not its pointer. Make output unambiguously terminated. Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 2 +- src/mon/Session.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 19f6580a7c3f2..87042f62561c0 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3466,7 +3466,7 @@ void Monitor::dispatch(MonOpRequestRef op) s = session_map.new_session(m->get_source_inst(), m->get_connection().get()); assert(s); m->get_connection()->set_priv(s->get()); - dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl; + dout(10) << "ms_dispatch new session " << s << " " << *s << dendl; op->set_session(s); logger->set(l_mon_num_sessions, session_map.get_size()); diff --git a/src/mon/Session.h b/src/mon/Session.h index a3260210fe786..ff8073027908b 100644 --- a/src/mon/Session.h +++ b/src/mon/Session.h @@ -204,11 +204,11 @@ struct MonSessionMap { } }; -inline ostream& operator<<(ostream& out, const MonSession *s) +inline ostream& operator<<(ostream& out, const MonSession& s) { - out << "MonSession: " << s->inst << " is " - << (s->closed ? "closed" : "open"); - out << s->caps; + out << "MonSession(" << s.inst << " is " + << (s.closed ? "closed" : "open"); + out << s.caps << ")"; return out; } From 137eb7e3759c9fa1700582b86ab3c0bc0158a957 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 12:35:46 -0400 Subject: [PATCH 630/654] mon/Monitor: fix MonSession Leak dispatch() does not need to keep a ref; we hold one indirectly via the MonOpRequest. Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 87042f62561c0..de5d60e07ccd8 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3487,6 +3487,7 @@ void Monitor::dispatch(MonOpRequestRef op) } if (reuse_caps) s->caps = caps; + s->put(); } else { dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl; } From 03b8ed3b6c503d71049e42544fa7e5c71e68c98d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 12:46:25 -0400 Subject: [PATCH 631/654] msg/simple: discard local queued messages on shutdown Otherwise these generate leak detection noise. Signed-off-by: Sage Weil --- src/msg/simple/DispatchQueue.cc | 11 +++++++++++ src/msg/simple/DispatchQueue.h | 1 + src/msg/simple/SimpleMessenger.cc | 3 ++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/msg/simple/DispatchQueue.cc b/src/msg/simple/DispatchQueue.cc index c47ee72b81848..500239f29ab1b 100644 --- a/src/msg/simple/DispatchQueue.cc +++ b/src/msg/simple/DispatchQueue.cc @@ -227,6 +227,17 @@ void DispatchQueue::wait() dispatch_thread.join(); } +void DispatchQueue::discard_local() +{ + for (list >::iterator p = local_messages.begin(); + p != local_messages.end(); + ++p) { + ldout(cct,20) << __func__ << " " << p->first << dendl; + p->first->put(); + } + local_messages.clear(); +} + void DispatchQueue::shutdown() { // stop my local delivery thread diff --git a/src/msg/simple/DispatchQueue.h b/src/msg/simple/DispatchQueue.h index 606b850240792..d379f55881cf7 100644 --- a/src/msg/simple/DispatchQueue.h +++ b/src/msg/simple/DispatchQueue.h @@ -180,6 +180,7 @@ class DispatchQueue { void fast_preprocess(Message *m); void enqueue(Message *m, int priority, uint64_t id); void discard_queue(uint64_t id); + void discard_local(); uint64_t get_id() { Mutex::Locker l(lock); return next_pipe_id++; diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc index fc4b48b213c95..fdb7278292d48 100644 --- a/src/msg/simple/SimpleMessenger.cc +++ b/src/msg/simple/SimpleMessenger.cc @@ -534,9 +534,10 @@ void SimpleMessenger::wait() } lock.Unlock(); - if(dispatch_queue.is_started()) { + if (dispatch_queue.is_started()) { ldout(cct,10) << "wait: waiting for dispatch queue" << dendl; dispatch_queue.wait(); + dispatch_queue.discard_local(); ldout(cct,10) << "wait: dispatch queue is stopped" << dendl; } From c842555ceeae3a18b1b9d2f4190d0b7b21514c75 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 12:49:36 -0400 Subject: [PATCH 632/654] mon: debug refs on output replies Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index de5d60e07ccd8..3dc10ef9f7eaf 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3189,7 +3189,8 @@ void Monitor::send_reply(MonOpRequestRef op, Message *reply) Message *req = op->get_req(); ConnectionRef con = op->get_connection(); - dout(2) << __func__ << " " << op << " " << *reply << dendl; + reply->set_cct(g_ceph_context); + dout(2) << __func__ << " " << op << " " << reply << " " << *reply << dendl; if (!con) { dout(2) << "send_reply no connection, dropping reply " << *reply From 96486fd6f284ca3ab67f4f94631896d41c529e85 Mon Sep 17 00:00:00 2001 From: Ken Dreyer Date: Tue, 8 Sep 2015 10:41:02 -0600 Subject: [PATCH 633/654] packaging: move rbd-replay* to ceph-common The rbd-replay* utilities are useful for Ceph users with RBD clients. Currently the rbd-replay* utilities ship in the "ceph-test" package, and we intend this ceph-test package for Ceph developers and contributors, not normal users. Move the rbd-replay* utilities to "ceph-common". http://tracker.ceph.com/issues/12994 Fixes: #12994 Signed-off-by: Ken Dreyer --- ceph.spec.in | 16 ++++++++-------- debian/ceph-common.install | 2 ++ debian/ceph-test.install | 2 -- debian/control | 2 ++ 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index c068af6a4268c..3303fc611fd9b 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -835,6 +835,11 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_bindir}/ceph-crush-location %{_bindir}/rados %{_bindir}/rbd +%{_bindir}/rbd-replay +%{_bindir}/rbd-replay-many +%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} >= 1310 +%{_bindir}/rbd-replay-prep +%endif %{_bindir}/ceph-post-file %{_bindir}/ceph-brag %{_mandir}/man8/ceph-authtool.8* @@ -846,6 +851,9 @@ mkdir -p %{_localstatedir}/run/ceph/ %{_mandir}/man8/ceph.8* %{_mandir}/man8/rados.8* %{_mandir}/man8/rbd.8* +%{_mandir}/man8/rbd-replay.8* +%{_mandir}/man8/rbd-replay-many.8* +%{_mandir}/man8/rbd-replay-prep.8* %{_datadir}/ceph/known_hosts_drop.ceph.com %{_datadir}/ceph/id_dsa_drop.ceph.com %{_datadir}/ceph/id_dsa_drop.ceph.com.pub @@ -1096,14 +1104,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_bindir}/ceph-monstore-tool %{_bindir}/ceph-osdomap-tool %{_bindir}/ceph-kvstore-tool -%{_mandir}/man8/rbd-replay.8* -%{_mandir}/man8/rbd-replay-many.8* -%{_mandir}/man8/rbd-replay-prep.8* -%{_bindir}/rbd-replay -%{_bindir}/rbd-replay-many -%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} >= 1310 -%{_bindir}/rbd-replay-prep -%endif %dir %{_libdir}/ceph %{_libdir}/ceph/ceph-monstore-update-crush.sh diff --git a/debian/ceph-common.install b/debian/ceph-common.install index 4e21adff9c474..25bb8e2e571bd 100644 --- a/debian/ceph-common.install +++ b/debian/ceph-common.install @@ -9,6 +9,7 @@ usr/bin/ceph-syn usr/bin/ceph-crush-location usr/bin/rados usr/bin/rbd +usr/bin/rbd-replay* usr/bin/ceph-post-file usr/bin/ceph-brag usr/share/man/man8/ceph-authtool.8 @@ -20,6 +21,7 @@ usr/share/man/man8/ceph-post-file.8 usr/share/man/man8/ceph.8 usr/share/man/man8/rados.8 usr/share/man/man8/rbd.8 +usr/share/man/man8/rbd-replay*.8 usr/share/ceph/known_hosts_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com.pub diff --git a/debian/ceph-test.install b/debian/ceph-test.install index 367cf4777ac37..bcd0b017477db 100644 --- a/debian/ceph-test.install +++ b/debian/ceph-test.install @@ -27,6 +27,4 @@ usr/bin/ceph-monstore-tool usr/bin/ceph-osdomap-tool usr/bin/ceph-kvstore-tool usr/share/java/libcephfs-test.jar -usr/bin/rbd-replay* -usr/share/man/man8/rbd-replay*.8 usr/lib/ceph/ceph-monstore-update-crush.sh diff --git a/debian/control b/debian/control index 363bb08d4a71a..64cb0d0440341 100644 --- a/debian/control +++ b/debian/control @@ -199,9 +199,11 @@ Depends: librbd1 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}, Conflicts: ceph-client-tools Replaces: ceph-client-tools, ceph (<< 9.0.0-943), + ceph-test (<< 9.0.3-1646), python-ceph (<< 0.92-1223), librbd1 (<< 0.92-1238) Breaks: ceph (<< 9.0.0-943), + ceph-test (<< 9.0.3-1646), python-ceph (<< 0.92-1223), librbd1 (<< 0.92-1238) Suggests: ceph, ceph-mds From c1b28591a2ba55abd644186938d440fc90743f15 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Sep 2015 18:05:59 -0400 Subject: [PATCH 634/654] radosgw: log to /var/log/ceph instead of /var/log/radosgw This is simpler. Signed-off-by: Sage Weil --- ceph.spec.in | 11 +---------- debian/radosgw.dirs | 1 - debian/rules | 1 - doc/man/8/radosgw.rst | 4 ++-- selinux/ceph.fc | 1 - selinux/ceph_selinux.8 | 6 ++---- src/logrotate.conf | 2 +- src/rgw/Makefile.am | 1 - src/rgw/logrotate.conf | 12 ------------ src/rgw/rgw_main.cc | 1 - 10 files changed, 6 insertions(+), 34 deletions(-) delete mode 100644 src/rgw/logrotate.conf diff --git a/ceph.spec.in b/ceph.spec.in index c068af6a4268c..c3c06e7192d96 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -25,8 +25,7 @@ restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \ restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \ restorecon -R /var/run/ceph > /dev/null 2>&1; \ restorecon -R /var/lib/ceph > /dev/null 2>&1; \ -restorecon -R /var/log/ceph > /dev/null 2>&1; \ -restorecon -R /var/log/radosgw > /dev/null 2>&1; +restorecon -R /var/log/ceph > /dev/null 2>&1; %endif %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} @@ -611,7 +610,6 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ce %endif mkdir -p $RPM_BUILD_ROOT%{_sbindir} install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph -install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config @@ -651,7 +649,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw -mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw %if 0%{?suse_version} # Fedora seems to have some problems with this macro, use it only on SUSE @@ -910,9 +907,7 @@ fi %{_bindir}/radosgw-object-expirer %{_mandir}/man8/radosgw.8* %{_mandir}/man8/radosgw-admin.8* -%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw %config %{_sysconfdir}/bash_completion.d/radosgw-admin -%dir %{_localstatedir}/log/radosgw/ %dir %{_localstatedir}/lib/ceph/radosgw %if 0%{?_with_systemd} %else @@ -951,10 +946,6 @@ fi done fi %endif -# Package removal cleanup -if [ "$1" -eq "0" ] ; then - rm -rf /var/log/radosgw -fi ################################################################################# %if %{with ocf} diff --git a/debian/radosgw.dirs b/debian/radosgw.dirs index d202a8c300cf2..a2f1849122bc8 100644 --- a/debian/radosgw.dirs +++ b/debian/radosgw.dirs @@ -1,2 +1 @@ -var/log/radosgw var/lib/ceph/radosgw diff --git a/debian/rules b/debian/rules index 95fdfbd454524..69ad8c88ba1fc 100755 --- a/debian/rules +++ b/debian/rules @@ -54,7 +54,6 @@ build-stamp: configure-stamp cp src/init-ceph debian/ceph.init cp src/init-radosgw debian/radosgw.init cp src/logrotate.conf debian/ceph.logrotate - cp src/rgw/logrotate.conf debian/radosgw.logrotate touch $@ diff --git a/doc/man/8/radosgw.rst b/doc/man/8/radosgw.rst index 1f74dec6d0b7b..f57b34679d5c6 100644 --- a/doc/man/8/radosgw.rst +++ b/doc/man/8/radosgw.rst @@ -101,7 +101,7 @@ tcp and through unix domain socket: host = {hostname} keyring = /etc/ceph/ceph.client.radosgw.keyring rgw socket path = "" - log file = /var/log/radosgw/client.radosgw.gateway.log + log file = /var/log/ceph/client.radosgw.gateway.log rgw frontends = fastcgi socket_port=9000 socket_host=0.0.0.0 rgw print continue = false @@ -156,7 +156,7 @@ tcp and through unix domain socket: host = {hostname} keyring = /etc/ceph/ceph.client.radosgw.keyring rgw socket path = /var/run/ceph/ceph.radosgw.gateway.fastcgi.sock - log file = /var/log/radosgw/client.radosgw.gateway.log + log file = /var/log/ceph/client.radosgw.gateway.log rgw print continue = false #. Add the following content in the gateway configuration file: diff --git a/selinux/ceph.fc b/selinux/ceph.fc index 31926895c465b..6b8d06254e8f0 100644 --- a/selinux/ceph.fc +++ b/selinux/ceph.fc @@ -9,6 +9,5 @@ /var/lib/ceph(/.*)? gen_context(system_u:object_r:ceph_var_lib_t,s0) /var/log/ceph(/.*)? gen_context(system_u:object_r:ceph_log_t,s0) -/var/log/radosgw(/.*)? gen_context(system_u:object_r:ceph_log_t,s0) /var/run/ceph(/.*)? gen_context(system_u:object_r:ceph_var_run_t,s0) diff --git a/selinux/ceph_selinux.8 b/selinux/ceph_selinux.8 index 6e91a212725e0..a646374bd5506 100644 --- a/selinux/ceph_selinux.8 +++ b/selinux/ceph_selinux.8 @@ -170,8 +170,6 @@ The SELinux process type ceph_t can manage files labeled with the following file /var/log/ceph(/.*)? .br - /var/log/radosgw(/.*)? -.br .br .B ceph_var_lib_t @@ -321,7 +319,7 @@ Paths: .br .TP 5 Paths: -/var/log/ceph(/.*)?, /var/log/radosgw(/.*)? +/var/log/ceph(/.*)? .EX .PP @@ -369,4 +367,4 @@ This manual page was auto-generated using .SH "SEE ALSO" selinux(8), ceph(8), semanage(8), restorecon(8), chcon(1), sepolicy(8) -, setsebool(8) \ No newline at end of file +, setsebool(8) diff --git a/src/logrotate.conf b/src/logrotate.conf index 014ce4532b088..08ad4b4c10978 100644 --- a/src/logrotate.conf +++ b/src/logrotate.conf @@ -4,7 +4,7 @@ compress sharedscripts postrotate - killall -q -1 ceph-mon ceph-mds ceph-osd || true + killall -q -1 ceph-mon ceph-mds ceph-osd radosgw || true endscript missingok notifempty diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index 9e330a1cdb045..3a30156c29803 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -124,7 +124,6 @@ bin_DEBUGPROGRAMS += ceph_rgw_jsonparser noinst_HEADERS += \ - rgw/logrotate.conf \ rgw/rgw_acl.h \ rgw/rgw_acl_s3.h \ rgw/rgw_acl_swift.h \ diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf deleted file mode 100644 index 036b85bc3b8e0..0000000000000 --- a/src/rgw/logrotate.conf +++ /dev/null @@ -1,12 +0,0 @@ -/var/log/radosgw/*.log { - rotate 7 - daily - compress - sharedscripts - postrotate - killall -q -1 radosgw || true - endscript - missingok - notifempty - su ceph ceph -} diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 7a9b32793b190..c6ba1c210fb9d 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -1028,7 +1028,6 @@ int main(int argc, const char **argv) vector def_args; def_args.push_back("--debug-rgw=1/5"); def_args.push_back("--keyring=$rgw_data/keyring"); - def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name.log"); vector args; argv_to_vec(argc, argv, args); From 1f7a2dc5069e2154f0b5d74d5bdac133ff0a85ba Mon Sep 17 00:00:00 2001 From: weiqian Date: Wed, 16 Sep 2015 11:04:52 +0800 Subject: [PATCH 635/654] doc:Replaces 'osd host' with 'host' Signed-off-by: weiqian --- doc/rados/configuration/osd-config-ref.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst index 0ac7d96b34921..5e6f61ffe8982 100644 --- a/doc/rados/configuration/osd-config-ref.rst +++ b/doc/rados/configuration/osd-config-ref.rst @@ -6,7 +6,7 @@ You can configure Ceph OSD Daemons in the Ceph configuration file, but Ceph OSD Daemons can use the default values and a very minimal configuration. A minimal -Ceph OSD Daemon configuration sets ``osd journal size`` and ``osd host``, and +Ceph OSD Daemon configuration sets ``osd journal size`` and ``host``, and uses default values for nearly everything else. Ceph OSD Daemons are numerically identified in incremental fashion, beginning @@ -19,7 +19,7 @@ with ``0`` using the following convention. :: In a configuration file, you may specify settings for all Ceph OSD Daemons in the cluster by adding configuration settings to the ``[osd]`` section of your configuration file. To add settings directly to a specific Ceph OSD Daemon -(e.g., ``osd host``), enter it in an OSD-specific section of your configuration +(e.g., ``host``), enter it in an OSD-specific section of your configuration file. For example: .. code-block:: ini @@ -28,10 +28,10 @@ file. For example: osd journal size = 1024 [osd.0] - osd host = osd-host-a + host = osd-host-a [osd.1] - osd host = osd-host-b + host = osd-host-b .. index:: OSD; config settings From d1505b5408c29073d72cecadd63f3badd31d7738 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Wed, 16 Sep 2015 14:05:05 +0800 Subject: [PATCH 636/654] doc: delete wrong description of installing RPMs In CentOS/RHEL 6/7, gitk and git-gui are available in default repository actually. Signed-off-by: Li Peng --- doc/start/documenting-ceph.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/start/documenting-ceph.rst b/doc/start/documenting-ceph.rst index 4fac6f0433127..2d09db3ae3a0e 100644 --- a/doc/start/documenting-ceph.rst +++ b/doc/start/documenting-ceph.rst @@ -420,19 +420,10 @@ For Debian/Ubuntu, execute:: sudo apt-get install gitk git-gui -For Fedora, execute:: +For Fedora/CentOS/RHEL, execute:: sudo yum install gitk git-gui -In CentOS/RHEL7, ``gitk`` and ``git-gui`` are not available in default or -``epel`` repository. So, use http://rpmfind.net/ to find them. Then, download -them from a mirror and install them. For example:: - - wget ftp://rpmfind.net/linux/centos/7.0.1406/os/x86_64/Packages/gitk-1.8.3.1-4.el7.noarch.rpm - sudo yum install gitk-1.8.3.1-4.el7.noarch.rpm - wget ftp://rpmfind.net/linux/centos/7.0.1406/os/x86_64/Packages/git-gui-1.8.3.1-4.el7.noarch.rpm - sudo yum install git-gui-1.8.3.1-4.el7.noarch.rpm - Then, execute:: cd {git-ceph-repo-path} From d0ac68bf1785b330f3202d924c2203ace9393fe6 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 16 Sep 2015 15:08:17 +0800 Subject: [PATCH 637/654] mon/PGMap: calc min_last_epoch_clean when decode Fixes: #13112 Signed-off-by: Kefu Chai --- src/mon/PGMap.cc | 2 +- src/test/mon/PGMap.cc | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 6fa3aaaea8fa4..28561b7168fde 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -374,7 +374,7 @@ void PGMap::calc_stats() redo_full_sets(); - calc_min_last_epoch_clean(); + min_last_epoch_clean = calc_min_last_epoch_clean(); } void PGMap::update_pg(pg_t pgid, bufferlist& bl) diff --git a/src/test/mon/PGMap.cc b/src/test/mon/PGMap.cc index 9f7a6b2d0b1fe..f13fa8936669c 100644 --- a/src/test/mon/PGMap.cc +++ b/src/test/mon/PGMap.cc @@ -81,7 +81,40 @@ TEST(pgmap, min_last_epoch_clean) } - +TEST(pgmap, calc_stats) +{ + bufferlist bl; + { + PGMap pg_map; + PGMap::Incremental inc; + osd_stat_t os; + pg_stat_t ps; + + ps.last_epoch_clean = 999; + inc.pg_stat_updates[pg_t(9,9)] = ps; + inc.version = 1; + inc.update_stat(0, 123, os); + pg_map.apply_incremental(g_ceph_context, inc); + ASSERT_EQ(123u, pg_map.get_min_last_epoch_clean()); + pg_map.encode(bl); + } + { + PGMap pg_map; + PGMap::Incremental inc; + osd_stat_t os; + pg_stat_t ps; + + ps.last_epoch_clean = 999; + inc.pg_stat_updates[pg_t(9,9)] = ps; + inc.version = 1; + inc.update_stat(0, 321, os); + pg_map.apply_incremental(g_ceph_context, inc); + ASSERT_EQ(321u, pg_map.get_min_last_epoch_clean()); + bufferlist::iterator p = bl.begin(); + ::decode(pg_map, p); + ASSERT_EQ(123u, pg_map.get_min_last_epoch_clean()); + } +} int main(int argc, char **argv) { vector args; From 7182499ca33fa1fa8f16b7831a5fdb349dfa773e Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 17 Sep 2015 09:35:11 +0200 Subject: [PATCH 638/654] install-deps.sh: disable python3 Disable python3 support until https://bugs.launchpad.net/ubuntu/+source/python-coverage/+bug/1496715 is fixed. Nothing in the build process depends on python3 right now, there is no harm disabling it. http://tracker.ceph.com/issues/13136 Fixes: #13136 Signed-off-by: Loic Dachary --- install-deps.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install-deps.sh b/install-deps.sh index 5ad41c69cd989..886678feb9d61 100755 --- a/install-deps.sh +++ b/install-deps.sh @@ -131,7 +131,7 @@ find . -name tox.ini | while read ini ; do cd $(dirname $ini) require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /') if test "$require" && ! test -d wheelhouse ; then - for interpreter in python2.7 python3 ; do + for interpreter in python2.7 ; do # python3 type $interpreter > /dev/null 2>&1 || continue activate_virtualenv $top_srcdir $interpreter || exit 1 populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1 @@ -141,7 +141,7 @@ find . -name tox.ini | while read ini ; do ) done -for interpreter in python2.7 python3 ; do +for interpreter in python2.7 ; do # python3 rm -rf $top_srcdir/install-deps-$interpreter done rm -rf $XDG_CACHE_HOME From e017aab23c8f8d619fbeecfe55a7ab113f7f0fa6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 17 Sep 2015 11:26:32 +0100 Subject: [PATCH 639/654] CMake: fix libcephfs shared lib generation Previously weren't generating versioned symlinks etc, so python bindings didn't find it. Signed-off-by: John Spray --- src/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 814228bf35c75..5a790fa7023ab 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -788,6 +788,10 @@ if(WITH_LIBCEPHFS) target_link_libraries(client osdc mds ${LIBEDIT_LIBS}) set(libcephfs_srcs libcephfs.cc) add_library(cephfs SHARED ${libcephfs_srcs}) +if(${ENABLE_SHARED}) + set_target_properties(cephfs PROPERTIES OUTPUT_NAME cephfs VERSION 1.0.0 + SOVERSION 1) +endif(${ENABLE_SHARED}) target_link_libraries(cephfs client global) install(TARGETS cephfs DESTINATION lib) install(DIRECTORY From 6a24d3198a5df10fd223996d53d19e063576f9b4 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 17 Sep 2015 13:46:30 +0100 Subject: [PATCH 640/654] libcephfs: fix calling init() then mount() Previously only ever called these separately, but it should be allowed for callers to use one after the other. Fixes: #13138 Signed-off-by: John Spray --- src/libcephfs.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/libcephfs.cc b/src/libcephfs.cc index 13d85c7e2814d..403d0dc11f127 100644 --- a/src/libcephfs.cc +++ b/src/libcephfs.cc @@ -108,9 +108,11 @@ struct ceph_mount_info if (mounted) return -EISCONN; - ret = init(); - if (ret != 0) { - return ret; + if (!inited) { + ret = init(); + if (ret != 0) { + return ret; + } } ret = client->mount(mount_root); From 85bece775cbf88859ca946085b302d8f7a3eb015 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 17 Sep 2015 10:47:26 -0400 Subject: [PATCH 641/654] new release key The previous release key, pub 4096R/460F3994 2015-09-15 uid Ceph.com (release key) may have been compromised. The new release key is pub 4096R/460F3994 2015-09-15 uid Ceph.com (release key) Signed-off-by: Sage Weil --- keys/{old_release.asc => old_release.1.asc} | 0 keys/old_release.2.possibly.compromised.asc | 31 ++++++++++++ keys/release.asc | 54 ++++++++++----------- 3 files changed, 57 insertions(+), 28 deletions(-) rename keys/{old_release.asc => old_release.1.asc} (100%) create mode 100644 keys/old_release.2.possibly.compromised.asc diff --git a/keys/old_release.asc b/keys/old_release.1.asc similarity index 100% rename from keys/old_release.asc rename to keys/old_release.1.asc diff --git a/keys/old_release.2.possibly.compromised.asc b/keys/old_release.2.possibly.compromised.asc new file mode 100644 index 0000000000000..8b2ab627c4b12 --- /dev/null +++ b/keys/old_release.2.possibly.compromised.asc @@ -0,0 +1,31 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1.4.11 (GNU/Linux) + +mQINBE+5bugBEADP31ZaQNvhOOQxjDwL/VYDLhtaGq4Q74FCY23uSQAMboKwo4JB +Te2JTSwBwU/RAPuWTrlKaQBPS30VF5SJN9t16llmoBWqhtBVf/lhQonC/28dTB6D +KR7Ahiz4Nv2g9m1sLau86JblQuODo8vWHXxahYSLQSyyxIXnlE4K3c1k0S4feLqu +ZxFtc2cFrQ/bUX9zXg6PXjDVAfY2R+x1JKGkVO/iwP+cjS1tCbvzdKcnQJEXpBwd +yHvDBuF3IjuR9JgrBhb1ALqexhFKHzG1kHFfOZ3DLVohig68lfyjCepGgo0BPOyy +S3Yk0QMumEaj9zRJurg49zWemX05XiBGt8SeCFxNUjXGYDIzSQ30K8fXmyjB74CW +EUDUuTpTt7oZF9jKCjfKmQwvW4GgJ4J0FSwiorXPK27didjLJCnkTt43v0ZETMRW +aADtiKFHl7lICuRmeXbd+6VkVqmoOz7ialMHnZ2KrHlqTcTPMd4llC4ayi2qS6Qb +dIi1g9fa5YMS6I7yGxmW4AWwNy7SE8DsTja0aGFR9k432r+Vxtr52jrmP2vVexva +CVaQkdk2/KEY3MjCPngiZwoTcOONYvNMvQaPrUtRuatcWJOgWsQVedY/UBxk968n +JzfnNDngbcYDRnOD8wLWyBGyYbOdg1ucckLXFEtPVXoRER5JHMcYhyh+/QARAQAB +tCRDZXBoIFJlbGVhc2UgS2V5IDxzYWdlQG5ld2RyZWFtLm5ldD6JAjgEEwECACIF +Ak+5bugCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEH6/3V0X7TFtSjEP +/A2pazEPwXrlQAHAjcXaFcPguKnXFrXRfbLpM9aZPR5gxH8mWl9RhEW/nL5pBf6A +Tx7lQ4F/h9bDlf4/bejuxUflkrJEPVWkyPf3hvImjSBs+LBTk4OkpUJwYd9AynsG +551Q0+6qxFfRVLCR6rLPHbMquXsKHROsSumEGUNrsMVC87gvtXEe/AOLUuRLEbjU +QqGKP2+mvliizU844a11B/bXViXhkNZw66ESAuqOw0dVPTo6aPLhuSDDrGEHQNTz +BsUseiUq795DqTE/5sL3lbTPrT1hKoIJFixYvaYBdygDgovsAi33nPn8UPitS5aD +zGJ/ByDdnI4QW15NN1diMp+BuvOCWLpMaxVQNflARlxxtfIfnvaKjgccr1YOyT91 +5tlbdr0y05r1uYZjYU5/4llilypUgzzQB1jeetr06fOpVvswAAWQJiS5JJU+V84W +r4sIBhZzGw1uvqNxIBWtk85W1ya7CmisRO7PZYW5lsLxZ48BxZhr45ar6/iDYreT +OOeP1f9GoJW0X+FAocNc/pobY02MhB/BXV1LRM3lY+yOK3sskspnMihMqP7tSfop +iJRtfXMLNdRRJFVZ5VSr1MCDK5RPQaqVsuvdtVqOJr1RwAQPjjzisOh+NYmvabkd +cVxjSV5DX0fMODr2l7cAXxJjZsAs6AlnQOGPg/NXKdkZiEYEEBECAAYFAk+5cEAA +CgkQ2kQg7SiJlcjJIACgsGpIw9ShLBciO3Y349ja7ILjC8cAnRrqoIpFxUrSIJF/ +8+w98auNwA18 +=uX7x +-----END PGP PUBLIC KEY BLOCK----- diff --git a/keys/release.asc b/keys/release.asc index 8b2ab627c4b12..d2961c52e7e79 100644 --- a/keys/release.asc +++ b/keys/release.asc @@ -1,31 +1,29 @@ -----BEGIN PGP PUBLIC KEY BLOCK----- -Version: GnuPG v1.4.11 (GNU/Linux) +Version: GnuPG v1 -mQINBE+5bugBEADP31ZaQNvhOOQxjDwL/VYDLhtaGq4Q74FCY23uSQAMboKwo4JB -Te2JTSwBwU/RAPuWTrlKaQBPS30VF5SJN9t16llmoBWqhtBVf/lhQonC/28dTB6D -KR7Ahiz4Nv2g9m1sLau86JblQuODo8vWHXxahYSLQSyyxIXnlE4K3c1k0S4feLqu -ZxFtc2cFrQ/bUX9zXg6PXjDVAfY2R+x1JKGkVO/iwP+cjS1tCbvzdKcnQJEXpBwd -yHvDBuF3IjuR9JgrBhb1ALqexhFKHzG1kHFfOZ3DLVohig68lfyjCepGgo0BPOyy -S3Yk0QMumEaj9zRJurg49zWemX05XiBGt8SeCFxNUjXGYDIzSQ30K8fXmyjB74CW -EUDUuTpTt7oZF9jKCjfKmQwvW4GgJ4J0FSwiorXPK27didjLJCnkTt43v0ZETMRW -aADtiKFHl7lICuRmeXbd+6VkVqmoOz7ialMHnZ2KrHlqTcTPMd4llC4ayi2qS6Qb -dIi1g9fa5YMS6I7yGxmW4AWwNy7SE8DsTja0aGFR9k432r+Vxtr52jrmP2vVexva -CVaQkdk2/KEY3MjCPngiZwoTcOONYvNMvQaPrUtRuatcWJOgWsQVedY/UBxk968n -JzfnNDngbcYDRnOD8wLWyBGyYbOdg1ucckLXFEtPVXoRER5JHMcYhyh+/QARAQAB -tCRDZXBoIFJlbGVhc2UgS2V5IDxzYWdlQG5ld2RyZWFtLm5ldD6JAjgEEwECACIF -Ak+5bugCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEH6/3V0X7TFtSjEP -/A2pazEPwXrlQAHAjcXaFcPguKnXFrXRfbLpM9aZPR5gxH8mWl9RhEW/nL5pBf6A -Tx7lQ4F/h9bDlf4/bejuxUflkrJEPVWkyPf3hvImjSBs+LBTk4OkpUJwYd9AynsG -551Q0+6qxFfRVLCR6rLPHbMquXsKHROsSumEGUNrsMVC87gvtXEe/AOLUuRLEbjU -QqGKP2+mvliizU844a11B/bXViXhkNZw66ESAuqOw0dVPTo6aPLhuSDDrGEHQNTz -BsUseiUq795DqTE/5sL3lbTPrT1hKoIJFixYvaYBdygDgovsAi33nPn8UPitS5aD -zGJ/ByDdnI4QW15NN1diMp+BuvOCWLpMaxVQNflARlxxtfIfnvaKjgccr1YOyT91 -5tlbdr0y05r1uYZjYU5/4llilypUgzzQB1jeetr06fOpVvswAAWQJiS5JJU+V84W -r4sIBhZzGw1uvqNxIBWtk85W1ya7CmisRO7PZYW5lsLxZ48BxZhr45ar6/iDYreT -OOeP1f9GoJW0X+FAocNc/pobY02MhB/BXV1LRM3lY+yOK3sskspnMihMqP7tSfop -iJRtfXMLNdRRJFVZ5VSr1MCDK5RPQaqVsuvdtVqOJr1RwAQPjjzisOh+NYmvabkd -cVxjSV5DX0fMODr2l7cAXxJjZsAs6AlnQOGPg/NXKdkZiEYEEBECAAYFAk+5cEAA -CgkQ2kQg7SiJlcjJIACgsGpIw9ShLBciO3Y349ja7ILjC8cAnRrqoIpFxUrSIJF/ -8+w98auNwA18 -=uX7x +mQINBFX4hgkBEADLqn6O+UFp+ZuwccNldwvh5PzEwKUPlXKPLjQfXlQRig1flpCH +E0HJ5wgGlCtYd3Ol9f9+qU24kDNzfbs5bud58BeE7zFaZ4s0JMOMuVm7p8JhsvkU +C/Lo/7NFh25e4kgJpjvnwua7c2YrA44ggRb1QT19ueOZLK5wCQ1mR+0GdrcHRCLr +7Sdw1d7aLxMT+5nvqfzsmbDullsWOD6RnMdcqhOxZZvpay8OeuK+yb8FVQ4sOIzB +FiNi5cNOFFHg+8dZQoDrK3BpwNxYdGHsYIwU9u6DWWqXybBnB9jd2pve9PlzQUbO +eHEa4Z+jPqxY829f4ldaql7ig8e6BaInTfs2wPnHJ+606g2UH86QUmrVAjVzlLCm +nqoGymoAPGA4ObHu9X3kO8viMBId9FzooVqR8a9En7ZE0Dm9O7puzXR7A1f5sHoz +JdYHnr32I+B8iOixhDUtxIY4GA8biGATNaPd8XR2Ca1hPuZRVuIiGG9HDqUEtXhV +fY5qjTjaThIVKtYgEkWMT+Wet3DPPiWT3ftNOE907e6EWEBCHgsEuuZnAbku1GgD +LBH4/a/yo9bNvGZKRaTUM/1TXhM5XgVKjd07B4cChgKypAVHvef3HKfCG2U/DkyA +LjteHt/V807MtSlQyYaXUTGtDCrQPSlMK5TjmqUnDwy6Qdq8dtWN3DtBWQARAQAB +tCpDZXBoLmNvbSAocmVsZWFzZSBrZXkpIDxzZWN1cml0eUBjZXBoLmNvbT6JAjgE +EwECACIFAlX4hgkCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEOhKwsBG +DzmUXdIQAI8YPcZMBWdv489q8CzxlfRIRZ3Gv/G/8CH+EOExcmkVZ89mVHngCdAP +DOYCl8twWXC1lwJuLDBtkUOHXNuR5+Jcl5zFOUyldq1Hv8u03vjnGT7lLJkJoqpG +l9QD8nBqRvBU7EM+CU7kP8+09b+088pULil+8x46PwgXkvOQwfVKSOr740Q4J4nm +/nUOyTNtToYntmt2fAVWDTIuyPpAqA6jcqSOC7Xoz9cYxkVWnYMLBUySXmSS0uxl +3p+wK0lMG0my/gb+alke5PAQjcE5dtXYzCn+8Lj0uSfCk8Gy0ZOK2oiUjaCGYN6D +u72qDRFBnR3jaoFqi03bGBIMnglGuAPyBZiI7LJgzuT9xumjKTJW3kN4YJxMNYu1 +FzmIyFZpyvZ7930vB2UpCOiIaRdZiX4Z6ZN2frD3a/vBxBNqiNh/BO+Dex+PDfI4 +TqwF8zlcjt4XZ2teQ8nNMR/D8oiYTUW8hwR4laEmDy7ASxe0p5aijmUApWq5UTsF ++s/QbwugccU0iR5orksM5u9MZH4J/mFGKzOltfGXNLYI6D5Mtwrnyi0BsF5eY0u6 +vkdivtdqrq2DXY+ftuqLOQ7b+t1RctbcMHGPptlxFuN9ufP5TiTWSpfqDwmHCLsT +k2vFiMwcHdLpQ1IH8ORVRgPPsiBnBOJ/kIiXG2SxPUTjjEGOVgeA +=/Tod -----END PGP PUBLIC KEY BLOCK----- From 3971274b342b00d76c2e19211becdc3ee81cf1c1 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 18 Sep 2015 17:05:09 +0800 Subject: [PATCH 642/654] mon: return size_t from MonitorDBStore::Transaction::size() Signed-off-by: Kefu Chai --- src/mon/MonitorDBStore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index 1a6f419e515a3..08f3233c79f5c 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -191,7 +191,7 @@ class MonitorDBStore return (size() == 0); } - bool size() { + size_t size() const { return ops.size(); } uint64_t get_keys() const { From aa238e5ed50f44a94caf84567267e4f6be8732a2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 18 Sep 2015 09:40:13 -0400 Subject: [PATCH 643/654] crush/CrushTester: allow testing by ruleset Signed-off-by: Sage Weil --- src/crush/CrushTester.cc | 14 +++++++++++++- src/crush/CrushTester.h | 9 ++++++++- src/tools/crushtool.cc | 8 +++++++- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc index 209d03a9f1019..086383385b120 100644 --- a/src/crush/CrushTester.cc +++ b/src/crush/CrushTester.cc @@ -355,7 +355,9 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector &dst, dst.push_back( data_buffer.str() ); } -int CrushTester::test_with_crushtool(const char *crushtool_cmd, int max_id, int timeout) +int CrushTester::test_with_crushtool(const char *crushtool_cmd, + int max_id, int timeout, + int ruleset) { SubProcessTimed crushtool(crushtool_cmd, true, false, true, timeout); string opt_max_id = boost::lexical_cast(max_id); @@ -365,6 +367,12 @@ int CrushTester::test_with_crushtool(const char *crushtool_cmd, int max_id, int "--min-x", "1", "--max-x", "50", NULL); + if (ruleset >= 0) { + crushtool.add_cmd_args( + "--ruleset", + stringify(ruleset).c_str(), + NULL); + } int ret = crushtool.spawn(); if (ret != 0) { err << "failed run crushtool: " << crushtool.err(); @@ -491,6 +499,10 @@ int CrushTester::test() err << "rule " << r << " dne" << std::endl; continue; } + if (ruleset >= 0 && + crush.get_rule_mask_ruleset(r) != ruleset) { + continue; + } int minr = min_rep, maxr = max_rep; if (min_rep < 0 || max_rep < 0) { minr = crush.get_rule_mask_min_size(r); diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h index ed14761462ac9..2f1a2c60e7537 100644 --- a/src/crush/CrushTester.h +++ b/src/crush/CrushTester.h @@ -15,6 +15,7 @@ class CrushTester { map device_weight; int min_rule, max_rule; + int ruleset; int min_x, max_x; int min_rep, max_rep; @@ -168,6 +169,7 @@ class CrushTester { CrushTester(CrushWrapper& c, ostream& eo) : crush(c), err(eo), min_rule(-1), max_rule(-1), + ruleset(-1), min_x(-1), max_x(-1), min_rep(-1), max_rep(-1), num_batches(1), @@ -333,6 +335,10 @@ class CrushTester { min_rule = max_rule = rule; } + void set_ruleset(int rs) { + ruleset = rs; + } + /** * check if any bucket/nodes is referencing an unknown name or type * @param max_id rejects any non-bucket items with id less than this number, @@ -344,7 +350,8 @@ class CrushTester { int test(); int test_with_crushtool(const char *crushtool_cmd = "crushtool", int max_id = -1, - int timeout = 0); + int timeout = 0, + int ruleset = -1); }; #endif diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc index c7ed691062aee..532ffe1aa24b6 100644 --- a/src/tools/crushtool.cc +++ b/src/tools/crushtool.cc @@ -170,7 +170,7 @@ void usage() cout << " show location for given device id\n"; cout << " -i mapfn --test test a range of inputs on the map\n"; cout << " [--min-x x] [--max-x x] [--x x]\n"; - cout << " [--min-rule r] [--max-rule r] [--rule r]\n"; + cout << " [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]\n"; cout << " [--num-rep n]\n"; cout << " [--batches b] split the CRUSH mapping into b > 1 rounds\n"; cout << " [--weight|-w devno weight]\n"; @@ -465,6 +465,12 @@ int main(int argc, const char **argv) exit(EXIT_FAILURE); } tester.set_rule(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--ruleset", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + exit(EXIT_FAILURE); + } + tester.set_ruleset(x); } else if (ceph_argparse_witharg(args, i, &x, err, "--batches", (char*)NULL)) { if (!err.str().empty()) { cerr << err.str() << std::endl; From 524b0bdcc45c2f4b95f2239c988e93250f337f3d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 18 Sep 2015 09:41:25 -0400 Subject: [PATCH 644/654] mon/OSDMonitor: only test crush ruleset for the newly created pool Otherwise, we test *all* crush rules.. which might be a lot, and which is a big waste of time and effort. Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 3b44b2d4b0c66..a7d18f8a4f3c1 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4493,7 +4493,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, CrushTester tester(newcrush, *ss); r = tester.test_with_crushtool(g_conf->crushtool.c_str(), osdmap.get_max_osd(), - g_conf->mon_lease); + g_conf->mon_lease, + crush_ruleset); if (r) { dout(10) << " tester.test_with_crushtool returns " << r << dendl; return r; From 1b3090d50e5bd5ca3e6e396b23d2d9826896c718 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 18 Sep 2015 09:42:47 -0400 Subject: [PATCH 645/654] mon/OSDMonitor: fix crush injection error message Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index a7d18f8a4f3c1..5e23d3de61726 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -5125,7 +5125,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, g_conf->mon_lease); if (r < 0) { derr << "error on crush map: " << ess.str() << dendl; - ss << "Failed to parse crushmap: " << ess.str(); + ss << "Failed crushmap test: " << ess.str(); err = r; goto reply; } From e44d1e07a71b24a1903fb6191559aa91eb3de612 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Fri, 18 Sep 2015 17:00:30 +0200 Subject: [PATCH 646/654] ceph.spec.in: Fix up (/var)/run/ceph creation Fixes: #13059 Signed-off-by: Boris Ranto --- ceph.spec.in | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 667ee11b84239..45f40b881de8c 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -637,7 +637,7 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph #set up placeholder directories mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph -%if (! 0%{?suse_version}) || ( 0%{?suse_version} && (! 0%{?_with_systemd}) ) +%if ! 0%{?_with_systemd} mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph %endif mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph @@ -673,13 +673,13 @@ rm -rf $RPM_BUILD_ROOT %post /sbin/ldconfig %if 0%{?_with_systemd} + systemd-tmpfiles --create %if 0%{?suse_version} %service_add_post ceph.target %endif %else /sbin/chkconfig --add ceph %endif -mkdir -p %{_localstatedir}/run/ceph/ %preun %if 0%{?_with_systemd} @@ -818,7 +818,9 @@ mkdir -p %{_localstatedir}/run/ceph/ %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw +%if ! 0%{?_with_systemd} %attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph +%endif ################################################################################# %files -n ceph-common From bf9c00580f7716fcaeff0631e16171180b2a792a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 18 Sep 2015 11:40:26 -0400 Subject: [PATCH 647/654] ceph-osd-prestart.sh: no ceph-disk chown chown -R ceph:ceph $1 is easy enough. Signed-off-by: Sage Weil --- PendingReleaseNotes | 2 +- src/ceph-osd-prestart.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index e91ded961dbac..92a220b94ab78 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -37,7 +37,7 @@ Upgrading #. Fix the ownership. E.g.,:: chown -R ceph:ceph /var/lib/ceph/mon/ceph-foo - ceph-disk chown /dev/sdb1 + chown -R ceph:ceph /var/lib/ceph/osd/ceph-123 #. Restart the daemon(s) diff --git a/src/ceph-osd-prestart.sh b/src/ceph-osd-prestart.sh index 77974e8a58278..cefca854e61ff 100644 --- a/src/ceph-osd-prestart.sh +++ b/src/ceph-osd-prestart.sh @@ -56,7 +56,7 @@ fi owner=`stat -c %U $data/.` if [ $owner != 'ceph' -a $owner != 'root' ]; then echo "ceph-osd data dir $data is not owned by 'ceph' or 'root'" - echo "you must 'ceph-disk chown ...' or similar to fix ownership" + echo "you must 'chown -R ceph:ceph ...' or similar to fix ownership" exit 1 fi From c1172cadab67112d5da7e07eb5d4cf17aec5b854 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 17 Sep 2015 15:51:20 +0100 Subject: [PATCH 648/654] mon: fix auth get-or-create output Previously the caps were omitted from the output: they should be present for this to fulfil the 'get' part of get-or-create. Signed-off-by: John Spray --- src/mon/AuthMonitor.cc | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index c948680840bbc..730410e8c5082 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -864,17 +864,25 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op) goto done; } + // Parse the list of caps into a map + std::map wanted_caps; + for (vector::const_iterator it = caps_vec.begin(); + it != caps_vec.end() && (it + 1) != caps_vec.end(); + it += 2) { + const std::string &sys = *it; + bufferlist cap; + ::encode(*(it+1), cap); + wanted_caps[sys] = cap; + } + // do we have it? EntityAuth entity_auth; if (mon->key_server.get_auth(entity, entity_auth)) { - for (vector::iterator it = caps_vec.begin(); - it != caps_vec.end(); it += 2) { - string sys = *it; - bufferlist cap; - ::encode(*(it+1), cap); - if (entity_auth.caps.count(sys) == 0 || - !entity_auth.caps[sys].contents_equal(cap)) { - ss << "key for " << entity << " exists but cap " << sys << " does not match"; + for (const auto &sys_cap : wanted_caps) { + if (entity_auth.caps.count(sys_cap.first) == 0 || + !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) { + ss << "key for " << entity << " exists but cap " << sys_cap.first + << " does not match"; err = -EINVAL; goto done; } @@ -890,6 +898,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op) KeyRing kr; kr.add(entity, entity_auth.key); if (f) { + kr.set_caps(entity, entity_auth.caps); kr.encode_formatted("auth", f.get(), rdata); } else { kr.encode_plaintext(rdata); @@ -921,9 +930,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op) auth_inc.op = KeyServerData::AUTH_INC_ADD; auth_inc.name = entity; auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES); - for (vector::iterator it = caps_vec.begin(); - it != caps_vec.end(); it += 2) - ::encode(*(it+1), auth_inc.auth.caps[*it]); + auth_inc.auth.caps = wanted_caps; push_cephx_inc(auth_inc); @@ -937,6 +944,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op) KeyRing kr; kr.add(entity, auth_inc.auth.key); if (f) { + kr.set_caps(entity, wanted_caps); kr.encode_formatted("auth", f.get(), rdata); } else { kr.encode_plaintext(rdata); From 387d7800359154950431d0984c756f43f21dd9b4 Mon Sep 17 00:00:00 2001 From: Alfredo Deza Date: Fri, 18 Sep 2015 14:13:02 -0400 Subject: [PATCH 649/654] doc: correct links to download.ceph.com Signed-off-by: Alfredo Deza --- doc/install/get-packages.rst | 239 +++++++++++++---------------------- 1 file changed, 90 insertions(+), 149 deletions(-) diff --git a/doc/install/get-packages.rst b/doc/install/get-packages.rst index 64ba5f9b1bebd..8491b40a01e11 100644 --- a/doc/install/get-packages.rst +++ b/doc/install/get-packages.rst @@ -3,20 +3,20 @@ ============== To install Ceph and other enabling software, you need to retrieve packages from -the Ceph repository. Follow this guide to get packages; then, proceed to the +the Ceph repository. Follow this guide to get packages; then, proceed to the `Install Ceph Object Storage`_. Getting Packages ================ -There are two ways to get packages: +There are two ways to get packages: -- **Add Repositories:** Adding repositories is the easiest way to get packages, +- **Add Repositories:** Adding repositories is the easiest way to get packages, because package management tools will retrieve the packages and all enabling - software for you in most cases. However, to use this approach, each + software for you in most cases. However, to use this approach, each :term:`Ceph Node` in your cluster must have internet access. - + - **Download Packages Manually:** Downloading packages manually is a convenient way to install Ceph if your environment does not allow a :term:`Ceph Node` to access the internet. @@ -25,39 +25,39 @@ There are two ways to get packages: Requirements ============ -All Ceph deployments require Ceph packages (except for development). You should -also add keys and recommended packages. +All Ceph deployments require Ceph packages (except for development). You should +also add keys and recommended packages. -- **Keys: (Recommended)** Whether you add repositories or download packages +- **Keys: (Recommended)** Whether you add repositories or download packages manually, you should download keys to verify the packages. If you do not get - the keys, you may encounter security warnings. There are two keys: one for - releases (common) and one for development (programmers and QA only). Choose + the keys, you may encounter security warnings. There are two keys: one for + releases (common) and one for development (programmers and QA only). Choose the key that suits your needs. See `Add Keys`_ for details. -- **Ceph Extras: (Required)** The Ceph Extras repository provides newer - Ceph-enabled versions of packages which are already provided in your Linux - distribution, but where newer versions are required to support Ceph. Examples - of newer versions of available packages include QEMU for CentOS/RHEL - distribution and iSCSI among others. If you intend to use any of the - foregoing packages, you must add the Ceph Extras repository or download the +- **Ceph Extras: (Required)** The Ceph Extras repository provides newer + Ceph-enabled versions of packages which are already provided in your Linux + distribution, but where newer versions are required to support Ceph. Examples + of newer versions of available packages include QEMU for CentOS/RHEL + distribution and iSCSI among others. If you intend to use any of the + foregoing packages, you must add the Ceph Extras repository or download the packages manually. This repository also contains Ceph dependencies for those who intend to install Ceph manually. See `Add Ceph Extras`_ for details. -- **Ceph: (Required)** All Ceph deployments require Ceph release packages, - except for deployments that use development packages (development, QA, and +- **Ceph: (Required)** All Ceph deployments require Ceph release packages, + except for deployments that use development packages (development, QA, and bleeding edge deployments only). See `Add Ceph`_ for details. -- **Ceph Development: (Optional)** If you are developing for Ceph, testing Ceph - development builds, or if you want features from the bleeding edge of Ceph - development, you may get Ceph development packages. See +- **Ceph Development: (Optional)** If you are developing for Ceph, testing Ceph + development builds, or if you want features from the bleeding edge of Ceph + development, you may get Ceph development packages. See `Add Ceph Development`_ for details. -- **Apache/FastCGI: (Optional)** If you are deploying a - :term:`Ceph Object Storage` service, you must install Apache and FastCGI. - Ceph provides Apache and FastCGI builds that are identical to those available - from Apache, but with 100-continue support. If you want to enable - :term:`Ceph Object Gateway` daemons with 100-continue support, you must - retrieve Apache/FastCGI packages from the Ceph repository. +- **Apache/FastCGI: (Optional)** If you are deploying a + :term:`Ceph Object Storage` service, you must install Apache and FastCGI. + Ceph provides Apache and FastCGI builds that are identical to those available + from Apache, but with 100-continue support. If you want to enable + :term:`Ceph Object Gateway` daemons with 100-continue support, you must + retrieve Apache/FastCGI packages from the Ceph repository. See `Add Apache/FastCGI`_ for details. @@ -79,13 +79,13 @@ APT To install the ``release.asc`` key, execute the following:: - wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add - + wget -q -O- 'https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add - -To install the ``autobuild.asc`` key, execute the following -(QA and developers only):: +To install the ``autobuild.asc`` key, execute the following +(QA and developers only):: - wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' | sudo apt-key add - + wget -q -O- 'https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' | sudo apt-key add - RPM @@ -93,71 +93,12 @@ RPM To install the ``release.asc`` key, execute the following:: - sudo rpm --import 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' + sudo rpm --import 'https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' To install the ``autobuild.asc`` key, execute the following -(QA and developers only):: - - sudo rpm --import 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' - - - -Add Ceph Extras -=============== - -Some Ceph deployments require newer Ceph-enabled versions of packages that are -already available in your Linux distribution. For example, Ceph Extras contains -newer Ceph-enabled packages for the SCSI target framework and QEMU packages for -RPMs. The repository also contains ``curl``, ``leveldb`` and other Ceph -dependencies. Add the Ceph Extras repository to ensure you obtain these -additional packages from the Ceph repository. - - -Debian Packages ---------------- - -Add our Ceph Extras package repository to your system's list of APT sources. :: +(QA and developers only):: - echo deb http://ceph.com/packages/ceph-extras/debian $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph-extras.list - - -RPM Packages ------------- - -.. note:: ceph-extras on RPM-based systems is only needed on EL6-based - distributions (RHEL 6, CentOS 6, Scientific Linux 6). It is not needed - for Fedora or RHEL 7+. - -For RPM packages, add our package repository to your ``/etc/yum.repos.d`` repos (e.g., -``ceph-extras.repo``). Some Ceph packages (e.g., QEMU) must take priority over standard -packages, so you must ensure that you set ``priority=2``. :: - - [ceph-extras] - name=Ceph Extras Packages - baseurl=http://ceph.com/packages/ceph-extras/rpm/{distro}/$basearch - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - - [ceph-extras-noarch] - name=Ceph Extras noarch - baseurl=http://ceph.com/packages/ceph-extras/rpm/{distro}/noarch - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - - [ceph-extras-source] - name=Ceph Extras Sources - baseurl=http://ceph.com/packages/ceph-extras/rpm/{distro}/SRPMS - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + sudo rpm --import 'https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' Add Ceph @@ -167,18 +108,18 @@ Release repositories use the ``release.asc`` key to verify packages. To install Ceph packages with the Advanced Package Tool (APT) or Yellowdog Updater, Modified (YUM), you must add Ceph repositories. -You may find releases for Debian/Ubuntu (installed with APT) at:: +You may find releases for Debian/Ubuntu (installed with APT) at:: - http://ceph.com/debian-{release-name} + http://download.ceph.com/debian-{release-name} -You may find releases for CentOS/RHEL and others (installed with YUM) at:: +You may find releases for CentOS/RHEL and others (installed with YUM) at:: - http://ceph.com/rpm-{release-name} + http://download.ceph.com/rpm-{release-name} The major releases of Ceph include: - **Hammer:** Hammer is the most recent, and is also the eighth major release - of Ceph. These packages are recommended for anyone deploying Ceph in a + of Ceph. These packages are recommended for anyone deploying Ceph in a production environment. Critical bug fixes are backported and point releases are made as necessary. @@ -187,14 +128,14 @@ The major releases of Ceph include: bug fixes are backported and point releases are made as necessary. - **Firefly:** Firefly is the sixth major release of Ceph. These packages - are recommended for anyone deploying Ceph in a production environment. + are recommended for anyone deploying Ceph in a production environment. Firefly is a long-term stable release, so critical bug fixes are backported and point releases are made as necessary. - **Emperor:** Emperor is the fifth major release of Ceph. These packages are are old and no longer supported, so we recommend that users upgrade to Firefly immediately. - + - **Dumpling:** Dumpling is the fourth major release of Ceph. These packages are older and not recommended for new users, but critical bug fixes are still backported as necessary. We encourage all Dumpling users to update to @@ -205,46 +146,46 @@ The major releases of Ceph include: users upgrade to a supported version. .. tip:: For European users, there is also a mirror in the Netherlands at: - http://eu.ceph.com/ + http://eu.ceph.com/ Debian Packages --------------- Add a Ceph package repository to your system's list of APT sources. For newer -versions of Debian/Ubuntu, call ``lsb_release -sc`` on the command line to -get the short codename, and replace ``{codename}`` in the following command. :: +versions of Debian/Ubuntu, call ``lsb_release -sc`` on the command line to +get the short codename, and replace ``{codename}`` in the following command. :: - sudo apt-add-repository 'deb http://ceph.com/debian-firefly/ {codename} main' + sudo apt-add-repository 'deb http://download.ceph.com/debian-firefly/ {codename} main' -For early Linux distributions, you may execute the following command:: +For early Linux distributions, you may execute the following command:: - echo deb http://ceph.com/debian-firefly/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list + echo deb http://download.ceph.com/debian-firefly/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list For earlier Ceph releases, replace ``{release-name}`` with the name with the name of the Ceph release. You may call ``lsb_release -sc`` on the command line to get the short codename, and replace ``{codename}`` in the following command. :: - sudo apt-add-repository 'deb http://ceph.com/debian-{release-name}/ {codename} main' + sudo apt-add-repository 'deb http://download.ceph.com/debian-{release-name}/ {codename} main' For older Linux distributions, replace ``{release-name}`` with the name of the -release:: +release:: - echo deb http://ceph.com/debian-{release-name}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list + echo deb http://download.ceph.com/debian-{release-name}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list Ceph on ARM processors requires Google's memory profiling tools (``google-perftools``). The Ceph repository should have a copy at -http://ceph.com/packages/google-perftools/debian. :: +http://download.ceph.com/packages/google-perftools/debian. :: - echo deb http://ceph.com/packages/google-perftools/debian $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/google-perftools.list + echo deb http://download.ceph.com/packages/google-perftools/debian $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/google-perftools.list For development release packages, add our package repository to your system's list of APT sources. See `the testing Debian repository`_ for a complete list of Debian and Ubuntu releases supported. :: - echo deb http://ceph.com/debian-testing/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list + echo deb http://download.ceph.com/debian-testing/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list RPM Packages @@ -254,37 +195,37 @@ For major releases, you may add a Ceph entry to the ``/etc/yum.repos.d`` directory. Create a ``ceph.repo`` file. In the example below, replace ``{ceph-release}`` with a major release of Ceph (e.g., ``dumpling``, ``emperor``, etc.) and ``{distro}`` with your Linux distribution (e.g., ``el6``, -``rhel6``, etc.). You may view http://ceph.com/rpm-{ceph-release}/ directory to +``rhel6``, etc.). You may view http://download.ceph.com/rpm-{ceph-release}/ directory to see which distributions Ceph supports. Some Ceph packages (e.g., EPEL) must take priority over standard packages, so you must ensure that you set ``priority=2``. :: [ceph] name=Ceph packages for $basearch - baseurl=http://ceph.com/rpm-{ceph-release}/{distro}/$basearch + baseurl=http://download.ceph.com/rpm-{ceph-release}/{distro}/$basearch enabled=1 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc [ceph-noarch] name=Ceph noarch packages - baseurl=http://ceph.com/rpm-{ceph-release}/{distro}/noarch + baseurl=http://download.ceph.com/rpm-{ceph-release}/{distro}/noarch enabled=1 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc [ceph-source] name=Ceph source packages - baseurl=http://ceph.com/rpm-{ceph-release}/{distro}/SRPMS + baseurl=http://download.ceph.com/rpm-{ceph-release}/{distro}/SRPMS enabled=0 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc For development release packages, you may specify the repository @@ -292,30 +233,30 @@ for development releases instead. :: [ceph] name=Ceph packages for $basearch/$releasever - baseurl=http://ceph.com/rpm-testing/{distro}/$basearch + baseurl=http://download.ceph.com/rpm-testing/{distro}/$basearch enabled=1 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc [ceph-noarch] name=Ceph noarch packages - baseurl=http://ceph.com/rpm-testing/{distro}/noarch + baseurl=http://download.ceph.com/rpm-testing/{distro}/noarch enabled=1 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc [ceph-source] name=Ceph source packages - baseurl=http://ceph.com/rpm-testing/{distro}/SRPMS + baseurl=http://download.ceph.com/rpm-testing/{distro}/SRPMS enabled=0 priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc For specific packages, you may retrieve them by specifically downloading the @@ -325,14 +266,14 @@ Development packages have new features integrated quickly, while still undergoing several weeks of QA prior to release. The repository package installs the repository details on your local system for -use with ``yum`` or ``up2date``. Replace ``{distro}`` with your Linux distribution, +use with ``yum`` or ``up2date``. Replace ``{distro}`` with your Linux distribution, and ``{release}`` with the specific release of Ceph:: - su -c 'rpm -Uvh http://ceph.com/rpms/{distro}/x86_64/ceph-{release}.el6.noarch.rpm' + su -c 'rpm -Uvh http://download.ceph.com/rpms/{distro}/x86_64/ceph-{release}.el6.noarch.rpm' You can download the RPMs directly from:: - http://ceph.com/rpm-testing + http://download.ceph.com/rpm-testing Add Ceph Development @@ -344,7 +285,7 @@ ensure that you remove repository entries for major releases first. Debian Packages ---------------- +--------------- We automatically build Debian and Ubuntu packages for current development branches in the Ceph source code repository. These @@ -374,10 +315,10 @@ install. :: enabled=0 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc -You may view http://gitbuilder.ceph.com directory to see which distributions +You may view http://gitbuilder.ceph.com directory to see which distributions Ceph supports. @@ -393,7 +334,7 @@ Debian Packages --------------- Add our Apache and FastCGI packages to your system's list of APT sources if you intend to -use 100-continue. :: +use 100-continue. :: echo deb http://gitbuilder.ceph.com/apache2-deb-$(lsb_release -sc)-x86_64-basic/ref/master $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph-apache.list echo deb http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-$(lsb_release -sc)-x86_64-basic/ref/master $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph-fastcgi.list @@ -416,7 +357,7 @@ http://gitbuilder.ceph.com directory to see which distributions Ceph supports. priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc [apache2-ceph-source] name=Apache source packages for Ceph @@ -425,7 +366,7 @@ http://gitbuilder.ceph.com directory to see which distributions Ceph supports. priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc Repeat the forgoing process by creating a ``ceph-fastcgi.repo`` file. :: @@ -437,7 +378,7 @@ Repeat the forgoing process by creating a ``ceph-fastcgi.repo`` file. :: priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc [fastcgi-ceph-noarch] name=FastCGI noarch packages for Ceph @@ -446,7 +387,7 @@ Repeat the forgoing process by creating a ``ceph-fastcgi.repo`` file. :: priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc [fastcgi-ceph-source] name=FastCGI source packages for Ceph @@ -455,20 +396,20 @@ Repeat the forgoing process by creating a ``ceph-fastcgi.repo`` file. :: priority=2 gpgcheck=1 type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc + gpgkey=https://git.ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc Download Packages ================= -If you are attempting to install behind a firewall in an environment without internet -access, you must retrieve the packages (mirrored with all the necessary dependencies) +If you are attempting to install behind a firewall in an environment without internet +access, you must retrieve the packages (mirrored with all the necessary dependencies) before attempting an install. Debian Packages --------------- -Ceph requires additional additional third party libraries. +Ceph requires additional additional third party libraries. - libaio1 - libsnappy1 @@ -486,14 +427,14 @@ your Linux distribution codename. Replace ``{arch}`` with the CPU architecture. :: - wget -q http://ceph.com/debian-{release}/pool/main/c/ceph/ceph_{version}{distro}_{arch}.deb + wget -q http://download.ceph.com/debian-{release}/pool/main/c/ceph/ceph_{version}{distro}_{arch}.deb RPM Packages ------------ -Ceph requires additional additional third party libraries. -To add the EPEL repository, execute the following:: +Ceph requires additional additional third party libraries. +To add the EPEL repository, execute the following:: su -c 'rpm -Uvh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm' @@ -512,22 +453,22 @@ platforms. The repository package installs the repository details on your local system for use with ``yum`` or ``up2date``. Replace ``{distro}`` with your distribution. :: - su -c 'rpm -Uvh http://ceph.com/rpm-firefly/{distro}/noarch/ceph-{version}.{distro}.noarch.rpm' + su -c 'rpm -Uvh http://download.ceph.com/rpm-firefly/{distro}/noarch/ceph-{version}.{distro}.noarch.rpm' For example, for CentOS 6 (``el6``):: - su -c 'rpm -Uvh http://ceph.com/rpm-firefly/el6/noarch/ceph-release-1-0.el6.noarch.rpm' + su -c 'rpm -Uvh http://download.ceph.com/rpm-firefly/el6/noarch/ceph-release-1-0.el6.noarch.rpm' You can download the RPMs directly from:: - http://ceph.com/rpm-firefly + http://download.ceph.com/rpm-firefly -For earlier Ceph releases, replace ``{release-name}`` with the name -with the name of the Ceph release. You may call ``lsb_release -sc`` on the command +For earlier Ceph releases, replace ``{release-name}`` with the name +with the name of the Ceph release. You may call ``lsb_release -sc`` on the command line to get the short codename. :: - su -c 'rpm -Uvh http://ceph.com/rpm-{release-name}/{distro}/noarch/ceph-{version}.{distro}.noarch.rpm' + su -c 'rpm -Uvh http://download.ceph.com/rpm-{release-name}/{distro}/noarch/ceph-{version}.{distro}.noarch.rpm' From c0ef84fd22aac58b6ec8e58eb4f8dffed306d39e Mon Sep 17 00:00:00 2001 From: Alfredo Deza Date: Fri, 18 Sep 2015 14:13:21 -0400 Subject: [PATCH 650/654] doc: remove ceph-extras Signed-off-by: Alfredo Deza --- doc/install/install-vm-cloud.rst | 67 +++++++------------------------- 1 file changed, 13 insertions(+), 54 deletions(-) diff --git a/doc/install/install-vm-cloud.rst b/doc/install/install-vm-cloud.rst index 8bdb1e8b85111..a28a76a28f74f 100644 --- a/doc/install/install-vm-cloud.rst +++ b/doc/install/install-vm-cloud.rst @@ -29,77 +29,36 @@ Install QEMU QEMU KVM can interact with Ceph Block Devices via ``librbd``, which is an important feature for using Ceph with cloud platforms. Once you install QEMU, -see `QEMU and Block Devices`_ for usage. +see `QEMU and Block Devices`_ for usage. Debian Packages --------------- QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later -versions. To install QEMU, execute the following:: +versions. To install QEMU, execute the following:: sudo apt-get install qemu - + RPM Packages ------------ To install QEMU, execute the following: -#. Install ``yum-plugin-priorities``. :: - - sudo yum install yum-plugin-priorities - -#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. - -#. Ensure ``priorities.conf`` enables the plugin. :: - - [main] - enabled = 1 - -.. note:: ceph-extras on RPM-based systems is only needed on EL6-based - distributions (RHEL 6, CentOS 6, Scientific Linux 6). It is not needed - for Fedora or RHEL 7+. -#. Create a ``/etc/yum.repos.d/ceph-extras.repo`` file with the following - contents, and replace ``{distro}`` with your Linux distribution. Follow - the ``baseurl`` path below to see which distributions Ceph supports:: - - [ceph-extras] - name=Ceph Extras - baseurl=http://ceph.com/packages/ceph-extras/rpm/{distro}/$basearch - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - - [ceph-qemu-source] - name=Ceph Extras Sources - baseurl=http://ceph.com/packages/ceph-extras/rpm/{distro}/SRPMS - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - -#. Update your repositories. :: +#. Update your repositories. :: sudo yum update -#. Ensure that non-priority versions are removed. :: - - sudo yum remove qemu-kvm qemu-kvm-tools qemu-img - sudo yum clean all - -#. Install QEMU for Ceph. :: +#. Install QEMU for Ceph. :: sudo yum install qemu-kvm qemu-kvm-tools qemu-img - -#. Install additional QEMU packages (optional):: + +#. Install additional QEMU packages (optional):: sudo yum install qemu-guest-agent qemu-guest-agent-win32 - + Building QEMU ------------- @@ -127,7 +86,7 @@ Debian Packages ``libvirt`` packages are incorporated into Ubuntu 12.04 Precise Pangolin and later versions of Ubuntu. To install ``libvirt`` on these distributions, -execute the following:: +execute the following:: sudo apt-get update && sudo apt-get install libvirt-bin @@ -140,8 +99,8 @@ Storage Cluster and you must also install a version of QEMU with ``rbd`` format support. See `Install QEMU`_ for details. -``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions. -To install ``libvirt``, execute the following:: +``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions. +To install ``libvirt``, execute the following:: sudo yum install libvirt @@ -157,7 +116,7 @@ complete the installation. For example:: cd libvirt ./autogen.sh make - sudo make install + sudo make install See `libvirt Installation`_ for details. @@ -166,4 +125,4 @@ See `libvirt Installation`_ for details. .. _libvirt Installation: http://www.libvirt.org/compiling.html .. _AutoGen: http://www.gnu.org/software/autogen/ .. _QEMU and Block Devices: ../../rbd/qemu-rbd -.. _Using libvirt with Ceph Block Device: ../../rbd/libvirt \ No newline at end of file +.. _Using libvirt with Ceph Block Device: ../../rbd/libvirt From a6f07e9fa3bd5e50a2d8cfa387b44a39d383fa65 Mon Sep 17 00:00:00 2001 From: Alfredo Deza Date: Fri, 18 Sep 2015 14:20:12 -0400 Subject: [PATCH 651/654] doc: remove mention of ceph-extra as a requirement Signed-off-by: Alfredo Deza --- doc/install/get-packages.rst | 9 --------- 1 file changed, 9 deletions(-) diff --git a/doc/install/get-packages.rst b/doc/install/get-packages.rst index 8491b40a01e11..6d20ffd891b4d 100644 --- a/doc/install/get-packages.rst +++ b/doc/install/get-packages.rst @@ -34,15 +34,6 @@ also add keys and recommended packages. releases (common) and one for development (programmers and QA only). Choose the key that suits your needs. See `Add Keys`_ for details. -- **Ceph Extras: (Required)** The Ceph Extras repository provides newer - Ceph-enabled versions of packages which are already provided in your Linux - distribution, but where newer versions are required to support Ceph. Examples - of newer versions of available packages include QEMU for CentOS/RHEL - distribution and iSCSI among others. If you intend to use any of the - foregoing packages, you must add the Ceph Extras repository or download the - packages manually. This repository also contains Ceph dependencies for those - who intend to install Ceph manually. See `Add Ceph Extras`_ for details. - - **Ceph: (Required)** All Ceph deployments require Ceph release packages, except for deployments that use development packages (development, QA, and bleeding edge deployments only). See `Add Ceph`_ for details. From 170f9add76d2e9cf38cc69646cb798ea2fdaa7b0 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Sat, 19 Sep 2015 22:25:31 +0200 Subject: [PATCH 652/654] doc: do not promise backports to Dumpling Dumpling was retired in May 2015. http://tracker.ceph.com/issues/13175 Fixes: #13175 Signed-off-by: Nathan Cutler --- doc/install/get-packages.rst | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/doc/install/get-packages.rst b/doc/install/get-packages.rst index 6d20ffd891b4d..2d67c611b61f7 100644 --- a/doc/install/get-packages.rst +++ b/doc/install/get-packages.rst @@ -124,17 +124,13 @@ The major releases of Ceph include: and point releases are made as necessary. - **Emperor:** Emperor is the fifth major release of Ceph. These packages - are are old and no longer supported, so we recommend that users upgrade to + are are old and no longer maintained, so we recommend that users upgrade to Firefly immediately. -- **Dumpling:** Dumpling is the fourth major release of Ceph. These packages - are older and not recommended for new users, but critical bug fixes are - still backported as necessary. We encourage all Dumpling users to update to - Firefly as soon as they are able to do so. - -- **Argonaut, Bobtail, Cuttlefish:** These are the first three releases of - Ceph. These packages are old and no longer supported, so we recommend that - users upgrade to a supported version. +- **Argonaut, Bobtail, Cuttlefish, Dumpling:** These are the first four + releases of Ceph. These packages are old and no longer maintained (Dumpling + was retired in May 2015), so we recommend that users upgrade to a more + recent version. .. tip:: For European users, there is also a mirror in the Netherlands at: http://eu.ceph.com/ From 4da6793d8dd9c1d09036e9eda4f7854818a51902 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Fri, 18 Sep 2015 12:59:09 +0200 Subject: [PATCH 653/654] install-deps: enable python3 The upstream regression has been fixed, we can re-enable python3 support. http://tracker.ceph.com/issues/13136 Fixes: #13136 Signed-off-by: Loic Dachary --- install-deps.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install-deps.sh b/install-deps.sh index 886678feb9d61..5ad41c69cd989 100755 --- a/install-deps.sh +++ b/install-deps.sh @@ -131,7 +131,7 @@ find . -name tox.ini | while read ini ; do cd $(dirname $ini) require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /') if test "$require" && ! test -d wheelhouse ; then - for interpreter in python2.7 ; do # python3 + for interpreter in python2.7 python3 ; do type $interpreter > /dev/null 2>&1 || continue activate_virtualenv $top_srcdir $interpreter || exit 1 populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1 @@ -141,7 +141,7 @@ find . -name tox.ini | while read ini ; do ) done -for interpreter in python2.7 ; do # python3 +for interpreter in python2.7 python3 ; do rm -rf $top_srcdir/install-deps-$interpreter done rm -rf $XDG_CACHE_HOME From 21a1e75d8a7bad89a48cd9d36902c5d609be5015 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Sun, 20 Sep 2015 23:42:45 +0200 Subject: [PATCH 654/654] tests: update to match crushmap validation message http://tracker.ceph.com/issues/13182 Fixes: #13182 Signed-off-by: Loic Dachary --- qa/workunits/cephtool/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 9ca1727d4e793..fc81f86f3ef08 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1631,7 +1631,7 @@ function test_mon_crushmap_validation() exit 1" > "${crushtool_path}" expect_false ceph osd setcrushmap -i $map 2> $TMPFILE - check_response "Error EINVAL: Failed to parse crushmap: TEST FAIL" + check_response "Error EINVAL: Failed crushmap test: TEST FAIL" local mon_lease=`ceph-conf --show-config-value mon_lease` @@ -1650,7 +1650,7 @@ function test_mon_crushmap_validation() sleep $((mon_lease + 1))" > "${crushtool_path}" expect_false ceph osd setcrushmap -i $map 2> $TMPFILE - check_response "Error EINVAL: Failed to parse crushmap: ${crushtool_path}: timed out (${mon_lease} sec)" + check_response "Error EINVAL: Failed crushmap test: ${crushtool_path}: timed out (${mon_lease} sec)" ceph tell mon.\* injectargs --crushtool "${crushtool_path_old}"