Skip to content

Commit

Permalink
Merge pull request #5825 from tchaikov/wip-12848-hammer
Browse files Browse the repository at this point in the history
ReplicatedPG::hit_set_trim osd/ReplicatedPG.cc: 11006: FAILED assert(obc)

Reviewed-by: Loic Dachary <ldachary@redhat.com>
  • Loading branch information
Loic Dachary committed Nov 17, 2015
1 parent 2d07e3b commit bb67852
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 28 deletions.
1 change: 1 addition & 0 deletions src/common/config_opts.h
Expand Up @@ -497,6 +497,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
Expand Down
2 changes: 2 additions & 0 deletions src/include/ceph_features.h
Expand Up @@ -64,6 +64,7 @@
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
/* ... */
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)

Expand Down Expand Up @@ -151,6 +152,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_MDS_QUOTA | \
CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \
CEPH_FEATURE_OSD_HITSET_GMT | \
CEPH_FEATURE_HAMMER_0_94_4 | \
0ULL)

Expand Down
2 changes: 1 addition & 1 deletion src/mon/MonCommands.h
Expand Up @@ -634,7 +634,7 @@ COMMAND("osd pool get " \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
Expand Down
38 changes: 38 additions & 0 deletions src/mon/OSDMonitor.cc
Expand Up @@ -16,6 +16,7 @@
*
*/

#include <algorithm>
#include <sstream>

#include "OSDMonitor.h"
Expand Down Expand Up @@ -1572,6 +1573,9 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
failure_info.clear();
}

static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
return pool.second.use_gmt_hitset;
}

// boot --

Expand Down Expand Up @@ -1641,6 +1645,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
}
}

if (std::find_if(osdmap.get_pools().begin(),
osdmap.get_pools().end(),
uses_gmt_hitset) != osdmap.get_pools().end()) {
assert(osdmap.get_num_up_osds() == 0 ||
osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
<< m->get_orig_source_inst()
<< " doesn't announce support -- ignore" << dendl;
goto ignore;
}
}

// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
Expand Down Expand Up @@ -3084,6 +3101,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
if (!p->is_tier() &&
(var == "hit_set_type" || var == "hit_set_period" ||
var == "hit_set_count" || var == "hit_set_fpp" ||
var == "use_gmt_hitset" ||
var == "target_max_objects" || var == "target_max_bytes" ||
var == "cache_target_full_ratio" ||
var == "cache_target_dirty_ratio" ||
Expand Down Expand Up @@ -3136,6 +3154,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
f->dump_float("hit_set_fpp", bloomp->get_fpp());
}
} else if (var == "use_gmt_hitset") {
f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
} else if (var == "target_max_objects") {
f->dump_unsigned("target_max_objects", p->target_max_objects);
} else if (var == "target_max_bytes") {
Expand Down Expand Up @@ -3193,6 +3213,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
ss << "hit_set_fpp: " << bloomp->get_fpp();
} else if (var == "use_gmt_hitset") {
ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
} else if (var == "target_max_objects") {
ss << "target_max_objects: " << p->target_max_objects;
} else if (var == "target_max_bytes") {
Expand Down Expand Up @@ -4075,6 +4097,11 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
if (g_conf->osd_pool_use_gmt_hitset &&
(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
pi->use_gmt_hitset = true;
else
pi->use_gmt_hitset = false;

pi->size = size;
pi->min_size = min_size;
Expand Down Expand Up @@ -4418,6 +4445,17 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
bloomp->set_fpp(f);
} else if (var == "use_gmt_hitset") {
if (val == "true" || (interr.empty() && n == 1)) {
if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
ss << "not all OSDs support GMT hit set.";
return -EINVAL;
}
p.use_gmt_hitset = true;
} else {
ss << "expecting value 'true' or '1'";
return -EINVAL;
}
} else if (var == "debug_fake_ec_pool") {
if (val == "true" || (interr.empty() && n == 1)) {
p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
Expand Down
3 changes: 0 additions & 3 deletions src/osd/HitSet.cc
Expand Up @@ -36,9 +36,6 @@ HitSet::HitSet(const HitSet::Params& params)
impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get())));
break;

case TYPE_NONE:
break;

default:
assert (0 == "unknown HitSet type");
}
Expand Down
84 changes: 69 additions & 15 deletions src/osd/ReplicatedPG.cc
Expand Up @@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
oid = get_hit_set_archive_object(p->begin, p->end);
oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
break;
}
}
Expand Down Expand Up @@ -10130,10 +10130,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
return hoid;
}

hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
utime_t end,
bool using_gmt)
{
ostringstream ss;
ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
ss << "hit_set_" << info.pgid.pgid << "_archive_";
if (using_gmt) {
start.gmtime(ss) << "_";
end.gmtime(ss);
} else {
start.localtime(ss) << "_";
end.localtime(ss);
}
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
Expand All @@ -10152,12 +10161,19 @@ void ReplicatedPG::hit_set_clear()
void ReplicatedPG::hit_set_setup()
{
if (!is_active() ||
!is_primary() ||
!pool.info.hit_set_count ||
!pool.info.hit_set_period ||
pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
!is_primary()) {
hit_set_clear();
//hit_set_remove_all(); // FIXME: implement me soon
return;
}

if (is_active() && is_primary() &&
(!pool.info.hit_set_count ||
!pool.info.hit_set_period ||
pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
hit_set_clear();

// only primary is allowed to remove all the hit set objects
hit_set_remove_all();
return;
}

Expand All @@ -10169,6 +10185,46 @@ void ReplicatedPG::hit_set_setup()
hit_set_apply_log();
}

void ReplicatedPG::hit_set_remove_all()
{
// If any archives are degraded we skip this
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);

// Once we hit a degraded object just skip
if (is_degraded_or_backfilling_object(aoid))
return;
if (scrubber.write_blocked_by_scrub(aoid))
return;
}

if (!info.hit_set.history.empty()) {
list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
assert(p != info.hit_set.history.rend());
hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
assert(!is_degraded_or_backfilling_object(oid));
ObjectContextRef obc = get_object_context(oid, false);
assert(obc);

RepGather *repop = simple_repop_create(obc);
OpContext *ctx = repop->ctx;
ctx->at_version = get_next_version();
ctx->updated_hset_history = info.hit_set;
utime_t now = ceph_clock_now(cct);
ctx->mtime = now;
hit_set_trim(repop, 0);
info.stats.stats.add(ctx->delta_stats);
simple_repop_submit(repop);
}

info.hit_set = pg_hit_set_history_t();
if (agent_state) {
agent_state->discard_hit_sets();
}
}

void ReplicatedPG::hit_set_create()
{
utime_t now = ceph_clock_now(NULL);
Expand Down Expand Up @@ -10270,7 +10326,7 @@ void ReplicatedPG::hit_set_persist()
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);

// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
Expand All @@ -10279,10 +10335,8 @@ void ReplicatedPG::hit_set_persist()
return;
}

oid = get_hit_set_archive_object(start, now);
oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
// If the current object is degraded we skip this persist request
if (is_degraded_or_backfilling_object(oid))
return;
if (scrubber.write_blocked_by_scrub(oid))
return;

Expand Down Expand Up @@ -10373,7 +10427,7 @@ void ReplicatedPG::hit_set_persist()

updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
updated_hit_set_hist.current_info = pg_hit_set_info_t();
updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
updated_hit_set_hist.current_last_stamp = utime_t();

// fabricate an object_info_t and SnapSet
Expand Down Expand Up @@ -10436,7 +10490,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);

assert(!is_degraded_or_backfilling_object(oid));

Expand Down Expand Up @@ -10721,7 +10775,7 @@ void ReplicatedPG::agent_load_hit_sets()
continue;
}

hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
Expand Down
5 changes: 4 additions & 1 deletion src/osd/ReplicatedPG.h
Expand Up @@ -901,9 +901,12 @@ class ReplicatedPG : public PG, public PGBackend::Listener {
bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
void hit_set_in_memory_trim(); ///< discard old in memory HitSets
void hit_set_remove_all();

hobject_t get_hit_set_current_object(utime_t stamp);
hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
hobject_t get_hit_set_archive_object(utime_t start,
utime_t end,
bool using_gmt);

// agent
boost::scoped_ptr<TierAgentState> agent_state;
Expand Down
32 changes: 28 additions & 4 deletions src/osd/osd_types.cc
Expand Up @@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
Expand Down Expand Up @@ -1238,7 +1239,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}

ENCODE_START(17, 5, bl);
ENCODE_START(21, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
Expand Down Expand Up @@ -1280,12 +1281,15 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
::encode(expected_num_objects, bl);
::encode(uint32_t(.6 * 1e6), bl);
::encode(uint32_t(1), bl);
::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}

void pg_pool_t::decode(bufferlist::iterator& bl)
{
DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl);
DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
Expand Down Expand Up @@ -1397,6 +1401,19 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
} else {
expected_num_objects = 0;
}
if (struct_v >= 19) {
uint32_t dummy;
::decode(dummy, bl);
}
if (struct_v >= 20) {
uint32_t dummy;
::decode(dummy, bl);
}
if (struct_v >= 21) {
::decode(use_gmt_hitset, bl);
} else {
use_gmt_hitset = false;
}
DECODE_FINISH(bl);
calc_pg_masks();
}
Expand Down Expand Up @@ -3789,19 +3806,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)

void pg_hit_set_info_t::encode(bufferlist& bl) const
{
ENCODE_START(1, 1, bl);
ENCODE_START(2, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}

void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
DECODE_START(1, p);
DECODE_START(2, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
if (struct_v >= 2) {
::decode(using_gmt, p);
} else {
using_gmt = false;
}
DECODE_FINISH(p);
}

Expand All @@ -3810,6 +3833,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
f->dump_stream("using_gmt") << using_gmt;
}

void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
Expand Down

0 comments on commit bb67852

Please sign in to comment.