Skip to content

Commit

Permalink
osd: add _fastinfo PG attr for common pg_info_t updates
Browse files Browse the repository at this point in the history
For most IO operations we only update a handful of fields
in the pg_info_t structure.  However, the full struct,
when encoded, is on the order of 800 bytes.

This adds a new attribute, _fastinfo, which contains only
the most commonly updated fields.  When present, the
fastinfo fields should be overlayed on top of the full
info struct contained in the existing info attr.  If
a field outside of the "fast" set is updated, we clear
the fastinfo attribute and update the full info attr.

Signed-off-by: Sage Weil <sage@redhat.com>
  • Loading branch information
liewegas committed Sep 23, 2016
1 parent 5b6d975 commit 3717e7d
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 16 deletions.
1 change: 1 addition & 0 deletions src/osd/OSD.cc
Expand Up @@ -194,6 +194,7 @@ CompatSet OSD::get_osd_initial_compat_set() {
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
ceph_osd_feature_incompat);
}
Expand Down
74 changes: 59 additions & 15 deletions src/osd/PG.cc
Expand Up @@ -74,6 +74,7 @@ const string infover_key("_infover");
const string info_key("_info");
const string biginfo_key("_biginfo");
const string epoch_key("_epoch");
const string fastinfo_key("_fastinfo");


template <class T>
Expand Down Expand Up @@ -2741,23 +2742,30 @@ void PG::init(

void PG::upgrade(ObjectStore *store)
{
assert(info_struct_v <= 8);
assert(info_struct_v <= 9);
ObjectStore::Transaction t;

assert(info_struct_v == 7);
assert(info_struct_v >= 7);

// 8 -> 9
if (info_struct_v <= 8) {
// no special action needed.
}

// 7 -> 8
pg_log.mark_log_for_rewrite();
ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
t.remove(coll_t::meta(), log_oid);
t.remove(coll_t::meta(), biginfo_oid);
if (info_struct_v <= 7) {
pg_log.mark_log_for_rewrite();
ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
t.remove(coll_t::meta(), log_oid);
t.remove(coll_t::meta(), biginfo_oid);

t.touch(coll, pgmeta_oid);
map<string,bufferlist> v;
__u8 ver = cur_struct_v;
::encode(ver, v[infover_key]);
t.omap_setkeys(coll, pgmeta_oid, v);
t.touch(coll, pgmeta_oid);
map<string,bufferlist> v;
__u8 ver = cur_struct_v;
::encode(ver, v[infover_key]);
t.omap_setkeys(coll, pgmeta_oid, v);
}

dirty_info = true;
dirty_big_info = true;
Expand Down Expand Up @@ -2789,10 +2797,38 @@ int PG::_prepare_write_info(map<string,bufferlist> *km,
bool dirty_big_info,
bool dirty_epoch)
{
if (dirty_epoch) {
::encode(epoch, (*km)[epoch_key]);
}

// try to do info efficiently?
if (!dirty_big_info) {
pg_fast_info_t fast;
fast.populate_from(info);
fast.apply_to(&last_written_info);
if (info == last_written_info) {
::encode(fast, (*km)[fastinfo_key]);
return 0;
}
generic_derr << __func__ << " fastinfo failed, info:\n";
{
JSONFormatter jf(true);
jf.dump_object("info", info);
jf.flush(*_dout);
}
{
*_dout << "\nlast_written_info:\n";
JSONFormatter jf(true);
jf.dump_object("last_written_info", last_written_info);
jf.flush(*_dout);
}
*_dout << dendl;
}
(*km)[fastinfo_key]; // erase any previous fastinfo
last_written_info = info;

// info. store purged_snaps separately.
interval_set<snapid_t> purged_snaps;
if (dirty_epoch)
::encode(epoch, (*km)[epoch_key]);
purged_snaps.swap(info.purged_snaps);
::encode(info, (*km)[info_key]);
purged_snaps.swap(info.purged_snaps);
Expand Down Expand Up @@ -3067,11 +3103,12 @@ int PG::read_info(
keys.insert(infover_key);
keys.insert(info_key);
keys.insert(biginfo_key);
keys.insert(fastinfo_key);
ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
map<string,bufferlist> values;
int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
if (r == 0) {
assert(values.size() == 3);
assert(values.size() > 3);

bufferlist::iterator p = values[infover_key].begin();
::decode(struct_v, p);
Expand All @@ -3083,6 +3120,13 @@ int PG::read_info(
p = values[biginfo_key].begin();
::decode(past_intervals, p);
::decode(info.purged_snaps, p);

p = values[fastinfo_key].begin();
if (!p.end()) {
pg_fast_info_t fast;
::decode(fast, p);
fast.apply_to(&info);
}
return 0;
}

Expand Down
4 changes: 3 additions & 1 deletion src/osd/PG.h
Expand Up @@ -295,7 +295,9 @@ class PG : DoutPrefixProvider {
pg_info_t info; ///< current pg info
pg_info_t last_written_info; ///< last written info
__u8 info_struct_v;
static const __u8 cur_struct_v = 8;
static const __u8 cur_struct_v = 9;
// v9 was fastinfo_key addition
// v8 was the move to a per-pg pgmeta object
// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
// (first appeared in cuttlefish).
static const __u8 compat_struct_v = 7;
Expand Down
134 changes: 134 additions & 0 deletions src/osd/osd_types.h
Expand Up @@ -61,6 +61,7 @@
#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(14, "fastinfo pg attr")


/// max recovery priority for MBackfillReserve
Expand Down Expand Up @@ -2232,6 +2233,139 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
return out;
}

/**
* pg_fast_info_t - common pg_info_t fields
*
* These are the fields of pg_info_t (and children) that are updated for
* most IO operations.
*/
struct pg_fast_info_t {
eversion_t last_update;
eversion_t last_complete;
version_t last_user_version;
struct { // pg_stat_t stats
eversion_t version;
version_t reported_seq;
utime_t last_fresh;
utime_t last_active;
utime_t last_peered;
utime_t last_clean;
utime_t last_unstale;
utime_t last_undegraded;
utime_t last_fullsized;
int64_t log_size; // (also ondisk_log_size, which has the same value)
struct { // object_stat_collection_t stats;
struct { // objct_stat_sum_t sum
int64_t num_bytes; // in bytes
int64_t num_objects;
int64_t num_object_copies;
int64_t num_rd;
int64_t num_rd_kb;
int64_t num_wr;
int64_t num_wr_kb;
} sum;
} stats;
} stats;

void populate_from(const pg_info_t& info) {
last_update = info.last_update;
last_complete = info.last_complete;
last_user_version = info.last_user_version;
stats.version = info.stats.version;
stats.reported_seq = info.stats.reported_seq;
stats.last_fresh = info.stats.last_fresh;
stats.last_active = info.stats.last_active;
stats.last_peered = info.stats.last_peered;
stats.last_clean = info.stats.last_clean;
stats.last_unstale = info.stats.last_unstale;
stats.last_undegraded = info.stats.last_undegraded;
stats.last_fullsized = info.stats.last_fullsized;
stats.log_size = info.stats.log_size;
stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
}

void apply_to(pg_info_t* info) {
info->last_update = last_update;
info->last_complete = last_complete;
info->last_user_version = last_user_version;
info->stats.version = stats.version;
info->stats.reported_seq = stats.reported_seq;
info->stats.last_fresh = stats.last_fresh;
info->stats.last_active = stats.last_active;
info->stats.last_peered = stats.last_peered;
info->stats.last_clean = stats.last_clean;
info->stats.last_unstale = stats.last_unstale;
info->stats.last_undegraded = stats.last_undegraded;
info->stats.last_fullsized = stats.last_fullsized;
info->stats.log_size = stats.log_size;
info->stats.ondisk_log_size = stats.log_size;
info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
}

void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
::encode(last_update, bl);
::encode(last_complete, bl);
::encode(last_user_version, bl);
::encode(stats.version, bl);
::encode(stats.reported_seq, bl);
::encode(stats.last_fresh, bl);
::encode(stats.last_active, bl);
::encode(stats.last_peered, bl);
::encode(stats.last_clean, bl);
::encode(stats.last_unstale, bl);
::encode(stats.last_undegraded, bl);
::encode(stats.last_fullsized, bl);
::encode(stats.log_size, bl);
::encode(stats.stats.sum.num_bytes, bl);
::encode(stats.stats.sum.num_objects, bl);
::encode(stats.stats.sum.num_object_copies, bl);
::encode(stats.stats.sum.num_rd, bl);
::encode(stats.stats.sum.num_rd_kb, bl);
::encode(stats.stats.sum.num_wr, bl);
::encode(stats.stats.sum.num_wr_kb, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& p) {
DECODE_START(1, p);
::decode(last_update, p);
::decode(last_complete, p);
::decode(last_user_version, p);
::decode(stats.version, p);
::decode(stats.reported_seq, p);
::decode(stats.last_fresh, p);
::decode(stats.last_active, p);
::decode(stats.last_peered, p);
::decode(stats.last_clean, p);
::decode(stats.last_unstale, p);
::decode(stats.last_undegraded, p);
::decode(stats.last_fullsized, p);
::decode(stats.log_size, p);
::decode(stats.stats.sum.num_bytes, p);
::decode(stats.stats.sum.num_objects, p);
::decode(stats.stats.sum.num_object_copies, p);
::decode(stats.stats.sum.num_rd, p);
::decode(stats.stats.sum.num_rd_kb, p);
::decode(stats.stats.sum.num_wr, p);
::decode(stats.stats.sum.num_wr_kb, p);
DECODE_FINISH(p);
}
};
WRITE_CLASS_ENCODER(pg_fast_info_t)


struct pg_notify_t {
epoch_t query_epoch;
epoch_t epoch_sent;
Expand Down

0 comments on commit 3717e7d

Please sign in to comment.