Skip to content

Commit

Permalink
Merge pull request #10703 from dillaman/wip-16855
Browse files Browse the repository at this point in the history
rbd-mirror: improve split-brain detection logic

Reviewed-by: Mykola Golub <mgolub@mirantis.com>
  • Loading branch information
Mykola Golub committed Aug 17, 2016
2 parents 8f535ba + 3545d9e commit f7ae584
Show file tree
Hide file tree
Showing 16 changed files with 945 additions and 207 deletions.
13 changes: 13 additions & 0 deletions qa/workunits/rbd/rbd_mirror.sh
Expand Up @@ -99,6 +99,19 @@ admin_daemon ${CLUSTER1} rbd mirror status
testlog "TEST: failover and failback"
start_mirror ${CLUSTER2}

# demote and promote same cluster
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

# failover
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
Expand Down
80 changes: 48 additions & 32 deletions src/librbd/Journal.cc
Expand Up @@ -161,7 +161,7 @@ template <typename J>
int open_journaler(CephContext *cct, J *journaler,
cls::journal::Client *client,
journal::ImageClientMeta *client_meta,
journal::TagData *tag_data) {
uint64_t *tag_tid, journal::TagData *tag_data) {
C_SaferCond init_ctx;
journaler->init(&init_ctx);
int r = init_ctx.wait();
Expand Down Expand Up @@ -191,9 +191,8 @@ int open_journaler(CephContext *cct, J *journaler,

C_SaferCond get_tags_ctx;
Mutex lock("lock");
uint64_t tag_tid;
C_DecodeTags *tags_ctx = new C_DecodeTags(
cct, &lock, &tag_tid, tag_data, &get_tags_ctx);
cct, &lock, tag_tid, tag_data, &get_tags_ctx);
journaler->get_tags(client_meta->tag_class, &tags_ctx->tags, tags_ctx);

r = get_tags_ctx.wait();
Expand All @@ -207,18 +206,12 @@ template <typename J>
int allocate_journaler_tag(CephContext *cct, J *journaler,
const cls::journal::Client &client,
uint64_t tag_class,
const journal::TagData &prev_tag_data,
const journal::TagPredecessor &predecessor,
const std::string &mirror_uuid,
cls::journal::Tag *new_tag) {
journal::TagData tag_data;
if (!client.commit_position.object_positions.empty()) {
auto position = client.commit_position.object_positions.front();
tag_data.predecessor_commit_valid = true;
tag_data.predecessor_tag_tid = position.tag_tid;
tag_data.predecessor_entry_tid = position.entry_tid;
}
tag_data.predecessor_mirror_uuid = prev_tag_data.mirror_uuid;
tag_data.mirror_uuid = mirror_uuid;
tag_data.predecessor = predecessor;

bufferlist tag_bl;
::encode(tag_data, tag_bl);
Expand Down Expand Up @@ -465,8 +458,10 @@ int Journal<I>::get_tag_owner(IoCtx& io_ctx, std::string& image_id,

cls::journal::Client client;
journal::ImageClientMeta client_meta;
uint64_t tag_tid;
journal::TagData tag_data;
int r = open_journaler(cct, &journaler, &client, &client_meta, &tag_data);
int r = open_journaler(cct, &journaler, &client, &client_meta, &tag_tid,
&tag_data);
if (r >= 0) {
*mirror_uuid = tag_data.mirror_uuid;
}
Expand All @@ -484,9 +479,10 @@ int Journal<I>::request_resync(I *image_ctx) {

cls::journal::Client client;
journal::ImageClientMeta client_meta;
uint64_t tag_tid;
journal::TagData tag_data;
int r = open_journaler(image_ctx->cct, &journaler, &client, &client_meta,
&tag_data);
&tag_tid, &tag_data);
BOOST_SCOPE_EXIT_ALL(&journaler) {
journaler.shut_down();
};
Expand Down Expand Up @@ -522,9 +518,10 @@ int Journal<I>::promote(I *image_ctx) {

cls::journal::Client client;
journal::ImageClientMeta client_meta;
uint64_t tag_tid;
journal::TagData tag_data;
int r = open_journaler(image_ctx->cct, &journaler, &client, &client_meta,
&tag_data);
&tag_tid, &tag_data);
BOOST_SCOPE_EXIT_ALL(&journaler) {
journaler.shut_down();
};
Expand All @@ -533,9 +530,21 @@ int Journal<I>::promote(I *image_ctx) {
return r;
}

journal::TagPredecessor predecessor;
if (tag_data.mirror_uuid == ORPHAN_MIRROR_UUID) {
// orderly promotion -- demotion epoch will have a single entry
// so link to our predecessor (demotion) epoch
predecessor = journal::TagPredecessor{
ORPHAN_MIRROR_UUID, true, tag_tid, 1};
} else {
// forced promotion -- create an epoch no peers can link against
predecessor = journal::TagPredecessor{
LOCAL_MIRROR_UUID, true, tag_tid, 0};
}

cls::journal::Tag new_tag;
r = allocate_journaler_tag(cct, &journaler, client, client_meta.tag_class,
tag_data, LOCAL_MIRROR_UUID, &new_tag);
predecessor, LOCAL_MIRROR_UUID, &new_tag);
if (r < 0) {
return r;
}
Expand Down Expand Up @@ -624,6 +633,11 @@ bool Journal<I>::is_tag_owner() const {
return (m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
}

template <typename I>
uint64_t Journal<I>::get_tag_tid() const {
return m_tag_tid;
}

template <typename I>
journal::TagData Journal<I>::get_tag_data() const {
return m_tag_data;
Expand All @@ -645,9 +659,19 @@ int Journal<I>::demote() {
return r;
}

assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
journal::TagPredecessor predecessor;
predecessor.mirror_uuid = LOCAL_MIRROR_UUID;
if (!client.commit_position.object_positions.empty()) {
auto position = client.commit_position.object_positions.front();
predecessor.commit_valid = true;
predecessor.tag_tid = position.tag_tid;
predecessor.entry_tid = position.entry_tid;
}

cls::journal::Tag new_tag;
r = allocate_journaler_tag(cct, m_journaler, client, m_tag_class,
m_tag_data, ORPHAN_MIRROR_UUID, &new_tag);
predecessor, ORPHAN_MIRROR_UUID, &new_tag);
if (r < 0) {
return r;
}
Expand Down Expand Up @@ -697,9 +721,8 @@ void Journal<I>::allocate_local_tag(Context *on_finish) {
CephContext *cct = m_image_ctx.cct;
ldout(cct, 20) << this << " " << __func__ << dendl;

bool predecessor_commit_valid = false;
uint64_t predecessor_tag_tid = 0;
uint64_t predecessor_entry_tid = 0;
journal::TagPredecessor predecessor;
predecessor.mirror_uuid = LOCAL_MIRROR_UUID;
{
Mutex::Locker locker(m_lock);
assert(m_journaler != nullptr && is_tag_owner());
Expand All @@ -718,22 +741,18 @@ void Journal<I>::allocate_local_tag(Context *on_finish) {
assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
if (!client.commit_position.object_positions.empty()) {
auto position = client.commit_position.object_positions.front();
predecessor_commit_valid = true;
predecessor_tag_tid = position.tag_tid;
predecessor_entry_tid = position.entry_tid;
predecessor.commit_valid = true;
predecessor.tag_tid = position.tag_tid;
predecessor.entry_tid = position.entry_tid;
}
}

allocate_tag(LOCAL_MIRROR_UUID, LOCAL_MIRROR_UUID, predecessor_commit_valid,
predecessor_tag_tid, predecessor_entry_tid, on_finish);
allocate_tag(LOCAL_MIRROR_UUID, predecessor, on_finish);
}

template <typename I>
void Journal<I>::allocate_tag(const std::string &mirror_uuid,
const std::string &predecessor_mirror_uuid,
bool predecessor_commit_valid,
uint64_t predecessor_tag_tid,
uint64_t predecessor_entry_tid,
const journal::TagPredecessor &predecessor,
Context *on_finish) {
CephContext *cct = m_image_ctx.cct;
ldout(cct, 20) << this << " " << __func__ << ": mirror_uuid=" << mirror_uuid
Expand All @@ -744,10 +763,7 @@ void Journal<I>::allocate_tag(const std::string &mirror_uuid,

journal::TagData tag_data;
tag_data.mirror_uuid = mirror_uuid;
tag_data.predecessor_mirror_uuid = predecessor_mirror_uuid;
tag_data.predecessor_commit_valid = predecessor_commit_valid;
tag_data.predecessor_tag_tid = predecessor_tag_tid;
tag_data.predecessor_entry_tid = predecessor_entry_tid;
tag_data.predecessor = predecessor;

bufferlist tag_bl;
::encode(tag_data, tag_bl);
Expand Down
6 changes: 3 additions & 3 deletions src/librbd/Journal.h
Expand Up @@ -118,14 +118,14 @@ class Journal {
void close(Context *on_finish);

bool is_tag_owner() const;
uint64_t get_tag_tid() const;
journal::TagData get_tag_data() const;
int demote();

void allocate_local_tag(Context *on_finish);
void allocate_tag(const std::string &mirror_uuid,
const std::string &predecessor_mirror_uuid,
bool predecessor_commit_valid, uint64_t predecessor_tag_tid,
uint64_t predecessor_entry_tid, Context *on_finish);
const journal::TagPredecessor &predecessor,
Context *on_finish);

void flush_commit_position(Context *on_finish);

Expand Down
9 changes: 9 additions & 0 deletions src/librbd/internal.cc
Expand Up @@ -3106,6 +3106,15 @@ int mirror_image_disable_internal(ImageCtx *ictx, bool force,
return -EINVAL;
}

// avoid accepting new requests from peers while we demote
// the image
ictx->exclusive_lock->block_requests(0);
BOOST_SCOPE_EXIT_ALL( (ictx) ) {
if (ictx->exclusive_lock != nullptr) {
ictx->exclusive_lock->unblock_requests();
}
};

C_SaferCond lock_ctx;
ictx->exclusive_lock->request_lock(&lock_ctx);

Expand Down
58 changes: 39 additions & 19 deletions src/librbd/journal/Types.cc
Expand Up @@ -513,29 +513,42 @@ void ClientData::generate_test_instances(std::list<ClientData *> &o) {

// Journal Tag

void TagPredecessor::encode(bufferlist& bl) const {
::encode(mirror_uuid, bl);
::encode(commit_valid, bl);
::encode(tag_tid, bl);
::encode(entry_tid, bl);
}

void TagPredecessor::decode(bufferlist::iterator& it) {
::decode(mirror_uuid, it);
::decode(commit_valid, it);
::decode(tag_tid, it);
::decode(entry_tid, it);
}

void TagPredecessor::dump(Formatter *f) const {
f->dump_string("mirror_uuid", mirror_uuid);
f->dump_string("commit_valid", commit_valid ? "true" : "false");
f->dump_unsigned("tag_tid", tag_tid);
f->dump_unsigned("entry_tid", entry_tid);
}

void TagData::encode(bufferlist& bl) const {
::encode(mirror_uuid, bl);
::encode(predecessor_mirror_uuid, bl);
::encode(predecessor_commit_valid, bl);
::encode(predecessor_tag_tid, bl);
::encode(predecessor_entry_tid, bl);
predecessor.encode(bl);
}

void TagData::decode(bufferlist::iterator& it) {
::decode(mirror_uuid, it);
::decode(predecessor_mirror_uuid, it);
::decode(predecessor_commit_valid, it);
::decode(predecessor_tag_tid, it);
::decode(predecessor_entry_tid, it);
predecessor.decode(it);
}

void TagData::dump(Formatter *f) const {
f->dump_string("mirror_uuid", mirror_uuid);
f->dump_string("predecessor_mirror_uuid", predecessor_mirror_uuid);
f->dump_string("predecessor_commit_valid",
predecessor_commit_valid ? "true" : "false");
f->dump_unsigned("predecessor_tag_tid", predecessor_tag_tid);
f->dump_unsigned("predecessor_entry_tid", predecessor_entry_tid);
f->open_object_section("predecessor");
predecessor.dump(f);
f->close_section();
}

void TagData::generate_test_instances(std::list<TagData *> &o) {
Expand Down Expand Up @@ -669,19 +682,26 @@ std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) {
return out;
}

std::ostream &operator<<(std::ostream &out, const TagData &tag_data) {
std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor) {
out << "["
<< "mirror_uuid=" << tag_data.mirror_uuid << ", "
<< "predecessor_mirror_uuid=" << tag_data.predecessor_mirror_uuid;
if (tag_data.predecessor_commit_valid) {
<< "mirror_uuid=" << predecessor.mirror_uuid;
if (predecessor.commit_valid) {
out << ", "
<< "predecessor_tag_tid=" << tag_data.predecessor_tag_tid << ", "
<< "predecessor_entry_tid=" << tag_data.predecessor_entry_tid;
<< "tag_tid=" << predecessor.tag_tid << ", "
<< "entry_tid=" << predecessor.entry_tid;
}
out << "]";
return out;
}

std::ostream &operator<<(std::ostream &out, const TagData &tag_data) {
out << "["
<< "mirror_uuid=" << tag_data.mirror_uuid << ", "
<< "predecessor=" << tag_data.predecessor
<< "]";
return out;
}

} // namespace journal
} // namespace librbd

38 changes: 30 additions & 8 deletions src/librbd/journal/Types.h
Expand Up @@ -463,15 +463,38 @@ struct ClientData {

// Journal Tag data structures

struct TagPredecessor {
std::string mirror_uuid; // empty if local
bool commit_valid = false;
uint64_t tag_tid = 0;
uint64_t entry_tid = 0;

TagPredecessor() {
}
TagPredecessor(const std::string &mirror_uuid, bool commit_valid,
uint64_t tag_tid, uint64_t entry_tid)
: mirror_uuid(mirror_uuid), commit_valid(commit_valid), tag_tid(tag_tid),
entry_tid(entry_tid) {
}

inline bool operator==(const TagPredecessor &rhs) const {
return (mirror_uuid == rhs.mirror_uuid &&
commit_valid == rhs.commit_valid &&
tag_tid == rhs.tag_tid &&
entry_tid == rhs.entry_tid);
}

void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& it);
void dump(Formatter *f) const;
};

struct TagData {
// owner of the tag (exclusive lock epoch)
std::string mirror_uuid; // empty if local

// mapping to last committed record of previous tag
std::string predecessor_mirror_uuid; // empty if local
bool predecessor_commit_valid = false;
uint64_t predecessor_tag_tid = 0;
uint64_t predecessor_entry_tid = 0;
TagPredecessor predecessor;

TagData() {
}
Expand All @@ -482,10 +505,8 @@ struct TagData {
bool predecessor_commit_valid,
uint64_t predecessor_tag_tid, uint64_t predecessor_entry_tid)
: mirror_uuid(mirror_uuid),
predecessor_mirror_uuid(predecessor_mirror_uuid),
predecessor_commit_valid(predecessor_commit_valid),
predecessor_tag_tid(predecessor_tag_tid),
predecessor_entry_tid(predecessor_entry_tid) {
predecessor(predecessor_mirror_uuid, predecessor_commit_valid,
predecessor_tag_tid, predecessor_entry_tid) {
}

void encode(bufferlist& bl) const;
Expand All @@ -501,6 +522,7 @@ std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta);
std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync);
std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta);
std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta);
std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor);
std::ostream &operator<<(std::ostream &out, const TagData &tag_data);

enum class ListenerType : int8_t {
Expand Down

0 comments on commit f7ae584

Please sign in to comment.