Skip to content

Commit

Permalink
Merge pull request #11413: jewel: MDS goes damaged on blacklist (fail…
Browse files Browse the repository at this point in the history
…ed to read JournalPointer: -108 ((108) Cannot send after transport endpoint shutdown)

Reviewed-by: Loic Dachary <ldachary@redhat.com>
  • Loading branch information
Loic Dachary committed Oct 13, 2016
2 parents 4aef89d + 2ee3e54 commit 292f39a
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 3 deletions.
5 changes: 4 additions & 1 deletion src/ceph_mds.cc
Expand Up @@ -136,9 +136,12 @@ int main(int argc, const char **argv)
"MDS names may not start with a numeric digit." << dendl;
}

uint64_t nonce = 0;
get_random_bytes((char*)&nonce, sizeof(nonce));

Messenger *msgr = Messenger::create(g_ceph_context, g_conf->ms_type,
entity_name_t::MDS(-1), "mds",
getpid());
nonce);
if (!msgr)
exit(1);
msgr->set_cluster_protocol(CEPH_MDS_PROTOCOL);
Expand Down
16 changes: 14 additions & 2 deletions src/mds/MDLog.cc
Expand Up @@ -910,6 +910,10 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
int write_result = jp.save(mds->objecter);
// Nothing graceful we can do for this
assert(write_result >= 0);
} else if (read_result == -EBLACKLISTED) {
derr << "Blacklisted during JournalPointer read! Respawning..." << dendl;
mds->respawn();
assert(0); // Should be unreachable because respawn calls execv
} else if (read_result != 0) {
mds->clog->error() << "failed to read JournalPointer: " << read_result
<< " (" << cpp_strerror(read_result) << ")";
Expand All @@ -936,7 +940,11 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
C_SaferCond recover_wait;
back.recover(&recover_wait);
int recovery_result = recover_wait.wait();
if (recovery_result != 0) {
if (recovery_result == -EBLACKLISTED) {
derr << "Blacklisted during journal recovery! Respawning..." << dendl;
mds->respawn();
assert(0); // Should be unreachable because respawn calls execv
} else if (recovery_result != 0) {
// Journaler.recover succeeds if no journal objects are present: an error
// means something worse like a corrupt header, which we can't handle here.
mds->clog->error() << "Error recovering journal " << jp.front << ": "
Expand Down Expand Up @@ -979,7 +987,11 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
int recovery_result = recover_wait.wait();
dout(4) << "Journal " << jp.front << " recovered." << dendl;

if (recovery_result != 0) {
if (recovery_result == -EBLACKLISTED) {
derr << "Blacklisted during journal recovery! Respawning..." << dendl;
mds->respawn();
assert(0); // Should be unreachable because respawn calls execv
} else if (recovery_result != 0) {
mds->clog->error() << "Error recovering journal " << jp.front << ": "
<< cpp_strerror(recovery_result);
mds->damaged_unlocked();
Expand Down

0 comments on commit 292f39a

Please sign in to comment.