Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hammer: clock skew report is incorrect by ceph health detail command #8051

Merged
merged 2 commits into from Apr 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/common/config_opts.h
Expand Up @@ -204,6 +204,7 @@ OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between
OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted
OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin
Expand Down
81 changes: 71 additions & 10 deletions src/mon/Monitor.cc
Expand Up @@ -178,6 +178,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,

timecheck_round(0),
timecheck_acks(0),
timecheck_rounds_since_clean(0),
timecheck_event(NULL),

probe_timeout_event(NULL),
Expand Down Expand Up @@ -3685,8 +3686,7 @@ void Monitor::timecheck_start_round()
timecheck();
out:
dout(10) << __func__ << " setting up next event" << dendl;
timecheck_event = new C_TimeCheck(this);
timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event);
timecheck_reset_event();
}

void Monitor::timecheck_finish_round(bool success)
Expand All @@ -3700,6 +3700,7 @@ void Monitor::timecheck_finish_round(bool success)
assert(timecheck_waiting.empty());
assert(timecheck_acks == quorum.size());
timecheck_report();
timecheck_check_skews();
return;
}

Expand Down Expand Up @@ -3733,6 +3734,69 @@ void Monitor::timecheck_cleanup()
timecheck_waiting.clear();
timecheck_skews.clear();
timecheck_latencies.clear();

timecheck_rounds_since_clean = 0;
}

void Monitor::timecheck_reset_event()
{
if (timecheck_event) {
timer.cancel_event(timecheck_event);
timecheck_event = NULL;
}

double delay =
cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean;

if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) {
delay = cct->_conf->mon_timecheck_interval;
}

dout(10) << __func__ << " delay " << delay
<< " rounds_since_clean " << timecheck_rounds_since_clean
<< dendl;

timecheck_event = new C_TimeCheck(this);
timer.add_event_after(delay, timecheck_event);
}

void Monitor::timecheck_check_skews()
{
dout(10) << __func__ << dendl;
assert(is_leader());
assert((timecheck_round % 2) == 0);
if (monmap->size() == 1) {
assert(0 == "We are alone; we shouldn't have gotten here!");
return;
}
assert(timecheck_latencies.size() == timecheck_skews.size());

bool found_skew = false;
for (map<entity_inst_t, double>::iterator p = timecheck_skews.begin();
p != timecheck_skews.end(); ++p) {

double abs_skew;
if (timecheck_has_skew(p->second, &abs_skew)) {
dout(10) << __func__
<< " " << p->first << " skew " << abs_skew << dendl;
found_skew = true;
}
}

if (found_skew) {
++timecheck_rounds_since_clean;
timecheck_reset_event();
} else if (timecheck_rounds_since_clean > 0) {
dout(1) << __func__
<< " no clock skews found after " << timecheck_rounds_since_clean
<< " rounds" << dendl;
// make sure the skews are really gone and not just a transient success
// this will run just once if not in the presence of skews again.
timecheck_rounds_since_clean = 1;
timecheck_reset_event();
timecheck_rounds_since_clean = 0;
}

}

void Monitor::timecheck_report()
Expand All @@ -3755,7 +3819,8 @@ void Monitor::timecheck_report()
m->epoch = get_epoch();
m->round = timecheck_round;

for (map<entity_inst_t, double>::iterator it = timecheck_skews.begin(); it != timecheck_skews.end(); ++it) {
for (map<entity_inst_t, double>::iterator it = timecheck_skews.begin();
it != timecheck_skews.end(); ++it) {
double skew = it->second;
double latency = timecheck_latencies[it->first];

Expand Down Expand Up @@ -3814,10 +3879,10 @@ health_status_t Monitor::timecheck_status(ostringstream &ss,
const double latency)
{
health_status_t status = HEALTH_OK;
double abs_skew = (skew_bound > 0 ? skew_bound : -skew_bound);
assert(latency >= 0);

if (abs_skew > g_conf->mon_clock_drift_allowed) {
double abs_skew;
if (timecheck_has_skew(skew_bound, &abs_skew)) {
status = HEALTH_WARN;
ss << "clock skew " << abs_skew << "s"
<< " > max " << g_conf->mon_clock_drift_allowed << "s";
Expand Down Expand Up @@ -3931,11 +3996,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m)
<< " delta " << delta << " skew_bound " << skew_bound
<< " latency " << latency << dendl;

if (timecheck_skews.count(other) == 0) {
timecheck_skews[other] = skew_bound;
} else {
timecheck_skews[other] = (timecheck_skews[other]*0.8)+(skew_bound*0.2);
}
timecheck_skews[other] = skew_bound;

timecheck_acks++;
if (timecheck_acks == quorum.size()) {
Expand Down
22 changes: 22 additions & 0 deletions src/mon/Monitor.h
Expand Up @@ -54,6 +54,7 @@
#include "include/memory.h"
#include "include/str_map.h"
#include <errno.h>
#include <cmath>


#define CEPH_MON_PROTOCOL 13 /* cluster internal */
Expand Down Expand Up @@ -463,6 +464,15 @@ class Monitor : public Dispatcher,
version_t timecheck_round;
unsigned int timecheck_acks;
utime_t timecheck_round_start;
/* When we hit a skew we will start a new round based off of
* 'mon_timecheck_skew_interval'. Each new round will be backed off
* until we hit 'mon_timecheck_interval' -- which is the typical
* interval when not in the presence of a skew.
*
* This variable tracks the number of rounds with skews since last clean
* so that we can report to the user and properly adjust the backoff.
*/
uint64_t timecheck_rounds_since_clean;
/**
* Time Check event.
*/
Expand All @@ -482,6 +492,8 @@ class Monitor : public Dispatcher,
void timecheck_finish_round(bool success = true);
void timecheck_cancel_round();
void timecheck_cleanup();
void timecheck_reset_event();
void timecheck_check_skews();
void timecheck_report();
void timecheck();
health_status_t timecheck_status(ostringstream &ss,
Expand All @@ -490,6 +502,16 @@ class Monitor : public Dispatcher,
void handle_timecheck_leader(MTimeCheck *m);
void handle_timecheck_peon(MTimeCheck *m);
void handle_timecheck(MTimeCheck *m);

/**
* Returns 'true' if this is considered to be a skew; 'false' otherwise.
*/
bool timecheck_has_skew(const double skew_bound, double *abs) const {
double abs_skew = std::fabs(skew_bound);
if (abs)
*abs = abs_skew;
return (abs_skew > g_conf->mon_clock_drift_allowed);
}
/**
* @}
*/
Expand Down