Skip to content

Commit

Permalink
avoid trace SevError for TesterRecruitmentTimeout unless it keeps fai…
Browse files Browse the repository at this point in the history
…lure for over 1 day
  • Loading branch information
kakaiu committed Apr 29, 2024
1 parent bb8ba15 commit 0f7d72d
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
17 changes: 17 additions & 0 deletions fdbserver/CommitProxyServer.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,14 @@ ACTOR static Future<ResolveTransactionBatchReply> trackResolutionMetrics(Referen
return reply;
}

std::string getHotShardString(std::vector<std::pair<KeyRange, double>> input) {
std::string res;
for (const auto& [range, expireTime] : input) {
res = res + range.toString() + "; ";
}
return res;
}

namespace CommitBatch {

struct CommitBatchContext {
Expand Down Expand Up @@ -754,13 +762,22 @@ void CommitBatchContext::checkHotShards() {
if (isSingleKeyMutation((MutationRef::Type)m.type)) {
for (const auto& shard : pProxyCommitData->hotShards) {
if (shard.first.contains(KeyRef(m.param1))) {
TraceEvent("Zhe")
.detail("MutationType", "SingleKeyMutation")
.detail("Param1", m.param1)
.detail("HotShards", getHotShardString(pProxyCommitData->hotShards));
abortTransaction = true;
break;
}
}
} else if (m.type == MutationRef::ClearRange) {
for (const auto& shard : pProxyCommitData->hotShards) {
if (shard.first.intersects(KeyRangeRef(m.param1, m.param2))) {
TraceEvent("Zhe")
.detail("MutationType", "ClearRange")
.detail("Param1", m.param1)
.detail("Param2", m.param2)
.detail("HotShards", getHotShardString(pProxyCommitData->hotShards));
abortTransaction = true;
break;
}
Expand Down
17 changes: 15 additions & 2 deletions fdbserver/tester.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1594,7 +1594,7 @@ ACTOR Future<std::vector<TesterInterface>> getTesters(Reference<AsyncVar<Optiona
}
when(wait(cc->onChange())) {}
when(wait(testerTimeout)) {
TraceEvent(SevError, "TesterRecruitmentTimeout").log();
TraceEvent(SevWarn, "TesterRecruitmentTimeout").log();
throw timed_out();
}
}
Expand Down Expand Up @@ -1792,6 +1792,7 @@ ACTOR Future<Void> runConsistencyCheckerUrgentCore(Reference<AsyncVar<Optional<C
state std::vector<TesterInterface> ts; // used to store testers interface
state std::vector<KeyRange> rangesToCheck; // get from globalProgressMap
state std::vector<KeyRange> shardsToCheck; // get from keyServer metadata
state Optional<double> whenFailedToGetTesterStart;

// Initialize globalProgressMap
Optional<std::vector<KeyRange>> rangesToCheck_ = loadRangesToCheckFromKnob();
Expand Down Expand Up @@ -1838,7 +1839,19 @@ ACTOR Future<Void> runConsistencyCheckerUrgentCore(Reference<AsyncVar<Optional<C
// Step 2: Get testers
ts.clear();
if (!testers.present()) { // In real clusters
wait(store(ts, getTesters(cc, minTestersExpected)));
try {
wait(store(ts, getTesters(cc, minTestersExpected)));
whenFailedToGetTesterStart.reset();
} catch (Error& e) {
if (e.code() == error_code_timed_out) {
if (!whenFailedToGetTesterStart.present()) {
whenFailedToGetTesterStart = now();
} else if (now() - whenFailedToGetTesterStart.get() > 3600 * 24) { // 1 day
TraceEvent(SevError, "TesterRecruitmentTimeout").log();
}
}
throw e;
}
if (g_network->isSimulated() && deterministicRandom()->random01() < 0.05) {
throw operation_failed(); // Introduce random failure
}
Expand Down

0 comments on commit 0f7d72d

Please sign in to comment.