@@ -1594,7 +1594,7 @@ ACTOR Future<std::vector<TesterInterface>> getTesters(Reference<AsyncVar<Optiona
1594
1594
}
1595
1595
when (wait (cc->onChange ())) {}
1596
1596
when (wait (testerTimeout)) {
1597
- TraceEvent (SevError , " TesterRecruitmentTimeout" ).log ();
1597
+ TraceEvent (SevWarn , " TesterRecruitmentTimeout" ).log ();
1598
1598
throw timed_out ();
1599
1599
}
1600
1600
}
@@ -1792,6 +1792,7 @@ ACTOR Future<Void> runConsistencyCheckerUrgentCore(Reference<AsyncVar<Optional<C
1792
1792
state std::vector<TesterInterface> ts; // used to store testers interface
1793
1793
state std::vector<KeyRange> rangesToCheck; // get from globalProgressMap
1794
1794
state std::vector<KeyRange> shardsToCheck; // get from keyServer metadata
1795
+ state Optional<double > whenFailedToGetTesterStart;
1795
1796
1796
1797
// Initialize globalProgressMap
1797
1798
Optional<std::vector<KeyRange>> rangesToCheck_ = loadRangesToCheckFromKnob ();
@@ -1838,7 +1839,19 @@ ACTOR Future<Void> runConsistencyCheckerUrgentCore(Reference<AsyncVar<Optional<C
1838
1839
// Step 2: Get testers
1839
1840
ts.clear ();
1840
1841
if (!testers.present ()) { // In real clusters
1841
- wait (store (ts, getTesters (cc, minTestersExpected)));
1842
+ try {
1843
+ wait (store (ts, getTesters (cc, minTestersExpected)));
1844
+ whenFailedToGetTesterStart.reset ();
1845
+ } catch (Error& e) {
1846
+ if (e.code () == error_code_timed_out) {
1847
+ if (!whenFailedToGetTesterStart.present ()) {
1848
+ whenFailedToGetTesterStart = now ();
1849
+ } else if (now () - whenFailedToGetTesterStart.get () > 3600 * 24 ) { // 1 day
1850
+ TraceEvent (SevError, " TesterRecruitmentTimeout" ).log ();
1851
+ }
1852
+ }
1853
+ throw e;
1854
+ }
1842
1855
if (g_network->isSimulated () && deterministicRandom ()->random01 () < 0.05 ) {
1843
1856
throw operation_failed (); // Introduce random failure
1844
1857
}
0 commit comments