Skip to content

Commit

Permalink
Fix node health-check-related test failures
Browse files Browse the repository at this point in the history
In elastic#52680 we introduced a new health check mechanism. This commit fixes
up some sporadic related test failures, and improves the behaviour of
the `FollowersChecker` slightly in the case that no retries are
configured.

Closes elastic#59252
Closes elastic#59172
  • Loading branch information
DaveCTurner committed Jul 9, 2020
1 parent a251bf3 commit 8fa17a5
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -327,16 +327,16 @@ public void handleException(TransportException exp) {
failureCountSinceLastSuccess++;

final String reason;
if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
reason = "followers check retry count exceeded";
} else if (exp instanceof ConnectTransportException
if (exp instanceof ConnectTransportException
|| exp.getCause() instanceof ConnectTransportException) {
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
reason = "disconnected";
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
reason = "health check failed";
} else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
reason = "followers check retry count exceeded";
} else {
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
scheduleNextWakeUp();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public StatusInfo getHealth() {

class FsHealthMonitor implements Runnable {

private static final String TEMP_FILE_NAME = ".es_temp_file";
static final String TEMP_FILE_NAME = ".es_temp_file";
private byte[] byteToWrite;

FsHealthMonitor(){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,7 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
}

public void testFailsNodeThatDoesNotRespond() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();

final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> null,
"followers check retry count exceeded",
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
Expand All @@ -189,15 +178,7 @@ public void testFailsNodeThatDoesNotRespond() {
}

public void testFailsNodeThatRejectsCheck() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();

final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> {
throw new ElasticsearchException("simulated exception");
},
Expand All @@ -207,15 +188,7 @@ public void testFailsNodeThatRejectsCheck() {
}

public void testFailureCounterResetsOnSuccess() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(2, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();

final Settings settings = randomSettings();
final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
final int maxRecoveries = randomIntBetween(3, 10);

Expand Down Expand Up @@ -298,16 +271,7 @@ public String toString() {
}

public void testFailsNodeThatIsUnhealthy() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();

testBehaviourOfFailingNode(settings, () -> {
testBehaviourOfFailingNode(randomSettings(), () -> {
throw new NodeHealthCheckFailureException("non writable exception");
}, "health check failed", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
}
Expand All @@ -322,7 +286,7 @@ private void testBehaviourOfFailingNode(Settings testSettings, Supplier<Transpor
final MockTransport mockTransport = new MockTransport() {
@Override
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
assertFalse(node.equals(localNode));
assertNotEquals(node, localNode);
deterministicTaskQueue.scheduleNow(new Runnable() {
@Override
public void run() {
Expand Down Expand Up @@ -675,6 +639,20 @@ private static DiscoveryNode newNode(int nodeId, Map<String, String> attributes,
Version.CURRENT);
}

private static Settings randomSettings() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
}
return settingsBuilder.build();
}

private static class ExpectsSuccess implements TransportResponseHandler<Empty> {
private final AtomicBoolean responseReceived = new AtomicBoolean();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,7 @@ private static class FileSystemFsyncHungProvider extends FilterFileSystemProvide
AtomicBoolean injectIOException = new AtomicBoolean();
AtomicInteger injectedPaths = new AtomicInteger();

private String pathPrefix = "/";
private long delay;
private final long delay;
private final ThreadPool threadPool;

FileSystemFsyncHungProvider(FileSystem inner, long delay, ThreadPool threadPool) {
Expand All @@ -325,7 +324,7 @@ public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options,
@Override
public void force(boolean metaData) throws IOException {
if (injectIOException.get()) {
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
if (path.getFileName().toString().equals(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
injectedPaths.incrementAndGet();
final long startTimeMillis = threadPool.relativeTimeInMillis();
do {
Expand Down

0 comments on commit 8fa17a5

Please sign in to comment.