Skip to content

Commit

Permalink
Log details of non-green indicators in HealthPeriodicLogger
Browse files Browse the repository at this point in the history
This commit adds the details of an indicator that is not green to the fields for
`HealthPeriodicLogger`.

An example of a regular (green) log message:

```
[2024-05-03T13:42:34,346][INFO ][o.e.h.HealthPeriodicLogger] [runTask-0] elasticsearch.health.data_stream_lifecycle.status="green" elasticsearch.health.disk.status="green" elasticsearch.health.ilm.status="green" elasticsearch.health.master_is_stable.status="green" elasticsearch.health.overall.status="green" elasticsearch.health.repository_integrity.status="green" elasticsearch.health.shards_availability.status="green" elasticsearch.health.shards_capacity.status="green" elasticsearch.health.slm.status="green" message="health=green"
```

And a message with details while the cluster is non-green:

```
[2024-05-03T13:43:34,339][INFO ][o.e.h.HealthPeriodicLogger] [runTask-0] elasticsearch.health.data_stream_lifecycle.status="green" elasticsearch.health.disk.status="green" elasticsearch.health.ilm.status="green" elasticsearch.health.master_is_stable.status="green" elasticsearch.health.overall.status="yellow" elasticsearch.health.repository_integrity.status="green" elasticsearch.health.shards_availability.details="{"initializing_primaries":0,"creating_replicas":0,"started_replicas":0,"unassigned_primaries":0,"restarting_replicas":0,"creating_primaries":0,"initializing_replicas":0,"unassigned_replicas":1,"started_primaries":2,"restarting_primaries":0}" elasticsearch.health.shards_availability.status="yellow" elasticsearch.health.shards_capacity.status="green" elasticsearch.health.slm.status="green" message="health=yellow [shards_availability]"
```
  • Loading branch information
dakrone committed May 3, 2024
1 parent fd3aa53 commit 1aa9fbb
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 8 deletions.
Expand Up @@ -17,6 +17,7 @@
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.component.Lifecycle;
import org.elasticsearch.common.component.LifecycleListener;
Expand Down Expand Up @@ -311,7 +312,7 @@ boolean tryToLogHealth() {
RunOnce release = new RunOnce(currentlyRunning::release);
try {
ActionListener<List<HealthIndicatorResult>> listenerWithRelease = ActionListener.runAfter(resultsListener, release);
this.healthService.getHealth(this.client, null, false, 0, listenerWithRelease);
this.healthService.getHealth(this.client, null, true, 0, listenerWithRelease);
} catch (Exception e) {
// In case of an exception before the listener was wired, we can release the flag here, and we feel safe
// that it will not release it again because this can only be run once.
Expand Down Expand Up @@ -359,6 +360,12 @@ static Map<String, Object> convertToLoggedFields(List<HealthIndicatorResult> ind
String.format(Locale.ROOT, "%s.%s.status", HEALTH_FIELD_PREFIX, indicatorResult.name()),
indicatorResult.status().xContentValue()
);
if (GREEN.equals(indicatorResult.status()) == false && indicatorResult.details() != null) {
result.put(
String.format(Locale.ROOT, "%s.%s.details", HEALTH_FIELD_PREFIX, indicatorResult.name()),
Strings.toString(indicatorResult.details())
);
}
});

// message field. Show the non-green indicators if they exist.
Expand Down
Expand Up @@ -2168,7 +2168,7 @@ private static ClusterState createClusterStateWith(
.build();
}

private static Map<String, Object> addDefaults(Map<String, Object> override) {
public static Map<String, Object> addDefaults(Map<String, Object> override) {
return Map.of(
"unassigned_primaries",
override.getOrDefault("unassigned_primaries", 0),
Expand Down
Expand Up @@ -18,7 +18,9 @@
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
import org.elasticsearch.cluster.routing.allocation.shards.ShardsAvailabilityHealthIndicatorServiceTests;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.component.Lifecycle;
import org.elasticsearch.common.logging.ESLogMessage;
import org.elasticsearch.common.scheduler.SchedulerEngine;
Expand Down Expand Up @@ -122,13 +124,36 @@ public void testConvertToLoggedFields() {

Map<String, Object> loggerResults = HealthPeriodicLogger.convertToLoggedFields(results);

// verify that the number of fields is the number of indicators + 2 (for overall and for message)
assertThat(loggerResults.size(), equalTo(results.size() + 2));
// verify that the number of fields is the number of indicators + 4
// (for overall and for message, plus details for the two yellow indicators)
assertThat(loggerResults.size(), equalTo(results.size() + 4));

// test indicator status
assertThat(loggerResults.get(makeHealthStatusString("master_is_stable")), equalTo("green"));
assertThat(loggerResults.get(makeHealthStatusString("disk")), equalTo("yellow"));
assertThat(
loggerResults.get(makeHealthDetailsString("disk")),
equalTo(
getTestIndicatorResults().stream()
.filter(i -> i.name().equals("disk"))
.findFirst()
.map(HealthIndicatorResult::details)
.map(Strings::toString)
.orElseThrow()
)
);
assertThat(loggerResults.get(makeHealthStatusString("shards_availability")), equalTo("yellow"));
assertThat(
loggerResults.get(makeHealthDetailsString("shards_availability")),
equalTo(
getTestIndicatorResults().stream()
.filter(i -> i.name().equals("shards_availability"))
.findFirst()
.map(HealthIndicatorResult::details)
.map(Strings::toString)
.orElseThrow()
)
);

// test calculated overall status
assertThat(loggerResults.get(makeHealthStatusString("overall")), equalTo(overallStatus.xContentValue()));
Expand Down Expand Up @@ -751,24 +776,65 @@ private void verifyLoggerIsReadyToRun(HealthPeriodicLogger healthPeriodicLogger)

private List<HealthIndicatorResult> getTestIndicatorResults() {
var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
var slowTasks = new HealthIndicatorResult("disk", YELLOW, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult("shards_availability", YELLOW, null, null, null, null);
var slowTasks = new HealthIndicatorResult(
"disk",
YELLOW,
null,
new SimpleHealthIndicatorDetails(
Map.of(
"indices_with_readonly_block",
0,
"nodes_with_enough_disk_space",
1,
"nodes_with_unknown_disk_status",
0,
"nodes_over_high_watermark",
0,
"nodes_over_flood_stage_watermark",
1
)
),
null,
null
);
var shardsAvailable = new HealthIndicatorResult(
"shards_availability",
YELLOW,
null,
new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of())),
null,
null
);

return List.of(networkLatency, slowTasks, shardsAvailable);
}

private List<HealthIndicatorResult> getTestIndicatorResultsAllGreen() {
var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
var slowTasks = new HealthIndicatorResult("disk", GREEN, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult("shards_availability", GREEN, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult(
"shards_availability",
GREEN,
null,
new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of())),
null,
null
);

return List.of(networkLatency, slowTasks, shardsAvailable);
}

private List<HealthIndicatorResult> getTestIndicatorResultsWithRed() {
var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
var slowTasks = new HealthIndicatorResult("disk", GREEN, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult("shards_availability", RED, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult(
"shards_availability",
RED,
null,
new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of("unassigned_primaries", 1))),
null,
null
);

return List.of(networkLatency, slowTasks, shardsAvailable);
}
Expand All @@ -777,6 +843,10 @@ private String makeHealthStatusString(String key) {
return String.format(Locale.ROOT, "%s.%s.status", HealthPeriodicLogger.HEALTH_FIELD_PREFIX, key);
}

private String makeHealthDetailsString(String key) {
return String.format(Locale.ROOT, "%s.%s.details", HealthPeriodicLogger.HEALTH_FIELD_PREFIX, key);
}

private HealthPeriodicLogger createAndInitHealthPeriodicLogger(
ClusterService clusterService,
HealthService testHealthService,
Expand Down

0 comments on commit 1aa9fbb

Please sign in to comment.