Skip to content

Commit

Permalink
Add liveness to health report (#3504)
Browse files Browse the repository at this point in the history
* Add liveness to health report
  • Loading branch information
PavelZaytsev committed Jan 31, 2023
1 parent b8e5d20 commit a772add
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 4 deletions.
2 changes: 1 addition & 1 deletion infrastructure/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ARG CORFU_TOOLS_JAR

WORKDIR /app

RUN apt update && apt -y install iptables bash jq python3 sudo iproute2 python3-pip
RUN apt update && apt -y install iptables bash jq python3 sudo iproute2 python3-pip curl
RUN python3 -m pip install pyyaml==5.4.1

COPY target/${CORFU_JAR} /usr/share/corfu/lib/${CORFU_JAR}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import lombok.extern.slf4j.Slf4j;
import org.corfudb.util.LambdaUtils;

import java.lang.management.ManagementFactory;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.time.Duration;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;

/**
* HealthMonitor keeps track of the HealthStatus of each Component.
Expand All @@ -19,10 +29,28 @@ public final class HealthMonitor {

private final ConcurrentMap<Component, HealthStatus> componentHealthStatus;

private final ScheduledExecutorService scheduledExecutorService;

private final AtomicReference<LivenessStatus> livenessStatus;

private static final Duration LIVENESS_INTERVAL = Duration.ofMinutes(1);

private static Optional<HealthMonitor> instance = Optional.empty();

private HealthMonitor() {
this.componentHealthStatus = new ConcurrentHashMap<>();
this.livenessStatus = new AtomicReference<>(new LivenessStatus(true, ""));
this.scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder()
.setNameFormat("HealthMonitorLivenessThread")
.build());
this.scheduledExecutorService.scheduleWithFixedDelay(
() -> LambdaUtils.runSansThrow(this::checkLiveness),
0,
LIVENESS_INTERVAL.toMillis(),
TimeUnit.MILLISECONDS
);

}

public static void init() {
Expand Down Expand Up @@ -55,6 +83,29 @@ private void addIssue(Issue issue) {
});
}

@VisibleForTesting
static void liveness() {
instance.ifPresent(HealthMonitor::checkLiveness);
}

private void checkLiveness() {
ThreadMXBean tmx = ManagementFactory.getThreadMXBean();
long[] ids = tmx.findDeadlockedThreads();
if (ids != null) {
log.warn("Detected deadlock");
ThreadInfo[] infos = tmx.getThreadInfo(ids, true, true);
StringBuilder sb = new StringBuilder();
for (ThreadInfo ti : infos) {
sb.append(ti.toString());
sb.append("\n");
}
livenessStatus.set(new LivenessStatus(false, sb.toString()));
}
else {
livenessStatus.set(new LivenessStatus(true, ""));
}
}

/**
* Resolve the issue. If there was no issue to begin with, it's a NOOP.
* @param issue An issue
Expand All @@ -77,14 +128,15 @@ private void removeIssue(Issue issue) {
private void close() {
componentHealthStatus.clear();
instance = Optional.empty();
scheduledExecutorService.shutdown();
}

public static void shutdown() {
instance.ifPresent(HealthMonitor::close);
}

private HealthReport healthReport() {
return HealthReport.fromComponentHealthStatus(componentHealthStatus);
return HealthReport.fromComponentHealthStatus(componentHealthStatus, livenessStatus.get());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,29 @@ public class HealthReport {
@Getter
private final Set<ComponentReportedHealthStatus> runtime;

@NonNull
@Getter
private final ReportedLivenessStatus liveness;


/**
* Create a HealthReport from the HealthMonitor's componentHealthStatus. Overall status is healthy if all the
* components are init and runtime healthy. If init report is empty - the overall status is unknown. If at least
* one init component is unhealthy or at least one runtime component is unhealthy, it's reflected in the overall status.
* Otherwise, the status is healthy.
*
* @param componentHealthStatus HealthMonitor's componentHealthStatus
* @param livenessStatus HealthMonitor's livenessStatus
* @return A health report
*/
public static HealthReport fromComponentHealthStatus(Map<Component, HealthStatus> componentHealthStatus) {
public static HealthReport fromComponentHealthStatus(Map<Component, HealthStatus> componentHealthStatus,
LivenessStatus livenessStatus) {
Map<Component, HealthStatus> componentHealthStatusSnapshot = ImmutableMap.copyOf(componentHealthStatus);
Set<ComponentReportedHealthStatus> initReportedHealthStatus =
createInitReportedHealthStatus(componentHealthStatusSnapshot);
Set<ComponentReportedHealthStatus> runtimeReportedHealthStatus =
createRuntimeReportedHealthStatus(componentHealthStatusSnapshot);
ReportedLivenessStatus reportedLivenessStatus = createReportedLivenessStatus(livenessStatus);
String overallReason;
ComponentStatus overallStatus;
if (initReportedHealthStatus.isEmpty()) {
Expand All @@ -99,7 +107,7 @@ public static HealthReport fromComponentHealthStatus(Map<Component, HealthStatus
} else if (!isHealthy(initReportedHealthStatus)) {
overallStatus = DOWN;
overallReason = OVERALL_STATUS_DOWN;
} else if (!isHealthy(runtimeReportedHealthStatus)) {
} else if (!isHealthy(runtimeReportedHealthStatus) || reportedLivenessStatus.getStatus() != UP) {
overallStatus = FAILURE;
overallReason = OVERALL_STATUS_FAILURE;
} else {
Expand All @@ -111,6 +119,7 @@ public static HealthReport fromComponentHealthStatus(Map<Component, HealthStatus
.reason(overallReason)
.init(initReportedHealthStatus)
.runtime(runtimeReportedHealthStatus)
.liveness(reportedLivenessStatus)
.build();
}

Expand Down Expand Up @@ -165,6 +174,15 @@ private static Set<ComponentReportedHealthStatus> createRuntimeReportedHealthSta
}).collect(ImmutableSet.toImmutableSet());
}

private static ReportedLivenessStatus createReportedLivenessStatus(LivenessStatus livenessStatus) {
if (livenessStatus.isHealthy()) {
return new ReportedLivenessStatus(UP, OVERALL_STATUS_UP);
}
else {
return new ReportedLivenessStatus(DOWN, livenessStatus.getReason());
}
}

@AllArgsConstructor
public static enum ComponentStatus {

Expand Down Expand Up @@ -203,4 +221,14 @@ public static class ComponentReportedHealthStatus {
@Getter
private final String reason;
}

@AllArgsConstructor
@ToString
@EqualsAndHashCode
public static class ReportedLivenessStatus {
@Getter
private final ComponentStatus status;
@Getter
private final String reason;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.corfudb.infrastructure.health;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;

@ToString
@AllArgsConstructor
public class LivenessStatus {
@Getter
boolean isHealthy;
@Getter
String reason;
}

0 comments on commit a772add

Please sign in to comment.