Skip to content

Commit

Permalink
Status error presentation with details
Browse files Browse the repository at this point in the history
Remote Execution Status messages embedded in ExecuteResponses are
extremely capable vehicles for conveying the nature of an error, and
informing a user of further steps to take to remediate it. This change
expands the presentation of these response Statuses, and brings all of
the error details to light, by default instead of requiring
--verbose_failures to investigate any details of a remote execution
problem.

The interpretation of precondition failures to highlight retriable
responses has been expanded to ignore benign details that might be
included in a response.

SpawnResult error message composition has been simplified substantially,
without any special behavior for 'Remote' errors, and a removal of a
duplicate message printout incurred in the wake of succcessive @janakr
and @olaola changes. Failure messages are now implied to be present in
all spawn result failure reporting exactly once, and the failureMessage
of a SpawnResult is implied to be the parameter to getDetailMessage.

An example error presentation is as follows (including the modifications
to SpawnResult's output formatting):

```
ERROR: /home/werkt/dev/test/BUILD:22:10: Linking test failed: (Exit 34): Remote Execution Failure:
Failed Precondition: Action 4223ab2cc114385110714243a0b4a88cc743f2169b5be7d4d438a6bbba4f529f/142 is invalid
  Resource Info: type.googleapis.com/google.longrunning.Operation: name='shard/operations/9335fef2-184b-4d26-9a6f-2f27cebe7527', owner='tool_invocation_id:4b4bf7b1-fadd-44fd-99be-a234e7c26fc4,correlated_invocation_id:dc88325a-9317-48c0-9013-b3bb8b7a458f'
  Precondition Failure:
    (MISSING) bazel-out/k8-fastbuild/bin/test: 7872: An output could not be uploaded because it exceeded the maximum size of an entry
Target //:test failed to build
```

Closes bazelbuild#12564.

PiperOrigin-RevId: 3449738
  • Loading branch information
werkt authored and Copybara-Service committed Dec 1, 2020
1 parent 12b655b commit 5122617
Show file tree
Hide file tree
Showing 13 changed files with 382 additions and 95 deletions.
Expand Up @@ -385,11 +385,6 @@ public String getDetailMessage(
String reason = "(" + status.toShortString() + ")"; // e.g "(Exit 1)"
String explanation = Strings.isNullOrEmpty(message) ? "" : ": " + message;

if (!status().isConsideredUserError()) {
String errorDetail = status().name().toLowerCase(Locale.US)
.replace('_', ' ');
explanation += ". Note: Remote connection/protocol failed with: " + errorDetail;
}
if (status() == Status.TIMEOUT) {
if (getWallTime().isPresent()) {
explanation +=
Expand All @@ -407,9 +402,6 @@ public String getDetailMessage(
explanation += " Action tagged as local was forcibly run remotely and failed - it's "
+ "possible that the action simply doesn't work remotely";
}
if (!Strings.isNullOrEmpty(failureMessage)) {
explanation += " " + failureMessage;
}
return reason + explanation;
}

Expand Down
133 changes: 133 additions & 0 deletions src/main/java/com/google/devtools/build/lib/remote/ExecuteRetrier.java
@@ -0,0 +1,133 @@
// Copyright 2020 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.devtools.build.lib.remote;

import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.util.Durations;
import com.google.rpc.DebugInfo;
import com.google.rpc.Help;
import com.google.rpc.LocalizedMessage;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import com.google.rpc.RequestInfo;
import com.google.rpc.ResourceInfo;
import com.google.rpc.RetryInfo;
import com.google.rpc.Status;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;

/** Specific retry logic for execute request with gapi Status. */
class ExecuteRetrier extends RemoteRetrier {

private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static class RetryInfoBackoff implements Backoff {
private final int maxRetryAttempts;
int retries = 0;

RetryInfoBackoff(int maxRetryAttempts) {
this.maxRetryAttempts = maxRetryAttempts;
}

@Override
public long nextDelayMillis(Exception e) {
if (retries >= maxRetryAttempts) {
return -1;
}
RetryInfo retryInfo = getRetryInfo(e);
retries++;
return Durations.toMillis(retryInfo.getRetryDelay());
}

RetryInfo getRetryInfo(Exception e) {
RetryInfo retryInfo = RetryInfo.getDefaultInstance();
Status status = StatusProto.fromThrowable(e);
if (status != null) {
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
try {
retryInfo = detail.unpack(RetryInfo.class);
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen, ignore
}
}
}
}
return retryInfo;
}

@Override
public int getRetryAttempts() {
return retries;
}
}

ExecuteRetrier(
int maxRetryAttempts,
ListeningScheduledExecutorService retryService,
CircuitBreaker circuitBreaker) {
super(
() -> maxRetryAttempts > 0 ? new RetryInfoBackoff(maxRetryAttempts) : RETRIES_DISABLED,
ExecuteRetrier::shouldRetry,
retryService,
circuitBreaker);
}

private static boolean shouldRetry(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
boolean failedPrecondition = status.getCode() == Code.FAILED_PRECONDITION.value();
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
// server says we can retry, regardless of other details
return true;
} else if (failedPrecondition) {
if (detail.is(PreconditionFailure.class)) {
try {
PreconditionFailure f = detail.unpack(PreconditionFailure.class);
if (f.getViolationsCount() == 0) {
failedPrecondition = false;
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
failedPrecondition = false;
}
}
// if *all* > 0 precondition failure violations have type MISSING, failedPrecondition
// remains true
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen
return false;
}
} else if (!(detail.is(DebugInfo.class)
|| detail.is(Help.class)
|| detail.is(LocalizedMessage.class)
|| detail.is(RequestInfo.class)
|| detail.is(ResourceInfo.class))) { // ignore benign details
// consider all other details as failures
failedPrecondition = false;
}
}
}
return failedPrecondition;
}
}
Expand Up @@ -63,4 +63,8 @@ public boolean isExecutionTimeout() {
public ExecuteResponse getResponse() {
return response;
}

public Status getOriginalStatus() {
return status;
}
}
Expand Up @@ -194,7 +194,7 @@ ExecuteResponse waitExecution() throws IOException {
//
// However, we only retry Execute() if executeBackoff should retry. Also increase the retry
// counter at the same time (done by nextDelayMillis()).
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis() >= 0) {
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis(e) >= 0) {
lastOperation = null;
return null;
}
Expand Down
Expand Up @@ -174,7 +174,7 @@ public ExponentialBackoff(RemoteOptions options) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (attempts == maxAttempts) {
return -1;
}
Expand Down Expand Up @@ -221,13 +221,13 @@ public void reset() {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (currentBackoff == null) {
currentBackoff = backoffSupplier.get();
retries++;
return 0;
}
return currentBackoff.nextDelayMillis();
return currentBackoff.nextDelayMillis(e);
}

@Override
Expand Down
Expand Up @@ -85,17 +85,12 @@
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.longrunning.Operation;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;
import com.google.protobuf.Timestamp;
import com.google.protobuf.util.Durations;
import com.google.protobuf.util.Timestamps;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import io.grpc.Context;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
Expand All @@ -114,38 +109,6 @@
@ThreadSafe
public class RemoteSpawnRunner implements SpawnRunner {

private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static boolean retriableExecErrors(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
if (!RemoteRetrierUtils.causedByStatus(e, Code.FAILED_PRECONDITION)) {
return false;
}
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
for (Any details : status.getDetailsList()) {
PreconditionFailure f;
try {
f = details.unpack(PreconditionFailure.class);
} catch (InvalidProtocolBufferException protoEx) {
return false;
}
if (f.getViolationsCount() == 0) {
return false; // Generally shouldn't happen
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
return false;
}
}
}
return true; // if *all* > 0 violations have type MISSING
}

private final Path execRoot;
private final RemoteOptions remoteOptions;
private final ExecutionOptions executionOptions;
Expand Down Expand Up @@ -656,12 +619,10 @@ private SpawnResult handleError(
catastrophe = false;
}

final String errorMessage;
if (!verboseFailures) {
errorMessage = Utils.grpcAwareErrorMessage(exception);
} else {
String errorMessage = Utils.grpcAwareErrorMessage(exception);
if (verboseFailures) {
// On --verbose_failures print the whole stack trace
errorMessage = Throwables.getStackTraceAsString(exception);
errorMessage += "\n" + Throwables.getStackTraceAsString(exception);
}

return new SpawnResult.Builder()
Expand Down Expand Up @@ -817,12 +778,7 @@ static Collection<Path> resolveActionInputs(

private static RemoteRetrier createExecuteRetrier(
RemoteOptions options, ListeningScheduledExecutorService retryService) {
return new RemoteRetrier(
options.remoteMaxRetryAttempts > 0
? () -> new Retrier.ZeroBackoff(options.remoteMaxRetryAttempts)
: () -> Retrier.RETRIES_DISABLED,
RemoteSpawnRunner::retriableExecErrors,
retryService,
Retrier.ALLOW_ALL_CALLS);
return new ExecuteRetrier(
options.remoteMaxRetryAttempts, retryService, Retrier.ALLOW_ALL_CALLS);
}
}
14 changes: 7 additions & 7 deletions src/main/java/com/google/devtools/build/lib/remote/Retrier.java
Expand Up @@ -47,11 +47,11 @@ public interface Backoff {
* Returns the next delay in milliseconds, or a value less than {@code 0} if we should stop
* retrying.
*/
long nextDelayMillis();
long nextDelayMillis(Exception e);

/**
* Returns the number of calls to {@link #nextDelayMillis()} thus far, not counting any calls
* that returned less than {@code 0}.
* Returns the number of calls to {@link #nextDelayMillis(Exception)} thus far, not counting any
* calls that returned less than {@code 0}.
*/
int getRetryAttempts();
}
Expand Down Expand Up @@ -140,7 +140,7 @@ public void recordSuccess() {}
public static final Backoff RETRIES_DISABLED =
new Backoff() {
@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
return -1;
}

Expand All @@ -161,7 +161,7 @@ public ZeroBackoff(int maxRetries) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (retries >= maxRetries) {
return -1;
}
Expand Down Expand Up @@ -253,7 +253,7 @@ public <T> T execute(Callable<T> call, Backoff backoff) throws Exception {
if (!shouldRetry.test(e)) {
throw e;
}
final long delayMillis = backoff.nextDelayMillis();
final long delayMillis = backoff.nextDelayMillis(e);
if (delayMillis < 0) {
throw e;
}
Expand Down Expand Up @@ -286,7 +286,7 @@ public <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, Backoff backo
private <T> ListenableFuture<T> onExecuteAsyncFailure(
Exception t, AsyncCallable<T> call, Backoff backoff) {
if (isRetriable(t)) {
long waitMillis = backoff.nextDelayMillis();
long waitMillis = backoff.nextDelayMillis(t);
if (waitMillis >= 0) {
try {
return Futures.scheduleAsync(
Expand Down
Expand Up @@ -20,16 +20,20 @@ java_library(
"//src/main/java/com/google/devtools/build/lib/analysis:blaze_version_info",
"//src/main/java/com/google/devtools/build/lib/authandtls",
"//src/main/java/com/google/devtools/build/lib/concurrent",
"//src/main/java/com/google/devtools/build/lib/remote:ExecutionStatusException",
"//src/main/java/com/google/devtools/build/lib/remote/common",
"//src/main/java/com/google/devtools/build/lib/remote/options",
"//src/main/java/com/google/devtools/build/lib/vfs",
"//src/main/java/com/google/devtools/build/lib/vfs:pathfragment",
"//src/main/protobuf:failure_details_java_proto",
"//third_party:flogger",
"//third_party:guava",
"//third_party:jsr305",
"//third_party/grpc:grpc-jar",
"//third_party/protobuf:protobuf_java",
"//third_party/protobuf:protobuf_java_util",
"@googleapis//:google_rpc_code_java_proto",
"@googleapis//:google_rpc_error_details_java_proto",
"@googleapis//:google_rpc_status_java_proto",
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_grpc",
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_proto",
],
Expand Down

0 comments on commit 5122617

Please sign in to comment.