Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MaxRetryCountTest+[BUG]404/1002CyclicRegionRetry+[BUG]449RetryTooAggressively +[BUG]MissingCrossRegionRetryForWritesWithServer410 #37050) #37040

Merged
merged 33 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
86eb7f1
Update platform-matrix.json
FabianMeiswinkel Oct 2, 2023
0e9dcda
Update FaultInjectionWithAvailabilityStrategyTests.java
FabianMeiswinkel Oct 2, 2023
0aca208
Update FaultInjectionWithAvailabilityStrategyTests.java
FabianMeiswinkel Oct 2, 2023
74e6748
Revert "Update platform-matrix.json"
FabianMeiswinkel Oct 2, 2023
786c64e
Update FaultInjectionWithAvailabilityStrategyTests.java
FabianMeiswinkel Oct 3, 2023
0da26b2
Update FaultInjectionWithAvailabilityStrategyTests.java
FabianMeiswinkel Oct 3, 2023
d51b12f
Merge branch 'main' of https://github.com/Azure/azure-sdk-for-java in…
FabianMeiswinkel Oct 4, 2023
fe95e0a
MaxRetryCountTest skeleton
FabianMeiswinkel Oct 4, 2023
24e5c2b
Fixing ClientRetryPolicy for MM
FabianMeiswinkel Oct 4, 2023
686daa5
Adding MaxMicroBatchSize config and siwtching BulKWriter to use buffe…
FabianMeiswinkel Oct 6, 2023
ee0af3a
Revert "Adding MaxMicroBatchSize config and siwtching BulKWriter to u…
FabianMeiswinkel Oct 6, 2023
8012dc9
add 449 test case and fix 449 aggressive retry issue
Oct 6, 2023
8dd57db
add server 410 test cases
Oct 6, 2023
9e8f51d
add transit timeout tests
Oct 7, 2023
ef7f43d
correct retry count for bounded staleness
Oct 9, 2023
a790188
add tests for 503, 500, 429
Oct 9, 2023
4639f5d
fix compilation issue
Oct 10, 2023
62c9f1c
retry 503/0 for writes
Oct 10, 2023
21839d1
Merge branch 'main' into users/fabianm/maxRetryCount
Oct 10, 2023
ee0857d
resolve comments
Oct 11, 2023
9138810
resolve comments
Oct 13, 2023
b943abf
Merge branch 'main' into users/fabianm/maxRetryCount
Oct 13, 2023
509dc62
update changelog
Oct 13, 2023
1d81a11
fix tests
Oct 14, 2023
32f6df5
fix failed tests
Oct 16, 2023
ccccc96
resolve conflicts
Oct 16, 2023
a38a420
fix tests
Oct 17, 2023
d8d5d13
fix SessionNotAvailableRetryTests
Oct 17, 2023
ac45abb
fix FaultInjectionWithAvailabilityStrategyTests
Oct 17, 2023
9993d1d
merge from main and resolve conflicts
Oct 19, 2023
9335307
refactor
Oct 19, 2023
1e918d9
fix tests
Oct 19, 2023
b19447e
fix tests
Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ public CosmosException getInjectedServerError(RxDocumentServiceRequest request)
responseHeaders.put(
HttpConstants.HttpHeaders.RETRY_AFTER_IN_MILLISECONDS,
String.valueOf(500));
responseHeaders.put(WFConstants.BackendHeaders.SUB_STATUS,
Integer.toString(HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE));
cosmosException = new RequestRateTooLargeException(null, lsn, partitionKeyRangeId, responseHeaders);

break;
Expand Down Expand Up @@ -137,7 +139,10 @@ public CosmosException getInjectedServerError(RxDocumentServiceRequest request)
case SERVICE_UNAVAILABLE:
responseHeaders.put(WFConstants.BackendHeaders.SUB_STATUS,
Integer.toString(HttpConstants.SubStatusCodes.SERVER_GENERATED_503));
cosmosException = new ServiceUnavailableException(null, lsn, null, responseHeaders, HttpConstants.SubStatusCodes.SERVER_GENERATED_503);
ServiceUnavailableException serviceUnavailableException =
new ServiceUnavailableException(null, lsn, null, responseHeaders, HttpConstants.SubStatusCodes.SERVER_GENERATED_503);
serviceUnavailableException.setIsBasedOn503ResponseFromService();
cosmosException = serviceUnavailableException;
break;

case STALED_ADDRESSES_SERVER_GONE:
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ public Mono<ShouldRetryResult> shouldRetry(Exception e) {
return this.shouldRetryOnBackendServiceUnavailableAsync(
this.isReadRequest,
isWebExceptionRetriable,
this.request.getNonIdempotentWriteRetriesEnabled());
this.request.getNonIdempotentWriteRetriesEnabled(),
clientException);
}

return this.throttlingRetry.shouldRetry(e);
Expand Down Expand Up @@ -204,7 +205,7 @@ private ShouldRetryResult shouldRetryOnSessionNotAvailable(RxDocumentServiceRequ
this.isReadRequest ?
this.globalEndpointManager.getApplicableReadEndpoints(request) : this.globalEndpointManager.getApplicableWriteEndpoints(request);

if (this.sessionTokenRetryCount > endpoints.size()) {
if (this.sessionTokenRetryCount >= endpoints.size()) {
// When use multiple write locations is true and the request has been tried
// on all locations, then don't retry the request
return ShouldRetryResult.noRetry();
Expand Down Expand Up @@ -299,11 +300,27 @@ private Mono<Void> refreshLocation(boolean isReadRequest, boolean forceRefresh,
private Mono<ShouldRetryResult> shouldRetryOnBackendServiceUnavailableAsync(
boolean isReadRequest,
boolean isWebExceptionRetriable,
boolean nonIdempotentWriteRetriesEnabled) {

if (!isReadRequest && !nonIdempotentWriteRetriesEnabled && !isWebExceptionRetriable) {
boolean nonIdempotentWriteRetriesEnabled,
CosmosException cosmosException) {

// The request has failed with 503, SDK need to decide whether it is safe to retry for write operations
// For server generated retries, it is safe to retry
// For SDK generated 503, it will be more tricky as we have to decide the cause of it. For any causes that SDK not sure whether the request
// has reached/processed from server side, unless customer has specifically opted in for nonIdempotentWriteRetries, SDK should not retry.
// When SDK would generate 503:
// - When server return 410, SDK may internally retry multiple times, when all the retries exhausted, SDK will bubble up 503 with corresponding subStatusCode
// (Note: currently, subStatus code for read may get lost during the conversion, but for writes, the subStatus code will be reserved)
// - when SDK generated 410 due to different reason (like connectionTimeout, transient timeout etc), SDK will internally retry multiple times
// when all the retries exhausted, SDK will bubble up 503
//
// Fow now, without nonIdempotentWriteRetries being enabled, SDK will only retry for the following situation:
// 1. For any connection related errors, it will be covered under isWebExceptionRetriable -> which SDK will retry
// 2. For any server returned 503s, SDK will retry
// 3. For SDK generated 503, SDK will only retry if the subStatusCode is SERVER_GENERATED_410
if (!isReadRequest && !shouldRetryWriteOnServiceUnavailable(nonIdempotentWriteRetriesEnabled, isWebExceptionRetriable, cosmosException)) {
logger.warn(
"shouldRetryOnBackendServiceUnavailableAsync() Not retrying on write with non retriable exception. Retry count = {}",
"shouldRetryOnBackendServiceUnavailableAsync() Not retrying" +
" on write with non retriable exception and non server returned service unavailable. Retry count = {}",
this.serviceUnavailableRetryCount);
return Mono.just(ShouldRetryResult.noRetry());
}
Expand Down Expand Up @@ -374,6 +391,24 @@ CosmosDiagnostics getCosmosDiagnostics() {
return cosmosDiagnostics;
}

private boolean shouldRetryWriteOnServiceUnavailable(
boolean nonIdempotentWriteRetriesEnabled,
boolean isWebExceptionRetriable,
CosmosException cosmosException) {

if (nonIdempotentWriteRetriesEnabled || isWebExceptionRetriable) {
return true;
}

if (cosmosException instanceof ServiceUnavailableException) {
ServiceUnavailableException serviceUnavailableException = (ServiceUnavailableException) cosmosException;
return serviceUnavailableException.isBasedOn503ResponseFromService()
|| serviceUnavailableException.getStatusCode() == HttpConstants.SubStatusCodes.SERVER_GENERATED_410;
xinlian12 marked this conversation as resolved.
Show resolved Hide resolved
}

return false;
}

private static class RetryContext {

public int retryCount;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,15 @@ public class Configs {
private static final int DEFAULT_ADDRESS_REFRESH_RESPONSE_TIMEOUT_IN_SECONDS = 5;

// SessionTokenMismatchRetryPolicy Constants
private static final String DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME =
public static final String DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME =
xinlian12 marked this conversation as resolved.
Show resolved Hide resolved
"COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS";
private static final int DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS = 5000;

private static final String DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME =
public static final String DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME =
"COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS";
private static final int DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS = 5;

private static final String DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME =
public static final String DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME =
"COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS";
private static final int DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS = 500;

Expand Down Expand Up @@ -141,7 +141,7 @@ public class Configs {
private static final String OPEN_CONNECTIONS_CONCURRENCY = "COSMOS.OPEN_CONNECTIONS_CONCURRENCY";
private static final int DEFAULT_OPEN_CONNECTIONS_CONCURRENCY = 1;

private static final String MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = "COSMOS.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED";
public static final String MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = "COSMOS.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED";
private static final int DEFAULT_MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = 1;

public Configs() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* The type Service unavailable exception.
*/
public class ServiceUnavailableException extends CosmosException {
private boolean basedOn503ResponseFromService = false;

ServiceUnavailableException() {
this(RMResources.ServiceUnavailable, HttpConstants.SubStatusCodes.UNKNOWN);
}
Expand Down Expand Up @@ -89,6 +91,14 @@ public ServiceUnavailableException(String message,
setSubStatus(subStatusCode);
}

public void setIsBasedOn503ResponseFromService() {
this.basedOn503ResponseFromService = true;
}

public boolean isBasedOn503ResponseFromService() {
return this.basedOn503ResponseFromService;
}

private void setSubStatus(int subStatusCode) {
this.getResponseHeaders().put(
HttpConstants.HttpHeaders.SUB_STATUS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,6 @@ private boolean shouldRetryLocally(SessionRetryOptions sessionRetryOptions, int
// hence to curb the retry attempts on a region,
// compare sessionTokenMismatchRetryAttempts with max retry attempts allowed on the region - 1
return !(regionSwitchHint == CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED
&& sessionTokenMismatchRetryAttempts == (this.maxRetryAttemptsInCurrentRegion.get() - 1));
&& sessionTokenMismatchRetryAttempts >= (this.maxRetryAttemptsInCurrentRegion.get() - 1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,15 @@ public Mono<ShouldRetryResult> shouldRetry(Exception exception) {
Math.min(
Math.min(this.currentBackoffMilliseconds.get() + random.nextInt(RANDOM_SALT_IN_MS), remainingMilliseconds),
RetryWithRetryPolicy.MAXIMUM_BACKOFF_TIME_IN_MS));
this.currentBackoffMilliseconds.accumulateAndGet(RetryWithRetryPolicy.BACK_OFF_MULTIPLIER, (left, right) -> left * right);

this.currentBackoffMilliseconds.set(
Math.max(
RetryWithRetryPolicy.INITIAL_BACKOFF_TIME_MS,
Math.min(
RetryWithRetryPolicy.MAXIMUM_BACKOFF_TIME_IN_MS,
this.currentBackoffMilliseconds.get() * RetryWithRetryPolicy.BACK_OFF_MULTIPLIER))
);

logger.debug("BackoffTime: {} ms.", backoffTime.toMillis());

// Calculate the remaining time based after accounting for the backoff that we
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -953,9 +953,11 @@ private Mono<StoreResponse> createErrorResponseFromHttpResponse(String resourceA

case HttpConstants.StatusCodes.SERVICE_UNAVAILABLE:
int subStatusCode = getSubStatusCodeFromHeader(response);
exception = new ServiceUnavailableException(errorMessage, response.headers(), request.uri(),
ServiceUnavailableException serviceUnavailableException = new ServiceUnavailableException(errorMessage, response.headers(), request.uri(),
(subStatusCode == 0) ? HttpConstants.SubStatusCodes.SERVER_GENERATED_503
: HttpConstants.SubStatusCodes.UNKNOWN);
serviceUnavailableException.setIsBasedOn503ResponseFromService();
exception = serviceUnavailableException;
break;

case HttpConstants.StatusCodes.REQUEST_TIMEOUT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1113,8 +1113,15 @@ private void messageReceived(final ChannelHandlerContext context, final RntbdRes
break;

case StatusCodes.SERVICE_UNAVAILABLE:
cause = new ServiceUnavailableException(error, lsn, partitionKeyRangeId, responseHeaders,
SubStatusCodes.SERVER_GENERATED_503);
ServiceUnavailableException serviceUnavailableException =
new ServiceUnavailableException(
error,
lsn,
partitionKeyRangeId,
responseHeaders,
SubStatusCodes.SERVER_GENERATED_503);
serviceUnavailableException.setIsBasedOn503ResponseFromService();
cause = serviceUnavailableException;
break;

case StatusCodes.TOO_MANY_REQUESTS:
Expand Down
Loading