diff --git a/sdk/cosmos/azure-cosmos-test/src/main/java/com/azure/cosmos/test/implementation/faultinjection/FaultInjectionServerErrorResultInternal.java b/sdk/cosmos/azure-cosmos-test/src/main/java/com/azure/cosmos/test/implementation/faultinjection/FaultInjectionServerErrorResultInternal.java index f846cd08e7e71..aefaad0ea3992 100644 --- a/sdk/cosmos/azure-cosmos-test/src/main/java/com/azure/cosmos/test/implementation/faultinjection/FaultInjectionServerErrorResultInternal.java +++ b/sdk/cosmos/azure-cosmos-test/src/main/java/com/azure/cosmos/test/implementation/faultinjection/FaultInjectionServerErrorResultInternal.java @@ -90,6 +90,8 @@ public CosmosException getInjectedServerError(RxDocumentServiceRequest request) responseHeaders.put( HttpConstants.HttpHeaders.RETRY_AFTER_IN_MILLISECONDS, String.valueOf(500)); + responseHeaders.put(WFConstants.BackendHeaders.SUB_STATUS, + Integer.toString(HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE)); cosmosException = new RequestRateTooLargeException(null, lsn, partitionKeyRangeId, responseHeaders); break; @@ -137,7 +139,8 @@ public CosmosException getInjectedServerError(RxDocumentServiceRequest request) case SERVICE_UNAVAILABLE: responseHeaders.put(WFConstants.BackendHeaders.SUB_STATUS, Integer.toString(HttpConstants.SubStatusCodes.SERVER_GENERATED_503)); - cosmosException = new ServiceUnavailableException(null, lsn, null, responseHeaders, HttpConstants.SubStatusCodes.SERVER_GENERATED_503); + cosmosException = + new ServiceUnavailableException(null, lsn, null, responseHeaders, HttpConstants.SubStatusCodes.SERVER_GENERATED_503); break; case STALED_ADDRESSES_SERVER_GONE: diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTests.java index 6a13f54f3a34a..bc3c030e4c286 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTests.java @@ -146,16 +146,8 @@ public class FaultInjectionWithAvailabilityStrategyTests extends TestSuiteBase { } }; - private final static Consumer validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions = - (ctx) -> { - assertThat(ctx).isNotNull(); - if (ctx != null) { - assertThat(ctx.getDiagnostics()).isNotNull(); - assertThat(ctx.getDiagnostics().size()).isGreaterThanOrEqualTo(1); - assertThat(ctx.getDiagnostics().size()).isLessThanOrEqualTo(2); - assertThat(ctx.getContactedRegionNames().size()).isEqualTo(2); - } - }; + private Consumer validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion = null; + private final static BiConsumer noFailureInjection = (container, operationType) -> {}; @@ -178,10 +170,17 @@ public class FaultInjectionWithAvailabilityStrategyTests extends TestSuiteBase { private BiConsumer injectInternalServerErrorIntoAllRegions = null; + private BiConsumer injectQueryPlanTransitTimeoutIntoFirstRegionOnly = null; + + private BiConsumer injectGatewayTransitTimeoutIntoFirstRegionOnly = null; + private Consumer validateDiagnosticsContextHasDiagnosticsForAllRegions = null; private Consumer validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion = null; + private String FIRST_REGION_NAME = null; + private String SECOND_REGION_NAME = null; + private List writeableRegions; private String testDatabaseId; @@ -223,6 +222,9 @@ public void beforeClass() { assertThat(this.writeableRegions).isNotNull(); assertThat(this.writeableRegions.size()).isGreaterThanOrEqualTo(2); + FIRST_REGION_NAME = this.writeableRegions.get(0).toLowerCase(Locale.ROOT); + SECOND_REGION_NAME = this.writeableRegions.get(1).toLowerCase(Locale.ROOT); + this.validateDiagnosticsContextHasDiagnosticsForAllRegions = (ctx) -> { assertThat(ctx).isNotNull(); @@ -244,6 +246,20 @@ public void beforeClass() { } }; + this.validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion = (ctx) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isGreaterThanOrEqualTo(1); + assertThat(ctx.getDiagnostics().size()).isLessThanOrEqualTo(2); + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(1); + + if (ctx.getContactedRegionNames().size() == 1) { + assertThat(ctx.getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + } + } + }; + FeedRange ALL_PARTITIONS = null; this.injectReadSessionNotAvailableIntoAllRegions = @@ -273,6 +289,14 @@ public void beforeClass() { this.injectInternalServerErrorIntoAllRegions = (c, operationType) -> injectInternalServerError(c, this.writeableRegions, operationType); + this.injectQueryPlanTransitTimeoutIntoFirstRegionOnly = + (c, operationType) -> injectGatewayTransitTimeout( + c, this.getFirstRegion(), FaultInjectionOperationType.METADATA_REQUEST_QUERY_PLAN); + + this.injectGatewayTransitTimeoutIntoFirstRegionOnly = + (c, operationType) -> injectGatewayTransitTimeout( + c, this.getFirstRegion(), operationType); + CosmosAsyncContainer container = this.createTestContainer(dummyClient); this.testDatabaseId = container.getDatabase().getId(); this.testContainerId = container.getId(); @@ -353,6 +377,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoAllRegions, validateStatusCodeIsReadSessionNotAvailableError, @@ -369,6 +394,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), reluctantThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -385,6 +411,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -398,6 +425,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoAllExceptFirstRegion, validateStatusCodeIs200Ok, @@ -415,6 +443,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoAllRegions, validateStatusCodeIsOperationCancelled, @@ -431,6 +460,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -444,6 +474,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoAllExceptFirstRegion, validateStatusCodeIs200Ok, @@ -460,6 +491,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), null, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, // First operation will failover from region 1 to region 2 quickly enough @@ -477,6 +509,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), null, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIsOperationCancelled, // Too many local retries to allow cross regional failover within e2e timeout @@ -495,6 +528,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, "SomeNonExistingId", injectReadSessionNotAvailableIntoFirstRegionOnly, // Too many local retries to allow cross regional failover within e2e timeout, but after @@ -513,6 +547,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), null, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, "SomeNonExistingId", injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIsLegitNotFound, // Too many local retries to allow cross regional failover within e2e timeout @@ -529,6 +564,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(5), reluctantThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, "SomeNonExistingId", injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIsLegitNotFound, // Too many local retries to allow cross regional failover within e2e timeout @@ -544,6 +580,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectTransitTimeoutIntoAllRegions, validateStatusCodeIsOperationCancelled, @@ -558,6 +595,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -572,6 +610,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectTransitTimeoutIntoAllRegions, validateStatusCodeIsOperationCancelled, @@ -590,6 +629,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(90), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -604,6 +644,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIsOperationCancelled, @@ -618,6 +659,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(90), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -633,10 +675,11 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, // This test injects 503 (Service Unavailable) into all regions. @@ -647,6 +690,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, @@ -663,6 +707,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(90), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectInternalServerErrorIntoFirstRegionOnly, validateStatusCodeIsInternalServerError, @@ -679,6 +724,7 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectInternalServerErrorIntoFirstRegionOnly, validateStatusCodeIsInternalServerError, @@ -695,11 +741,29 @@ public Object[][] testConfigs_readAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, sameDocumentIdJustCreated, injectInternalServerErrorIntoAllRegions, validateStatusCodeIsInternalServerError, validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion }, + + // GATEWAY + // ------- + + // This test injects Gateway transit timeout into the local region only. + // Expected outcome is a successful retry by the availability strategy + new Object[] { + "GW_408_FirstRegionOnly", + Duration.ofSeconds(1), + eagerThresholdAvailabilityStrategy, + noRegionSwitchHint, + ConnectionMode.GATEWAY, + sameDocumentIdJustCreated, + injectGatewayTransitTimeoutIntoFirstRegionOnly, + validateStatusCodeIs200Ok, + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion + }, }; } @@ -709,6 +773,7 @@ public void readAfterCreation( Duration endToEndTimeout, ThresholdBasedAvailabilityStrategy availabilityStrategy, CosmosRegionSwitchHint regionSwitchHint, + ConnectionMode connectionMode, String readItemDocumentIdOverride, BiConsumer faultInjectionCallback, BiConsumer validateStatusCode, @@ -742,7 +807,8 @@ public void readAfterCreation( null, 0, 0, - false); + false, + connectionMode); } @DataProvider(name = "testConfigs_writeAfterCreation") @@ -855,10 +921,11 @@ public Object[][] testConfigs_writeAfterCreation() { // No availability strategy exists - expected outcome is a successful response from the cross-regional // retry issued in the client retry policy new Object[] { - "Create_503_FirstRegionOnly_NoAvailabilityStrategy_WithWriteRetries", + "Create_503_FirstRegionOnly_NoAvailabilityStrategy_WriteRetriesEnabled_WithWriteRetries", Duration.ofSeconds(3), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -869,20 +936,21 @@ public Object[][] testConfigs_writeAfterCreation() { }, // This test injects 503 (Service Unavailable) into the local region only. - // No availability strategy exists - expected outcome is a 503 because non-idempotent write retries - // are disabled - and no cross regional retry is happening + // No availability strategy exists - expected outcome is a successful response from the cross-regional + // issued in the client retry policy new Object[] { - "Create_503_FirstRegionOnly_NoAvailabilityStrategy_NoWriteRetries", - Duration.ofSeconds(1), + "Create_503_FirstRegionOnly_NoAvailabilityStrategy_WriteRetriesDisabled_withWriteRetries", + Duration.ofSeconds(2), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.CREATE_ITEM, createAnotherItemCallback, injectServiceUnavailableIntoFirstRegionOnly, - validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateStatusCodeIs201Created, + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, // This test injects 503 (Service Unavailable) into the local region only. @@ -890,44 +958,47 @@ public Object[][] testConfigs_writeAfterCreation() { // regional retry in client retry policy of operations against first region - or the hedging // against the second region new Object[] { - "Create_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "Create_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, createAnotherItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs201Created, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, // This test injects 503 (Service Unavailable) into the local region only. - // Default availability strategy exists - expected outcome is a 503 because non-idempotent write retries - // are disabled - which means no hedging for write operations nor cross regional retry + // Default availability strategy exists - expected outcome is successful response from the cross + // regional retry in client retry policy, but not from the hedging against the second region new Object[] { - "Create_503_FirstRegionOnly_NoWriteRetries", - Duration.ofSeconds(1), + "Create_503_FirstRegionOnly_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.CREATE_ITEM, createAnotherItemCallback, injectServiceUnavailableIntoFirstRegionOnly, - validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateStatusCodeIs201Created, + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, // This test injects 503 (Service Unavailable) into all regions. // Eager availability strategy exists - expected outcome is a 503 - diagnostics should reflect the // hedging against second region new Object[] { - "Create_503_AllRegions_WithWriteRetries", - Duration.ofSeconds(1), + "Create_503_AllRegions_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -939,98 +1010,105 @@ public Object[][] testConfigs_writeAfterCreation() { // This test injects 503 (Service Unavailable) into all regions. // Default availability strategy exists - expected outcome is a 503 because non-idempotent write retries - // are disabled - which means no hedging for write operations nor cross regional retry + // are disabled - which means no hedging for write operations, but there will be cross regional retry from clientRetryPolicy // Same expectation for all write operation types new Object[] { - "Create_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "Create_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.CREATE_ITEM, createAnotherItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "Replace_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "Replace_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, replaceItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "Patch_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "Patch_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.PATCH_ITEM, patchItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "Delete_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "Delete_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.DELETE_ITEM, deleteItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "UpsertExisting_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "UpsertExisting_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.UPSERT_ITEM, upsertExistingItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "UpsertNew_503_AllRegions_NoWriteRetries", - Duration.ofSeconds(1), + "UpsertNew_503_AllRegions_WriteRetriesDisabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.UPSERT_ITEM, upsertAnotherItemCallback, injectServiceUnavailableIntoAllRegions, validateStatusCodeIsServiceUnavailable, - validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegion + validateDiagnosticsContextHasDiagnosticsForOnlyFirstRegionButWithRegionalFailover }, new Object[] { - "Patch_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "Patch_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.PATCH_ITEM, patchItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, // This test injects 503 (Service Unavailable) into the first region only. @@ -1038,62 +1116,67 @@ public Object[][] testConfigs_writeAfterCreation() { // write retries are enabled which would allow hedging (or cross regional fail-over) to succeed // Same expectation for all write operation types new Object[] { - "Delete_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "Delete_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.DELETE_ITEM, deleteItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs204NoContent, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, new Object[] { - "Replace_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "Replace_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, replaceItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, new Object[] { - "UpsertNew_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "UpsertNew_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.UPSERT_ITEM, upsertAnotherItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs201Created, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, new Object[] { - "UpsertExisting_503_FirstRegionOnly_WithWriteRetries", - Duration.ofSeconds(1), + "UpsertExisting_503_FirstRegionOnly_WriteRetriesEnabled_WithWriteRetries", + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.UPSERT_ITEM, upsertExistingItemCallback, injectServiceUnavailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, new Object[] { "Create_500_FirstRegionOnly_NoAvailabilityStrategy_WithRetries", Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1111,6 +1194,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1130,6 +1214,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), reluctantThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.DELETE_ITEM, @@ -1147,6 +1232,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.DELETE_ITEM, @@ -1164,6 +1250,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), reluctantThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.PATCH_ITEM, @@ -1181,6 +1268,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.PATCH_ITEM, @@ -1197,9 +1285,10 @@ public Object[][] testConfigs_writeAfterCreation() { // data for initial region new Object[] { "Replace_408_AllRegions_DefaultAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1215,9 +1304,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics should contain data for original and hedging operation new Object[] { "Replace_408_AllRegions_DefaultAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1231,9 +1321,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics only in first region due to no hedging new Object[] { "Replace_408_AllRegions_NoAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1248,9 +1339,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics only in first region due to no hedging or cross regional fail-over being started yet new Object[] { "Replace_408_AllRegions_NoAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1265,9 +1357,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics only in first region due to no hedging or cross regional fail-over being started yet new Object[] { "UpsertExisting_408_FirstRegionOnly_DefaultAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.UPSERT_ITEM, @@ -1283,9 +1376,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics should have data for both operations. new Object[] { "UpsertExisting_408_FirstRegionOnly_DefaultAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.UPSERT_ITEM, @@ -1300,9 +1394,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Diagnostics only in first region new Object[] { "UpsertNew_408_FirstRegionOnly_NoAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.UPSERT_ITEM, @@ -1318,6 +1413,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(90), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.UPSERT_ITEM, @@ -1335,9 +1431,10 @@ public Object[][] testConfigs_writeAfterCreation() { // execution via availability strategy was happening (but also failed) new Object[] { "Create_404-1002_AllRegions_LocalPreferred_DefaultAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1355,9 +1452,10 @@ public Object[][] testConfigs_writeAfterCreation() { // execution via availability strategy was happening (but also failed) new Object[] { "Replace_404-1002_AllRegions_LocalPreferred_DefaultAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), defaultAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1375,9 +1473,10 @@ public Object[][] testConfigs_writeAfterCreation() { // execution via availability strategy was happening (but also failed) new Object[] { "Replace_404-1002_AllRegions_RemotePreferred_EagerAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1393,9 +1492,10 @@ public Object[][] testConfigs_writeAfterCreation() { // cross regional retry) new Object[] { "Replace_404-1002_AllRegions_LocalPreferred_EagerAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1410,9 +1510,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Regional fail-over seen, because preferred region switch is remote new Object[] { "Replace_404-1002_AllRegions_RemotePreferred_EagerAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1429,9 +1530,10 @@ public Object[][] testConfigs_writeAfterCreation() { // cross regional retry) new Object[] { "Replace_404-1002_FirstRegionOnly_RemotePreferred_EagerAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1448,9 +1550,10 @@ public Object[][] testConfigs_writeAfterCreation() { // client retry policy within the e2e timeout. new Object[] { "Replace_404-1002_FirstRegionOnly_LocalPreferred_EagerAvailabilityStrategy_NoRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesDisabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1466,9 +1569,10 @@ public Object[][] testConfigs_writeAfterCreation() { // result in successful response. new Object[] { "Replace_404-1002_FirstRegionOnly_RemotePreferred_EagerAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1476,7 +1580,7 @@ public Object[][] testConfigs_writeAfterCreation() { injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, // no hedging even with availability strategy because nonIdempotentWrites are disabled - validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButTwoContactedRegions + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion }, // 404/1022 into local region only @@ -1484,9 +1588,10 @@ public Object[][] testConfigs_writeAfterCreation() { // preference results in too many local retries). Should result in successful response form hedging. new Object[] { "Replace_404-1002_FirstRegionOnly_LocalPreferred_EagerAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.REPLACE_ITEM, @@ -1503,9 +1608,10 @@ public Object[][] testConfigs_writeAfterCreation() { // cross regional retry to finish within e2e timeout. new Object[] { "Create_404-1002_FirstRegionOnly_RemotePreferred_ReluctantAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), reluctantThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1525,6 +1631,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1587,6 +1694,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, Duration.ofMillis(600), nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1647,6 +1755,7 @@ public Object[][] testConfigs_writeAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, Duration.ofMillis(1100), nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1683,9 +1792,10 @@ public Object[][] testConfigs_writeAfterCreation() { // Successful response expected from hedging. Diagnostics should have data for both operations. new Object[] { "Create_404-1002_FirstRegionOnly_LocalPreferred_EagerAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.CREATE_ITEM, @@ -1703,9 +1813,10 @@ public Object[][] testConfigs_writeAfterCreation() { // terminating the composite Mono. Diagnostics should have data for both operations. new Object[] { "DeleteNonExistingItem_404-1002_FirstRegionOnly_LocalPreferred_EagerAvailabilityStrategy_WithRetries", - Duration.ofSeconds(1), + Duration.ofSeconds(2), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType.DELETE_ITEM, @@ -1715,6 +1826,28 @@ public Object[][] testConfigs_writeAfterCreation() { // no hedging even with availability strategy because nonIdempotentWrites are disabled validateDiagnosticsContextHasDiagnosticsForAllRegions }, + + // GATEWAY + // ------- + + // This test injects 503 (Service Unavailable) into the local region only. + // Default availability strategy exists - expected outcome is successful response from either the cross + // regional retry in client retry policy of operations against first region - or the hedging + // against the second region + new Object[] { + "GW_Create_GW408_FirstRegionOnly_WithWriteRetries", + Duration.ofSeconds(3), + eagerThresholdAvailabilityStrategy, + noRegionSwitchHint, + ConnectionMode.GATEWAY, + NO_CUSTOM_MIN_RETRY_TIME_IN_REGION_FOR_WRITES, + nonIdempotentWriteRetriesEnabled, + FaultInjectionOperationType.CREATE_ITEM, + createAnotherItemCallback, + injectGatewayTransitTimeoutIntoFirstRegionOnly, + validateStatusCodeIs201Created, + validateDiagnosticsContextHasDiagnosticsForOneOrTwoRegionsButAlwaysContactedSecondRegion + }, }; } @@ -1724,6 +1857,7 @@ public void writeAfterCreation( Duration endToEndTimeout, ThresholdBasedAvailabilityStrategy availabilityStrategy, CosmosRegionSwitchHint regionSwitchHint, + ConnectionMode connectionMode, Duration customMinRetryTimeInLocalRegion, Boolean nonIdempotentWriteRetriesEnabled, FaultInjectionOperationType faultInjectionOperationType, @@ -1749,7 +1883,8 @@ public void writeAfterCreation( null, 0, 0, - false); + false, + connectionMode); } private CosmosResponseWrapper queryReturnsTotalRecordCountCore( @@ -1829,8 +1964,6 @@ public Object[][] testConfigs_queryAfterCreation() { final int ENOUGH_DOCS_OTHER_PK_TO_HIT_EVERY_PARTITION = PHYSICAL_PARTITION_COUNT * 10; final int SINGLE_REGION = 1; final int TWO_REGIONS = 2; - final String FIRST_REGION_NAME = writeableRegions.get(0).toLowerCase(Locale.ROOT); - final String SECOND_REGION_NAME = writeableRegions.get(1).toLowerCase(Locale.ROOT); BiConsumer injectReadSessionNotAvailableIntoFirstRegionOnlyForSinglePartition = (c, operationType) -> injectReadSessionNotAvailableError(c, this.getFirstRegion(), operationType, c.getFeedRanges().block().get(0)); @@ -1995,10 +2128,6 @@ public Object[][] testConfigs_queryAfterCreation() { assertThat(secondRegionDiagnostics.getFeedResponseDiagnostics().getClientSideRequestStatistics().size()).isEqualTo(1); }; - BiConsumer injectQueryPlanTransitTimeout = - (c, operationType) -> injectGatewayTransitTimeout( - c, this.getFirstRegion(), FaultInjectionOperationType.METADATA_REQUEST_QUERY_PLAN); - return new Object[][] { // CONFIG description // new Object[] { @@ -2028,6 +2157,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, singlePartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, noFailureInjection, @@ -2051,6 +2181,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, noFailureInjection, @@ -2078,6 +2209,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, singlePartitionQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2105,6 +2237,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2131,6 +2264,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, singlePartitionEmptyResultQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2155,6 +2289,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionEmptyResultQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2187,6 +2322,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionEmptyResultQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOneAndEmptyPagesEnabled, noFailureInjection, @@ -2229,6 +2365,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, noFailureInjection, @@ -2259,6 +2396,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, singlePartitionWithAggregatesAndOrderByQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2288,6 +2426,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionWithAggregatesAndOrderByQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2320,6 +2459,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionWithAggregatesAndOrderByQueryGenerator, queryReturnsTotalRecordCountWithPageSizeOne, noFailureInjection, @@ -2348,6 +2488,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionWithAggregatesAndOrderByQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, noFailureInjection, @@ -2389,6 +2530,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, crossPartitionWithAggregatesAndOrderByQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, noFailureInjection, @@ -2422,6 +2564,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(10), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, singlePartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, injectReadSessionNotAvailableIntoFirstRegionOnly, @@ -2462,6 +2605,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(3), reluctantThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, crossPartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, injectReadSessionNotAvailableIntoFirstRegionOnly, @@ -2513,6 +2657,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(1), reluctantThresholdAvailabilityStrategy, CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + ConnectionMode.DIRECT, crossPartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, injectReadSessionNotAvailableIntoFirstRegionOnlyForSinglePartition, @@ -2562,6 +2707,7 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(10), eagerThresholdAvailabilityStrategy, CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + ConnectionMode.DIRECT, singlePartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, injectServiceUnavailableIntoAllRegions, @@ -2603,9 +2749,10 @@ public Object[][] testConfigs_queryAfterCreation() { Duration.ofSeconds(3), reluctantThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, singlePartitionQueryGenerator, queryReturnsTotalRecordCountWithDefaultPageSize, - injectQueryPlanTransitTimeout, + injectQueryPlanTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIs200Ok, 1, ArrayUtils.toArray( @@ -2662,6 +2809,94 @@ public Object[][] testConfigs_queryAfterCreation() { ENOUGH_DOCS_OTHER_PK_TO_HIT_EVERY_PARTITION, NO_OTHER_DOCS_WITH_SAME_PK }, + + // GATEWAY MODE + // ------------ + + // Simple cross partition query - 404/1002 injected into all partition of the first region + // RegionSwitchHint is remote - with reluctant availability strategy - so, the expectation is that the + // retry on the first region will provide a successful response and no hedging is happening. + // There should be one CosmosDiagnosticsContext (and page) per partition - each should only have + // a single CosmosDiagnostics instance contacting both regions. + new Object[] { + "GW_DefaultPageSize_CrossPartition_GW408_EagerAvailabilityStrategy", + Duration.ofSeconds(3), + eagerThresholdAvailabilityStrategy, + noRegionSwitchHint, + ConnectionMode.GATEWAY, + crossPartitionQueryGenerator, + queryReturnsTotalRecordCountWithDefaultPageSize, + injectGatewayTransitTimeoutIntoFirstRegionOnly, + validateStatusCodeIs200Ok, + PHYSICAL_PARTITION_COUNT, + ArrayUtils.toArray( + validateCtxTwoRegions, // query plan 1st region, all queries 2nd region + validateCtxQueryPlan, + (ctx) -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); + + // Diagnostics of query attempt in first region not even available yet + assertThat(diagnostics.length).isEqualTo(2); + + // query plan on first region + assertThat(diagnostics[0].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[0].getContactedRegionNames().iterator().next()).isEqualTo(FIRST_REGION_NAME); + }, + (ctx) -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); + assertThat(diagnostics[1].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[1].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + ClientSideRequestStatistics[] clientStats = + diagnostics[1] + .getFeedResponseDiagnostics() + .getClientSideRequestStatistics() + .toArray(new ClientSideRequestStatistics[0]); + assertThat(clientStats.length).isEqualTo(1); + for (int i = 0; i < clientStats.length; i++) { + assertThat(clientStats[i].getContactedRegionNames()).isNotNull(); + assertThat(clientStats[i].getContactedRegionNames().size()).isEqualTo(1); + assertThat(clientStats[i].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(clientStats[i].getGatewayStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList().size()).isEqualTo(0); + } + } + ), + ArrayUtils.toArray( + validateCtxSingleRegion, + (ctx) -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); + assertThat(diagnostics[0].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[0].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[0].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[0].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[0].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + ClientSideRequestStatistics[] clientStats = + diagnostics[0] + .getFeedResponseDiagnostics() + .getClientSideRequestStatistics() + .toArray(new ClientSideRequestStatistics[0]); + assertThat(clientStats.length).isEqualTo(1); + for (int i = 0; i < clientStats.length; i++) { + assertThat(clientStats[i].getContactedRegionNames()).isNotNull(); + assertThat(clientStats[i].getContactedRegionNames().size()).isEqualTo(1); + assertThat(clientStats[i].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(clientStats[i].getGatewayStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList().size()).isEqualTo(0); + } + } + ), + validateAllRecordsSameIdReturned, + ENOUGH_DOCS_OTHER_PK_TO_HIT_EVERY_PARTITION, + NO_OTHER_DOCS_WITH_SAME_PK + }, }; } @@ -2671,6 +2906,7 @@ public void queryAfterCreation( Duration endToEndTimeout, ThresholdBasedAvailabilityStrategy availabilityStrategy, CosmosRegionSwitchHint regionSwitchHint, + ConnectionMode connectionMode, Function queryGenerator, BiFunction queryExecution, BiConsumer faultInjectionCallback, @@ -2699,7 +2935,8 @@ public void queryAfterCreation( responseValidator, numberOfOtherDocumentsWithSameId, numberOfOtherDocumentsWithSamePk, - false); + false, + connectionMode); } private CosmosResponseWrapper readManyCore( @@ -3177,7 +3414,8 @@ public void readManyAfterCreation( responseValidator, numberOfOtherDocumentsWithSameId, numberOfOtherDocumentsWithSamePk, - false); + false, + ConnectionMode.DIRECT); } private CosmosResponseWrapper readAllReturnsTotalRecordCountCore( @@ -3258,8 +3496,6 @@ public Object[][] testConfigs_readAllAfterCreation() { final int ENOUGH_DOCS_OTHER_PK_TO_HIT_EVERY_PARTITION = PHYSICAL_PARTITION_COUNT * 10; final int SINGLE_REGION = 1; final int TWO_REGIONS = 2; - final String FIRST_REGION_NAME = writeableRegions.get(0).toLowerCase(Locale.ROOT); - final String SECOND_REGION_NAME = writeableRegions.get(1).toLowerCase(Locale.ROOT); BiConsumer validateExpectedRecordCount = (response, expectedRecordCount) -> { if (expectedRecordCount != null) { @@ -3464,6 +3700,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainer, noFailureInjection, validateStatusCodeIs200Ok, @@ -3490,6 +3727,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainer, noFailureInjection, validateStatusCodeIs200Ok, @@ -3517,6 +3755,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainerEnforceEmptyPages, noFailureInjection, validateStatusCodeIs200Ok, @@ -3550,6 +3789,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllPageSizeOneEntireContainer, noFailureInjection, validateStatusCodeIs200Ok, @@ -3584,6 +3824,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeSinglePartition, noFailureInjection, validateStatusCodeIs200Ok, @@ -3602,13 +3843,14 @@ public Object[][] testConfigs_readAllAfterCreation() { ENOUGH_DOCS_SAME_PK_TO_EXCEED_PAGE_SIZE }, - // ReadAll (entire container) with single doc inserted into single partition only. + // ReadAll (entire container) with multiple docs with same "id" inserted across partitions. // No failure injection and all records will fit into a single page new Object[] { "DefaultPageSize_Container_DocsAcrossAllPartitions_AllGood_NoAvailabilityStrategy", Duration.ofSeconds(1), noAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainer, noFailureInjection, validateStatusCodeIs200Ok, @@ -3642,6 +3884,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainer, injectTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -3680,6 +3923,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(3), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllPageSizeOneSinglePartition, injectTransitTimeoutIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -3724,6 +3968,7 @@ public Object[][] testConfigs_readAllAfterCreation() { Duration.ofSeconds(1), eagerThresholdAvailabilityStrategy, noRegionSwitchHint, + ConnectionMode.DIRECT, readAllDefaultPageSizeEntireContainer, injectReadSessionNotAvailableIntoFirstRegionOnly, validateStatusCodeIs200Ok, @@ -3754,6 +3999,85 @@ public Object[][] testConfigs_readAllAfterCreation() { ENOUGH_DOCS_OTHER_PK_TO_HIT_EVERY_PARTITION, NO_OTHER_DOCS_WITH_SAME_PK }, + + // GATEWAY MODE + //------------- + + // ReadAll (entire container) with single doc inserted into single partition only. + // No failure injection and all records will fit into a single page + new Object[] { + "GW_DefaultPageSize_Container_SingleDocument_AllGood_NoAvailabilityStrategy", + Duration.ofSeconds(3), + noAvailabilityStrategy, + noRegionSwitchHint, + ConnectionMode.GATEWAY, + readAllDefaultPageSizeEntireContainer, + noFailureInjection, + validateStatusCodeIs200Ok, + 1, + ArrayUtils.toArray( + validateCtxSingleRegion, + validateCtxQueryPlan, + validateCtxOnlyFeedResponsesExceptQueryPlan, + (ctx) -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(2); + } + ), + null, + validateExactlyOneRecordReturned, + NO_OTHER_DOCS_WITH_SAME_ID, + NO_OTHER_DOCS_WITH_SAME_PK + }, + + new Object[] { + "GW_DefaultPageSize_Container_SingleDocument_GW408_EagerAvailabilityStrategy", + Duration.ofSeconds(3), + eagerThresholdAvailabilityStrategy, + noRegionSwitchHint, + ConnectionMode.GATEWAY, + readAllDefaultPageSizeEntireContainer, + injectGatewayTransitTimeoutIntoFirstRegionOnly, + validateStatusCodeIs200Ok, + 1, + ArrayUtils.toArray( + validateCtxQueryPlan, + validateCtxOnlyFeedResponsesExceptQueryPlan, + (ctx) -> { + assertThat(ctx.getContactedRegionNames()).isNotNull(); + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(1); + assertThat(ctx.getContactedRegionNames().contains(SECOND_REGION_NAME)).isEqualTo(true); + assertThat(ctx.getDiagnostics()).isNotNull(); + CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); + + // Diagnostics of query attempt in first region not even available yet + assertThat(diagnostics.length).isEqualTo(2); + assertThat(diagnostics[1].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[1].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + ClientSideRequestStatistics[] clientStats = + diagnostics[1] + .getFeedResponseDiagnostics() + .getClientSideRequestStatistics() + .toArray(new ClientSideRequestStatistics[0]); + assertThat(clientStats.length).isEqualTo(PHYSICAL_PARTITION_COUNT); + for (int i = 0; i < clientStats.length; i++) { + assertThat(clientStats[i].getContactedRegionNames()).isNotNull(); + assertThat(clientStats[i].getContactedRegionNames().size()).isEqualTo(1); + assertThat(clientStats[i].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(clientStats[i].getGatewayStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList()).isNotNull(); + assertThat(clientStats[i].getResponseStatisticsList().size()).isEqualTo(0); + } + } + ), + null, + validateExactlyOneRecordReturned, + NO_OTHER_DOCS_WITH_SAME_ID, + NO_OTHER_DOCS_WITH_SAME_PK + }, }; } @@ -3763,6 +4087,7 @@ public void readAllAfterCreation( Duration endToEndTimeout, ThresholdBasedAvailabilityStrategy availabilityStrategy, CosmosRegionSwitchHint regionSwitchHint, + ConnectionMode connectionMode, Function readAllOperation, BiConsumer faultInjectionCallback, BiConsumer validateStatusCode, @@ -3790,7 +4115,8 @@ public void readAllAfterCreation( responseValidator, numberOfOtherDocumentsWithSameId, numberOfOtherDocumentsWithSamePk, - true); + true, + connectionMode); } private static ObjectNode createTestItemAsJson(String id, String pkValue) { @@ -4019,7 +4345,8 @@ private void execute( Consumer validateResponse, int numberOfOtherDocumentsWithSameId, int numberOfOtherDocumentsWithSamePk, - boolean clearContainerBeforeExecution) { + boolean clearContainerBeforeExecution, + ConnectionMode connectionMode) { logger.info("START {}", testCaseId); @@ -4027,7 +4354,8 @@ private void execute( this.writeableRegions, regionSwitchHint, customMinRetryTimeInLocalRegionForWrites, - nonIdempotentWriteRetriesEnabled); + nonIdempotentWriteRetriesEnabled, + connectionMode); try { if (clearContainerBeforeExecution) { @@ -4169,7 +4497,8 @@ private static CosmosAsyncClient buildCosmosClient( List preferredRegions, CosmosRegionSwitchHint regionSwitchHint, Duration customMinRetryTimeInLocalRegionForWrites, - Boolean nonIdempotentWriteRetriesEnabled) { + Boolean nonIdempotentWriteRetriesEnabled, + ConnectionMode connectionMode) { CosmosClientTelemetryConfig telemetryConfig = new CosmosClientTelemetryConfig() .diagnosticsHandler(new CosmosDiagnosticsLogger()); @@ -4190,10 +4519,15 @@ private static CosmosAsyncClient buildCosmosClient( .consistencyLevel(ConsistencyLevel.SESSION) .preferredRegions(preferredRegions) .sessionRetryOptions(retryOptionsBuilder.build()) - .directMode() .multipleWriteRegionsEnabled(true) .clientTelemetryConfig(telemetryConfig); + if (connectionMode == ConnectionMode.GATEWAY) { + builder.gatewayMode(); + } else { + builder.directMode(); + } + if (nonIdempotentWriteRetriesEnabled != null) { builder.setNonIdempotentWriteRetryPolicy( nonIdempotentWriteRetriesEnabled, true); diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/MaxRetryCountTests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/MaxRetryCountTests.java new file mode 100644 index 0000000000000..50a11c3a1a0a0 --- /dev/null +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/MaxRetryCountTests.java @@ -0,0 +1,2661 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +package com.azure.cosmos; + +import com.azure.cosmos.implementation.AsyncDocumentClient; +import com.azure.cosmos.implementation.ClientSideRequestStatistics; +import com.azure.cosmos.implementation.Configs; +import com.azure.cosmos.implementation.DatabaseAccount; +import com.azure.cosmos.implementation.DatabaseAccountLocation; +import com.azure.cosmos.implementation.GlobalEndpointManager; +import com.azure.cosmos.implementation.HttpConstants; +import com.azure.cosmos.implementation.OperationType; +import com.azure.cosmos.implementation.RxDocumentClientImpl; +import com.azure.cosmos.implementation.TestConfigurations; +import com.azure.cosmos.implementation.Utils; +import com.azure.cosmos.implementation.apachecommons.lang.tuple.ImmutablePair; +import com.azure.cosmos.implementation.apachecommons.lang.tuple.Pair; +import com.azure.cosmos.implementation.directconnectivity.ReflectionUtils; +import com.azure.cosmos.models.CosmosClientTelemetryConfig; +import com.azure.cosmos.models.CosmosContainerProperties; +import com.azure.cosmos.models.CosmosItemResponse; +import com.azure.cosmos.models.CosmosPatchItemRequestOptions; +import com.azure.cosmos.models.FeedRange; +import com.azure.cosmos.models.PartitionKey; +import com.azure.cosmos.models.PartitionKeyDefinition; +import com.azure.cosmos.models.ThroughputProperties; +import com.azure.cosmos.rx.TestSuiteBase; +import com.azure.cosmos.test.faultinjection.CosmosFaultInjectionHelper; +import com.azure.cosmos.test.faultinjection.FaultInjectionCondition; +import com.azure.cosmos.test.faultinjection.FaultInjectionConditionBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionConnectionType; +import com.azure.cosmos.test.faultinjection.FaultInjectionEndpointBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionOperationType; +import com.azure.cosmos.test.faultinjection.FaultInjectionResultBuilders; +import com.azure.cosmos.test.faultinjection.FaultInjectionRule; +import com.azure.cosmos.test.faultinjection.FaultInjectionRuleBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionServerErrorResult; +import com.azure.cosmos.test.faultinjection.FaultInjectionServerErrorType; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.util.TriConsumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Function; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.assertj.core.api.AssertionsForClassTypes.fail; + +@SuppressWarnings("SameParameterValue") +public class MaxRetryCountTests extends TestSuiteBase { + private static final int PHYSICAL_PARTITION_COUNT = 3; + private final static Logger logger = LoggerFactory.getLogger(FaultInjectionWithAvailabilityStrategyTests.class); + + private final static String sameDocumentIdJustCreated = null; + + private final static Boolean notSpecifiedWhetherIdempotentWriteRetriesAreEnabled = null; + private final static ThresholdBasedAvailabilityStrategy noAvailabilityStrategy = null; + private final static Duration defaultNetworkRequestTimeoutDuration = Duration.ofSeconds(5); + private final static Duration minNetworkRequestTimeoutDuration = Duration.ofSeconds(1); + private final static ThrottlingRetryOptions defaultThrottlingRetryOptions = new ThrottlingRetryOptions(); + + private final static BiConsumer validateStatusCodeIsReadSessionNotAvailableError = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.NOTFOUND); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.READ_SESSION_NOT_AVAILABLE); + }; + + private final static BiConsumer validateStatusCodeIsInternalServerError = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.INTERNAL_SERVER_ERROR); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.UNKNOWN); + }; + + private final static BiConsumer validateStatusCodeIsServiceUnavailable = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.SERVER_GENERATED_503); + }; + + private final static BiConsumer validateStatusCodeIsRetryWith = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.RETRY_WITH); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.UNKNOWN); + }; + + // TODO: currently there is an issue where the subStatus code is missed in read especially when all replicas are failed with 410 + // StoreReader line #486 + private final static BiConsumer validateStatusCodeIsServerGoneGenerated503ForRead = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.UNKNOWN); + }; + + private final static BiConsumer validateStatusCodeIsServerTimeoutGenerated503ForRead = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.UNKNOWN); + }; + + private final static BiConsumer validateStatusCodeIsServerGoneGenerated503ForWrite = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.SERVER_GENERATED_410); + }; + + private final static BiConsumer validateStatusCodeIsServerTimeoutGenerated410ForWrite = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.GONE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.SERVER_GENERATED_408); + }; + + private final static BiConsumer validateStatusCodeIsServerTimeoutGenerated503ForWrite = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.SERVER_GENERATED_408); + }; + + private final static BiConsumer validateStatusCodeIsTimeout = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.REQUEST_TIMEOUT); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.UNKNOWN); + }; + + private final static BiConsumer validateStatusCodeIsTransitTimeoutGenerated503ForWrite = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.TRANSPORT_GENERATED_410); + }; + + private final static BiConsumer validateStatusCodeIsRequestRateTooLarge = + (statusCode, subStatusCode) -> { + assertThat(statusCode).isEqualTo(HttpConstants.StatusCodes.TOO_MANY_REQUESTS); + assertThat(subStatusCode).isEqualTo(HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE); + }; + + private final static BiConsumer noFailureInjection = + (container, operationType) -> {}; + + private BiConsumer injectReadSessionNotAvailableIntoAllRegions = null; + + private BiConsumer injectServiceUnavailableIntoAllRegions = null; + + private BiConsumer injectInternalServerErrorIntoAllRegions = null; + + private BiConsumer injectRetryWithErrorIntoAllRegions = null; + private BiConsumer injectServerGoneErrorIntoAllRegions = null; + private Function> injectTransitTimeoutIntoAllRegions = null; + private BiConsumer injectServerTimeoutErrorIntoAllRegions = null; + private BiConsumer injectRequestRateTooLargeIntoAllRegions = null; + + private String FIRST_REGION_NAME = null; + private String SECOND_REGION_NAME = null; + + private List writeableRegions; + + private String testDatabaseId; + private String testContainerId; + + @Override + public String resolveTestNameSuffix(Object[] row) { + if (row == null || row.length == 0) { + return ""; + } + + return (String)row[0]; + } + + @BeforeClass(groups = { "multi-master" }) + public void beforeClass() { + CosmosClientBuilder clientBuilder = new CosmosClientBuilder() + .endpoint(TestConfigurations.HOST) + .key(TestConfigurations.MASTER_KEY) + .contentResponseOnWriteEnabled(true) + .directMode(); + + CosmosAsyncClient dummyClient = null; + + try { + + dummyClient = clientBuilder.buildAsyncClient(); + + AsyncDocumentClient asyncDocumentClient = ReflectionUtils.getAsyncDocumentClient(dummyClient); + RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) asyncDocumentClient; + GlobalEndpointManager globalEndpointManager = + ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + + DatabaseAccount databaseAccount = globalEndpointManager.getLatestDatabaseAccount(); + + Map writeRegionMap = this.getRegionMap(databaseAccount, true); + + this.writeableRegions = new ArrayList<>(writeRegionMap.keySet()); + assertThat(this.writeableRegions).isNotNull(); + assertThat(this.writeableRegions.size()).isGreaterThanOrEqualTo(2); + + FIRST_REGION_NAME = this.writeableRegions.get(0).toLowerCase(Locale.ROOT); + SECOND_REGION_NAME = this.writeableRegions.get(1).toLowerCase(Locale.ROOT); + + FeedRange ALL_PARTITIONS = null; + + this.injectReadSessionNotAvailableIntoAllRegions = + (c, operationType) -> injectReadSessionNotAvailableError(c, this.writeableRegions, operationType, ALL_PARTITIONS); + + this.injectServiceUnavailableIntoAllRegions = + (c, operationType) -> injectServiceUnavailable(c, this.writeableRegions, operationType); + + this.injectInternalServerErrorIntoAllRegions = + (c, operationType) -> injectInternalServerError(c, this.writeableRegions, operationType); + + this.injectRetryWithErrorIntoAllRegions = + (c, operationType) -> injectRetryWithServerError(c, this.writeableRegions, operationType); + + this.injectServerGoneErrorIntoAllRegions = + (c, operationType) -> injectServerGoneError(c, this.writeableRegions, operationType); + + this.injectTransitTimeoutIntoAllRegions = + (networkRequestTimeoutDuration) -> + (c, operationType) -> injectTransitTimeoutError(c, this.writeableRegions, operationType, networkRequestTimeoutDuration); + + this.injectServerTimeoutErrorIntoAllRegions = + (c, operationType) -> injectServerTimeoutError(c, this.writeableRegions, operationType); + + this.injectRequestRateTooLargeIntoAllRegions = + (c, operationType) -> injectServerRequestRateTooLargeError(c, this.writeableRegions, operationType); + + CosmosAsyncContainer container = this.createTestContainer(dummyClient); + this.testDatabaseId = container.getDatabase().getId(); + this.testContainerId = container.getId(); + + // Creating a container is an async task - especially with multiple regions it can + // take some time until the container is available in the remote regions as well + // When the container does not exist yet, you would see 401 for example for point reads etc. + // So, adding this delay after container creation to minimize risk of hitting these errors + try { + Thread.sleep(3000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + } finally { + safeClose(dummyClient); + } + } + @AfterClass(groups = { "multi-master" }) + public void afterClass() { + CosmosClientBuilder clientBuilder = new CosmosClientBuilder() + .endpoint(TestConfigurations.HOST) + .key(TestConfigurations.MASTER_KEY) + .contentResponseOnWriteEnabled(true) + .directMode(); + + CosmosAsyncClient dummyClient = null; + if (testDatabaseId != null) { + try { + + dummyClient = clientBuilder.buildAsyncClient(); + CosmosAsyncDatabase testDatabase = dummyClient + .getDatabase(this.testDatabaseId); + + safeDeleteDatabase(testDatabase); + } finally { + safeClose(dummyClient); + } + } + } + + private static int expectedMaxNumberOfRetriesForLocalRegionPreferred( + int sessionTokenMismatchWaitTime, + int sessionTokenMismatchInitialBackoff, + int sessionTokenMismatchMaxBackoff + ) { + int totalRetryPolicyDuration = sessionTokenMismatchWaitTime; + int currentBackoff = sessionTokenMismatchInitialBackoff; + int waitTime = 0; + int count = 1; + while (waitTime <= totalRetryPolicyDuration) { + waitTime += currentBackoff; + currentBackoff = Math.min(currentBackoff * 2, sessionTokenMismatchMaxBackoff); + count += 1; + } + + logger.info( + "expectedMaxNumberOfRetriesForLocalRegionPreferred {}, {}, {} == {}", + sessionTokenMismatchWaitTime, + sessionTokenMismatchInitialBackoff, + sessionTokenMismatchMaxBackoff, + count); + + return count + 1; + } + + private static int expectedMaxNumberOfRetriesForRetryWith( + int maxWaitTimeInMS, + int maxBackoffTimeInMs, + int initialBackoffTimeInMs, + int backoffMultiplier) { + int currentBackoff = initialBackoffTimeInMs; + int waitTime = 0; + int count = 1; + + while (waitTime <= maxWaitTimeInMS) { + waitTime += currentBackoff; + currentBackoff = Math.min(currentBackoff * backoffMultiplier, maxBackoffTimeInMs); + count += 1; + } + + logger.info( + "expectedMaxNumberOfRetriesForRetryWith [maxWaitTimeInMS {}, maxBackoffTimeInMs {}, initialBackoffTimeInMs {}] == {}", + maxWaitTimeInMS, + maxBackoffTimeInMs, + initialBackoffTimeInMs, + count); + + return count + 1; + } + + private static int expectedMaxNumberOfRetriesForGone( + int maxWaitTimeInMs, + int maxBackoffTimeInMs, + int initialBackoffTimeInMs, + int backoffMultiplier, + ConsistencyLevel consistencyLevel, + OperationType operationType) { + + int requestsInFirstRetryCycle = 0; + int requestsInFollowingRetryCycle = 0; + if (operationType.isWriteOperation()) { + requestsInFirstRetryCycle = 1; + requestsInFollowingRetryCycle = 1; + } else { + switch (consistencyLevel) { + case EVENTUAL: + case CONSISTENT_PREFIX: + requestsInFirstRetryCycle = 1; + requestsInFollowingRetryCycle = 1; + break; + case SESSION: + requestsInFirstRetryCycle = 4; + requestsInFollowingRetryCycle = 4; + break; + case BOUNDED_STALENESS: + case STRONG: + requestsInFirstRetryCycle = 4 * 2; // QuorumNotSelected, SDK will retry 2 times before bubble up the exception + requestsInFollowingRetryCycle = 4; + break; + default: + throw new IllegalArgumentException("Consistency level is not supported " + consistencyLevel); + } + } + + int currentBackoff = 0; + long remainingTimeInMs = maxWaitTimeInMs; + int count = requestsInFirstRetryCycle; + boolean firstRetryAttempt = true; + int maxRetriesInLocalRegion = 1; // includes the first try + + while (remainingTimeInMs > 0 || firstRetryAttempt) { + maxRetriesInLocalRegion ++; + if (firstRetryAttempt && remainingTimeInMs < 0) { + // SDK will always retry on the local region one more time + remainingTimeInMs = maxBackoffTimeInMs; + } + + firstRetryAttempt = false; + remainingTimeInMs -= currentBackoff; + count += requestsInFollowingRetryCycle; + + if (currentBackoff == 0) { + currentBackoff = initialBackoffTimeInMs; + } else { + currentBackoff = Math.min(currentBackoff * backoffMultiplier, maxBackoffTimeInMs); + } + } + + logger.info( + "expectedMaxNumberOfRetriesForGone [maxWaitTimeInMs {}, maxBackoffTimeInMs {}, initialBackoffTimeInMs {}, " + + "consistencyLevel {}, OperationType {}] == {}, maxRetriesInLocalRegion {}", + maxWaitTimeInMs, + maxBackoffTimeInMs, + initialBackoffTimeInMs, + consistencyLevel, + operationType, + count, + maxRetriesInLocalRegion); + + return count; + } + + private static int expectedMaxNumberOfRetriesForTransientTimeout( + int maxWaitTimeInMs, + int maxBackoffTimeInMs, + int initialBackoffTimeInMs, + int backoffMultiplier, + ConsistencyLevel consistencyLevel, + OperationType operationType, + Duration networkRequestTimeout, + Boolean idempotentWriteRetriesAreEnabled) { + + int count = 0; + int maxRetriesInLocalRegion = 1; // includes the first try + + if (operationType.isWriteOperation() + && (idempotentWriteRetriesAreEnabled == null || !idempotentWriteRetriesAreEnabled)) { + count = 1; + } else { + long latencyInFirstRetryCycleInMs = 0; + int requestsInFirstRetryCycle = 0; + + long latencyInFollowingRetryCycleInMs = 0; + int requestsInFollowingRetryCycle = 0; + + if (operationType.isWriteOperation()) { + latencyInFirstRetryCycleInMs = networkRequestTimeout.toMillis(); + requestsInFirstRetryCycle = 1; + requestsInFollowingRetryCycle = 1; + latencyInFollowingRetryCycleInMs = networkRequestTimeout.toMillis(); + } else { + switch (consistencyLevel) { + case EVENTUAL: + case CONSISTENT_PREFIX: + latencyInFirstRetryCycleInMs = networkRequestTimeout.toMillis(); + requestsInFirstRetryCycle = 1; + latencyInFollowingRetryCycleInMs = networkRequestTimeout.toMillis(); + requestsInFollowingRetryCycle = 1; + break; + case SESSION: + latencyInFirstRetryCycleInMs = networkRequestTimeout.toMillis() * 4; + requestsInFirstRetryCycle = 4; + latencyInFollowingRetryCycleInMs = networkRequestTimeout.toMillis() * 4; + requestsInFollowingRetryCycle = 4; + break; + case BOUNDED_STALENESS: + case STRONG: + latencyInFirstRetryCycleInMs = networkRequestTimeout.toMillis() * 3 * 2; // for quorum reads, SDK will send two requests in parallel + requestsInFirstRetryCycle = 4 * 2;// QuorumNotSelected, SDK will retry 2 times before bubble up the exception + latencyInFollowingRetryCycleInMs = networkRequestTimeout.toMillis() * 3; // for quorum reads, SDK will send two requests in parallel + requestsInFollowingRetryCycle = 4; // after the first cycle, forceRefreshHeader on the requests has been set, so instead of 2, it will only retry 1 time + break; + default: + throw new IllegalArgumentException("Consistency level is not supported " + consistencyLevel); + } + } + + int currentBackoff = 0; + long remainingTimeInMs = maxWaitTimeInMs; + count = requestsInFirstRetryCycle; + remainingTimeInMs -= latencyInFirstRetryCycleInMs; + boolean firstRetryAttempt = true; + + while (remainingTimeInMs > 0 || firstRetryAttempt) { + maxRetriesInLocalRegion++; + if (firstRetryAttempt && remainingTimeInMs < 0) { + // SDK will always retry on the local region one more time + remainingTimeInMs = maxBackoffTimeInMs; + } + + firstRetryAttempt = false; + remainingTimeInMs -= currentBackoff; + count += requestsInFollowingRetryCycle; + remainingTimeInMs -= latencyInFollowingRetryCycleInMs; + + if (currentBackoff == 0) { + currentBackoff = initialBackoffTimeInMs; + } else { + currentBackoff = Math.min(currentBackoff * backoffMultiplier, maxBackoffTimeInMs); + } + } + } + + logger.info( + "expectedMaxNumberOfRetriesForTransitTimeout [maxWaitTimeInMs {}, maxBackoffTimeInMs {}, initialBackoffTimeInMs {}, " + + "consistencyLevel {}, OperationType {}, networkRequestTimeout {}, idempotentWriteRetriesAreEnabled {}] == {}, maxRetriesInLocalRegion {}", + maxWaitTimeInMs, + maxBackoffTimeInMs, + initialBackoffTimeInMs, + consistencyLevel, + operationType, + networkRequestTimeout, + idempotentWriteRetriesAreEnabled, + count, + maxRetriesInLocalRegion + ); + + return count; + } + + private static int expectedMaxNumberOfRetriesForServerInternalServerError( + ConsistencyLevel consistencyLevel, + OperationType operationType) { + + if (operationType.isWriteOperation()) { + return 1; + } + + switch (consistencyLevel) { + case EVENTUAL: + case CONSISTENT_PREFIX: + case SESSION: + return 1; + case BOUNDED_STALENESS: + case STRONG: + return 2; // SDK do quorum reads + default: + throw new IllegalArgumentException("Consitency level is not supported " + consistencyLevel); + } + } + + private static int expectedMaxNumberOfRetriesForServerServiceUnavailable( + ConsistencyLevel consistencyLevel, + OperationType operationType) { + + if (operationType.isWriteOperation()) { + return 1; + } + + switch (consistencyLevel) { + case EVENTUAL: + case CONSISTENT_PREFIX: + case SESSION: + return 1; + case BOUNDED_STALENESS: + case STRONG: + return 2; // SDK do quorum reads + default: + throw new IllegalArgumentException("Consitency level is not supported " + consistencyLevel); + } + } + + private static int expectedMaxNumberOfRetriesForServerRequestRateTooLarge( + ConsistencyLevel consistencyLevel, + OperationType operationType, + ThrottlingRetryOptions retryOptions) { + + if (operationType.isWriteOperation()) { + return retryOptions.getMaxRetryAttemptsOnThrottledRequests() + 1; + } + + switch (consistencyLevel) { + case EVENTUAL: + case CONSISTENT_PREFIX: + case SESSION: + return retryOptions.getMaxRetryAttemptsOnThrottledRequests() + 1; + case BOUNDED_STALENESS: + case STRONG: + return 2 * (retryOptions.getMaxRetryAttemptsOnThrottledRequests() + 1); // SDK do quorum reads + default: + throw new IllegalArgumentException("Consitency level is not supported " + consistencyLevel); + } + } + + @DataProvider(name = "readMaxRetryCount_readSessionNotAvailable") + public Object[][] testConfigs_readMaxRetryCount_readSessionNotAvailable() { + final Integer MAX_LOCAL_RETRY_COUNT_DEFAULT = null; + final Integer MAX_LOCAL_RETRY_COUNT_ONE = 1; + final Integer MAX_LOCAL_RETRY_COUNT_TWO = 2; + final Integer MAX_LOCAL_RETRY_COUNT_ZERO = 0; + final Integer MAX_LOCAL_RETRY_COUNT_THREE = 3; + final Integer MAX_LOCAL_RETRY_COUNT_FOUR = 4; + + final Integer SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT = null; + final Integer SESSION_TOKEN_MISMATCH_WAIT_TIME_FIVE_SECONDS = 5000; + final Integer SESSION_TOKEN_MISMATCH_WAIT_TIME_ONE_SECOND = 1000; + + final Integer SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT = null; + final Integer SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_FIVE_MS = 5; + final Integer SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_ONE_SECOND = 1000; + + final Integer SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT = null; + final Integer SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_HUNDRED_MS = 500; + final Integer SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_SECONDS = 5000; + + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout + // Region switch hint (404/1002 prefer local or remote retries) + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects 404/1002 across all regions for the read operation after the initial creation + // The region switch hint for 404/1002 is remote - meaning no local retries are happening + // It is expected to fail with a 404/1002 - the validation will make sure that cross regional + // execution via availability strategy was happening (but also failed) + new Object[] { + "404-1002_AllRegions_RemotePreferred_DefaultLocalRetryCountOf1", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + Math.max(1, MAX_LOCAL_RETRY_COUNT_TWO) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_DEFAULT, // DEFAULT is 1 + SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT, // DEFAULT is 5 seconds, + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT, // DEFAULT is 500 ms + }, + + new Object[] { + "404-1002_AllRegions_RemotePreferred_LocalRetryCount0", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + // even though maxRetryCount is being set to 0, but internally MIN_MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED will be used + Math.max(1, MAX_LOCAL_RETRY_COUNT_TWO) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_ZERO, + SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT, // DEFAULT is 5 seconds + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT, // DEFAULT is 500 ms + }, + + new Object[] { + "404-1002_AllRegions_RemotePreferred_LocalRetryCount3", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.REMOTE_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + Math.max(1, MAX_LOCAL_RETRY_COUNT_FOUR) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_THREE, + SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT, // DEFAULT is 5 seconds + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT, // DEFAULT is 500 ms + }, + + new Object[] { + "404-1002_AllRegions_LocalPreferred_DefaultsSessionTokenMismatch", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForLocalRegionPreferred( + SESSION_TOKEN_MISMATCH_WAIT_TIME_FIVE_SECONDS, + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_FIVE_MS, + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_HUNDRED_MS + ) * Math.max(1, MAX_LOCAL_RETRY_COUNT_ONE) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_DEFAULT, // DEFAULT is 1 + SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT, // DEFAULT is 5 seconds + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT, // DEFAULT is 500 ms + }, + + new Object[] { + "404-1002_AllRegions_LocalPreferred_HighBackoff", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForLocalRegionPreferred( + SESSION_TOKEN_MISMATCH_WAIT_TIME_ONE_SECOND, + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_ONE_SECOND, + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_SECONDS + ) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_DEFAULT, // DEFAULT is 1 + SESSION_TOKEN_MISMATCH_WAIT_TIME_ONE_SECOND, // DEFAULT is 5 seconds + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_ONE_SECOND, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_SECONDS, // DEFAULT is 500 ms + }, + + new Object[] { + "404-1002_AllRegions_LocalPreferred_Defaults_LocalRetryCount3", + Duration.ofSeconds(60), + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + sameDocumentIdJustCreated, + injectReadSessionNotAvailableIntoAllRegions, + validateStatusCodeIsReadSessionNotAvailableError, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + (expectedMaxNumberOfRetriesForLocalRegionPreferred( + SESSION_TOKEN_MISMATCH_WAIT_TIME_FIVE_SECONDS, + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_FIVE_MS, + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_FIVE_HUNDRED_MS + ) + 1) * (1 + (4 * writeableRegions.size())) + ), + MAX_LOCAL_RETRY_COUNT_THREE, // DEFAULT is 1 + SESSION_TOKEN_MISMATCH_WAIT_TIME_DEFAULT, // DEFAULT is 5 seconds + SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_DEFAULT, // DEFAULT is 5 ms + SESSION_TOKEN_MISMATCH_MAX_BACKOFF_DEFAULT, // DEFAULT is 500 ms + }, + }; + } + + @DataProvider(name = "readMaxRetryCount_retryWith") + public Object[][] testConfigs_readMaxRetryCount_retryWith() { + final int DEFAULT_WAIT_TIME_IN_MS = 30 * 1000; + final int DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS = 1000; + final int DEFAULT_INITIAL_BACKOFF_TIME_MS = 10; + final int DEFAULT_BACK_OFF_MULTIPLIER = 2; + + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects 449/0 across all regions for the read operation after the initial creation + // It is expected to fail with a 449/0 + // There is no cross region retry for 449/0 + new Object[] { + "449-0_AllRegions_Read", + Duration.ofSeconds(60), + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + sameDocumentIdJustCreated, + injectRetryWithErrorIntoAllRegions, + validateStatusCodeIsRetryWith, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForRetryWith( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_MS, + DEFAULT_BACK_OFF_MULTIPLIER + ) + ), + }, + new Object[] { + "449-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + sameDocumentIdJustCreated, + injectRetryWithErrorIntoAllRegions, + validateStatusCodeIsRetryWith, + (Consumer)(requestCount) -> assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForRetryWith( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_MS, + DEFAULT_BACK_OFF_MULTIPLIER + ) + ), + } + }; + } + + @DataProvider(name = "readMaxRetryCount_serverGone") + public Object[][] testConfigs_readMaxRetryCount_serverGone() { + final int DEFAULT_WAIT_TIME_IN_MS = 30 * 1000; + final int DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS = 15 * 1000; + final int DEFAULT_INITIAL_BACKOFF_TIME_IN_MS = 1000; + final int DEFAULT_BACK_OFF_MULTIPLIER = 2; + + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects server generated 410/0 across all regions for the read operation after the initial creation + // It is expected to fail with a 503/21005 + new Object[] { + "410-0_AllRegions_Read", + Duration.ofSeconds(60), + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServerGoneErrorIntoAllRegions, + validateStatusCodeIsServerGoneGenerated503ForRead, // SDK will wrap into 503 exceptions after exhausting all retries + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForGone( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType + ) * (1 + this.writeableRegions.size()) + ) + }, + new Object[] { + "410-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServerGoneErrorIntoAllRegions, + validateStatusCodeIsServerGoneGenerated503ForWrite, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForGone( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType + ) * (1 + this.writeableRegions.size()) + ) + }, + new Object[] { + "410-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectServerGoneErrorIntoAllRegions, + validateStatusCodeIsServerGoneGenerated503ForWrite, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForGone( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType + ) * (1 + this.writeableRegions.size()) + ) + } + }; + } + + @DataProvider(name = "readMaxRetryCount_transitTimeout") + public Object[][] testConfigs_readMaxRetryCount_transitTimeout() { + final int DEFAULT_WAIT_TIME_IN_MS = 30 * 1000; + final int DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS = 15 * 1000; + final int DEFAULT_INITIAL_BACKOFF_TIME_IN_MS = 1000; + final int DEFAULT_BACK_OFF_MULTIPLIER = 2; + + // TODO: add max networkRequestTimeout test + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // NetworkRequestTimeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects transient timeout across all regions for the read operation after the initial creation + // For read, it is expected to fail with 503/20001 + // For write with Idempotent retries being enabled, it is expected to fail with 503/20001 + // For write with idempotent retries being disabled, it is expected to fail with 408 + new Object[] { + "410-20001_AllRegions_Read", + Duration.ofSeconds(60), + minNetworkRequestTimeoutDuration, + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectTransitTimeoutIntoAllRegions.apply(minNetworkRequestTimeoutDuration), + validateStatusCodeIsServerGoneGenerated503ForRead, // SDK will wrap into 503 exceptions after exhausting all retries + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + minNetworkRequestTimeoutDuration, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled + ) * (1 + this.writeableRegions.size()) + ) + }, + new Object[] { + "410-20001_AllRegions_Read", + Duration.ofSeconds(60), + defaultNetworkRequestTimeoutDuration, + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectTransitTimeoutIntoAllRegions.apply(defaultNetworkRequestTimeoutDuration), + validateStatusCodeIsServerGoneGenerated503ForRead, // SDK will wrap into 503 exceptions after exhausting all retries + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + defaultNetworkRequestTimeoutDuration, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled + ) * (1 + this.writeableRegions.size()) + ) + }, + new Object[] { + "410-20001_AllRegions_Create", + Duration.ofSeconds(60), + minNetworkRequestTimeoutDuration, + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectTransitTimeoutIntoAllRegions.apply(minNetworkRequestTimeoutDuration), + validateStatusCodeIsTimeout, // when idempotent write is disabled, SDK will not retry for write operation, 408 will be bubbled up + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + minNetworkRequestTimeoutDuration, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled + ) + ) + }, + new Object[] { + "410-20001_AllRegions_Create", + Duration.ofSeconds(60), + minNetworkRequestTimeoutDuration, + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectTransitTimeoutIntoAllRegions.apply(minNetworkRequestTimeoutDuration), + validateStatusCodeIsTransitTimeoutGenerated503ForWrite, // when idempotent write is enabled, write will retry in reach region and bubble as 503/20001 + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + minNetworkRequestTimeoutDuration, + true + ) * (1 + this.writeableRegions.size()) + ) + } + }; + } + + @DataProvider(name = "readMaxRetryCount_serverTimeout") + public Object[][] testConfigs_readMaxRetryCount_serverTimeout() { + final int DEFAULT_WAIT_TIME_IN_MS = 30 * 1000; + final int DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS = 15 * 1000; + final int DEFAULT_INITIAL_BACKOFF_TIME_IN_MS = 1000; + final int DEFAULT_BACK_OFF_MULTIPLIER = 2; + + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects transient timeout across all regions for the read operation after the initial creation + // For read, it is expected to fail with 503/20001 + // For write with Idempotent retries being enabled, it is expected to fail with 503/20001 + // For write with idempotent retries being disabled, it is expected to fail with 408 + new Object[] { + "408-0_AllRegions_Read", + Duration.ofSeconds(60), + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServerTimeoutErrorIntoAllRegions, + validateStatusCodeIsServerTimeoutGenerated503ForRead, // SDK will translate 408 into 410, and then follow 410 retry rules + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + Duration.ofSeconds(5), + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled + ) * (1 + this.writeableRegions.size()) + ) + }, + new Object[] { + "408-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServerTimeoutErrorIntoAllRegions, + validateStatusCodeIsServerTimeoutGenerated410ForWrite, // when idempotent write is disabled, SDK will not retry for write operation, 410 will be bubbled up + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + Duration.ofSeconds(5), + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled + ) + ) + }, + new Object[] { + "408-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectServerTimeoutErrorIntoAllRegions, + validateStatusCodeIsServerTimeoutGenerated503ForWrite, // when idempotent write is enabled, write will retry in reach region and bubble as 503/21010 + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + Duration.ofSeconds(5), + true + ) * (1 + this.writeableRegions.size()) + ) + } + }; + } + + @DataProvider(name = "readMaxRetryCount_serverServiceUnavailable") + public Object[][] testConfigs_readMaxRetryCount_serverServiceUnavailable() { + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects server generated 503/0 across all regions for the read operation after the initial creation + // It is expected to fail with a 503/0 + new Object[] { + "503-0_AllRegions_Read", + Duration.ofSeconds(60), + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServiceUnavailableIntoAllRegions, + validateStatusCodeIsServiceUnavailable, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerServiceUnavailable( + consistencyLevel, + operationType) * (1 + this.writeableRegions.size())) + }, + new Object[] { + "503-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectServiceUnavailableIntoAllRegions, + validateStatusCodeIsServiceUnavailable, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerServiceUnavailable( + consistencyLevel, + operationType) * (1 + this.writeableRegions.size())) + }, + new Object[] { + "503-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectServiceUnavailableIntoAllRegions, + validateStatusCodeIsServiceUnavailable, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerServiceUnavailable( + consistencyLevel, + operationType) * (1 + this.writeableRegions.size())) + } + }; + } + + @DataProvider(name = "readMaxRetryCount_serverRequestRateTooLarge") + public Object[][] testConfigs_readMaxRetryCount_serverRequestRateTooLarge() { + final ThrottlingRetryOptions customizedThrottlingRetryOptions = + new ThrottlingRetryOptions().setMaxRetryAttemptsOnThrottledRequests(2).setMaxRetryWaitTime(Duration.ofSeconds(2)); + + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects server generated 503/0 across all regions for the read operation after the initial creation + // It is expected to fail with a 503/0 + new Object[] { + "429-3200_AllRegions_Read", + Duration.ofSeconds(60), + defaultThrottlingRetryOptions, + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectRequestRateTooLargeIntoAllRegions, + validateStatusCodeIsRequestRateTooLarge, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerRequestRateTooLarge( + consistencyLevel, + operationType, + defaultThrottlingRetryOptions)) + }, + new Object[] { + "429-3200_AllRegions_Read", + Duration.ofSeconds(60), + customizedThrottlingRetryOptions, + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectRequestRateTooLargeIntoAllRegions, + validateStatusCodeIsRequestRateTooLarge, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerRequestRateTooLarge( + consistencyLevel, + operationType, + customizedThrottlingRetryOptions)) + }, + new Object[] { + "429-3200_AllRegions_Create", + Duration.ofSeconds(60), + defaultThrottlingRetryOptions, + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectRequestRateTooLargeIntoAllRegions, + validateStatusCodeIsRequestRateTooLarge, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerRequestRateTooLarge( + consistencyLevel, + operationType, + defaultThrottlingRetryOptions)) + }, + new Object[] { + "429-3200_AllRegions_Create", + Duration.ofSeconds(60), + defaultThrottlingRetryOptions, + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectRequestRateTooLargeIntoAllRegions, + validateStatusCodeIsRequestRateTooLarge, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerRequestRateTooLarge( + consistencyLevel, + operationType, + defaultThrottlingRetryOptions)) + } + }; + } + + @DataProvider(name = "readMaxRetryCount_serverInternalServerError") + public Object[][] testConfigs_readMaxRetryCount_serverInternalServerError() { + return new Object[][] { + // CONFIG description + // new Object[] { + // TestId - name identifying the test case + // End-to-end timeout, + // OperationType, + // FaultInjectionOperationType, + // Flag to indicate whether IdempotentWriteRetries are enabled + // optional documentId used for reads (instead of the just created doc id) - this can be used to trigger 404/0 + // Failure injection callback + // Status code/sub status code validation callback + // maxExpectedRetryCount + // }, + + // This test injects server generated 500/0 across all regions for the read operation after the initial creation + // It is expected to fail with a 500/0 + new Object[] { + "500-0_AllRegions_Read", + Duration.ofSeconds(60), + OperationType.Read, + FaultInjectionOperationType.READ_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectInternalServerErrorIntoAllRegions, + validateStatusCodeIsInternalServerError, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerInternalServerError(consistencyLevel, operationType)) + }, + new Object[] { + "500-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + sameDocumentIdJustCreated, + injectInternalServerErrorIntoAllRegions, + validateStatusCodeIsInternalServerError, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerInternalServerError(consistencyLevel, operationType)) + }, + new Object[] { + "500-0_AllRegions_Create", + Duration.ofSeconds(60), + OperationType.Create, + FaultInjectionOperationType.CREATE_ITEM, + true, // IdempotentWriteRetries is enabled + sameDocumentIdJustCreated, + injectInternalServerErrorIntoAllRegions, + validateStatusCodeIsInternalServerError, + (TriConsumer)(requestCount, consistencyLevel, operationType) -> + assertThat(requestCount).isLessThanOrEqualTo( + expectedMaxNumberOfRetriesForServerInternalServerError(consistencyLevel, operationType)) + } + }; + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_readSessionNotAvailable") + public void readMaxRetryCount_readSessionNotAvailable( + String testCaseId, + Duration endToEndTimeout, + CosmosRegionSwitchHint regionSwitchHint, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + Consumer maxExpectedRequestCountValidation, + Integer maxRetriesInLocalRegion, + Integer sessionTokenRetriesWaitTime, + Integer sessionTokenRetriesInitialBackoff, + Integer sessionTokenRetriesMaxBackoff) { + + final int TWO_REGIONS = 2; + + if (maxRetriesInLocalRegion != null) { + System.setProperty( + Configs.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED, + String.valueOf(maxRetriesInLocalRegion)); + } else { + System.clearProperty(Configs.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED); + } + + if (sessionTokenRetriesWaitTime != null) { + System.setProperty( + Configs.DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME, + String.valueOf(sessionTokenRetriesWaitTime)); + } else { + System.clearProperty(Configs.DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME); + } + + if (sessionTokenRetriesInitialBackoff != null) { + System.setProperty( + Configs.DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME, + String.valueOf(sessionTokenRetriesInitialBackoff)); + } else { + System.clearProperty(Configs.DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME); + } + + if (sessionTokenRetriesMaxBackoff != null) { + System.setProperty( + Configs.DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME, + String.valueOf(sessionTokenRetriesMaxBackoff)); + } else { + System.clearProperty(Configs.DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME); + } + + Function readItemCallback = + getRequestCallBack(OperationType.Read, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + + logger.info( + "MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED: {}", + Configs.getMaxRetriesInLocalRegionWhenRemoteRegionPreferred()); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxTwoRegions = + (ctx) -> validateCtxRegions.accept(ctx, TWO_REGIONS); + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + maxExpectedRequestCountValidation.accept(actualRequestCount); + } + }; + + try { + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + regionSwitchHint, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(FaultInjectionOperationType.READ_ITEM), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxTwoRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } finally { + System.clearProperty(Configs.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED); + } + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_retryWith") + public void readMaxRetryCount_retryWith( + String testCaseId, + Duration endToEndTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + Consumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; // there is no cross region retry for 449 + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> validateCtxRegions.accept(ctx, ONE_REGION); + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + maxExpectedRequestCountValidation.accept(actualRequestCount); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + notSpecifiedWhetherIdempotentWriteRetriesAreEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_serverGone") + public void readMaxRetryCount_serverGone( + String testCaseId, + Duration endToEndTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + final int TWO_REGIONS = 2; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> { + if (operationType.isReadOnlyOperation()) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else if (isIdempotentWriteRetriesEnabled != null && !isIdempotentWriteRetriesEnabled) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else { + validateCtxRegions.accept(ctx, ONE_REGION); + } + }; + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_transitTimeout") + public void readMaxRetryCount_transitTimeout( + String testCaseId, + Duration endToEndTimeout, + Duration networkRequestTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + final int TWO_REGIONS = 2; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> { + if (operationType.isReadOnlyOperation()) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else if (isIdempotentWriteRetriesEnabled != null && !isIdempotentWriteRetriesEnabled) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else { + validateCtxRegions.accept(ctx, ONE_REGION); + } + }; + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + networkRequestTimeout, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_serverTimeout") + public void readMaxRetryCount_serverTimeout( + String testCaseId, + Duration endToEndTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + final int TWO_REGIONS = 2; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> { + if (operationType.isReadOnlyOperation()) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else if (isIdempotentWriteRetriesEnabled != null && !isIdempotentWriteRetriesEnabled) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else { + validateCtxRegions.accept(ctx, ONE_REGION); + } + }; + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + // TODO: currently, fault injection does not support 408 + delay, so the error is being injected without delay + // Will add the support in fault injection, and then will uncomment the following check + // maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_serverServiceUnavailable") + public void readMaxRetryCount_serverServiceUnavailable( + String testCaseId, + Duration endToEndTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + final int TWO_REGIONS = 2; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> { + if (operationType.isReadOnlyOperation()) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else if (isIdempotentWriteRetriesEnabled != null && !isIdempotentWriteRetriesEnabled) { + validateCtxRegions.accept(ctx, TWO_REGIONS); + } else { + validateCtxRegions.accept(ctx, ONE_REGION); + } + }; + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_serverInternalServerError") + public void readMaxRetryCount_serverInternalServerError( + String testCaseId, + Duration endToEndTimeout, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> validateCtxRegions.accept(ctx, ONE_REGION); + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + defaultThrottlingRetryOptions); + } + + @Test(groups = {"multi-master"}, dataProvider = "readMaxRetryCount_serverRequestRateTooLarge") + public void readMaxRetryCount_serverRequestRateTooLarge( + String testCaseId, + Duration endToEndTimeout, + ThrottlingRetryOptions throttlingRetryOptions, + OperationType operationType, + FaultInjectionOperationType faultInjectionOperationType, + Boolean isIdempotentWriteRetriesEnabled, + String readItemDocumentIdOverride, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + TriConsumer maxExpectedRequestCountValidation) { + + final int ONE_REGION = 1; + Function readItemCallback = + this.getRequestCallBack(operationType, readItemDocumentIdOverride); + + BiConsumer validateCtxRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + + Consumer logCtx = + (ctx) -> { + assertThat(ctx).isNotNull(); + logger.info( + "DIAGNOSTICS CONTEXT: {} Json: {}", + ctx.toString(), + ctx.toJson()); + }; + + Consumer validateCtxOneRegions = + (ctx) -> validateCtxRegions.accept(ctx, ONE_REGION); + + Consumer ctxValidation = ctx -> { + assertThat(ctx.getDiagnostics()).isNotNull(); + assertThat(ctx.getDiagnostics().size()).isEqualTo(1); + CosmosDiagnostics diagnostics = ctx.getDiagnostics().iterator().next(); + assertThat(diagnostics.getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.getClientSideRequestStatistics().size()).isEqualTo(1); + + ClientSideRequestStatistics clientStats = diagnostics.getClientSideRequestStatistics().iterator().next(); + assertThat(clientStats.getResponseStatisticsList()).isNotNull(); + int actualRequestCount = clientStats.getResponseStatisticsList().size(); + + if (maxExpectedRequestCountValidation != null) { + logger.info( + "ACTUAL REQUEST COUNT: {}", + actualRequestCount); + + // TODO: expand into other consistencies + maxExpectedRequestCountValidation.accept(actualRequestCount, ConsistencyLevel.SESSION, operationType); + } + }; + + execute( + testCaseId, + endToEndTimeout, + noAvailabilityStrategy, + CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED, + isIdempotentWriteRetriesEnabled, + defaultNetworkRequestTimeoutDuration, + ArrayUtils.toArray(faultInjectionOperationType), + readItemCallback, + faultInjectionCallback, + validateStatusCode, + 1, + ArrayUtils.toArray(logCtx, validateCtxOneRegions, ctxValidation), + null, + null, + 0, + 0, + false, + ConnectionMode.DIRECT, + throttlingRetryOptions); + } + + // Once validate the algorithm by using the e2e tests, this is a quick method to only log the max count + // can be removed + private void logMaxCount() { + // gone + final int DEFAULT_WAIT_TIME_IN_MS = 30 * 1000; + final int DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS = 15 * 1000; + final int DEFAULT_INITIAL_BACKOFF_TIME_IN_MS = 1000; + final int DEFAULT_BACK_OFF_MULTIPLIER = 2; + + for (OperationType operationType : Arrays.asList(OperationType.Read, OperationType.Create)) { + for (ConsistencyLevel consistencyLevel : ConsistencyLevel.values()) { + expectedMaxNumberOfRetriesForGone( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType + ); + } + } + + + // transitTimeout + for (OperationType operationType : Arrays.asList(OperationType.Read, OperationType.Create)) { + for (ConsistencyLevel consistencyLevel : ConsistencyLevel.values()) { + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + Duration.ofSeconds(1), + false + ); + + if (operationType.isWriteOperation()) { + expectedMaxNumberOfRetriesForTransientTimeout( + DEFAULT_WAIT_TIME_IN_MS, + DEFAULT_MAXIMUM_BACKOFF_TIME_IN_MS, + DEFAULT_INITIAL_BACKOFF_TIME_IN_MS, + DEFAULT_BACK_OFF_MULTIPLIER, + consistencyLevel, + operationType, + Duration.ofSeconds(1), + true + ); + } + } + } + } + + + private CosmosAsyncContainer createTestContainer(CosmosAsyncClient clientWithPreferredRegions) { + String dbId = UUID.randomUUID().toString(); + return createTestContainer(clientWithPreferredRegions, dbId); + } + + private CosmosAsyncContainer createTestContainer(CosmosAsyncClient clientWithPreferredRegions, String dbId) { + String containerId = UUID.randomUUID().toString(); + + clientWithPreferredRegions.createDatabaseIfNotExists(dbId).block(); + CosmosAsyncDatabase databaseWithSeveralWriteableRegions = clientWithPreferredRegions.getDatabase(dbId); + + // setup db and container and pass their ids accordingly + // ensure the container has a partition key definition of /mypk + + databaseWithSeveralWriteableRegions + .createContainerIfNotExists( + new CosmosContainerProperties( + containerId, + new PartitionKeyDefinition().setPaths(Arrays.asList("/mypk"))), + // for PHYSICAL_PARTITION_COUNT partitions + ThroughputProperties.createManualThroughput(6_000 * PHYSICAL_PARTITION_COUNT)) + .block(); + + return databaseWithSeveralWriteableRegions.getContainer(containerId); + } + + private static void inject( + String ruleName, + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType applicableOperationType, + FaultInjectionServerErrorResult toBeInjectedServerErrorResult, + FeedRange applicableFeedRange) { + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + applicableOperationType, + toBeInjectedServerErrorResult, + applicableFeedRange, + FaultInjectionConnectionType.DIRECT + ); + } + + private static void inject( + String ruleName, + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType applicableOperationType, + FaultInjectionServerErrorResult toBeInjectedServerErrorResult, + FeedRange applicableFeedRange, + FaultInjectionConnectionType connectionType) { + + FaultInjectionRuleBuilder ruleBuilder = new FaultInjectionRuleBuilder(ruleName); + + List faultInjectionRules = new ArrayList<>(); + + // inject 404/1002s in all regions + // configure in accordance with preferredRegions on the client + for (String region : applicableRegions) { + FaultInjectionConditionBuilder conditionBuilder = new FaultInjectionConditionBuilder() + .operationType(applicableOperationType) + .connectionType(connectionType) + .region(region); + + if (applicableFeedRange != null) { + conditionBuilder = conditionBuilder.endpoints( + new FaultInjectionEndpointBuilder(applicableFeedRange) + .replicaCount(4) + .includePrimary(true) + .build() + ); + } + + FaultInjectionCondition faultInjectionConditionForReads = conditionBuilder.build(); + + // sustained fault injection + FaultInjectionRule readSessionUnavailableRule = ruleBuilder + .condition(faultInjectionConditionForReads) + .result(toBeInjectedServerErrorResult) + .duration(Duration.ofSeconds(120)) + .build(); + + faultInjectionRules.add(readSessionUnavailableRule); + } + + CosmosFaultInjectionHelper + .configureFaultInjectionRules(containerWithSeveralWriteableRegions, faultInjectionRules) + .block(); + + logger.info( + "FAULT INJECTION - Applied rule '{}' for regions '{}'.", + ruleName, + String.join(", ", applicableRegions)); + } + + private static void injectReadSessionNotAvailableError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType operationType, + FeedRange applicableFeedRange) { + + String ruleName = "serverErrorRule-read-session-unavailable-" + UUID.randomUUID(); + FaultInjectionServerErrorResult badSessionTokenServerErrorResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + operationType, + badSessionTokenServerErrorResult, + applicableFeedRange + ); + } + + private static void injectGatewayTransitTimeout( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-gatewayTransitTimeout-" + UUID.randomUUID(); + FaultInjectionServerErrorResult timeoutResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.RESPONSE_DELAY) + .delay(Duration.ofSeconds(6)) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + timeoutResult, + null, + FaultInjectionConnectionType.GATEWAY + ); + } + + private static void injectServiceUnavailable( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-serviceUnavailable-" + UUID.randomUUID(); + FaultInjectionServerErrorResult serviceUnavailableResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.SERVICE_UNAVAILABLE) + .delay(Duration.ofMillis(5)) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + serviceUnavailableResult, + null + ); + } + + private static void injectInternalServerError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-internalServerError-" + UUID.randomUUID(); + FaultInjectionServerErrorResult serviceUnavailableResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.INTERNAL_SERVER_ERROR) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + serviceUnavailableResult, + null + ); + } + + private static void injectRetryWithServerError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-retryWithError-" + UUID.randomUUID(); + FaultInjectionServerErrorResult retryWithResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.RETRY_WITH) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + retryWithResult, + null + ); + } + + private static void injectServerGoneError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-serverGoneError-" + UUID.randomUUID(); + FaultInjectionServerErrorResult serverGoneResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.GONE) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + serverGoneResult, + null + ); + } + + private static void injectTransitTimeoutError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType, + Duration networkRequestTimeout) { + + String ruleName = "serverErrorRule-transitTimeoutError-" + UUID.randomUUID(); + FaultInjectionServerErrorResult transitTimeoutResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.RESPONSE_DELAY) + .delay(networkRequestTimeout.plus(Duration.ofMillis(100))) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + transitTimeoutResult, + null + ); + } + + private static void injectServerTimeoutError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-serverTimeout-" + UUID.randomUUID(); + FaultInjectionServerErrorResult serverTimeoutResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.TIMEOUT) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + serverTimeoutResult, + null + ); + } + + private static void injectServerRequestRateTooLargeError( + CosmosAsyncContainer containerWithSeveralWriteableRegions, + List applicableRegions, + FaultInjectionOperationType faultInjectionOperationType) { + + String ruleName = "serverErrorRule-server429-" + UUID.randomUUID(); + FaultInjectionServerErrorResult serverRequestRateTooLargeResult = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.TOO_MANY_REQUEST) + .build(); + + inject( + ruleName, + containerWithSeveralWriteableRegions, + applicableRegions, + faultInjectionOperationType, + serverRequestRateTooLargeResult, + null + ); + } + + private void execute( + String testCaseId, + Duration endToEndTimeout, + ThresholdBasedAvailabilityStrategy availabilityStrategy, + CosmosRegionSwitchHint regionSwitchHint, + Boolean nonIdempotentWriteRetriesEnabled, + Duration networkRequestTimeout, + FaultInjectionOperationType[] faultInjectionOperationTypes, + Function actionAfterInitialCreation, + BiConsumer faultInjectionCallback, + BiConsumer validateStatusCode, + int expectedDiagnosticsContextCount, + Consumer[] firstDiagnosticsContextValidations, + Consumer[] otherDiagnosticsContextValidations, + Consumer validateResponse, + int numberOfOtherDocumentsWithSameId, + int numberOfOtherDocumentsWithSamePk, + boolean clearContainerBeforeExecution, + ConnectionMode connectionMode, + ThrottlingRetryOptions throttlingRetryOptions) { + + logger.info("START {}", testCaseId); + + CosmosAsyncClient clientWithPreferredRegions = buildCosmosClient( + this.writeableRegions, + regionSwitchHint, + nonIdempotentWriteRetriesEnabled, + connectionMode, + networkRequestTimeout, + throttlingRetryOptions); + + try { + + if (clearContainerBeforeExecution) { + CosmosAsyncContainer newTestContainer = + this.createTestContainer(clientWithPreferredRegions, this.testDatabaseId); + this.testContainerId = newTestContainer.getId(); + // Creating a container is an async task - especially with multiple regions it can + // take some time until the container is available in the remote regions as well + // When the container does not exist yet, you would see 401 for example for point reads etc. + // So, adding this delay after container creation to minimize risk of hitting these errors + try { + Thread.sleep(3000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + String documentId = UUID.randomUUID().toString(); + Pair idAndPkValPair = new ImmutablePair<>(documentId, documentId); + + CosmosDiagnosticsTest.TestItem createdItem = new CosmosDiagnosticsTest.TestItem(documentId, documentId); + CosmosAsyncContainer testContainer = clientWithPreferredRegions + .getDatabase(this.testDatabaseId) + .getContainer(this.testContainerId); + + testContainer.createItem(createdItem).block(); + + List> otherIdAndPkValues = new ArrayList<>(); + for (int i = 0; i < numberOfOtherDocumentsWithSameId; i++) { + String additionalPK = UUID.randomUUID().toString(); + testContainer.createItem(new CosmosDiagnosticsTest.TestItem(documentId, additionalPK)).block(); + otherIdAndPkValues.add(Pair.of(documentId, additionalPK)); + } + + for (int i = 0; i < numberOfOtherDocumentsWithSamePk; i++) { + String sharedPK = documentId; + String additionalDocumentId = UUID.randomUUID().toString(); + testContainer.createItem(new CosmosDiagnosticsTest.TestItem(additionalDocumentId, sharedPK)).block(); + otherIdAndPkValues.add(Pair.of(additionalDocumentId, sharedPK)); + } + + if (faultInjectionCallback != null) { + for (FaultInjectionOperationType faultInjectionOperationType: faultInjectionOperationTypes) { + faultInjectionCallback.accept(testContainer, faultInjectionOperationType); + } + } + + CosmosEndToEndOperationLatencyPolicyConfigBuilder e2ePolicyBuilder = + new CosmosEndToEndOperationLatencyPolicyConfigBuilder(endToEndTimeout) + .enable(true); + CosmosEndToEndOperationLatencyPolicyConfig endToEndOperationLatencyPolicyConfig = + availabilityStrategy != null + ? e2ePolicyBuilder.availabilityStrategy(availabilityStrategy).build() + : e2ePolicyBuilder.build(); + + CosmosPatchItemRequestOptions itemRequestOptions = new CosmosPatchItemRequestOptions(); + + if (endToEndOperationLatencyPolicyConfig != null) { + itemRequestOptions.setCosmosEndToEndOperationLatencyPolicyConfig(endToEndOperationLatencyPolicyConfig); + } + + try { + + ItemOperationInvocationParameters params = new ItemOperationInvocationParameters(); + params.container = testContainer; + params.options = itemRequestOptions; + params.idAndPkValuePair = idAndPkValPair; + params.otherDocumentIdAndPkValuePairs = otherIdAndPkValues; + params.nonIdempotentWriteRetriesEnabled = nonIdempotentWriteRetriesEnabled; + + CosmosResponseWrapper response = actionAfterInitialCreation.apply(params); + + CosmosDiagnosticsContext[] diagnosticsContexts = response.getDiagnosticsContexts(); + assertThat(diagnosticsContexts).isNotNull(); + + logger.info( + "DIAGNOSTICS CONTEXT COUNT: {}", + diagnosticsContexts.length); + for (CosmosDiagnosticsContext diagnosticsContext: diagnosticsContexts) { + logger.info( + "DIAGNOSTICS CONTEXT: {} {}", + diagnosticsContext != null ? diagnosticsContext.toString() : "n/a", + diagnosticsContext != null ? diagnosticsContext.toJson() : "NULL"); + } + + assertThat(diagnosticsContexts.length).isEqualTo(expectedDiagnosticsContextCount); + + if (response == null) { + fail("Response is null"); + } else { + validateStatusCode.accept(response.getStatusCode(), null); + if (validateResponse != null) { + validateResponse.accept(response); + } + } + + for (Consumer ctxValidation : firstDiagnosticsContextValidations) { + ctxValidation.accept(diagnosticsContexts[0]); + } + + for (int i = 1; i < diagnosticsContexts.length; i++) { + CosmosDiagnosticsContext currentCtx = diagnosticsContexts[i]; + + for (Consumer ctxValidation : otherDiagnosticsContextValidations) { + ctxValidation.accept(currentCtx); + } + } + } catch (Exception e) { + if (e instanceof CosmosException) { + CosmosException cosmosException = Utils.as(e, CosmosException.class); + CosmosDiagnosticsContext diagnosticsContext = null; + if (cosmosException.getDiagnostics() != null) { + diagnosticsContext = cosmosException.getDiagnostics().getDiagnosticsContext(); + } + + logger.info("EXCEPTION: ", e); + logger.info( + "DIAGNOSTICS CONTEXT: {} {}", + diagnosticsContext != null ? diagnosticsContext.toString() : "n/a", + diagnosticsContext != null ? diagnosticsContext.toJson(): "NULL"); + + validateStatusCode.accept(cosmosException.getStatusCode(), cosmosException.getSubStatusCode()); + if (firstDiagnosticsContextValidations != null) { + assertThat(expectedDiagnosticsContextCount).isEqualTo(1); + for (Consumer ctxValidation : firstDiagnosticsContextValidations) { + ctxValidation.accept(diagnosticsContext); + } + } + } else { + fail("A CosmosException instance should have been thrown.", e); + } + } + } finally { + safeClose(clientWithPreferredRegions); + } + } + + private static CosmosAsyncClient buildCosmosClient( + List preferredRegions, + CosmosRegionSwitchHint regionSwitchHint, + Boolean nonIdempotentWriteRetriesEnabled, + ConnectionMode connectionMode, + Duration networkRequestTimeout, + ThrottlingRetryOptions throttlingRetryOptions) { + + CosmosClientTelemetryConfig telemetryConfig = new CosmosClientTelemetryConfig() + .diagnosticsHandler(new CosmosDiagnosticsLogger()); + + CosmosRegionSwitchHint effectiveRegionSwitchHint = regionSwitchHint != null + ? regionSwitchHint + : CosmosRegionSwitchHint.LOCAL_REGION_PREFERRED; + SessionRetryOptionsBuilder retryOptionsBuilder = new SessionRetryOptionsBuilder() + .regionSwitchHint(effectiveRegionSwitchHint); + + CosmosClientBuilder builder = new CosmosClientBuilder() + .endpoint(TestConfigurations.HOST) + .key(TestConfigurations.MASTER_KEY) + .consistencyLevel(ConsistencyLevel.SESSION) + .preferredRegions(preferredRegions) + .sessionRetryOptions(retryOptionsBuilder.build()) + .multipleWriteRegionsEnabled(true) + .clientTelemetryConfig(telemetryConfig); + + if (throttlingRetryOptions != null) { + builder.throttlingRetryOptions(throttlingRetryOptions); + } + + if (connectionMode == ConnectionMode.GATEWAY) { + builder.gatewayMode(); + } else { + DirectConnectionConfig directConnectionConfig = DirectConnectionConfig.getDefaultConfig(); + if (networkRequestTimeout != null) { + directConnectionConfig.setNetworkRequestTimeout(networkRequestTimeout); + } + builder.directMode(directConnectionConfig); + } + + if (nonIdempotentWriteRetriesEnabled != null) { + builder.setNonIdempotentWriteRetryPolicy( + nonIdempotentWriteRetriesEnabled, true); + } + + return builder.buildAsyncClient(); + } + + private Map getRegionMap(DatabaseAccount databaseAccount, boolean writeOnly) { + Iterator locationIterator = + writeOnly ? databaseAccount.getWritableLocations().iterator() : databaseAccount.getReadableLocations().iterator(); + Map regionMap = new ConcurrentHashMap<>(); + + while (locationIterator.hasNext()) { + DatabaseAccountLocation accountLocation = locationIterator.next(); + regionMap.put(accountLocation.getName(), accountLocation.getEndpoint()); + } + + return regionMap; + } + + private Function getRequestCallBack( + OperationType operationType, + String readItemDocumentIdOverride) { + + switch (operationType) { + case Read: + return (params) -> + new CosmosResponseWrapper(params.container + .readItem( + readItemDocumentIdOverride != null + ? readItemDocumentIdOverride + : params.idAndPkValuePair.getLeft(), + new PartitionKey(params.idAndPkValuePair.getRight()), + params.options, + ObjectNode.class) + .block()); + case Create: + return (params) -> + new CosmosResponseWrapper(params.container + .createItem(TestObject.create()) + .block()); + default: + throw new IllegalArgumentException("Request operation is not supported: " + operationType); + } + } + + private static class CosmosResponseWrapper { + private final CosmosDiagnosticsContext[] diagnosticsContexts; + private final Integer statusCode; + private final Integer subStatusCode; + + private final Long totalRecordCount; + + public CosmosResponseWrapper(CosmosItemResponse itemResponse) { + if (itemResponse.getDiagnostics() != null && + itemResponse.getDiagnostics().getDiagnosticsContext() != null) { + + this.diagnosticsContexts = ArrayUtils.toArray(itemResponse.getDiagnostics().getDiagnosticsContext()); + } else { + this.diagnosticsContexts = null; + } + + this.statusCode = itemResponse.getStatusCode(); + this.subStatusCode = null; + this.totalRecordCount = itemResponse.getItem() != null ? 1L : 0L; + } + + public CosmosResponseWrapper(CosmosDiagnosticsContext[] ctxs, int statusCode, Integer subStatusCode, Long totalRecordCount) { + this.diagnosticsContexts = ctxs; + this.statusCode = statusCode; + this.subStatusCode = subStatusCode; + this.totalRecordCount = totalRecordCount; + } + + public CosmosDiagnosticsContext[] getDiagnosticsContexts() { + return this.diagnosticsContexts; + } + + public Integer getStatusCode() { + return this.statusCode; + } + + public Integer getSubStatusCode() { + return this.subStatusCode; + } + + public Long getTotalRecordCount() { + return this.totalRecordCount; + } + } + + private static class ItemOperationInvocationParameters { + public CosmosPatchItemRequestOptions options; + public CosmosAsyncContainer container; + public Pair idAndPkValuePair; + + public List> otherDocumentIdAndPkValuePairs; + public Boolean nonIdempotentWriteRetriesEnabled; + } +} diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/ExcludedRegionWithFaultInjectionTests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/ExcludedRegionWithFaultInjectionTests.java index 0ce0f2843a643..3476112267f0c 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/ExcludedRegionWithFaultInjectionTests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/ExcludedRegionWithFaultInjectionTests.java @@ -1099,9 +1099,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.CREATED, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1159,9 +1159,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1224,9 +1224,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1266,9 +1266,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.CREATED, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1308,9 +1308,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.NO_CONTENT, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1350,9 +1350,9 @@ public Object[][] regionExclusionWriteAfterCreateTestConfigs() { this.chooseFirstRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseAllRegions.apply(this.preferredRegions) )), }, { @@ -1668,9 +1668,10 @@ public Object[][] regionExclusionBatchTestConfigs() { this.chooseLastRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + 2, + this.chooseFirstTwoRegions.apply(this.preferredRegions) )) }, { @@ -1689,9 +1690,10 @@ public Object[][] regionExclusionBatchTestConfigs() { this.chooseLastRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + 2, + this.chooseFirstTwoRegions.apply(this.preferredRegions) )) }, { @@ -2035,9 +2037,9 @@ public Object[][] regionExclusionBulkTestConfigs() { this.chooseLastRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseFirstTwoRegions.apply(this.preferredRegions) )) }, { @@ -2055,9 +2057,9 @@ public Object[][] regionExclusionBulkTestConfigs() { this.chooseLastRegion.apply(this.preferredRegions) )) .withExpectedResultAfterMutation(new ExpectedResult( - HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, - HttpConstants.SubStatusCodes.SERVER_GENERATED_503, - this.chooseFirstRegion.apply(this.preferredRegions) + HttpConstants.StatusCodes.OK, + HttpConstants.SubStatusCodes.UNKNOWN, + this.chooseFirstTwoRegions.apply(this.preferredRegions) )) }, { diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnDirectTests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnDirectTests.java index e0dfb233950bd..4c3f59196bc66 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnDirectTests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnDirectTests.java @@ -124,7 +124,7 @@ public static Object[][] faultInjectionServerErrorResponseProvider() { { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.GONE, true, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_410 }, { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.INTERNAL_SERVER_ERROR, false, 500, 0 }, { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.RETRY_WITH, true, 449, 0 }, - { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, 0 }, + { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE }, { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE, true, 404, 1002 }, { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.TIMEOUT, true, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_408 }, // for server return 408, SDK will wrap into 410/21010 { OperationType.Read, FaultInjectionOperationType.READ_ITEM, FaultInjectionServerErrorType.PARTITION_IS_MIGRATING, true, 410, 1008 }, @@ -133,7 +133,7 @@ public static Object[][] faultInjectionServerErrorResponseProvider() { { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.GONE, true, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_410 }, { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.INTERNAL_SERVER_ERROR, false, 500, 0 }, { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.RETRY_WITH, true, 449, 0 }, - { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, 0 }, + { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE }, { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE, true, 404, 1002 }, { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.TIMEOUT, true, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_408 }, // for server return 408, SDK will wrap into 410/21010 { OperationType.ReadFeed, FaultInjectionOperationType.READ_FEED_ITEM, FaultInjectionServerErrorType.PARTITION_IS_MIGRATING, true, 410, 1008 }, @@ -142,7 +142,7 @@ public static Object[][] faultInjectionServerErrorResponseProvider() { { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.GONE, true, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_410 }, { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.INTERNAL_SERVER_ERROR, false, 500, 0 }, { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.RETRY_WITH, true, 449, 0 }, - { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, 0 }, + { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE }, { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.TIMEOUT, false, 410, HttpConstants.SubStatusCodes.SERVER_GENERATED_408 }, // for server return 408, SDK will wrap into 410/21010 { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.PARTITION_IS_MIGRATING, true, 410, 1008 }, { OperationType.Create, FaultInjectionOperationType.CREATE_ITEM, FaultInjectionServerErrorType.PARTITION_IS_SPLITTING, true, 410, 1007 }, @@ -220,7 +220,7 @@ public void faultInjectionServerErrorRuleTests_OperationType(OperationType opera cosmosDiagnostics, operationType, HttpConstants.StatusCodes.TOO_MANY_REQUESTS, - HttpConstants.SubStatusCodes.UNKNOWN, + HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE, tooManyRequestsRuleId, true); } else { @@ -494,7 +494,7 @@ public void faultInjectionServerErrorRuleTests_Partition() throws JsonProcessing cosmosDiagnostics, OperationType.Read, HttpConstants.StatusCodes.TOO_MANY_REQUESTS, - HttpConstants.SubStatusCodes.UNKNOWN, + HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE, feedRangeRuleId, true ); diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnGatewayTests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnGatewayTests.java index ff3e1d218cca6..655619316e966 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnGatewayTests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnGatewayTests.java @@ -122,7 +122,7 @@ public static Object[][] faultInjectionServerErrorResponseProvider() { // faultInjectionServerError, will SDK retry, errorStatusCode, errorSubStatusCode { FaultInjectionServerErrorType.INTERNAL_SERVER_ERROR, false, 500, 0 }, { FaultInjectionServerErrorType.RETRY_WITH, false, 449, 0 }, - { FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, 0 }, + { FaultInjectionServerErrorType.TOO_MANY_REQUEST, true, 429, HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE }, { FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE, true, 404, 1002 }, { FaultInjectionServerErrorType.SERVICE_UNAVAILABLE, false, 503, 21008 } }; @@ -283,7 +283,7 @@ public void faultInjectionServerErrorRuleTests_Partition() throws JsonProcessing cosmosDiagnostics, OperationType.Read, HttpConstants.StatusCodes.TOO_MANY_REQUESTS, - HttpConstants.SubStatusCodes.UNKNOWN, + HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE, feedRangeRuleId, true ); @@ -504,7 +504,7 @@ public void faultInjectionServerErrorRuleTests_HitLimit( cosmosDiagnostics, operationType, HttpConstants.StatusCodes.TOO_MANY_REQUESTS, - HttpConstants.SubStatusCodes.UNKNOWN, + HttpConstants.SubStatusCodes.USER_REQUEST_RATE_TOO_LARGE, hitLimitRuleId, true ); diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/SessionNotAvailableRetryTest.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/SessionNotAvailableRetryTest.java index 6bb163d4e95b8..7f19b29db84f3 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/SessionNotAvailableRetryTest.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/SessionNotAvailableRetryTest.java @@ -9,6 +9,7 @@ import com.azure.cosmos.CosmosClientBuilder; import com.azure.cosmos.CosmosException; import com.azure.cosmos.implementation.apachecommons.lang.NotImplementedException; +import com.azure.cosmos.implementation.apachecommons.lang.StringUtils; import com.azure.cosmos.implementation.directconnectivity.ConsistencyReader; import com.azure.cosmos.implementation.directconnectivity.ConsistencyWriter; import com.azure.cosmos.implementation.directconnectivity.ReflectionUtils; @@ -26,6 +27,12 @@ import com.azure.cosmos.models.CosmosQueryRequestOptions; import com.azure.cosmos.models.PartitionKey; import com.azure.cosmos.rx.TestSuiteBase; +import com.azure.cosmos.test.faultinjection.CosmosFaultInjectionHelper; +import com.azure.cosmos.test.faultinjection.FaultInjectionConditionBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionResultBuilders; +import com.azure.cosmos.test.faultinjection.FaultInjectionRule; +import com.azure.cosmos.test.faultinjection.FaultInjectionRuleBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionServerErrorType; import org.apache.commons.lang3.reflect.FieldUtils; import org.mockito.Mockito; import org.mockito.stubbing.Answer; @@ -38,11 +45,16 @@ import java.lang.reflect.Field; import java.net.URI; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.fail; @@ -79,29 +91,25 @@ public void afterClass() { @DataProvider(name = "preferredRegions") private Object[][] preferredRegions() { List preferredLocations1 = new ArrayList<>(); - List regionalSuffix1 = new ArrayList<>(); List preferredLocations2 = new ArrayList<>(); - List regionalSuffix2 = new ArrayList<>(); Iterator locationIterator = this.databaseAccount.getReadableLocations().iterator(); while (locationIterator.hasNext()) { DatabaseAccountLocation accountLocation = locationIterator.next(); preferredLocations1.add(accountLocation.getName()); - regionalSuffix1.add(getRegionalSuffix(accountLocation.getEndpoint(), TestConfigurations.HOST)); } //putting preferences in opposite direction than what came from database account api for (int i = preferredLocations1.size() - 1; i >= 0; i--) { preferredLocations2.add(preferredLocations1.get(i)); - regionalSuffix2.add(regionalSuffix1.get(i)); } return new Object[][]{ - new Object[]{preferredLocations1, regionalSuffix1, OperationType.Read}, - new Object[]{preferredLocations2, regionalSuffix2, OperationType.Read}, - new Object[]{preferredLocations1, regionalSuffix1, OperationType.Query}, - new Object[]{preferredLocations2, regionalSuffix2, OperationType.Query}, - new Object[]{preferredLocations1, regionalSuffix1, OperationType.Create}, - new Object[]{preferredLocations2, regionalSuffix2, OperationType.Create}, + new Object[]{preferredLocations1, OperationType.Read}, + new Object[]{preferredLocations2, OperationType.Read}, + new Object[]{preferredLocations1, OperationType.Query}, + new Object[]{preferredLocations2, OperationType.Query}, + new Object[]{preferredLocations1, OperationType.Create}, + new Object[]{preferredLocations2, OperationType.Create}, }; } @@ -115,9 +123,22 @@ private Object[][] operations() { } @Test(groups = {"multi-master"}, dataProvider = "preferredRegions", timeOut = TIMEOUT) - public void sessionNotAvailableRetryMultiMaster(List preferredLocations, List regionalSuffix, - OperationType operationType) throws Exception { + public void sessionNotAvailableRetryMultiMaster( + List preferredLocations, + OperationType operationType) { + + List preferredLocationsWithLowerCase = + preferredLocations.stream().map(location -> location.toLowerCase(Locale.ROOT)).collect(Collectors.toList()); CosmosAsyncClient preferredListClient = null; + // inject 404/1002 into all regions + FaultInjectionRule sessionNotAvailableRule = new FaultInjectionRuleBuilder("sessionNotAvailableRuleMultiMaster-" + UUID.randomUUID()) + .condition(new FaultInjectionConditionBuilder().build()) + .result( + FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE) + .build()) + .build(); + try { preferredListClient = new CosmosClientBuilder() .endpoint(TestConfigurations.HOST) @@ -127,37 +148,9 @@ public void sessionNotAvailableRetryMultiMaster(List preferredLocations, .preferredRegions(preferredLocations) .buildAsyncClient(); - AsyncDocumentClient asyncDocumentClient = ReflectionUtils.getAsyncDocumentClient(preferredListClient); - RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) asyncDocumentClient; - StoreClient storeClient = ReflectionUtils.getStoreClient(rxDocumentClient); - ReplicatedResourceClient replicatedResourceClient = - ReflectionUtils.getReplicatedResourceClient(storeClient); - ConsistencyReader consistencyReader = ReflectionUtils.getConsistencyReader(replicatedResourceClient); - ConsistencyWriter consistencyWriter = ReflectionUtils.getConsistencyWriter(replicatedResourceClient); - StoreReader storeReader = ReflectionUtils.getStoreReader(consistencyReader); - - GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); - - RntbdTransportClientTest rntbdTransportClient = new RntbdTransportClientTest(globalEndpointManager); - RntbdTransportClientTest spyRntbdTransportClient = Mockito.spy(rntbdTransportClient); - ReflectionUtils.setTransportClient(storeReader, spyRntbdTransportClient); - ReflectionUtils.setTransportClient(consistencyWriter, spyRntbdTransportClient); - cosmosAsyncContainer = getSharedMultiPartitionCosmosContainer(preferredListClient); + CosmosFaultInjectionHelper.configureFaultInjectionRules(cosmosAsyncContainer, Arrays.asList(sessionNotAvailableRule)).block(); - List uris = new ArrayList<>(); - doAnswer((Answer>) invocationOnMock -> { - RxDocumentServiceRequest serviceRequest = invocationOnMock.getArgument(1, - RxDocumentServiceRequest.class); - uris.add(serviceRequest.requestContext.locationEndpointToRoute.toString()); - CosmosException cosmosException = BridgeInternal.createCosmosException(404); - @SuppressWarnings("unchecked") - Map responseHeaders = (Map) FieldUtils.readField(cosmosException, - "responseHeaders", true); - responseHeaders.put(HttpConstants.HttpHeaders.SUB_STATUS, "1002"); - return Mono.error(cosmosException); - }).when(spyRntbdTransportClient).invokeStoreAsync(Mockito.any(Uri.class), - Mockito.any(RxDocumentServiceRequest.class)); try { PartitionKey partitionKey = new PartitionKey("Test"); if (operationType.equals(OperationType.Read)) { @@ -177,51 +170,51 @@ public void sessionNotAvailableRetryMultiMaster(List preferredLocations, fail("Request should fail with 404/1002 error"); } catch (CosmosException ex) { assertThat(ex.getStatusCode()).isEqualTo(HttpConstants.StatusCodes.NOTFOUND); - Iterator regionContactedIterator = ex.getDiagnostics().getContactedRegionNames().iterator(); + assertThat(ex.getSubStatusCode()).isEqualTo(HttpConstants.SubStatusCodes.READ_SESSION_NOT_AVAILABLE); assertThat(ex.getDiagnostics().getContactedRegionNames().size()).isEqualTo(preferredLocations.size()); - for (String regionName : - getAvailableRegionNames(rxDocumentClient, true)) { - assertThat(regionName).isEqualTo(regionContactedIterator.next()); + assertThat(ex.getDiagnostics().getContactedRegionNames().containsAll(preferredLocationsWithLowerCase)).isTrue(); + + // validate the contacted regions follow the preferredRegion sequence + List contactedRegions = new ArrayList<>(); + String previousContactedRegion = StringUtils.EMPTY; + ClientSideRequestStatistics clientSideRequestStatistics = BridgeInternal.getClientSideRequestStatics(ex.getDiagnostics()); + for (ClientSideRequestStatistics.StoreResponseStatistics storeResponseStatistics : clientSideRequestStatistics.getResponseStatisticsList()) { + if (!storeResponseStatistics.getRegionName().equalsIgnoreCase(previousContactedRegion)) { + contactedRegions.add(storeResponseStatistics.getRegionName().toLowerCase(Locale.ROOT)); + previousContactedRegion = storeResponseStatistics.getRegionName().toLowerCase(Locale.ROOT); + } } + List expectedContactedRegions = new ArrayList<>(); + expectedContactedRegions.addAll(preferredLocationsWithLowerCase); + // SDK will do one more round retry in first preferred region due to RenameCollectionAwareClientRetryPolicy + expectedContactedRegions.add(preferredLocationsWithLowerCase.get(0)); + assertThat(contactedRegions.size()).isEqualTo(expectedContactedRegions.size()); + assertThat(contactedRegions.containsAll(expectedContactedRegions)).isTrue(); } - - HashSet uniqueHost = new HashSet<>(); - for (String uri : uris) { - uniqueHost.add(uri); - } - // First verify we are retrying in each region - assertThat(uniqueHost.size()).isEqualTo(preferredLocations.size()); - - - // First regional retries in originating region , then retrying per region in clientRetryPolicy and 1 - // retry in the - // last as per RenameCollectionAwareClientRetryPolicy after clearing session token - int numberOfRegionRetried = preferredLocations.size() + 2; - - // Calculating avg number of retries in each region - int averageRetryBySessionRetryPolicyInOneRegion = uris.size() / numberOfRegionRetried; - - int totalRetries = averageRetryBySessionRetryPolicyInOneRegion; - // First regional retries should be in the first preferred region - assertThat(uris.get(totalRetries / 2)).contains(regionalSuffix.get(0)); - - for (int i = 1; i <= preferredLocations.size(); i++) { - // Retrying in each region as per preferred region - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(regionalSuffix.get(i % regionalSuffix.size())); - totalRetries = totalRetries + averageRetryBySessionRetryPolicyInOneRegion; - } - - // Last region retries should be in first preferred region - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(regionalSuffix.get(0)); } finally { + sessionNotAvailableRule.disable(); safeClose(preferredListClient); } } @Test(groups = {"multi-region"}, dataProvider = "preferredRegions", timeOut = TIMEOUT) - public void sessionNotAvailableRetrySingleMaster(List preferredLocations, List regionalSuffix, - OperationType operationType) throws Exception { + public void sessionNotAvailableRetrySingleMaster( + List preferredLocations, + OperationType operationType) { + CosmosAsyncClient preferredListClient = null; + + List preferredLocationsWithLowerCase = + preferredLocations.stream().map(location -> location.toLowerCase(Locale.ROOT)).collect(Collectors.toList()); + // inject 404/1002 into all regions + FaultInjectionRule sessionNotAvailableRule = new FaultInjectionRuleBuilder("sessionNotAvailableRuleSingleMaster-" + UUID.randomUUID()) + .condition(new FaultInjectionConditionBuilder().build()) + .result( + FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.READ_SESSION_NOT_AVAILABLE) + .build()) + .build(); + try { preferredListClient = new CosmosClientBuilder() .endpoint(TestConfigurations.HOST) @@ -231,42 +224,10 @@ public void sessionNotAvailableRetrySingleMaster(List preferredLocations .preferredRegions(preferredLocations) .buildAsyncClient(); - AsyncDocumentClient asyncDocumentClient = ReflectionUtils.getAsyncDocumentClient(preferredListClient); - RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) asyncDocumentClient; - StoreClient storeClient = ReflectionUtils.getStoreClient(rxDocumentClient); - ReplicatedResourceClient replicatedResourceClient = - ReflectionUtils.getReplicatedResourceClient(storeClient); - ConsistencyReader consistencyReader = ReflectionUtils.getConsistencyReader(replicatedResourceClient); - ConsistencyWriter consistencyWriter = ReflectionUtils.getConsistencyWriter(replicatedResourceClient); - StoreReader storeReader = ReflectionUtils.getStoreReader(consistencyReader); - - GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); - - RntbdTransportClientTest rntbdTransportClient = new RntbdTransportClientTest(globalEndpointManager); - RntbdTransportClientTest spyRntbdTransportClient = Mockito.spy(rntbdTransportClient); - ReflectionUtils.setTransportClient(storeReader, spyRntbdTransportClient); - ReflectionUtils.setTransportClient(consistencyWriter, spyRntbdTransportClient); - cosmosAsyncContainer = getSharedMultiPartitionCosmosContainer(preferredListClient); + CosmosFaultInjectionHelper.configureFaultInjectionRules(cosmosAsyncContainer, Arrays.asList(sessionNotAvailableRule)).block(); PartitionKey partitionKey = new PartitionKey("Test"); - List uris = new ArrayList<>(); - String masterOrHubRegionSuffix = - getRegionalSuffix(databaseAccount.getWritableLocations().iterator().next().getEndpoint(), - TestConfigurations.HOST); - doAnswer((Answer>) invocationOnMock -> { - RxDocumentServiceRequest serviceRequest = invocationOnMock.getArgument(1, - RxDocumentServiceRequest.class); - uris.add(serviceRequest.requestContext.locationEndpointToRoute.toString()); - CosmosException cosmosException = BridgeInternal.createCosmosException(404); - @SuppressWarnings("unchecked") - Map responseHeaders = (Map) FieldUtils.readField(cosmosException, - "responseHeaders", true); - responseHeaders.put(HttpConstants.HttpHeaders.SUB_STATUS, "1002"); - FieldUtils.writeField(cosmosException, "responseHeaders", responseHeaders, true); - return Mono.error(cosmosException); - }).when(spyRntbdTransportClient).invokeStoreAsync(Mockito.any(Uri.class), - Mockito.any(RxDocumentServiceRequest.class)); try { if (operationType.equals(OperationType.Read)) { cosmosAsyncContainer.readItem("TestId", partitionKey, TestItem.class).block(); @@ -285,69 +246,48 @@ public void sessionNotAvailableRetrySingleMaster(List preferredLocations fail("Request should fail with 404/1002 error"); } catch (CosmosException ex) { assertThat(ex.getStatusCode()).isEqualTo(HttpConstants.StatusCodes.NOTFOUND); - Iterator regionContactedIterator = ex.getDiagnostics().getContactedRegionNames().iterator(); - if (operationType.isWriteOperation() || regionalSuffix.get(0).equals(masterOrHubRegionSuffix)) { + assertThat(ex.getSubStatusCode()).isEqualTo(HttpConstants.SubStatusCodes.READ_SESSION_NOT_AVAILABLE); + + Map writeRegionMap = this.getRegionMap(databaseAccount, true); + assertThat(writeRegionMap.size()).isEqualTo(1); + + List writeRegionList = + writeRegionMap + .keySet() + .stream() + .map(regionName -> regionName.toLowerCase(Locale.ROOT)) + .collect(Collectors.toList()); + + // for single master, when retrying 404/1002, it will retry on the write region + // so for write operation or if the first preferred region is the same as write region, the contracted region count should 1 + if (operationType.isWriteOperation() + || preferredLocationsWithLowerCase.get(0).equalsIgnoreCase(writeRegionList.get(0))) { assertThat(ex.getDiagnostics().getContactedRegionNames().size()).isEqualTo(1); - for (String regionName : - getAvailableRegionNames(rxDocumentClient, true)) { - assertThat(regionName.toLowerCase()).isEqualTo(regionContactedIterator.next()); - } } else { - assertThat(ex.getDiagnostics().getContactedRegionNames().size()).isEqualTo(preferredLocations.size()); - for (String regionName : - getAvailableRegionNames(rxDocumentClient, false)) { - assertThat(regionName).isEqualTo(regionContactedIterator.next()); + assertThat(ex.getDiagnostics().getContactedRegionNames().size()).isEqualTo(2); + + // validate the contacted region sequence + List contactedRegions = new ArrayList<>(); + String previousContactedRegion = StringUtils.EMPTY; + ClientSideRequestStatistics clientSideRequestStatistics = BridgeInternal.getClientSideRequestStatics(ex.getDiagnostics()); + for (ClientSideRequestStatistics.StoreResponseStatistics storeResponseStatistics : clientSideRequestStatistics.getResponseStatisticsList()) { + if (!storeResponseStatistics.getRegionName().equalsIgnoreCase(previousContactedRegion)) { + contactedRegions.add(storeResponseStatistics.getRegionName().toLowerCase(Locale.ROOT)); + previousContactedRegion = storeResponseStatistics.getRegionName().toLowerCase(Locale.ROOT); + } } - } - } - - HashSet uniqueHost = new HashSet<>(); - for (String uri : uris) { - uniqueHost.add(uri); - } - - // First regional retries in originating region, then retrying in master/hub region and 1 retry at the - // last from - // RenameCollectionAwareClientRetryPolicy after clearing session token - int numberOfRegionRetried = 3; - - // Calculating approx avg number of retries in each region - int averageRetryBySessionRetryPolicyInOneRegion = uris.size() / numberOfRegionRetried; - - int totalRetries = averageRetryBySessionRetryPolicyInOneRegion; - - if (operationType.equals(OperationType.Create)) { - assertThat(uniqueHost.size()).isEqualTo(1); // always goes to master region - - //First region retries should be in masterOrHubRegionSuffix - assertThat(uris.get(totalRetries / 2)).contains(masterOrHubRegionSuffix); - - // Second region retries should be in masterOrHubRegionSuffix - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(masterOrHubRegionSuffix); - totalRetries = totalRetries + averageRetryBySessionRetryPolicyInOneRegion; - //Last region retries should be in masterOrHubRegionSuffix - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(masterOrHubRegionSuffix); - } else { - if (regionalSuffix.get(0).equals(masterOrHubRegionSuffix)) { - //Verify we are retrying only in master region - assertThat(uniqueHost.size()).isEqualTo(1); - } else { - //Verify we are retrying in first preferred region and master region - assertThat(uniqueHost.size()).isEqualTo(2); + List expectedContactedRegions = new ArrayList<>(); + expectedContactedRegions.add(preferredLocationsWithLowerCase.get(0)); + expectedContactedRegions.addAll(writeRegionList); + // SDK will do one more round retry in first preferred region due to RenameCollectionAwareClientRetryPolicy + expectedContactedRegions.add(preferredLocationsWithLowerCase.get(0)); + assertThat(contactedRegions.size()).isEqualTo(expectedContactedRegions.size()); + assertThat(contactedRegions.containsAll(expectedContactedRegions)).isTrue(); } - - //First region retries should be in first preferred region - assertThat(uris.get(totalRetries / 2)).contains(regionalSuffix.get(0)); - - // Second region retries should be in masterOrHubRegion - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(masterOrHubRegionSuffix); - totalRetries = totalRetries + averageRetryBySessionRetryPolicyInOneRegion; - - //Last region retries should be in first preferred region - assertThat(uris.get(totalRetries + (averageRetryBySessionRetryPolicyInOneRegion) / 2)).contains(regionalSuffix.get(0)); } } finally { + sessionNotAvailableRule.disable(); safeClose(preferredListClient); } } @@ -456,28 +396,17 @@ public void sessionNotAvailableRetryWithoutPreferredList(OperationType operation } } - private String getRegionalSuffix(String str1, String str2) { - int initialIndex = findInitialIndex(str1, str2); - int indexFromLast = findIndexFromLast(str1, str2); - return str1.substring(initialIndex + 1, str1.length() - indexFromLast); - } + private Map getRegionMap(DatabaseAccount databaseAccount, boolean writeOnly) { + Iterator locationIterator = + writeOnly ? databaseAccount.getWritableLocations().iterator() : databaseAccount.getReadableLocations().iterator(); + Map regionMap = new ConcurrentHashMap<>(); - private int findInitialIndex(String str1, String str2) { - int counter = 0; - while (str1.charAt(counter) == str2.charAt(counter)) { - counter++; + while (locationIterator.hasNext()) { + DatabaseAccountLocation accountLocation = locationIterator.next(); + regionMap.put(accountLocation.getName(), accountLocation.getEndpoint()); } - return counter; - } - private int findIndexFromLast(String str1, String str2) { - int length1 = str1.length(); - int length2 = str2.length(); - int counter = 0; - while (str1.charAt(length1 - 1 - counter) == str2.charAt(length2 - 1 - counter)) { - counter++; - } - return counter; + return regionMap; } private Set getAvailableRegionNames(RxDocumentClientImpl rxDocumentClient, boolean isWriteRegion) throws Exception { diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index cff27dcc4e34f..506c3e2477c03 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -9,6 +9,10 @@ #### Bugs Fixed * Fixed an issue where `emptyPageDiagnosticsEnabled` in `CosmosQueryRequestOptions` was being overridden. This caused empty page diagnostics to be logged (with INFO level) even when the flag was set to false - See [PR 37199](https://github.com/Azure/azure-sdk-for-java/pull/37199) +* Fixed an issue where SDK mark region unavailable on http timeout - See [PR 37163](https://github.com/Azure/azure-sdk-for-java/pull/37163) +* Fixed an issue where SDK do `A, B, C, A` retry pattern for `404/1002` - See [PR 37040](https://github.com/Azure/azure-sdk-for-java/pull/37040) +* Fixed an issue where SDK do aggressive retry on `449` - See [PR 37040](https://github.com/Azure/azure-sdk-for-java/pull/37040) +* Fixed an issue where SDK skip cross region retry for server generated `410` for write operations - See [PR 37040](https://github.com/Azure/azure-sdk-for-java/pull/37040) #### Other Changes @@ -23,7 +27,6 @@ * Fixed an issue where `sampleDiagnostics` is not being honored for `query. See [PR 37015](https://github.com/Azure/azure-sdk-for-java/pull/37015) * Fixed the issue of `excludeRegions` not being honored for `CosmosBulkExecutionOptions`. - See[PR 36616](https://github.com/Azure/azure-sdk-for-java/pull/36616) * Fixed an issue with missing diagnostics (metrics, logging) for `Cosmos(Async)Container.readMany` calls - See [PR 37009](https://github.com/Azure/azure-sdk-for-java/pull/37009) -* Fixed an issue where SDK mark region unavailable on http timeout - See [PR 37163](https://github.com/Azure/azure-sdk-for-java/pull/37163) ### 4.50.0 (2023-09-25) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/ClientRetryPolicy.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/ClientRetryPolicy.java index b370bd1bb0995..cd540f2fe5804 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/ClientRetryPolicy.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/ClientRetryPolicy.java @@ -157,7 +157,8 @@ public Mono shouldRetry(Exception e) { return this.shouldRetryOnBackendServiceUnavailableAsync( this.isReadRequest, isWebExceptionRetriable, - this.request.getNonIdempotentWriteRetriesEnabled()); + this.request.getNonIdempotentWriteRetriesEnabled(), + clientException); } return this.throttlingRetry.shouldRetry(e); @@ -199,7 +200,7 @@ private ShouldRetryResult shouldRetryOnSessionNotAvailable(RxDocumentServiceRequ this.isReadRequest ? this.globalEndpointManager.getApplicableReadEndpoints(request) : this.globalEndpointManager.getApplicableWriteEndpoints(request); - if (this.sessionTokenRetryCount > endpoints.size()) { + if (this.sessionTokenRetryCount >= endpoints.size()) { // When use multiple write locations is true and the request has been tried // on all locations, then don't retry the request return ShouldRetryResult.noRetry(); @@ -313,11 +314,31 @@ private Mono refreshLocation(boolean isReadRequest, boolean forceRefresh, private Mono shouldRetryOnBackendServiceUnavailableAsync( boolean isReadRequest, boolean isWebExceptionRetriable, - boolean nonIdempotentWriteRetriesEnabled) { - - if (!isReadRequest && !nonIdempotentWriteRetriesEnabled && !isWebExceptionRetriable) { + boolean nonIdempotentWriteRetriesEnabled, + CosmosException cosmosException) { + + // The request has failed with 503, SDK need to decide whether it is safe to retry for write operations + // For server generated retries, it is safe to retry + // For SDK generated 503, it will be more tricky as we have to decide the cause of it. For any causes that SDK not sure whether the request + // has reached/processed from server side, unless customer has specifically opted in for nonIdempotentWriteRetries, SDK should not retry. + // When SDK would generate 503: + // - When server return 410, SDK may internally retry multiple times, when all the retries exhausted, SDK will bubble up 503 with corresponding subStatusCode + // (Note: currently, subStatus code for read may get lost during the conversion, but for writes, the subStatus code will be reserved) + // - when SDK generated 410 due to different reason (like connectionTimeout, transient timeout etc), SDK will internally retry multiple times + // when all the retries exhausted, SDK will bubble up 503 + // + // Fow now, without nonIdempotentWriteRetries being enabled, SDK will only retry for the following situation: + // 1. For any connection related errors, it will be covered under isWebExceptionRetriable -> which SDK will retry + // 2. For any server returned 503s, SDK will retry + // 3. For SDK generated 503, SDK will only retry if the subStatusCode is SERVER_GENERATED_410 + if (!isReadRequest + && !shouldRetryWriteOnServiceUnavailable( + nonIdempotentWriteRetriesEnabled, + isWebExceptionRetriable, + cosmosException)) { logger.warn( - "shouldRetryOnBackendServiceUnavailableAsync() Not retrying on write with non retriable exception. Retry count = {}", + "shouldRetryOnBackendServiceUnavailableAsync() Not retrying" + + " on write with non retriable exception and non server returned service unavailable. Retry count = {}", this.serviceUnavailableRetryCount); return Mono.just(ShouldRetryResult.noRetry()); } @@ -388,6 +409,24 @@ CosmosDiagnostics getCosmosDiagnostics() { return cosmosDiagnostics; } + private boolean shouldRetryWriteOnServiceUnavailable( + boolean nonIdempotentWriteRetriesEnabled, + boolean isWebExceptionRetriable, + CosmosException cosmosException) { + + if (nonIdempotentWriteRetriesEnabled || isWebExceptionRetriable) { + return true; + } + + if (cosmosException instanceof ServiceUnavailableException) { + ServiceUnavailableException serviceUnavailableException = (ServiceUnavailableException) cosmosException; + return serviceUnavailableException.getSubStatusCode() == HttpConstants.SubStatusCodes.SERVER_GENERATED_503 + || serviceUnavailableException.getSubStatusCode() == HttpConstants.SubStatusCodes.SERVER_GENERATED_410; + } + + return false; + } + private static class RetryContext { public int retryCount; diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/Configs.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/Configs.java index 9e1408314466e..4bf9e60342c24 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/Configs.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/Configs.java @@ -100,15 +100,15 @@ public class Configs { private static final int DEFAULT_ADDRESS_REFRESH_RESPONSE_TIMEOUT_IN_SECONDS = 5; // SessionTokenMismatchRetryPolicy Constants - private static final String DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME = + public static final String DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS_NAME = "COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS"; private static final int DEFAULT_SESSION_TOKEN_MISMATCH_WAIT_TIME_IN_MILLISECONDS = 5000; - private static final String DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME = + public static final String DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS_NAME = "COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS"; private static final int DEFAULT_SESSION_TOKEN_MISMATCH_INITIAL_BACKOFF_TIME_IN_MILLISECONDS = 5; - private static final String DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME = + public static final String DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS_NAME = "COSMOS.DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS"; private static final int DEFAULT_SESSION_TOKEN_MISMATCH_MAXIMUM_BACKOFF_TIME_IN_MILLISECONDS = 500; @@ -147,7 +147,7 @@ public class Configs { private static final String OPEN_CONNECTIONS_CONCURRENCY = "COSMOS.OPEN_CONNECTIONS_CONCURRENCY"; private static final int DEFAULT_OPEN_CONNECTIONS_CONCURRENCY = 1; - private static final String MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = "COSMOS.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED"; + public static final String MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = "COSMOS.MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED"; private static final int DEFAULT_MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = 1; public static final int MIN_MAX_RETRIES_IN_LOCAL_REGION_WHEN_REMOTE_REGION_PREFERRED = 1; diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java index 63f33de0f698b..b6ba56956913f 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java @@ -360,7 +360,15 @@ public Mono shouldRetry(Exception exception) { Math.min( Math.min(this.currentBackoffMilliseconds.get() + random.nextInt(RANDOM_SALT_IN_MS), remainingMilliseconds), RetryWithRetryPolicy.MAXIMUM_BACKOFF_TIME_IN_MS)); - this.currentBackoffMilliseconds.accumulateAndGet(RetryWithRetryPolicy.BACK_OFF_MULTIPLIER, (left, right) -> left * right); + + this.currentBackoffMilliseconds.set( + Math.max( + RetryWithRetryPolicy.INITIAL_BACKOFF_TIME_MS, + Math.min( + RetryWithRetryPolicy.MAXIMUM_BACKOFF_TIME_IN_MS, + this.currentBackoffMilliseconds.get() * RetryWithRetryPolicy.BACK_OFF_MULTIPLIER)) + ); + logger.debug("BackoffTime: {} ms.", backoffTime.toMillis()); // Calculate the remaining time based after accounting for the backoff that we