Skip to content

Commit

Permalink
Fix test not-retrying failures
Browse files Browse the repository at this point in the history
  • Loading branch information
drwill-ms committed Oct 20, 2022
1 parent 8657ba9 commit cce60e5
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 148 deletions.
57 changes: 57 additions & 0 deletions e2e/test/helpers/HubServiceTestRetryPolicy.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.Collections.Generic;
using Microsoft.Azure.Devices.Client;

namespace Microsoft.Azure.Devices.E2ETests.Helpers
{
internal class HubServiceTestRetryPolicy : IRetryPolicy
{
private readonly HashSet<IotHubServiceErrorCode> _iotHubServiceErrorCodes;
private const int MaxRetries = 20;
private static readonly TimeSpan s_retryDelay = TimeSpan.FromSeconds(1);
private static readonly TimeSpan s_maxDelay = TimeSpan.FromSeconds(3);

public HubServiceTestRetryPolicy(HashSet<IotHubServiceErrorCode> iotHubServiceErrorCodes = default)
{
_iotHubServiceErrorCodes = iotHubServiceErrorCodes ?? new HashSet<IotHubServiceErrorCode>();
}

public bool ShouldRetry(uint currentRetryCount, Exception lastException, out TimeSpan retryInterval)
{
retryInterval = TimeSpan.Zero;

if (currentRetryCount < MaxRetries)
{
VerboseTestLogger.WriteLine($"{nameof(HubServiceTestRetryPolicy)}: Exhausted {currentRetryCount}/{MaxRetries} retries and failing due to {lastException}");
return false;
}

if (lastException is not IotHubServiceException)
{
VerboseTestLogger.WriteLine($"{nameof(HubServiceTestRetryPolicy)}: Unretriable exception encountered: {lastException}");
return false;
}

var hubEx = (IotHubServiceException)lastException;

if (hubEx.IsTransient
|| _iotHubServiceErrorCodes.Contains(hubEx.ErrorCode))
{
VerboseTestLogger.WriteLine($"{nameof(HubServiceTestRetryPolicy)}: retrying due to transient {hubEx.IsTransient} or error code {hubEx.ErrorCode}.");
double waitDurationMs = Math.Min(
currentRetryCount * s_retryDelay.TotalMilliseconds,
s_maxDelay.TotalMilliseconds);

retryInterval = TimeSpan.FromMilliseconds(waitDurationMs);

return true;
}

VerboseTestLogger.WriteLine($"{nameof(HubServiceTestRetryPolicy)}: not retrying due to transient {hubEx.IsTransient} or error code {hubEx.ErrorCode}.");
return false;
}
}
}
50 changes: 30 additions & 20 deletions e2e/test/helpers/ProvisioningServiceRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,39 +11,49 @@ namespace Microsoft.Azure.Devices.E2ETests.Helpers
public class ProvisioningServiceRetryPolicy : IRetryPolicy
{
private const string RetryAfterKey = "Retry-After";
private const uint MaxRetryCount = 5;
private const uint MaxRetryCount = 20;
private static readonly TimeSpan s_retryDelay = TimeSpan.FromSeconds(1);
private static readonly TimeSpan s_maxDelay = TimeSpan.FromSeconds(3);

private static readonly TimeSpan s_defaultRetryInterval = TimeSpan.FromSeconds(5);

private static readonly IRetryPolicy s_retryPolicy = new IncrementalDelayRetryPolicy(MaxRetryCount, TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(3));
private static readonly TimeSpan s_defaultRetryInterval = TimeSpan.FromSeconds(3);

public bool ShouldRetry(uint currentRetryCount, Exception lastException, out TimeSpan retryInterval)
{
retryInterval = TimeSpan.Zero;

var provisioningException = lastException as DeviceProvisioningServiceException;

if (provisioningException == null || currentRetryCount > MaxRetryCount)
if (currentRetryCount > MaxRetryCount
|| lastException is not DeviceProvisioningServiceException)
{
return false;
}
else if ((int)provisioningException.StatusCode == 429) // HttpStatusCode.TooManyRequests is not available in net472
{
IDictionary<string, string> httpHeaders = provisioningException.Fields;
bool retryAfterPresent = httpHeaders.TryGetValue(RetryAfterKey, out string retryAfter);

retryInterval = retryAfterPresent
? TimeSpan.FromSeconds(Convert.ToDouble(retryAfter))
: s_defaultRetryInterval;

return true;
}
else if ((int)provisioningException.StatusCode > 500 && (int)provisioningException.StatusCode < 600)
if (lastException is DeviceProvisioningServiceException provisioningException)
{
return s_retryPolicy.ShouldRetry(currentRetryCount, lastException, out retryInterval);
if (!provisioningException.IsTransient)
{
return false;
}

if ((int)provisioningException.StatusCode == 429) // HttpStatusCode.TooManyRequests is not available in net472
{
IDictionary<string, string> httpHeaders = provisioningException.Fields;
bool retryAfterPresent = httpHeaders.TryGetValue(RetryAfterKey, out string retryAfter);

retryInterval = retryAfterPresent
? TimeSpan.FromSeconds(Convert.ToDouble(retryAfter))
: s_defaultRetryInterval;

return true;
}
}

return false;
double waitDurationMs = Math.Min(
currentRetryCount * s_retryDelay.TotalMilliseconds,
s_maxDelay.TotalMilliseconds);

retryInterval = TimeSpan.FromMilliseconds(waitDurationMs);

return true;
}
}
}
76 changes: 11 additions & 65 deletions e2e/test/helpers/RetryOperationHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Azure.Devices.Client;
Expand All @@ -15,97 +13,45 @@ namespace Microsoft.Azure.Devices.E2ETests.Helpers
/// </summary>
public class RetryOperationHelper
{
// A conservative, basic retry policy that should be fine for most scenarios.
public static readonly IRetryPolicy DefaultRetryPolicy = new IncrementalDelayRetryPolicy(20, TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(3));

/// <summary>
/// Retry an async operation based on the retry strategy supplied.
/// </summary>
/// <remarks>
/// This is for E2E tests of provisioning service clients .
/// This is for E2E tests of provisioning service clients.
/// </remarks>
/// <param name="asyncOperation">The async operation to be retried.</param>
/// <param name="retryPolicy">The retry policy to be applied.</param>
/// <param name="retryableExceptions">The exceptions to be retried on.</param>
/// <param name="logger">The <see cref="MsTestLogger"/> instance to be used.</param>
/// <param name="cancellationToken">The cancellation token to cancel the operation.</param>
public static async Task RetryOperationsAsync(
///
public static async Task RunWithRetryAsync(
Func<Task> asyncOperation,
IRetryPolicy retryPolicy,
HashSet<Type> retryableExceptions,
CancellationToken cancellationToken = default)
{
uint counter = 0;
bool shouldRetry;
do

while (true)
{
TimeSpan retryInterval;
try
{
counter++;
await asyncOperation().ConfigureAwait(false);
break;
return;
}
catch (Exception ex) when (retryableExceptions.Any(e => e.IsInstanceOfType(ex)))
catch (Exception ex) when (!retryPolicy.ShouldRetry(counter, ex, out retryInterval))
{
shouldRetry = retryPolicy.ShouldRetry(++counter, ex, out retryInterval);
VerboseTestLogger.WriteLine($"Attempt {counter}: operation did not succeed: {ex}");

if (!shouldRetry)
{
VerboseTestLogger.WriteLine($"Encountered an exception that will not be retried - attempt: {counter}; exception: {ex}");
throw;
}
VerboseTestLogger.WriteLine($"Attempt {counter}: operation did not succeed due to: {ex}");
}

VerboseTestLogger.WriteLine($"Will retry operation in {retryInterval}.");
await Task.Delay(retryInterval, cancellationToken);
}
while (shouldRetry && !cancellationToken.IsCancellationRequested);
}

/// <summary>
/// Retry an async operation based on the retry strategy supplied.
/// </summary>
/// <remarks>
/// This is for E2E tests of hub service clients.
/// </remarks>
/// <param name="asyncOperation">The async operation to be retried.</param>
/// <param name="retryPolicy">The retry policy to be applied.</param>
/// <param name="retryableStatusCodes">The errors to be retried on.</param>
/// <param name="logger">The <see cref="MsTestLogger"/> instance to be used.</param>
/// <param name="cancellationToken">The cancellation token to cancel the operation.</param>
public static async Task RetryOperationsAsync(
Func<Task> asyncOperation,
IRetryPolicy retryPolicy,
HashSet<IotHubServiceErrorCode> retryableStatusCodes,
CancellationToken cancellationToken = default)
{
uint counter = 0;
bool shouldRetry;
do
{
TimeSpan retryInterval;
try
if (retryInterval <= TimeSpan.Zero)
{
await asyncOperation().ConfigureAwait(false);
break;
}
catch (Exception ex) when (ex is IotHubServiceException serviceEx && retryableStatusCodes.Contains(serviceEx.ErrorCode))
{
shouldRetry = retryPolicy.ShouldRetry(++counter, ex, out retryInterval);
VerboseTestLogger.WriteLine($"Attempt {counter}: operation did not succeed: {ex}");

if (!shouldRetry)
{
VerboseTestLogger.WriteLine($"Encountered an exception that will not be retried - attempt: {counter}; exception: {ex}");
throw;
}
retryInterval = TimeSpan.FromSeconds(1);
}

VerboseTestLogger.WriteLine($"Will retry operation in {retryInterval}.");
await Task.Delay(retryInterval, cancellationToken);
}
while (shouldRetry && !cancellationToken.IsCancellationRequested);
}
}
}
28 changes: 12 additions & 16 deletions e2e/test/helpers/TestDevice.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

using System;
using System.Collections.Generic;
using System.Net;
using System.Security.Cryptography.X509Certificates;
using System.Threading;
using System.Threading.Tasks;
Expand All @@ -26,16 +25,16 @@ public enum ConnectionStringAuthScope

public class TestDevice : IDisposable
{
private const int MaxRetryCount = 20;
private static readonly HashSet<IotHubServiceErrorCode> s_throttlingStatusCodes = new() { IotHubServiceErrorCode.ThrottlingException };
private static readonly HashSet<IotHubServiceErrorCode> s_retryableStatusCodes = new(s_throttlingStatusCodes)
private static readonly SemaphoreSlim s_semaphore = new(1, 1);

private static readonly IRetryPolicy s_createRetryPolicy = new HubServiceTestRetryPolicy();

private static readonly HashSet<IotHubServiceErrorCode> s_getRetryableStatusCodes = new()
{
IotHubServiceErrorCode.DeviceNotFound,
IotHubServiceErrorCode.ModuleNotFound,
};
private static readonly SemaphoreSlim s_semaphore = new(1, 1);

private static readonly IRetryPolicy s_retryPolicy = new IncrementalDelayRetryPolicy(MaxRetryCount, TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(3));
private static readonly IRetryPolicy s_getRetryPolicy = new HubServiceTestRetryPolicy(s_getRetryableStatusCodes);

private X509Certificate2 _authCertificate;

Expand Down Expand Up @@ -100,25 +99,23 @@ private static async Task<TestDevice> CreateDeviceAsync(TestDeviceType type, str
Device device = null;

await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
device = await serviceClient.Devices.CreateAsync(requestDevice).ConfigureAwait(false);
},
s_retryPolicy,
s_throttlingStatusCodes,
s_createRetryPolicy,
CancellationToken.None)
.ConfigureAwait(false);

// Confirm the device exists in the registry before calling it good to avoid downstream test failures.
await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
await serviceClient.Devices.GetAsync(requestDevice.Id).ConfigureAwait(false);
},
s_retryPolicy,
s_retryableStatusCodes,
s_getRetryPolicy,
CancellationToken.None)
.ConfigureAwait(false);

Expand Down Expand Up @@ -190,13 +187,12 @@ public async Task RemoveDeviceAsync()
using var serviceClient = new IotHubServiceClient(TestConfiguration.IotHub.ConnectionString);

await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
await serviceClient.Devices.DeleteAsync(Id).ConfigureAwait(false);
},
s_retryPolicy,
s_throttlingStatusCodes,
s_getRetryPolicy,
CancellationToken.None)
.ConfigureAwait(false);
}
Expand Down
25 changes: 14 additions & 11 deletions e2e/test/iothub/service/RegistryE2ETests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
using System.Threading.Tasks;
using Azure;
using FluentAssertions;
using Microsoft.Azure.Devices.Client;
using Microsoft.Azure.Devices.E2ETests.helpers;
using Microsoft.Azure.Devices.E2ETests.Helpers;
using Microsoft.VisualStudio.TestTools.UnitTesting;

Expand All @@ -24,8 +26,12 @@ public class RegistryE2ETests : E2EMsTestBase
{
private readonly string _idPrefix = $"{nameof(RegistryE2ETests)}_";

// In particular, this should retry on "module not registered on this device" errors
private static readonly HashSet<Type> s_retryableExceptions = new() { typeof(IotHubServiceException) };
private static readonly HashSet<IotHubServiceErrorCode> s_getRetryableStatusCodes = new()
{
IotHubServiceErrorCode.DeviceNotFound,
IotHubServiceErrorCode.ModuleNotFound,
};
private static readonly IRetryPolicy s_retryPolicy = new HubServiceTestRetryPolicy(s_getRetryableStatusCodes);

[TestMethod]
[Timeout(TestTimeoutMilliseconds)]
Expand Down Expand Up @@ -432,13 +438,12 @@ public async Task ModulesClient_IdentityLifecycle()
Module retrievedModule = null;

await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
retrievedModule = await serviceClient.Modules.GetAsync(testDeviceId, testModuleId).ConfigureAwait(false);
},
RetryOperationHelper.DefaultRetryPolicy,
s_retryableExceptions)
s_retryPolicy)
.ConfigureAwait(false);

retrievedModule.Should().NotBeNull($"When checking for ETag, got null back for GET on module '{testDeviceId}/{testModuleId}'.");
Expand Down Expand Up @@ -634,13 +639,12 @@ public async Task ModulesClient_SetModulesETag_Works()
module = await serviceClient.Modules.CreateAsync(module).ConfigureAwait(false);

await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
module = await serviceClient.Modules.GetAsync(deviceId, moduleId).ConfigureAwait(false);
},
RetryOperationHelper.DefaultRetryPolicy,
s_retryableExceptions)
s_retryPolicy)
.ConfigureAwait(false);

try
Expand Down Expand Up @@ -703,13 +707,12 @@ public async Task ModulesClient_DeleteModulesETag_Works()
module = await serviceClient.Modules.CreateAsync(module).ConfigureAwait(false);

await RetryOperationHelper
.RetryOperationsAsync(
.RunWithRetryAsync(
async () =>
{
module = await serviceClient.Modules.GetAsync(deviceId, moduleId).ConfigureAwait(false);
},
RetryOperationHelper.DefaultRetryPolicy,
s_retryableExceptions)
s_retryPolicy)
.ConfigureAwait(false);

try
Expand Down

0 comments on commit cce60e5

Please sign in to comment.