Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 136 additions & 4 deletions e2e/config/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,8 +564,13 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi
}

if replicatedToCurrentRegion(version, location) {
toolkit.Logf(ctx, "Image version %s is already in region %s", *version.ID, location)
return nil
// Intent-to-replicate is registered in PublishingProfile.TargetRegions, but that
// does NOT mean the regional replica is actually serving traffic. The regional
// replication state can still be "Replicating" while the parent ProvisioningState
// is "Succeeded" — this is the source of GalleryImageNotFound 404s on VMSS create.
// Wait for the regional state itself before declaring success.
toolkit.Logf(ctx, "Image version %s is already a target of region %s; verifying regional replication state", *version.ID, location)
return a.waitForRegionalReplicationCompleted(ctx, image, version, location)
}
regions := make([]string, 0, len(version.Properties.PublishingProfile.TargetRegions))
for _, targetRegion := range version.Properties.PublishingProfile.TargetRegions {
Expand All @@ -575,12 +580,114 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi
toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Replicating to region %s", location)

start := time.Now() // Record the start time
err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location)
if err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location); err != nil {
return err
}

// The replicate LRO above completes when the parent resource is Succeeded, but the
// regional replica may still be in "Replicating" state for several more minutes.
// Block until that regional state hits Completed; otherwise downstream VMSS create
// will see GalleryImageNotFound and the test will appear to fail for a non-test reason.
if err := a.waitForRegionalReplicationCompleted(ctx, image, version, location); err != nil {
return err
}
elapsed := time.Since(start) // Calculate the elapsed time

toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", elapsed, *version.ID))

return err
return nil
}

// waitForRegionalReplicationCompleted polls the gallery image version with the
// ReplicationStatus expand option until the named region's RegionalReplicationStatus.State
// is Completed. Returns an error if the regional replication enters a terminal Failed state
// or the wait times out. Treats Unknown as transient (Azure occasionally emits Unknown
// briefly during state transitions).
//
// This closes a documented gap in the SIG API: the parent provisioning LRO can succeed
// before per-region replicas are actually serving traffic. Without this wait, callers see
// GalleryImageNotFound 404s on VMSS creation that look like bugs but are infra eventual
// consistency.
func (a *AzureClient) waitForRegionalReplicationCompleted(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion, location string) error {
imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions)
if err != nil {
return fmt.Errorf("create image version client for replication wait: %w", err)
}

getOpts := &armcompute.GalleryImageVersionsClientGetOptions{
Expand: to.Ptr(armcompute.ReplicationStatusTypesReplicationStatus),
}

var lastLoggedState armcompute.ReplicationState
const (
pollInterval = 15 * time.Second
pollTimeout = 20 * time.Minute
)

pollErr := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (bool, error) {
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, getOpts)
if err != nil {
// Transient API errors should not abort the wait; the SDK already retries throttling.
toolkit.Logf(ctx, "transient error getting image version replication status (will retry): %v", err)
return false, nil
}
Comment on lines +628 to +633
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

waitForRegionalReplicationCompleted currently treats any Get error as transient and keeps polling. This will mask permanent failures (e.g., 401/403 auth/RBAC issues, 404 if the version was deleted, malformed request) and can turn them into a 20-minute timeout with a generic error. Suggest inspecting err as *azcore.ResponseError and failing fast for non-retryable status codes, only continuing on clearly transient cases (e.g., 429/5xx/timeouts).

Copilot uses AI. Check for mistakes.

regional := findRegionalReplicationStatus(resp.GalleryImageVersion.Properties.ReplicationStatus, location)
if regional == nil || regional.State == nil {
// Region not yet present in the status summary; keep waiting.
return false, nil
}

state := *regional.State
if state != lastLoggedState {
progress := int32(0)
if regional.Progress != nil {
progress = *regional.Progress
}
toolkit.Logf(ctx, "Image version %s regional replication state in %s: %s (progress: %d%%)", *version.ID, location, state, progress)
lastLoggedState = state
}

switch state {
case armcompute.ReplicationStateCompleted:
return true, nil
case armcompute.ReplicationStateFailed:
details := ""
if regional.Details != nil {
details = *regional.Details
}
return false, fmt.Errorf("regional replication of %s to %s failed: %s", *version.ID, location, details)
case armcompute.ReplicationStateReplicating, armcompute.ReplicationStateUnknown:
return false, nil
default:
// Forward-compat: unknown future states treated as in-progress.
return false, nil
}
})

if pollErr != nil {
return fmt.Errorf("waiting for regional replication of %s to %s to complete: %w", *version.ID, location, pollErr)
}
toolkit.Logf(ctx, "Image version %s regional replication to %s confirmed Completed", *version.ID, location)
return nil
}

// findRegionalReplicationStatus finds the RegionalReplicationStatus for the given location
// (case-insensitive, ignoring spaces) within the version-wide ReplicationStatus.
func findRegionalReplicationStatus(status *armcompute.ReplicationStatus, location string) *armcompute.RegionalReplicationStatus {
if status == nil {
return nil
}
normalized := strings.ToLower(strings.ReplaceAll(location, " ", ""))
for _, regional := range status.Summary {
if regional == nil || regional.Region == nil {
continue
}
if strings.ToLower(strings.ReplaceAll(*regional.Region, " ", "")) == normalized {
return regional
}
}
return nil
}

func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion) error {
Expand Down Expand Up @@ -688,6 +795,31 @@ func (a *AzureClient) EnsureSIGImageVersion(ctx context.Context, image *Image, l
return VHDResourceID(*resp.ID), nil
}

// WaitForImageVersionReplicatedToRegion is the public entrypoint for callers (e.g. the
// VMSS create retry loop) that need to confirm a previously-resolved image is actually
// serving traffic in a region before retrying a request that just failed with
// GalleryImageNotFound. It fetches the live image version with ReplicationStatus
// expanded and blocks until the regional state is Completed (or fails fast on terminal
// Failed). It does not initiate replication; callers should have already gone through
// EnsureSIGImageVersion / LatestSIGImageVersionByTag.
func (a *AzureClient) WaitForImageVersionReplicatedToRegion(ctx context.Context, image *Image, location string) error {
if image == nil {
return fmt.Errorf("nil image")
}
if image.Version == "" {
return fmt.Errorf("image %s has no resolved version; cannot check replication state", image.Name)
}
imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions)
if err != nil {
return fmt.Errorf("create image version client: %w", err)
}
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil)
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doc comment for WaitForImageVersionReplicatedToRegion says it "fetches the live image version with ReplicationStatus expanded", but the initial Get(...) uses nil options (no Expand). Either update the comment to reflect that the polling Get uses Expand=ReplicationStatus, or pass the expand option to this initial Get to match the comment.

Suggested change
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil)
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, &armcompute.GalleryImageVersionsClientGetOptions{
Expand: to.Ptr("ReplicationStatus"),
})

Copilot uses AI. Check for mistakes.
if err != nil {
return fmt.Errorf("get image version %s/%s for replication-wait: %w", image.Name, image.Version, err)
}
return a.waitForRegionalReplicationCompleted(ctx, image, &resp.GalleryImageVersion, location)
}

func DefaultRetryOpts() policy.RetryOptions {
return policy.RetryOptions{
// Use generous retry settings to survive Azure Compute Gallery throttling.
Expand Down
64 changes: 64 additions & 0 deletions e2e/config/azure_replication_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package config

import (
"testing"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
)

// TestFindRegionalReplicationStatus exercises the location-matching logic used to
// extract the per-region replication state from a SIG image version's ReplicationStatus
// summary. Region names from ARM may be in either the "WestUS 2" form or the "westus2"
// form, and may include arbitrary casing; the lookup must normalize both sides.
func TestFindRegionalReplicationStatus(t *testing.T) {
completed := armcompute.ReplicationStateCompleted
replicating := armcompute.ReplicationStateReplicating

status := &armcompute.ReplicationStatus{
Summary: []*armcompute.RegionalReplicationStatus{
{Region: to.Ptr("East US"), State: &completed, Progress: to.Ptr(int32(100))},
{Region: to.Ptr("West US 2"), State: &replicating, Progress: to.Ptr(int32(40))},
{Region: to.Ptr("uaenorth"), State: &completed, Progress: to.Ptr(int32(100))},
nil, // tolerate nil entries
{Region: nil, State: &completed},
},
}

tests := []struct {
name string
status *armcompute.ReplicationStatus
location string
wantFound bool
wantState armcompute.ReplicationState
wantRegion string
}{
{name: "nil status", status: nil, location: "westus2"},
{name: "exact lowercase normalized match (with space in summary)", status: status, location: "westus2", wantFound: true, wantState: replicating, wantRegion: "West US 2"},
{name: "uppercase input match (with space in summary)", status: status, location: "EASTUS", wantFound: true, wantState: completed, wantRegion: "East US"},
{name: "input with embedded spaces", status: status, location: "east us", wantFound: true, wantState: completed, wantRegion: "East US"},
{name: "summary already normalized", status: status, location: "uaenorth", wantFound: true, wantState: completed, wantRegion: "uaenorth"},
{name: "missing region returns nil", status: status, location: "centralus"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := findRegionalReplicationStatus(tt.status, tt.location)
if !tt.wantFound {
if got != nil {
t.Fatalf("expected nil, got region %q state %v", *got.Region, *got.State)
}
return
}
if got == nil {
t.Fatalf("expected to find region %q, got nil", tt.wantRegion)
}
if got.Region == nil || *got.Region != tt.wantRegion {
t.Errorf("region mismatch: want %q got %v", tt.wantRegion, got.Region)
}
if got.State == nil || *got.State != tt.wantState {
t.Errorf("state mismatch: want %v got %v", tt.wantState, got.State)
}
})
}
}
43 changes: 31 additions & 12 deletions e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,24 +424,26 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine

func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) {
delay := 5 * time.Second
retryOn := func(err error) bool {
classify := func(err error) (retry bool, isGalleryImageMissing bool) {
var respErr *azcore.ResponseError
// only retry on Azure API errors with specific error codes
if !errors.As(err, &respErr) {
return false
return false, false
}
// AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps
// It's not a quota issue
// AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps.
// It's not a quota issue.
if respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" {
return true
return true, false
}
// GalleryImageNotFound can happen transiently after image replication completes
// due to Azure eventual consistency - the gallery API reports success but the
// compute fabric in the target region hasn't fully propagated the image yet
// GalleryImageNotFound: gallery image version exists and the parent provisioningState is Succeeded,
// but the regional replica is not yet serving traffic. ensureReplication SHOULD have caught this before
// we got here; if we're seeing it on VMSS create, it's either fabric-level eventual consistency (very
// short window) or a regression in the wait. Either way, classify it specially so we can fail fast on
// terminal Failed states instead of burning 10 retries.
if respErr.StatusCode == 404 && respErr.ErrorCode == "GalleryImageNotFound" {
return true
return true, true
}
return false
return false, false
}

maxAttempts := 10
Expand All @@ -454,11 +456,28 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error)
return vm, nil
}

// not a retryable error
if !retryOn(err) {
retry, isGalleryImageMissing := classify(err)
if !retry {
return vm, err
}

// For GalleryImageNotFound, defer to the canonical replication-state poller in
// the config package. This either:
// - returns quickly when the regional state is already Completed (fabric-level race;
// fall through to a normal retry), or
// - blocks until the region transitions to Completed (then retry once on success), or
// - fails fast with a clear error if the region is in a terminal Failed state.
if isGalleryImageMissing && s.VHD != nil && s.Runtime != nil && s.Runtime.Cluster != nil && s.Runtime.Cluster.Model != nil && s.Runtime.Cluster.Model.Location != nil {
location := *s.Runtime.Cluster.Model.Location
toolkit.Logf(ctx, "GalleryImageNotFound on VMSS create in %s; consulting regional replication status before retrying", location)
Comment on lines +470 to +472
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The long nil-check chain before calling WaitForImageVersionReplicatedToRegion is effectively redundant here because this function already dereferences s.Runtime.Cluster.Model.Properties.NodeResourceGroup earlier in the loop. Either rely on the existing invariants and simplify this condition, or restructure the code to safely extract location/nodeResourceGroup once up-front (so the nil checks actually add safety).

Copilot uses AI. Check for mistakes.
if waitErr := config.Azure.WaitForImageVersionReplicatedToRegion(ctx, s.VHD, location); waitErr != nil {
// Surface the precise infra cause instead of a generic "failed after N retries".
return vm, fmt.Errorf("VMSS create returned GalleryImageNotFound and regional replication is not usable: %w (original error: %v)", waitErr, err)
}
toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately")
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the GalleryImageNotFound path, the continue bypasses the attempt >= maxAttempts guard and the normal delay backoff. If the compute fabric keeps returning 404 even after replication is Completed, this loop can spin indefinitely and/or hot-loop without sleeping, which is both noisy and can trigger throttling. Consider enforcing the max-attempts check before continue, and apply the normal delay (or explicitly limit this branch to a single immediate retry).

Suggested change
toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately")
if attempt >= maxAttempts {
return vm, fmt.Errorf("failed to create VMSS after %d retries: %w", maxAttempts, err)
}
toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create after %v", delay)
select {
case <-ctx.Done():
return vm, err
case <-time.After(delay):
}

Copilot uses AI. Check for mistakes.
continue
}

if attempt >= maxAttempts {
return vm, fmt.Errorf("failed to create VMSS after %d retries: %w", maxAttempts, err)
}
Expand Down
Loading