-
Notifications
You must be signed in to change notification settings - Fork 251
fix(e2e): wait on RegionalReplicationStatus before declaring SIG image usable #8373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -564,8 +564,13 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi | |||||||||
| } | ||||||||||
|
|
||||||||||
| if replicatedToCurrentRegion(version, location) { | ||||||||||
| toolkit.Logf(ctx, "Image version %s is already in region %s", *version.ID, location) | ||||||||||
| return nil | ||||||||||
| // Intent-to-replicate is registered in PublishingProfile.TargetRegions, but that | ||||||||||
| // does NOT mean the regional replica is actually serving traffic. The regional | ||||||||||
| // replication state can still be "Replicating" while the parent ProvisioningState | ||||||||||
| // is "Succeeded" — this is the source of GalleryImageNotFound 404s on VMSS create. | ||||||||||
| // Wait for the regional state itself before declaring success. | ||||||||||
| toolkit.Logf(ctx, "Image version %s is already a target of region %s; verifying regional replication state", *version.ID, location) | ||||||||||
| return a.waitForRegionalReplicationCompleted(ctx, image, version, location) | ||||||||||
| } | ||||||||||
| regions := make([]string, 0, len(version.Properties.PublishingProfile.TargetRegions)) | ||||||||||
| for _, targetRegion := range version.Properties.PublishingProfile.TargetRegions { | ||||||||||
|
|
@@ -575,12 +580,114 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi | |||||||||
| toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Replicating to region %s", location) | ||||||||||
|
|
||||||||||
| start := time.Now() // Record the start time | ||||||||||
| err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location) | ||||||||||
| if err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location); err != nil { | ||||||||||
| return err | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // The replicate LRO above completes when the parent resource is Succeeded, but the | ||||||||||
| // regional replica may still be in "Replicating" state for several more minutes. | ||||||||||
| // Block until that regional state hits Completed; otherwise downstream VMSS create | ||||||||||
| // will see GalleryImageNotFound and the test will appear to fail for a non-test reason. | ||||||||||
| if err := a.waitForRegionalReplicationCompleted(ctx, image, version, location); err != nil { | ||||||||||
| return err | ||||||||||
| } | ||||||||||
| elapsed := time.Since(start) // Calculate the elapsed time | ||||||||||
|
|
||||||||||
| toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", elapsed, *version.ID)) | ||||||||||
|
|
||||||||||
| return err | ||||||||||
| return nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // waitForRegionalReplicationCompleted polls the gallery image version with the | ||||||||||
| // ReplicationStatus expand option until the named region's RegionalReplicationStatus.State | ||||||||||
| // is Completed. Returns an error if the regional replication enters a terminal Failed state | ||||||||||
| // or the wait times out. Treats Unknown as transient (Azure occasionally emits Unknown | ||||||||||
| // briefly during state transitions). | ||||||||||
| // | ||||||||||
| // This closes a documented gap in the SIG API: the parent provisioning LRO can succeed | ||||||||||
| // before per-region replicas are actually serving traffic. Without this wait, callers see | ||||||||||
| // GalleryImageNotFound 404s on VMSS creation that look like bugs but are infra eventual | ||||||||||
| // consistency. | ||||||||||
| func (a *AzureClient) waitForRegionalReplicationCompleted(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion, location string) error { | ||||||||||
| imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) | ||||||||||
| if err != nil { | ||||||||||
| return fmt.Errorf("create image version client for replication wait: %w", err) | ||||||||||
| } | ||||||||||
|
|
||||||||||
| getOpts := &armcompute.GalleryImageVersionsClientGetOptions{ | ||||||||||
| Expand: to.Ptr(armcompute.ReplicationStatusTypesReplicationStatus), | ||||||||||
| } | ||||||||||
|
|
||||||||||
| var lastLoggedState armcompute.ReplicationState | ||||||||||
| const ( | ||||||||||
| pollInterval = 15 * time.Second | ||||||||||
| pollTimeout = 20 * time.Minute | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| pollErr := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (bool, error) { | ||||||||||
| resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, getOpts) | ||||||||||
| if err != nil { | ||||||||||
| // Transient API errors should not abort the wait; the SDK already retries throttling. | ||||||||||
| toolkit.Logf(ctx, "transient error getting image version replication status (will retry): %v", err) | ||||||||||
| return false, nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| regional := findRegionalReplicationStatus(resp.GalleryImageVersion.Properties.ReplicationStatus, location) | ||||||||||
| if regional == nil || regional.State == nil { | ||||||||||
| // Region not yet present in the status summary; keep waiting. | ||||||||||
| return false, nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| state := *regional.State | ||||||||||
| if state != lastLoggedState { | ||||||||||
| progress := int32(0) | ||||||||||
| if regional.Progress != nil { | ||||||||||
| progress = *regional.Progress | ||||||||||
| } | ||||||||||
| toolkit.Logf(ctx, "Image version %s regional replication state in %s: %s (progress: %d%%)", *version.ID, location, state, progress) | ||||||||||
| lastLoggedState = state | ||||||||||
| } | ||||||||||
|
|
||||||||||
| switch state { | ||||||||||
| case armcompute.ReplicationStateCompleted: | ||||||||||
| return true, nil | ||||||||||
| case armcompute.ReplicationStateFailed: | ||||||||||
| details := "" | ||||||||||
| if regional.Details != nil { | ||||||||||
| details = *regional.Details | ||||||||||
| } | ||||||||||
| return false, fmt.Errorf("regional replication of %s to %s failed: %s", *version.ID, location, details) | ||||||||||
| case armcompute.ReplicationStateReplicating, armcompute.ReplicationStateUnknown: | ||||||||||
| return false, nil | ||||||||||
| default: | ||||||||||
| // Forward-compat: unknown future states treated as in-progress. | ||||||||||
| return false, nil | ||||||||||
| } | ||||||||||
| }) | ||||||||||
|
|
||||||||||
| if pollErr != nil { | ||||||||||
| return fmt.Errorf("waiting for regional replication of %s to %s to complete: %w", *version.ID, location, pollErr) | ||||||||||
| } | ||||||||||
| toolkit.Logf(ctx, "Image version %s regional replication to %s confirmed Completed", *version.ID, location) | ||||||||||
| return nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // findRegionalReplicationStatus finds the RegionalReplicationStatus for the given location | ||||||||||
| // (case-insensitive, ignoring spaces) within the version-wide ReplicationStatus. | ||||||||||
| func findRegionalReplicationStatus(status *armcompute.ReplicationStatus, location string) *armcompute.RegionalReplicationStatus { | ||||||||||
| if status == nil { | ||||||||||
| return nil | ||||||||||
| } | ||||||||||
| normalized := strings.ToLower(strings.ReplaceAll(location, " ", "")) | ||||||||||
| for _, regional := range status.Summary { | ||||||||||
| if regional == nil || regional.Region == nil { | ||||||||||
| continue | ||||||||||
| } | ||||||||||
| if strings.ToLower(strings.ReplaceAll(*regional.Region, " ", "")) == normalized { | ||||||||||
| return regional | ||||||||||
| } | ||||||||||
| } | ||||||||||
| return nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion) error { | ||||||||||
|
|
@@ -688,6 +795,31 @@ func (a *AzureClient) EnsureSIGImageVersion(ctx context.Context, image *Image, l | |||||||||
| return VHDResourceID(*resp.ID), nil | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // WaitForImageVersionReplicatedToRegion is the public entrypoint for callers (e.g. the | ||||||||||
| // VMSS create retry loop) that need to confirm a previously-resolved image is actually | ||||||||||
| // serving traffic in a region before retrying a request that just failed with | ||||||||||
| // GalleryImageNotFound. It fetches the live image version with ReplicationStatus | ||||||||||
| // expanded and blocks until the regional state is Completed (or fails fast on terminal | ||||||||||
| // Failed). It does not initiate replication; callers should have already gone through | ||||||||||
| // EnsureSIGImageVersion / LatestSIGImageVersionByTag. | ||||||||||
| func (a *AzureClient) WaitForImageVersionReplicatedToRegion(ctx context.Context, image *Image, location string) error { | ||||||||||
| if image == nil { | ||||||||||
| return fmt.Errorf("nil image") | ||||||||||
| } | ||||||||||
| if image.Version == "" { | ||||||||||
| return fmt.Errorf("image %s has no resolved version; cannot check replication state", image.Name) | ||||||||||
| } | ||||||||||
| imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) | ||||||||||
| if err != nil { | ||||||||||
| return fmt.Errorf("create image version client: %w", err) | ||||||||||
| } | ||||||||||
| resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil) | ||||||||||
|
||||||||||
| resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil) | |
| resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, &armcompute.GalleryImageVersionsClientGetOptions{ | |
| Expand: to.Ptr("ReplicationStatus"), | |
| }) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| package config | ||
|
|
||
| import ( | ||
| "testing" | ||
|
|
||
| "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" | ||
| "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" | ||
| ) | ||
|
|
||
| // TestFindRegionalReplicationStatus exercises the location-matching logic used to | ||
| // extract the per-region replication state from a SIG image version's ReplicationStatus | ||
| // summary. Region names from ARM may be in either the "WestUS 2" form or the "westus2" | ||
| // form, and may include arbitrary casing; the lookup must normalize both sides. | ||
| func TestFindRegionalReplicationStatus(t *testing.T) { | ||
| completed := armcompute.ReplicationStateCompleted | ||
| replicating := armcompute.ReplicationStateReplicating | ||
|
|
||
| status := &armcompute.ReplicationStatus{ | ||
| Summary: []*armcompute.RegionalReplicationStatus{ | ||
| {Region: to.Ptr("East US"), State: &completed, Progress: to.Ptr(int32(100))}, | ||
| {Region: to.Ptr("West US 2"), State: &replicating, Progress: to.Ptr(int32(40))}, | ||
| {Region: to.Ptr("uaenorth"), State: &completed, Progress: to.Ptr(int32(100))}, | ||
| nil, // tolerate nil entries | ||
| {Region: nil, State: &completed}, | ||
| }, | ||
| } | ||
|
|
||
| tests := []struct { | ||
| name string | ||
| status *armcompute.ReplicationStatus | ||
| location string | ||
| wantFound bool | ||
| wantState armcompute.ReplicationState | ||
| wantRegion string | ||
| }{ | ||
| {name: "nil status", status: nil, location: "westus2"}, | ||
| {name: "exact lowercase normalized match (with space in summary)", status: status, location: "westus2", wantFound: true, wantState: replicating, wantRegion: "West US 2"}, | ||
| {name: "uppercase input match (with space in summary)", status: status, location: "EASTUS", wantFound: true, wantState: completed, wantRegion: "East US"}, | ||
| {name: "input with embedded spaces", status: status, location: "east us", wantFound: true, wantState: completed, wantRegion: "East US"}, | ||
| {name: "summary already normalized", status: status, location: "uaenorth", wantFound: true, wantState: completed, wantRegion: "uaenorth"}, | ||
| {name: "missing region returns nil", status: status, location: "centralus"}, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| got := findRegionalReplicationStatus(tt.status, tt.location) | ||
| if !tt.wantFound { | ||
| if got != nil { | ||
| t.Fatalf("expected nil, got region %q state %v", *got.Region, *got.State) | ||
| } | ||
| return | ||
| } | ||
| if got == nil { | ||
| t.Fatalf("expected to find region %q, got nil", tt.wantRegion) | ||
| } | ||
| if got.Region == nil || *got.Region != tt.wantRegion { | ||
| t.Errorf("region mismatch: want %q got %v", tt.wantRegion, got.Region) | ||
| } | ||
| if got.State == nil || *got.State != tt.wantState { | ||
| t.Errorf("state mismatch: want %v got %v", tt.wantState, got.State) | ||
| } | ||
| }) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -424,24 +424,26 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine | |||||||||||||||||||||
|
|
||||||||||||||||||||||
| func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) { | ||||||||||||||||||||||
| delay := 5 * time.Second | ||||||||||||||||||||||
| retryOn := func(err error) bool { | ||||||||||||||||||||||
| classify := func(err error) (retry bool, isGalleryImageMissing bool) { | ||||||||||||||||||||||
| var respErr *azcore.ResponseError | ||||||||||||||||||||||
| // only retry on Azure API errors with specific error codes | ||||||||||||||||||||||
| if !errors.As(err, &respErr) { | ||||||||||||||||||||||
| return false | ||||||||||||||||||||||
| return false, false | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| // AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps | ||||||||||||||||||||||
| // It's not a quota issue | ||||||||||||||||||||||
| // AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps. | ||||||||||||||||||||||
| // It's not a quota issue. | ||||||||||||||||||||||
| if respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" { | ||||||||||||||||||||||
| return true | ||||||||||||||||||||||
| return true, false | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| // GalleryImageNotFound can happen transiently after image replication completes | ||||||||||||||||||||||
| // due to Azure eventual consistency - the gallery API reports success but the | ||||||||||||||||||||||
| // compute fabric in the target region hasn't fully propagated the image yet | ||||||||||||||||||||||
| // GalleryImageNotFound: gallery image version exists and the parent provisioningState is Succeeded, | ||||||||||||||||||||||
| // but the regional replica is not yet serving traffic. ensureReplication SHOULD have caught this before | ||||||||||||||||||||||
| // we got here; if we're seeing it on VMSS create, it's either fabric-level eventual consistency (very | ||||||||||||||||||||||
| // short window) or a regression in the wait. Either way, classify it specially so we can fail fast on | ||||||||||||||||||||||
| // terminal Failed states instead of burning 10 retries. | ||||||||||||||||||||||
| if respErr.StatusCode == 404 && respErr.ErrorCode == "GalleryImageNotFound" { | ||||||||||||||||||||||
| return true | ||||||||||||||||||||||
| return true, true | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| return false | ||||||||||||||||||||||
| return false, false | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| maxAttempts := 10 | ||||||||||||||||||||||
|
|
@@ -454,11 +456,28 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) | |||||||||||||||||||||
| return vm, nil | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // not a retryable error | ||||||||||||||||||||||
| if !retryOn(err) { | ||||||||||||||||||||||
| retry, isGalleryImageMissing := classify(err) | ||||||||||||||||||||||
| if !retry { | ||||||||||||||||||||||
| return vm, err | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // For GalleryImageNotFound, defer to the canonical replication-state poller in | ||||||||||||||||||||||
| // the config package. This either: | ||||||||||||||||||||||
| // - returns quickly when the regional state is already Completed (fabric-level race; | ||||||||||||||||||||||
| // fall through to a normal retry), or | ||||||||||||||||||||||
| // - blocks until the region transitions to Completed (then retry once on success), or | ||||||||||||||||||||||
| // - fails fast with a clear error if the region is in a terminal Failed state. | ||||||||||||||||||||||
| if isGalleryImageMissing && s.VHD != nil && s.Runtime != nil && s.Runtime.Cluster != nil && s.Runtime.Cluster.Model != nil && s.Runtime.Cluster.Model.Location != nil { | ||||||||||||||||||||||
| location := *s.Runtime.Cluster.Model.Location | ||||||||||||||||||||||
| toolkit.Logf(ctx, "GalleryImageNotFound on VMSS create in %s; consulting regional replication status before retrying", location) | ||||||||||||||||||||||
|
Comment on lines
+470
to
+472
|
||||||||||||||||||||||
| if waitErr := config.Azure.WaitForImageVersionReplicatedToRegion(ctx, s.VHD, location); waitErr != nil { | ||||||||||||||||||||||
| // Surface the precise infra cause instead of a generic "failed after N retries". | ||||||||||||||||||||||
| return vm, fmt.Errorf("VMSS create returned GalleryImageNotFound and regional replication is not usable: %w (original error: %v)", waitErr, err) | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately") | ||||||||||||||||||||||
|
||||||||||||||||||||||
| toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately") | |
| if attempt >= maxAttempts { | |
| return vm, fmt.Errorf("failed to create VMSS after %d retries: %w", maxAttempts, err) | |
| } | |
| toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create after %v", delay) | |
| select { | |
| case <-ctx.Done(): | |
| return vm, err | |
| case <-time.After(delay): | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
waitForRegionalReplicationCompletedcurrently treats anyGeterror as transient and keeps polling. This will mask permanent failures (e.g., 401/403 auth/RBAC issues, 404 if the version was deleted, malformed request) and can turn them into a 20-minute timeout with a generic error. Suggest inspectingerras*azcore.ResponseErrorand failing fast for non-retryable status codes, only continuing on clearly transient cases (e.g., 429/5xx/timeouts).