Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

Commit

Permalink
test: delete failed vmss extensions, wait longer (#4669)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackfrancis committed Sep 23, 2021
1 parent b8450ca commit f8fefaf
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 15 deletions.
7 changes: 6 additions & 1 deletion test/e2e/kubernetes/kubernetes_test.go
Expand Up @@ -3085,8 +3085,13 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
numNodesExpected := numAgentNodes + newKaminoNodes + numControlPlaneNodes
numLargeContainerPodsExpected := numAgentNodes + newKaminoNodes
By(fmt.Sprintf("Waiting for the %d new nodes created from prototype(s) to become Ready; waiting for %d total nodes", newKaminoNodes, numNodesExpected))
timeToWaitForLargeCluster := time.Duration(newKaminoNodes/1000) * time.Hour
timeToWaitForNewNodes := timeToWaitForLargeCluster
if cfg.Timeout > timeToWaitForLargeCluster {
timeToWaitForNewNodes = cfg.Timeout
}
start := time.Now()
ready := node.WaitOnReadyMin(numNodesExpected, 30*time.Second, false, 2*time.Hour)
ready := node.WaitOnReadyMin(numNodesExpected, 30*time.Second, false, timeToWaitForNewNodes)
Expect(ready).To(BeTrue())
elapsed = time.Since(start)
log.Printf("Took %s to add %d nodes derived from peer node prototype(s)\n", elapsed, newKaminoNodes)
Expand Down
40 changes: 26 additions & 14 deletions test/e2e/kubernetes/scripts/vmss-health-check.sh
Expand Up @@ -20,44 +20,56 @@ while true; do
for VMSS in $(az vmss list -g $RESOURCE_GROUP | jq -r '.[] | .name'); do
((NUM_VMSS++))
NUM_DELETED_INSTANCES=0
VMSS_PROVISIONING_STATE=$(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '.provisioningState')
echo $(date) VMSS $VMSS has a ProvisioningState of $VMSS_PROVISIONING_STATE
VMSS_CAPACITY=$(az vmss list -g $RESOURCE_GROUP | jq -r --arg VMSS "$VMSS" '.[] | select(.name == $VMSS) | .sku.capacity')
echo $(date) VMSS $VMSS has a current capacity of $VMSS_CAPACITY
for TERMINAL_VMSS in $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded" or .provisioningState == "Failed") | .name'); do
if [ "$VMSS_PROVISIONING_STATE" == "Succeeded" ] || [ "$VMSS_PROVISIONING_STATE" == "Failed" ]; then
((NUM_TERMINAL_VMSS++))
echo $(date) VMSS $TERMINAL_VMSS is in a terminal state!
HAS_FAILED_STATE_INSTANCE="false"
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $TERMINAL_VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $TERMINAL_VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $TERMINAL_VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_} --no-wait; then
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_} --no-wait; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $TERMINAL_VMSS | jq -r '.[].resources[] | select(.name == "vmssCSE" and .provisioningState == "Failed") | .id' | awk -F'/' '{print $9}'); do
echo $(date) Deleting VMSS $TERMINAL_VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $TERMINAL_VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_}; then
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[].resources[] | select(.name == "vmssCSE" and .provisioningState == "Failed") | .id' | awk -F'/' '{print $9}'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_}; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[].resources[] | select(.publisher == "Microsoft.AKS" and .provisioningState != "Succeeded" and .provisioningState != "Creating" and .provisioningState != "Deleting") | .id' | awk -F'/' '{print $9}'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_}; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
if [ "$HAS_FAILED_STATE_INSTANCE" == "true" ]; then
echo $(date) Waiting for $TERMINAL_VMSS to reach a terminal ProvisioningState after failed instances were deleted...
echo $(date) Waiting for $VMSS to reach a terminal ProvisioningState after failed instances were deleted...
sleep 30
until [[ $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded" or .provisioningState == "Failed") | .name') ]]; do
for STILL_FAILED_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $TERMINAL_VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
echo $(date) Instance $STILL_FAILED_VMSS_INSTANCE is still in a failed state, will attempt to delete again in the next loop
done
echo $(date) Waiting for $VMSS to reach a terminal ProvisioningState after failed instances were deleted...
sleep 30
done
echo $(date) VMSS $VMSS is in a terminal state after failed instances were deleted!
fi
done
fi
if [ "$NUM_DELETED_INSTANCES" -gt "0" ]; then
echo $(date) Instances were deleted from VMSS $VMSS, ensuring that capacity is set to $VMSS_CAPACITY
if ! az vmss scale --new-capacity $VMSS_CAPACITY -n $VMSS -g $RESOURCE_GROUP; then
if ! az vmss scale --new-capacity $VMSS_CAPACITY -n $VMSS -g $RESOURCE_GROUP --no-wait; then
exit 1
fi
fi
Expand Down

0 comments on commit f8fefaf

Please sign in to comment.