Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

test: better VMSS recovery in vmss-health-check #4662

Merged
merged 1 commit into from
Sep 21, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
32 changes: 24 additions & 8 deletions test/e2e/kubernetes/scripts/vmss-health-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,49 @@ if [ -z "$RESOURCE_GROUP" ]; then
exit 1;
fi

# TODO: track VMSS in a "Creating" state, enforce TTL, if "Creating TTL" expires:
# 1. Check if the "Creating" VMSS instance correlates with a running Kubernetes node in the cluster
# If so, (1) cordon/drain the node
# 2. Delete the instance in a stuck "Creating" state
# 3. Wait for the VMSS to achive a "Succeeded" ProvisioningState
# 4. Scale out the VMSS by 1

# Continually look for non-Succeeded VMSS instances
while true; do
NUM_VMSS=0
NUM_TERMINAL_VMSS=0
echo "$(date) Starting VMSS Health Remediation loop"
for VMSS in $(az vmss list -g $RESOURCE_GROUP | jq -r '.[] | .name'); do
((NUM_VMSS++))
NUM_DELETED_INSTANCES=0
VMSS_CAPACITY=$(az vmss list -g $RESOURCE_GROUP | jq -r --arg VMSS "$VMSS" '.[] | select(.name == $VMSS) | .sku.capacity')
echo VMSS $VMSS has a current capacity of $VMSS_CAPACITY
echo $(date) VMSS $VMSS has a current capacity of $VMSS_CAPACITY
for TERMINAL_VMSS in $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded" or .provisioningState == "Failed") | .name'); do
((NUM_TERMINAL_VMSS++))
echo VMSS $TERMINAL_VMSS is in a terminal state!
echo $(date) VMSS $TERMINAL_VMSS is in a terminal state!
HAS_FAILED_STATE_INSTANCE="false"
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $TERMINAL_VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
echo Deleting VMSS $TERMINAL_VMSS instance $TARGET_VMSS_INSTANCE
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $TERMINAL_VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $TERMINAL_VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_} --no-wait; then
exit 1
sleep 30
else
sleep 3
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
until [[ $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded") | .name') ]]; do
if [ "$HAS_FAILED_STATE_INSTANCE" == "true" ]; then
echo $(date) Waiting for $TERMINAL_VMSS to reach a terminal ProvisioningState after failed instances were deleted...
sleep 30
done
until [[ $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded" or .provisioningState == "Failed") | .name') ]]; do
for STILL_FAILED_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $TERMINAL_VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
echo $(date) Instance $STILL_FAILED_VMSS_INSTANCE is still in a failed state, will attempt to delete again in the next loop
done
done
fi
done
if [ "$NUM_DELETED_INSTANCES" -gt "0" ]; then
echo Instances were deleted from VMSS $VMSS, ensuring that capacity is set to $VMSS_CAPACITY
echo $(date) Instances were deleted from VMSS $VMSS, ensuring that capacity is set to $VMSS_CAPACITY
if ! az vmss scale --new-capacity $VMSS_CAPACITY -n $VMSS -g $RESOURCE_GROUP; then
exit 1
fi
Expand Down