This repository has been archived by the owner on Oct 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 527
/
vmss-health-check.sh
executable file
·84 lines (81 loc) · 4 KB
/
vmss-health-check.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
if [ -z "$RESOURCE_GROUP" ]; then
echo "must provide a RESOURCE_GROUP env var"
exit 1;
fi
# TODO: track VMSS in a "Creating" state, enforce TTL, if "Creating TTL" expires:
# 1. Check if the "Creating" VMSS instance correlates with a running Kubernetes node in the cluster
# If so, (1) cordon/drain the node
# 2. Delete the instance in a stuck "Creating" state
# 3. Wait for the VMSS to achive a "Succeeded" ProvisioningState
# 4. Scale out the VMSS by 1
# Continually look for non-Succeeded VMSS instances
while true; do
NUM_VMSS=0
NUM_TERMINAL_VMSS=0
echo "$(date) Starting VMSS Health Remediation loop"
for VMSS in $(az vmss list -g $RESOURCE_GROUP | jq -r '.[] | .name'); do
((NUM_VMSS++))
NUM_DELETED_INSTANCES=0
VMSS_PROVISIONING_STATE=$(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '.provisioningState')
echo $(date) VMSS $VMSS has a ProvisioningState of $VMSS_PROVISIONING_STATE
VMSS_CAPACITY=$(az vmss list -g $RESOURCE_GROUP | jq -r --arg VMSS "$VMSS" '.[] | select(.name == $VMSS) | .sku.capacity')
echo $(date) VMSS $VMSS has a current capacity of $VMSS_CAPACITY
if [ "$VMSS_PROVISIONING_STATE" == "Succeeded" ] || [ "$VMSS_PROVISIONING_STATE" == "Failed" ]; then
((NUM_TERMINAL_VMSS++))
HAS_FAILED_STATE_INSTANCE="false"
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[] | select(.provisioningState == "Failed") | .name'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_} --no-wait; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[].resources[] | select(.name == "vmssCSE" and .provisioningState == "Failed") | .id' | awk -F'/' '{print $9}'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_}; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
for TARGET_VMSS_INSTANCE in $(az vmss list-instances -g $RESOURCE_GROUP -n $VMSS | jq -r '.[].resources[] | select(.publisher == "Microsoft.AKS" and .provisioningState != "Succeeded" and .provisioningState != "Creating" and .provisioningState != "Deleting") | .id' | awk -F'/' '{print $9}'); do
HAS_FAILED_STATE_INSTANCE="true"
echo $(date) Deleting VMSS $VMSS instance $TARGET_VMSS_INSTANCE
if ! az vmss delete-instances -n $VMSS -g $RESOURCE_GROUP --instance-id ${TARGET_VMSS_INSTANCE##*_}; then
sleep 30
else
sleep 1
((NUM_DELETED_INSTANCES++))
fi
done
if [ "$HAS_FAILED_STATE_INSTANCE" == "true" ]; then
echo $(date) Waiting for $VMSS to reach a terminal ProvisioningState after failed instances were deleted...
sleep 30
until [[ $(az vmss show -g $RESOURCE_GROUP -n $VMSS | jq -r '. | select(.provisioningState == "Succeeded" or .provisioningState == "Failed") | .name') ]]; do
echo $(date) Waiting for $VMSS to reach a terminal ProvisioningState after failed instances were deleted...
sleep 30
done
echo $(date) VMSS $VMSS is in a terminal state after failed instances were deleted!
fi
fi
if [ "$NUM_DELETED_INSTANCES" -gt "0" ]; then
echo $(date) Instances were deleted from VMSS $VMSS, ensuring that capacity is set to $VMSS_CAPACITY
if ! az vmss scale --new-capacity $VMSS_CAPACITY -n $VMSS -g $RESOURCE_GROUP --no-wait; then
exit 1
fi
fi
done
if [ "$LOOP_FOREVER" == "true" ]; then
sleep 150
else
if [[ "${NUM_VMSS}" == "${NUM_TERMINAL_VMSS}" ]]; then
exit 0
fi
fi
done