Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f76e50a
test(kwok): try standard tier for cluster
huntergregory Sep 25, 2023
f7940ab
Revert "test(kwok): try standard tier for cluster"
huntergregory Sep 25, 2023
b2b5913
test: run kwok as pod
huntergregory Sep 25, 2023
e3fa8c0
fix: add execute permission to sh files
huntergregory Sep 25, 2023
50782d9
fix: allow scheduling on linux for kwok pod
huntergregory Sep 26, 2023
a7f6bd7
fix: wait timeouts and add retry logic
huntergregory Sep 26, 2023
35085db
fix: make sure to reapply kwok nodes if wait fails
huntergregory Sep 26, 2023
22cb607
test: print out cluster state if wait fails
huntergregory Sep 26, 2023
07a6bb4
test: prevent kwok from scheduling on windows node
huntergregory Sep 26, 2023
51b1f39
test: first wait for kwok pods (20 minutes)
huntergregory Sep 26, 2023
86a406b
style: rearrange wait check
huntergregory Sep 26, 2023
15a5452
fix: scale up kwok controller for reliability
huntergregory Sep 26, 2023
8df35ba
fix: typo in scaling kwok pods
huntergregory Sep 26, 2023
5a6de46
fix: check kwok pods running in test-connectivity instead of test-scale
huntergregory Sep 26, 2023
8cd565a
fix: wait for pods before adding NetPol
huntergregory Sep 26, 2023
2fb72c7
fix: 7 second timeout for windows agnhost connect
huntergregory Sep 27, 2023
24cfdd4
feat: get cluster state on failure
huntergregory Sep 27, 2023
24ec927
debug: fake a failure to verify log capture
huntergregory Sep 27, 2023
899b320
fix: bugs in getting cluster state
huntergregory Sep 28, 2023
80e7d0a
fix: remove newline instead of "n"
huntergregory Sep 28, 2023
c2292e1
Revert "debug: fake a failure to verify log capture"
huntergregory Sep 28, 2023
5809cff
feat(win-debug): get prom metrics
huntergregory Sep 28, 2023
24fb5f8
fix: leave timeout=5s for win
huntergregory Sep 29, 2023
2ee3dd2
style: remove new, unused --connect-timeout parameter
huntergregory Nov 14, 2023
7d4cdf0
Merge branch 'master' into hg/scale-infra
huntergregory Nov 14, 2023
d6a67cf
style: comment
huntergregory Nov 14, 2023
ea0d306
feat: top node/pod
huntergregory Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 67 additions & 61 deletions .pipelines/npm/npm-scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
displayName: "Verify Directory Exists"
failOnStderr: true
- task: AzureCLI@2
displayName: "Download Kubectl and Kwok"
displayName: "Download Kubectl"
inputs:
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
scriptType: "bash"
Expand All @@ -107,11 +107,6 @@ jobs:
set -e
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
chmod +x kubectl

KWOK_REPO=kubernetes-sigs/kwok
KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_LATEST_RELEASE}/kwok-$(go env GOOS)-$(go env GOARCH)"
chmod +x kwok
- task: AzureCLI@2
displayName: "Create AKS Cluster"
inputs:
Expand Down Expand Up @@ -140,17 +135,22 @@ jobs:
--vm-set-type VirtualMachineScaleSets \
--node-vm-size Standard_D4s_v3 \
--node-count 1 \
--tier standard \
--max-pods 100

if [[ $(PROFILE) == *ws22 ]]; then
# don't schedule anything on the linux system pool
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
az aks nodepool update \
--cluster-name $CLUSTER_NAME \
-g $(RESOURCE_GROUP) \
-n nodepool1 \
--node-taints CriticalAddonsOnly=true:NoSchedule
echo "Getting credentials to $CLUSTER_NAME"
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
mkdir -p ~/.kube/
cp ./kubeconfig ~/.kube/config

# install kwok on linux node
cd $(Pipeline.Workspace)/s/test/scale/
chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh
./run-kwok-as-pod.sh
# need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled"
kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5

if [[ $(PROFILE) == *ws22 ]]; then
echo "Adding Windows nodepool to $CLUSTER_NAME"
az aks nodepool add \
--resource-group $(RESOURCE_GROUP) \
Expand All @@ -163,11 +163,6 @@ jobs:
--max-pods 100
fi

echo "Getting credentials to $CLUSTER_NAME"
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
mkdir -p ~/.kube/
cp ./kubeconfig ~/.kube/config

- task: AzureCLI@2
displayName: "Deploy NPM to Test Cluster"
inputs:
Expand Down Expand Up @@ -230,15 +225,6 @@ jobs:
set -e
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log &
kwok_pid=$!

# 20 kwok nodes
# 1000 kwok Pods
Expand All @@ -262,7 +248,6 @@ jobs:
--num-unique-labels-per-deployment=2 \
--num-shared-labels-per-pod=10
rc=$?
kill $kwok_pid
exit $rc

- task: AzureCLI@2
Expand All @@ -277,20 +262,17 @@ jobs:
set -e
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log &
kwok_pid=$!

kubectl rollout restart -n kube-system ds azure-npm-win
echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
sleep 3m

kubectl get pod -n kube-system -l app=kwok-controller -owide
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
echo "##vso[task.logissue type=error]need at least one kwok pod running"
exit 1
}

cd $(Pipeline.Workspace)/s/test/scale/connectivity/
# notes for Windows:
# initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
Expand All @@ -302,11 +284,29 @@ jobs:
--max-wait-after-adding-netpol=30
rc=$?
if [[ $rc != 0 ]]; then
echo "capturing cluster state due to failure"
if [[ $(PROFILE) == *ws22 ]]; then
cd $(Pipeline.Workspace)/s/debug/windows/npm/
chmod u+x win-debug.sh
./win-debug.sh
mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
else
set -x
npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
fi

kubectl get pod -n scale-test
kubectl get pod -n connectivity-test
exit $rc
fi
kill $kwok_pid
exit $rc

kubectl get pod -n kube-system -l app=kwok-controller -owide
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
echo "##vso[task.logissue type=error]need at least one kwok pod running"
exit 1
}

- task: AzureCLI@2
displayName: "CRUD at Medium Scale"
Expand All @@ -320,15 +320,6 @@ jobs:
set -e
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log &
kwok_pid=$!

# will delete scale-test and connectivity-test namespaces from previous run
# 10 kwok Pods
Expand Down Expand Up @@ -359,7 +350,6 @@ jobs:
--delete-pods-interval=120 \
--delete-pods-times=2
rc=$?
kill $kwok_pid
exit $rc

- task: AzureCLI@2
Expand All @@ -374,15 +364,13 @@ jobs:
set -e
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log &
kwok_pid=$!

kubectl get pod -n kube-system -l app=kwok-controller -owide
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
echo "##vso[task.logissue type=error]need at least one kwok pod running"
exit 1
}


cd $(Pipeline.Workspace)/s/test/scale/connectivity/
# initial connectivity should be established within 10 minutes
Expand All @@ -394,11 +382,29 @@ jobs:
--max-wait-after-adding-netpol=20
rc=$?
if [[ $rc != 0 ]]; then
echo "capturing cluster state due to failure"
if [[ $(PROFILE) == *ws22 ]]; then
cd $(Pipeline.Workspace)/s/debug/windows/npm/
chmod u+x win-debug.sh
./win-debug.sh
mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
else
set -x
npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
fi

kubectl get pod -n scale-test
kubectl get pod -n connectivity-test
exit $rc
fi
kill $kwok_pid
exit $rc

kubectl get pod -n kube-system -l app=kwok-controller -owide
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
echo "##vso[task.logissue type=error]need at least one kwok pod running"
exit 1
}

- bash: |
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
Expand Down
54 changes: 41 additions & 13 deletions debug/windows/npm/win-debug.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
kubeconfig=$1
if [[ -z $1 ]]; then
echo "kubeconfig not provided. using default kubeconfig"
else
echo "using kubeconfig: $kubeconfig"
kubeconfigArg="--kubeconfig $kubeconfig"
fi

# NOTE: you may not be able to unzip logs.zip in Linux since it was compressed in Windows
set -e
set -x
dateString=`date -I` # like 2022-09-24
filepath=logs_$dateString
mkdir $filepath

echo "gathering logs and writing to $filepath/"

kubectl get pod -A -o wide --show-labels > $filepath/allpods.out
kubectl get netpol -A -o yaml > $filepath/all-netpol-yamls.out
kubectl describe netpol -A > $filepath/all-netpol-descriptions.out

npmPods=()
nodes=()
for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
for npmPodOrNode in `kubectl $kubeconfigArg get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
# for loop will go over each item (npm pod, then its node, then the next npm pod, then its node, ...)
set +e
echo $npmPodOrNode | grep -q azure-npm-win-
if [ $? -eq 0 ]; then
npmPods+=($npmPodOrNode)
else
nodes+=($npmPodOrNode)
fi
done
set -e

echo "npm pods: ${npmPods[@]}"
echo "nodes of npm pods: ${nodes[@]}"
Expand All @@ -33,22 +35,48 @@ for i in $(seq 1 ${#npmPods[*]}); do
node=${nodes[$j]}

echo "gathering logs. npm pod: $npmPod. node: $node"
kubectl logs -n kube-system $npmPod > $filepath/logs_$npmPod.out
kubectl $kubeconfigArg logs -n kube-system $npmPod > $filepath/logs_$npmPod.out

ips=()
for ip in `kubectl get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do
for ip in `kubectl $kubeconfigArg get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do
ips+=($ip)
done
echo "node $node has IPs: ${ips[@]}"

echo "copying ps1 file into $npmPod"
kubectl cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1
kubectl $kubeconfigArg cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1

echo "executing ps1 file on $npmPod"
kubectl exec -it -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'"
kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'"

echo "copying logs.zip from $npmPod. NOTE: this will be a windows-based compressed archive (probably need windows to expand it)"
kubectl cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
kubectl $kubeconfigArg cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
done

echo "finished getting HNS info. getting prometheus metrics"

mkdir -p $filepath/prometheus/node-metrics
for i in $(seq 1 ${#npmPods[*]}); do
j=$((i-1))
npmPod=${npmPods[$j]}
kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command "(Invoke-WebRequest -UseBasicParsing http://localhost:10091/node-metrics).Content" > $filepath/prometheus/node-metrics/$npmPod.out
done

echo "finished getting prometheus metrics. getting cluster state"

kubectl $kubeconfigArg get pod -A -o wide --show-labels > $filepath/allpods.out
kubectl $kubeconfigArg get netpol -A -o yaml > $filepath/all-netpol-yamls.out
kubectl $kubeconfigArg describe netpol -A > $filepath/all-netpol-descriptions.out

for ns in `kubectl $kubeconfigArg get pod -A | grep -v Running | grep -v STATUS | awk '{print $1}' | sort | uniq`; do
echo "describing failed pods in namespace $ns..."
failingPods=`kubectl $kubeconfigArg get pod -n $ns | grep -v Running | grep -v STATUS | awk '{print $1}' | xargs echo`
if [[ -z $failingPods ]]; then
continue
fi
echo "failing Pods: $failingPods"
kubectl $kubeconfigArg describe pod -n $ns $failingPods > $filepath/describepod_$ns.out
break
done

echo "finished gathering all logs. written to $filepath/"
2 changes: 1 addition & 1 deletion test/scale/connectivity/test-connectivity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ REQUIRED PARAMETERS:

OPTIONAL PARAMETERS:
--kubeconfig=<path> path to kubeconfig file
--kubectl-binary=<path> path to kubectl binary. Default is kubectl
--kubectl-binary=<path> path to kubectl binary. Default is kubectl

EXIT CODES:
0 - success
Expand Down
5 changes: 5 additions & 0 deletions test/scale/run-kwok-as-pod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# source: https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/
KWOK_REPO=kubernetes-sigs/kwok
KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml"
kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml"
Loading