From 735e57067a261c8ccbf11bb60476b3ae486f4217 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 29 Mar 2023 19:14:37 -0700 Subject: [PATCH 01/14] k8s scale testing with kwok --- .gitignore | 3 + test/scale/README.md | 16 ++ test/scale/run-kwok.sh | 28 +++ test/scale/scale-test.sh | 245 ++++++++++++++++++++++ test/scale/templates/kwok-deployment.yaml | 35 ++++ test/scale/templates/kwok-node.yaml | 42 ++++ test/scale/templates/networkpolicy.yaml | 22 ++ test/scale/templates/real-deployment.yaml | 33 +++ 8 files changed, 424 insertions(+) create mode 100644 test/scale/README.md create mode 100755 test/scale/run-kwok.sh create mode 100755 test/scale/scale-test.sh create mode 100644 test/scale/templates/kwok-deployment.yaml create mode 100644 test/scale/templates/kwok-node.yaml create mode 100644 test/scale/templates/networkpolicy.yaml create mode 100644 test/scale/templates/real-deployment.yaml diff --git a/.gitignore b/.gitignore index 5197febb92..6ecb9304d9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,6 @@ npm/debug/http *.srl go.work* + +# scale-test +test/scale/generated/* diff --git a/test/scale/README.md b/test/scale/README.md new file mode 100644 index 0000000000..769e28c924 --- /dev/null +++ b/test/scale/README.md @@ -0,0 +1,16 @@ +## Overview +Scripts for scale testing our components with both real resources and fake resources via [KWOK](https://github.com/kubernetes-sigs/kwok). + +Can specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. + +### Why KWOK? +KWOK saves time/resources, especially in Windows. + +## Usage +1. Create AKS cluster with `--uptime-sla` and create any nodepools. +2. To schedule real Pods on a node: `kubectl label node scale-test=true` +3. Modify `./scale-test.sh`: set KUBECONFIG_ARG if desired or leave empty. +4. Modify `./scale-test.sh`: if not using NPM, set `USING_NPM=false`. +5. Modify `./scale-test.sh`: update parameter values. Check your VMs' `--max-pod` capacity and set `maxRealPodsPerNode` accordingly (leave wiggle room for system Pods). +6. If making KWOK Pods, run: `./run-kwok.sh` +7. In another shell, run `./scale-test.sh` diff --git a/test/scale/run-kwok.sh b/test/scale/run-kwok.sh new file mode 100755 index 0000000000..48ca902401 --- /dev/null +++ b/test/scale/run-kwok.sh @@ -0,0 +1,28 @@ +##################################################################################### +# This script is used to schedule kwok nodes/pods and maintain kwok node heartbeats. +##################################################################################### + +INSTALL_KWOK=false +# KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') +KWOK_VERSION=${KWOK_LATEST_RELEASE:-"v0.1.1"} +# kubeconfig arg doesn't seem to work for kwok. It seems to just use current context of the default kubeconfig. +KUBECONFIG=~/.kube/config + +if [[ INSTALL_KWOK == true ]]; then + wget -O kwokctl -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_VERSION}/kwokctl-$(go env GOOS)-$(go env GOARCH)" + chmod +x kwokctl + sudo mv kwokctl /usr/local/bin/kwokctl + + wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_VERSION}/kwok-$(go env GOOS)-$(go env GOARCH)" + chmod +x kwok + sudo mv kwok /usr/local/bin/kwok +fi + +kwok --kubeconfig $KUBECONFIG \ + --cidr=155.0.0.0/16 \ + --node-ip=155.0.0.1 \ + --manage-all-nodes=false \ + --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ + --manage-nodes-with-label-selector= \ + --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ + --disregard-status-with-label-selector= diff --git a/test/scale/scale-test.sh b/test/scale/scale-test.sh new file mode 100755 index 0000000000..5cfd5eea01 --- /dev/null +++ b/test/scale/scale-test.sh @@ -0,0 +1,245 @@ +################################################################################################################################################################# +# This script will scale the number of pods, pod labels, and network policies in a cluster. +# It uses KWOK to create fake nodes and fake pods as needed. KWOK script must be running in another shell. +# It can also create real Pods on real VMs labeled with scale-test=true. +# It will NOT scale real nodes. +# +# USAGE: +# 1. Create AKS cluster with --uptime-sla and create any nodepools. +# 2. To schedule real Pods on a node: kubectl label node scale-test=true +# 3. Modify this script: set KUBECONFIG_ARG if desired or leave empty. +# 4. Modify this script: if not using NPM, set USING_NPM=false. +# 5. Modify this script: update parameter values. Check your VMs' --max-pod capacity and set maxRealPodsPerNode accordingly (leave wiggle room for system Pods). +# 6. If making KWOK Pods, run: ./run-kwok.sh +# 7. In another shell, run this script +################################################################################################################################################################# + +## CONSTANTS & PARAMETERS +# KUBECONFIG_ARG="--kubeconfig ./config-03-21" +USING_NPM=true +DEBUG_EXIT_AFTER_PRINTOUT=false +DEBUG_EXIT_AFTER_GENERATION=false + +maxKwokPodsPerNode=50 +numKwokDeployments=10 +numKwokReplicas=150 + +maxRealPodsPerNode=30 +numRealDeployments=10 +numRealReplicas=3 + +numSharedLabelsPerPod=3 # should be >= 3 for networkpolicy generation +numUniqueLabelsPerPod=1 # in Cilium, a value >= 1 results in every Pod having a unique identity (not recommended for scale) +numUniqueLabelsPerDeployment=2 + +# applied to every Pod +numNetworkPolicies=10 + +## CALCULATIONS +numKwokPods=$(( $numKwokDeployments * $numKwokReplicas )) +numKwokNodes=$(( ($numKwokPods + $maxKwokPodsPerNode - 1) / $maxKwokPodsPerNode)) +numRealPods=$(( $numRealDeployments * $numRealReplicas )) +numRealNodesRequired=$(( ($numRealPods + $maxRealPodsPerNode - 1) / $maxRealPodsPerNode)) +numTotalPods=$(( $numKwokPods + $numRealPods )) + +## NPM CALCULATIONS +# unique to templates/networkpolicy.yaml +numACLsAddedByNPM=$(( 6 * $numNetworkPolicies )) +# IPSet/member counts can be slight underestimates if there are more than one template-hash labels +# 4 basic IPSets are [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] +numIPSetsAddedByNPM=$(( 4 + 2*$numTotalPods*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod + 2*($numKwokDeployments+$numRealDeployments)*$numUniqueLabelsPerDeployment )) +# 3 basic members are [all-ns,kubernetes.io/metadata.name,kubernetes.io/metadata.name:scale-test] +# 5*pods members go to [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] +numIPSetMembersAddedByNPM=$(( 3 + $numTotalPods*(5 + 2*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod) + 2*($numKwokPods+$numRealPods)*$numUniqueLabelsPerDeployment )) + +## PRINTOUT +cat < $outFile + sed -i "s/TEMP_REPLICAS/$numReplicas/g" $outFile + + if [[ $numUniqueLabelsPerDeployment -gt 0 ]]; then + depLabels="" + for j in $(seq -f "%05g" 1 $numUniqueLabelsPerDeployment); do + depLabels="$depLabels\n $labelPrefix-$j: val" + done + perl -pi -e "s/OTHER_LABELS_6_SPACES/$depLabels/g" $outFile + + depLabels="" + for j in $(seq -f "%05g" 1 $numUniqueLabelsPerDeployment); do + depLabels="$depLabels\n $labelPrefix-$j: val" + done + perl -pi -e "s/OTHER_LABELS_8_SPACES/$depLabels/g" $outFile + else + sed -i "s/OTHER_LABELS_6_SPACES//g" $outFile + sed -i "s/OTHER_LABELS_8_SPACES//g" $outFile + fi + done +} + +generateDeployments $numKwokDeployments $numKwokReplicas kwok +generateDeployments $numRealDeployments $numRealReplicas real + +for j in $(seq 1 $numNetworkPolicies); do + valNum=$j + i=`printf "%05d" $j` + sed "s/TEMP_NAME/policy-$i/g" templates/networkpolicy.yaml > generated/networkpolicies/policy-$i.yaml + if [[ $valNum -ge $(( numSharedLabelsPerPod - 2 )) ]]; then + valNum=$(( $numSharedLabelsPerPod - 2 )) + fi + sed -i "s/TEMP_LABEL_NAME/shared-lab-$valNum/g" generated/networkpolicies/policy-$i.yaml + + ingressNum=$(( $valNum + 1 )) + sed -i "s/TEMP_INGRESS_NAME/shared-lab-$ingressNum/g" generated/networkpolicies/policy-$i.yaml + + egressNum=$(( $valNum + 2 )) + sed -i "s/TEMP_EGRESS_NAME/shared-lab-$egressNum/g" generated/networkpolicies/policy-$i.yaml +done + +for i in $(seq -f "%05g" 1 $numKwokNodes); do + cat templates/kwok-node.yaml | sed "s/INSERT_NUMBER/$i/g" > "generated/kwok-nodes/node-$i.yaml" +done + +if [[ $DEBUG_EXIT_AFTER_GENERATION == true ]]; then + echo "DEBUG: exiting after generation..." + exit 0 +fi + +## VALIDATE REAL NODES +echo "checking if there are enough real nodes..." +numRealNodes=$(kubectl $KUBECONFIG_ARG get nodes -l scale-test=true | grep -v NAME | wc -l) +if [[ $numRealNodes -lt $numRealNodesRequired ]]; then + kubectl $KUBECONFIG_ARG get nodes + echo "ERROR: need $numRealNodesRequired real nodes to achieve a scale of $numRealPods real Pods. Make sure to label nodes with: kubectl label node scale-test=true." + exit 1 +fi + +## DELETE PRIOR STATE +echo "cleaning up previous scale test state..." +kubectl $KUBECONFIG_ARG delete ns scale-test && shouldRestartNPM=true +kubectl $KUBECONFIG_ARG delete node -l type=kwok + +if [[ $USING_NPM == true ]]; then + if [[ $shouldRestartNPM == true ]]; then + echo "restarting NPM pods..." + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm-win + echo "sleeping 3m to allow NPM pods to restart..." + sleep 1m + echo "2m remaining..." + sleep 1m + echo "1m remaining..." + sleep 1m + fi + + echo "making sure NPM pods are running..." + kubectl $KUBECONFIG_ARG get pod -n kube-system | grep Running | grep -v "azure-npm-win" | grep -oP "azure-npm-[a-z0-9]+" -m 1 + if [[ $? != 0 ]]; then + echo "No Linux NPM pod running. Exiting." + exit 1 + fi + + kubectl $KUBECONFIG_ARG get pod -n kube-system | grep Running | grep -oP "azure-npm-win-[a-z0-9]+" -m 1 + if [[ $? != 0 ]]; then + echo "No Windows NPM pod running. Exiting." + exit 1 + fi +fi + +## RUN +if [[ $numKwokPods -gt 0 ]]; then + echo "START KWOK COMMAND NOW..." + sleep 10s +fi + +startDate=`date -u` +echo "STARTING RUN at $startDate" +echo + +set -x +kubectl $KUBECONFIG_ARG create ns scale-test +kubectl $KUBECONFIG_ARG apply -f generated/kwok-nodes/ +kubectl $KUBECONFIG_ARG apply -f generated/deployments/real/ +kubectl $KUBECONFIG_ARG apply -f generated/deployments/kwok/ +set +x + +if [[ $numSharedLabelsPerPod -gt 0 ]]; then + sharedLabels="" + for i in $(seq -f "%05g" 1 $numSharedLabelsPerPod); do + sharedLabels="$sharedLabels shared-lab-$i=val" + done + + set -x + kubectl $KUBECONFIG_ARG label pods -n scale-test --all $sharedLabels + set +x +fi + +if [[ $numUniqueLabelsPerPod -gt 0 ]]; then + count=1 + for pod in $(kubectl $KUBECONFIG_ARG get pods -n scale-test -o jsonpath='{.items[*].metadata.name}'); do + uniqueLabels="" + for tmp in $(seq 1 $numUniqueLabelsPerPod); do + i=`printf "%05d" $count` + uniqueLabels="$uniqueLabels uni-lab-$i=val" + count=$(( $count + 1 )) + done + + set -x + kubectl $KUBECONFIG_ARG label pods -n scale-test $pod $uniqueLabels + set +x + done +fi + +set -x +kubectl $KUBECONFIG_ARG apply -f generated/networkpolicies/ +set +x + +echo +echo "FINISHED at $(date -u). Had started at $startDate." +echo diff --git a/test/scale/templates/kwok-deployment.yaml b/test/scale/templates/kwok-deployment.yaml new file mode 100644 index 0000000000..70ef60d5c2 --- /dev/null +++ b/test/scale/templates/kwok-deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fake-TEMP_NAME + namespace: scale-test + labels: + app: scale-test +spec: + replicas: TEMP_REPLICAS + selector: + matchLabels: + app: scale-testOTHER_LABELS_6_SPACES + template: + metadata: + labels: + app: scale-testOTHER_LABELS_8_SPACES + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + # A taints was added to an automatically created Node. + # You can remove taints of Node or add this tolerations. + tolerations: + - key: "kwok.x-k8s.io/node" + operator: "Exists" + effect: "NoSchedule" + containers: + - name: fake-container + image: fake-image diff --git a/test/scale/templates/kwok-node.yaml b/test/scale/templates/kwok-node.yaml new file mode 100644 index 0000000000..249cc717af --- /dev/null +++ b/test/scale/templates/kwok-node.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Node +metadata: + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: kwok-node-INSERT_NUMBER + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + type: kwok + name: kwok-node-INSERT_NUMBER +spec: + taints: # Avoid scheduling actual running pods to fake Node + - effect: NoSchedule + key: kwok.x-k8s.io/node + value: fake +status: + allocatable: + cpu: 32 + memory: 256Gi + pods: 110 + capacity: + cpu: 32 + memory: 256Gi + pods: 110 + nodeInfo: + architecture: amd64 + bootID: "" + containerRuntimeVersion: "" + kernelVersion: "" + kubeProxyVersion: fake + kubeletVersion: fake + machineID: "" + operatingSystem: linux + osImage: "" + systemUUID: "" + phase: Running diff --git a/test/scale/templates/networkpolicy.yaml b/test/scale/templates/networkpolicy.yaml new file mode 100644 index 0000000000..8c2ccdd2c2 --- /dev/null +++ b/test/scale/templates/networkpolicy.yaml @@ -0,0 +1,22 @@ +kind: NetworkPolicy +apiVersion: networking.k8s.io/v1 +metadata: + name: TEMP_NAME + namespace: scale-test +spec: + podSelector: + matchLabels: + TEMP_LABEL_NAME: "true" + ingress: + - from: + - podSelector: + matchLabels: + TEMP_INGRESS_NAME: "true" + egress: + - to: + - podSelector: + matchLabels: + TEMP_EGRESS_NAME: "true" + policyTypes: + - Ingress + - Egress diff --git a/test/scale/templates/real-deployment.yaml b/test/scale/templates/real-deployment.yaml new file mode 100644 index 0000000000..38385eda49 --- /dev/null +++ b/test/scale/templates/real-deployment.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: TEMP_NAME + namespace: scale-test + labels: + app: scale-test +spec: + replicas: TEMP_REPLICAS + selector: + matchLabels: + app: scale-testOTHER_LABELS_6_SPACES + template: + metadata: + labels: + app: scale-testOTHER_LABELS_8_SPACES + spec: + nodeSelector: + scale-test: "true" + containers: + - command: + - /agnhost + - serve-hostname + - --tcp + - --http=false + - --port + - "80" + image: k8s.gcr.io/e2e-test-images/agnhost:2.33 + imagePullPolicy: IfNotPresent + name: cont-80-tcp + ports: + - containerPort: 80 + protocol: TCP From 1952ea0a08e7dc48cc5639c9e81ea7386bc25918 Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Wed, 29 Mar 2023 19:37:16 -0700 Subject: [PATCH 02/14] Update README.md --- test/scale/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/scale/README.md b/test/scale/README.md index 769e28c924..b8801b7a17 100644 --- a/test/scale/README.md +++ b/test/scale/README.md @@ -9,8 +9,10 @@ KWOK saves time/resources, especially in Windows. ## Usage 1. Create AKS cluster with `--uptime-sla` and create any nodepools. 2. To schedule real Pods on a node: `kubectl label node scale-test=true` -3. Modify `./scale-test.sh`: set KUBECONFIG_ARG if desired or leave empty. -4. Modify `./scale-test.sh`: if not using NPM, set `USING_NPM=false`. -5. Modify `./scale-test.sh`: update parameter values. Check your VMs' `--max-pod` capacity and set `maxRealPodsPerNode` accordingly (leave wiggle room for system Pods). +3. Modify `scale-test.sh`: set KUBECONFIG_ARG if desired or leave empty. +4. Modify `scale-test.sh`: if not using NPM, set `USING_NPM=false`. +5. Modify `scale-test.sh`: update parameter values. Check your VMs' `--max-pod` capacity and set `maxRealPodsPerNode` accordingly (leave wiggle room for system Pods). 6. If making KWOK Pods, run: `./run-kwok.sh` 7. In another shell, run `./scale-test.sh` + +Can also set `DEBUG_EXIT_AFTER_` variables in `scale-test.sh` to check configuration before actually running the scale tests. From 3751a629ccbce78e08b7b9050eba1e8db1fe2a42 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 10:46:54 -0700 Subject: [PATCH 03/14] fix netpol labels so that they apply to pods --- test/scale/scale-test.sh | 9 ++++++--- test/scale/templates/networkpolicy.yaml | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/test/scale/scale-test.sh b/test/scale/scale-test.sh index 5cfd5eea01..a296d6a230 100755 --- a/test/scale/scale-test.sh +++ b/test/scale/scale-test.sh @@ -133,13 +133,16 @@ for j in $(seq 1 $numNetworkPolicies); do if [[ $valNum -ge $(( numSharedLabelsPerPod - 2 )) ]]; then valNum=$(( $numSharedLabelsPerPod - 2 )) fi - sed -i "s/TEMP_LABEL_NAME/shared-lab-$valNum/g" generated/networkpolicies/policy-$i.yaml + k=`printf "%05d" $valNum` + sed -i "s/TEMP_LABEL_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml ingressNum=$(( $valNum + 1 )) - sed -i "s/TEMP_INGRESS_NAME/shared-lab-$ingressNum/g" generated/networkpolicies/policy-$i.yaml + k=`printf "%05d" $ingressNum` + sed -i "s/TEMP_INGRESS_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml egressNum=$(( $valNum + 2 )) - sed -i "s/TEMP_EGRESS_NAME/shared-lab-$egressNum/g" generated/networkpolicies/policy-$i.yaml + k=`printf "%05d" $ingressNum` + sed -i "s/TEMP_EGRESS_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml done for i in $(seq -f "%05g" 1 $numKwokNodes); do diff --git a/test/scale/templates/networkpolicy.yaml b/test/scale/templates/networkpolicy.yaml index 8c2ccdd2c2..4bd07587f0 100644 --- a/test/scale/templates/networkpolicy.yaml +++ b/test/scale/templates/networkpolicy.yaml @@ -6,17 +6,17 @@ metadata: spec: podSelector: matchLabels: - TEMP_LABEL_NAME: "true" + TEMP_LABEL_NAME: val ingress: - from: - podSelector: matchLabels: - TEMP_INGRESS_NAME: "true" + TEMP_INGRESS_NAME: val egress: - to: - podSelector: matchLabels: - TEMP_EGRESS_NAME: "true" + TEMP_EGRESS_NAME: val policyTypes: - Ingress - Egress From f8f92becb254e9e563b87707e110a12488e6a52d Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 14:29:45 -0700 Subject: [PATCH 04/14] test connectivity --- test/scale/README.md | 11 +- test/scale/connectivity/pinger.yaml | 33 ++++ test/scale/connectivity/test-connectivity.sh | 150 +++++++++++++++++++ 3 files changed, 186 insertions(+), 8 deletions(-) create mode 100644 test/scale/connectivity/pinger.yaml create mode 100755 test/scale/connectivity/test-connectivity.sh diff --git a/test/scale/README.md b/test/scale/README.md index 769e28c924..2ce0702454 100644 --- a/test/scale/README.md +++ b/test/scale/README.md @@ -1,16 +1,11 @@ ## Overview Scripts for scale testing our components with both real resources and fake resources via [KWOK](https://github.com/kubernetes-sigs/kwok). -Can specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. - ### Why KWOK? KWOK saves time/resources, especially in Windows. ## Usage 1. Create AKS cluster with `--uptime-sla` and create any nodepools. -2. To schedule real Pods on a node: `kubectl label node scale-test=true` -3. Modify `./scale-test.sh`: set KUBECONFIG_ARG if desired or leave empty. -4. Modify `./scale-test.sh`: if not using NPM, set `USING_NPM=false`. -5. Modify `./scale-test.sh`: update parameter values. Check your VMs' `--max-pod` capacity and set `maxRealPodsPerNode` accordingly (leave wiggle room for system Pods). -6. If making KWOK Pods, run: `./run-kwok.sh` -7. In another shell, run `./scale-test.sh` +2. If making KWOK Pods, run `run-kwok.sh` in the background. +3. Scale with `scale-test.sh`. Specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. +4. Test connectivity with `connectivity/test-connectivity.sh`. diff --git a/test/scale/connectivity/pinger.yaml b/test/scale/connectivity/pinger.yaml new file mode 100644 index 0000000000..17a7d9662a --- /dev/null +++ b/test/scale/connectivity/pinger.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pinger + namespace: connectivity-test + labels: + app: pinger +spec: + replicas: 2 + selector: + matchLabels: + app: pinger + template: + metadata: + labels: + app: pinger + spec: + nodeSelector: + connectivity-test: "true" + containers: + - command: + - /agnhost + - serve-hostname + - --tcp + - --http=false + - --port + - "80" + image: k8s.gcr.io/e2e-test-images/agnhost:2.33 + imagePullPolicy: IfNotPresent + name: cont-80-tcp + ports: + - containerPort: 80 + protocol: TCP diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh new file mode 100755 index 0000000000..c9a7b8ad87 --- /dev/null +++ b/test/scale/connectivity/test-connectivity.sh @@ -0,0 +1,150 @@ +## CONSTANTS & PARAMETERS +# KUBECONFIG_ARG="--kubeconfig ./config-03-21" +INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED=true +# tests that N^2 connections are successful, and that 2N connections follow INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED +NUM_SCALE_PODS_TO_VERIFY=10 + +## HELPER FUNCTIONS +connectFromPinger() { + local from=$1 + local dstIP=$2 + echo "checking connectivity from $from to $dstIP" + kubectl $KUBECONFIG_ARG exec -n connectivity-test $from -- /agnhost connect --timeout=3s $dstIP:80 +} + +connectFromScalePod() { + local from=$1 + local dstIP=$2 + echo "checking connectivity from $from to $dstIP" + kubectl $KUBECONFIG_ARG exec -n scale-test $from -- /agnhost connect --timeout=3s $dstIP:80 +} + +## VALIDATE FILE +test -f pinger.yaml || { + echo "ERROR: change into the connectivity/ directory when running this script" + exit 1 +} + +## RUN +set -e +startDate=`date -u` +echo "STARTING CONNECTIVITY TEST at $startDate" + +## GET SCALE PODS +scalePodNameIPs=(`kubectl $KUBECONFIG_ARG get pods -n scale-test --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) +scalePods=() +scalePodIPs=() +for nameIP in "${scalePodNameIPs[@]}"; do + nameIP=(`echo $nameIP | tr ',' ' '`) + name=${nameIP[0]} + ip=${nameIP[1]} + + echo $name | grep real-dep || continue + + echo "scale Pod: $name, IP: $ip" + + if [[ -z $name || -z $ip ]]; then + echo "ERROR: expected scale Pod name and IP to be non-empty" + exit 1 + fi + + scalePods+=($name) + scalePodIPs+=($ip) + + if [[ ${#scalePods[@]} -eq $NUM_SCALE_PODS_TO_VERIFY ]]; then + break + fi +done + +if [[ ${#scalePods[@]} == 0 ]]; then + echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/scale-test.sh with real Pods first." + exit 1 +elif [[ ${#scalePods[@]} -lt $NUM_SCALE_PODS_TO_VERIFY ]]; then + echo "WARNING: seeing ${#scalePodNameIPs[@]} real scale Pods running which is less than NUM_SCALE_PODS_TO_VERIFY=$NUM_SCALE_PODS_TO_VERIFY" + NUM_SCALE_PODS_TO_VERIFY=${#scalePodNameIPs[@]} +else + echo "will verify connectivity to $NUM_SCALE_PODS_TO_VERIFY scale Pods" +fi + +## CREATE PINGERS +kubectl $KUBECONFIG_ARG create ns connectivity-test || true +kubectl $KUBECONFIG_ARG apply -f pinger.yaml +sleep 5s +echo "waiting for pingers to be ready on a node labeled with 'connectivity-test=true'" +kubectl $KUBECONFIG_ARG wait --for=condition=Ready pod -n connectivity-test -l app=pinger --timeout=60s || { + kubectl $KUBECONFIG_ARG get node + echo "ERROR: pingers never ran. Make sure to label nodes with: kubectl label node connectivity-test=true" + exit 1 +} + +pingerNameIPs=(`kubectl $KUBECONFIG_ARG get pod -n connectivity-test -l app=pinger --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) +pinger1NameIP=(`echo "${pingerNameIPs[0]}" | tr ',' ' '`) +pinger1=${pinger1NameIP[0]} +pinger1IP=${pinger1NameIP[1]} +echo "pinger1: $pinger1, IP: $pinger1IP" +pinger2NameIP=(`echo "${pingerNameIPs[1]}" | tr ',' ' '`) +pinger2=${pinger2NameIP[0]} +pinger2IP=${pinger2NameIP[1]} +echo "pinger2: $pinger2, IP: $pinger2IP" +if [[ -z $pinger1 || -z $pinger1IP || -z $pinger2 || -z $pinger2IP ]]; then + echo "ERROR: expected two pingers to be running with IPs. Exiting." + exit 1 +fi + +## VERIFY CONNECTIVITY +connectFromPinger $pinger1 $pinger2IP || { + echo "ERROR: expected pinger1 to be able to connect to pinger2" + exit 2 +} + +connectFromPinger $pinger2 $pinger2 || { + echo "ERROR: expected pinger2 to be able to connect to pinger1" + exit 2 +} + +for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + for j in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + if [[ $i == $j ]]; then + continue + fi + + dstPod=${scalePods[$j]} + dstIP=${scalePodIPs[$j]} + connectFromScalePod $scalePod $dstIP || { + echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" + exit 2 + } + done +done + +for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + scalePodIP=${scalePodIPs[$i]} + + if [[ $INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED == true ]]; then + connectFromScalePod $scalePod $pinger1IP && { + echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" + exit 2 + } + + connectFromPinger $pinger1 $scalePodIP && { + echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" + exit 2 + } + else + connectFromScalePod $scalePod $pinger1IP || { + echo "ERROR: expected scale Pod $scalePod to be able to connect to pinger1" + exit 2 + } + + connectFromPinger $pinger1 $scalePodIP || { + echo "ERROR: expected pinger1 to be able to connect to scale Pod $scalePod" + exit 2 + } + fi +done + +echo +echo "FINISHED at $(date -u). Had started at $startDate." +echo From 74def51b6dd31d07c205891175d0250bd234fcc7 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 17:44:35 -0700 Subject: [PATCH 05/14] parameterize scripts --- test/scale/connectivity/test-connectivity.sh | 121 +++++++++--- test/scale/scale-test.sh | 191 +++++++++++++------ 2 files changed, 226 insertions(+), 86 deletions(-) diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh index c9a7b8ad87..d1101cd231 100755 --- a/test/scale/connectivity/test-connectivity.sh +++ b/test/scale/connectivity/test-connectivity.sh @@ -1,8 +1,62 @@ -## CONSTANTS & PARAMETERS -# KUBECONFIG_ARG="--kubeconfig ./config-03-21" -INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED=true -# tests that N^2 connections are successful, and that 2N connections follow INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED -NUM_SCALE_PODS_TO_VERIFY=10 +# exit on error +set -e + +printHelp() { + cat < [--kubeconfig=] + +Verifies that scale test Pods can connect to each other, but cannot connect to a new "pinger" Pod. + +USAGE: +1. Follow steps for test-scale.sh +2. Label a node to schedule "pinger" Pods: kubectl label node connectivity-test=true +3. Run this script + +EXIT CODES: +0 - success +7 - non-retriable error +8 - potentially retriable error +9 - retriable connectivity error +other - script exited from an unhandled error + +REQUIRED PARAMETERS: + --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] + +OPTIONAL PARAMETERS: + --kubeconfig= path to kubeconfig file +EOF +} + +## PARAMETERS +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + printHelp + exit 0 + ;; + --num-scale-pods-to-verify=*) + numScalePodsToVerify="${1#*=}" + ;; + --kubeconfig=*) + file=${1#*=} + KUBECONFIG_ARG="--kubeconfig $file" + test -f $file || { + echo "ERROR: kubeconfig not found: [$file]" + exit 7 + } + ;; + *) + echo "ERROR: unknown parameter $1. Make sure you're using '--key=value' for parameters with values" + exit 7 + ;; + esac + shift +done + +if [[ -z $numScalePodsToVerify ]]; then + echo "ERROR: --num-scale-pods-to-verify= is required" + exit 7 +fi ## HELPER FUNCTIONS connectFromPinger() { @@ -19,18 +73,25 @@ connectFromScalePod() { kubectl $KUBECONFIG_ARG exec -n scale-test $from -- /agnhost connect --timeout=3s $dstIP:80 } -## VALIDATE FILE +## VALIDATE test -f pinger.yaml || { echo "ERROR: change into the connectivity/ directory when running this script" - exit 1 + exit 7 } +if [[ -z `kubectl $KUBECONFIG_ARG get nodes -l connectivity-test=true | grep -v NAME` ]]; then + kubectl $KUBECONFIG_ARG get node + echo "ERROR: label a node with: kubectl label node connectivity-test=true" + exit 7 +fi + ## RUN set -e startDate=`date -u` echo "STARTING CONNECTIVITY TEST at $startDate" ## GET SCALE PODS +echo "getting scale Pods..." scalePodNameIPs=(`kubectl $KUBECONFIG_ARG get pods -n scale-test --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) scalePods=() scalePodIPs=() @@ -45,36 +106,36 @@ for nameIP in "${scalePodNameIPs[@]}"; do if [[ -z $name || -z $ip ]]; then echo "ERROR: expected scale Pod name and IP to be non-empty" - exit 1 + exit 8 fi scalePods+=($name) scalePodIPs+=($ip) - if [[ ${#scalePods[@]} -eq $NUM_SCALE_PODS_TO_VERIFY ]]; then + if [[ ${#scalePods[@]} -eq $numScalePodsToVerify ]]; then break fi done -if [[ ${#scalePods[@]} == 0 ]]; then - echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/scale-test.sh with real Pods first." - exit 1 -elif [[ ${#scalePods[@]} -lt $NUM_SCALE_PODS_TO_VERIFY ]]; then - echo "WARNING: seeing ${#scalePodNameIPs[@]} real scale Pods running which is less than NUM_SCALE_PODS_TO_VERIFY=$NUM_SCALE_PODS_TO_VERIFY" - NUM_SCALE_PODS_TO_VERIFY=${#scalePodNameIPs[@]} +numScalePodsFound=${#scalePods[@]} +if [[ $numScalePodsFound == 0 ]]; then + echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/test-scale.sh with real Pods first." + exit 8 +elif [[ $numScalePodsFound -lt $numScalePodsToVerify ]]; then + echo "WARNING: there are only $numScalePodsFound real scale Pods running which is less than numScalePodsToVerify. Will verify just these $numScalePodsFound Pods" + numScalePodsToVerify=$numScalePodsFound else - echo "will verify connectivity to $NUM_SCALE_PODS_TO_VERIFY scale Pods" + echo "will verify connectivity to $numScalePodsToVerify scale Pods" fi ## CREATE PINGERS kubectl $KUBECONFIG_ARG create ns connectivity-test || true kubectl $KUBECONFIG_ARG apply -f pinger.yaml sleep 5s -echo "waiting for pingers to be ready on a node labeled with 'connectivity-test=true'" +echo "waiting for pingers to be ready..." kubectl $KUBECONFIG_ARG wait --for=condition=Ready pod -n connectivity-test -l app=pinger --timeout=60s || { - kubectl $KUBECONFIG_ARG get node - echo "ERROR: pingers never ran. Make sure to label nodes with: kubectl label node connectivity-test=true" - exit 1 + echo "ERROR: pingers never ran" + exit 8 } pingerNameIPs=(`kubectl $KUBECONFIG_ARG get pod -n connectivity-test -l app=pinger --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) @@ -88,18 +149,18 @@ pinger2IP=${pinger2NameIP[1]} echo "pinger2: $pinger2, IP: $pinger2IP" if [[ -z $pinger1 || -z $pinger1IP || -z $pinger2 || -z $pinger2IP ]]; then echo "ERROR: expected two pingers to be running with IPs. Exiting." - exit 1 + exit 8 fi ## VERIFY CONNECTIVITY connectFromPinger $pinger1 $pinger2IP || { - echo "ERROR: expected pinger1 to be able to connect to pinger2" - exit 2 + echo "ERROR: expected pinger1 to be able to connect to pinger2. Pods may need more time to bootup" + exit 9 } connectFromPinger $pinger2 $pinger2 || { - echo "ERROR: expected pinger2 to be able to connect to pinger1" - exit 2 + echo "ERROR: expected pinger2 to be able to connect to pinger1. Pods may need more time to bootup" + exit 9 } for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do @@ -113,7 +174,7 @@ for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do dstIP=${scalePodIPs[$j]} connectFromScalePod $scalePod $dstIP || { echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" - exit 2 + exit 9 } done done @@ -125,22 +186,22 @@ for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do if [[ $INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED == true ]]; then connectFromScalePod $scalePod $pinger1IP && { echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" - exit 2 + exit 9 } connectFromPinger $pinger1 $scalePodIP && { echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" - exit 2 + exit 9 } else connectFromScalePod $scalePod $pinger1IP || { echo "ERROR: expected scale Pod $scalePod to be able to connect to pinger1" - exit 2 + exit 9 } connectFromPinger $pinger1 $scalePodIP || { echo "ERROR: expected pinger1 to be able to connect to scale Pod $scalePod" - exit 2 + exit 9 } fi done diff --git a/test/scale/scale-test.sh b/test/scale/scale-test.sh index a296d6a230..1c7bb5e84e 100755 --- a/test/scale/scale-test.sh +++ b/test/scale/scale-test.sh @@ -1,39 +1,116 @@ -################################################################################################################################################################# -# This script will scale the number of pods, pod labels, and network policies in a cluster. -# It uses KWOK to create fake nodes and fake pods as needed. KWOK script must be running in another shell. -# It can also create real Pods on real VMs labeled with scale-test=true. -# It will NOT scale real nodes. -# -# USAGE: -# 1. Create AKS cluster with --uptime-sla and create any nodepools. -# 2. To schedule real Pods on a node: kubectl label node scale-test=true -# 3. Modify this script: set KUBECONFIG_ARG if desired or leave empty. -# 4. Modify this script: if not using NPM, set USING_NPM=false. -# 5. Modify this script: update parameter values. Check your VMs' --max-pod capacity and set maxRealPodsPerNode accordingly (leave wiggle room for system Pods). -# 6. If making KWOK Pods, run: ./run-kwok.sh -# 7. In another shell, run this script -################################################################################################################################################################# - -## CONSTANTS & PARAMETERS -# KUBECONFIG_ARG="--kubeconfig ./config-03-21" -USING_NPM=true -DEBUG_EXIT_AFTER_PRINTOUT=false -DEBUG_EXIT_AFTER_GENERATION=false - -maxKwokPodsPerNode=50 -numKwokDeployments=10 -numKwokReplicas=150 - -maxRealPodsPerNode=30 -numRealDeployments=10 -numRealReplicas=3 - -numSharedLabelsPerPod=3 # should be >= 3 for networkpolicy generation -numUniqueLabelsPerPod=1 # in Cilium, a value >= 1 results in every Pod having a unique identity (not recommended for scale) -numUniqueLabelsPerDeployment=2 - -# applied to every Pod -numNetworkPolicies=10 +# exit on error +set -e + +printHelp() { + cat < --num-kwok-deployments= --num-kwok-replicas= --max-real-pods-per-node= --num-real-deployments= --num-real-replicas= --num-network-policies= --num-unique-labels-per-pod= --num-unique-labels-per-deployment= --num-shared-labels-per-pod= [--kubeconfig=] [--using-npm] [--debug-exit-after-print-counts] [--debug-exit-after-generation] + +Scales the number of Pods, Pod labels, and NetworkPolicies in a cluster. +Uses KWOK to create fake nodes and fake pods as needed. +Can also schedule real Pods. It will NOT scale real nodes. + +USAGE: +1. Create AKS cluster with --uptime-sla and create any nodepools +2. If making KWOK Pods, run `run-kwok.sh` in the background +3. Label node(s) to schedule real Pods: kubectl label node scale-test=true +4. Run this script with args like number of Deployments, replicas, and NetworkPolicies + +SPECIAL NOTES: +1. Check notes on --max-real-pods-per-node +2. For Cilium, check notes on --num-unique-labels-per-pod +3. Check restrictions on --num-shared-labels-per-pod + +REQUIRED PARAMETERS: + --max-kwok-pods-per-node limit for fake kwok nodes. 50 works. Not sure if there's a limit + --num-kwok-deployments number of fake deployments + --num-kwok-replicas per fake deployment + --max-real-pods-per-node check your VMs' --max-pod capacity and set maxRealPodsPerNode accordingly (leave wiggle room for system Pods) + --num-real-deployments deployments scheduled on nodes labeled with scale-test=true + --num-real-replicas per deployment + --num-network-policies NetPols applied to every Pod + --num-unique-labels-per-pod creates labels specific to each Pod. Creates numTotalPods*numUniqueLabelsPerPod distinct labels. In Cilium, a value >= 1 results in every Pod having a unique identity (not recommended for scale) + --num-unique-labels-per-deployment create labels shared between replicas of a deployment. Creates numTotalDeployments*numUniqueLabelsPerDeployment distinct labels + --num-shared-labels-per-pod create labels shared between all Pods. Creates numSharedLabelsPerPod distinct labels. Must be >= 3 if numNetworkPolicies > 0 because of the way we generate network policies + +OPTIONAL PARAMETERS: + --kubeconfig path to kubeconfig file + --restart-npm make sure NPM exists and restart it before running scale test + --debug-exit-after-print-counts skip scale test. Just print out counts of things to be created and counts of IPSets/ACLs that NPM would create + --debug-exit-after-generation skip scale test. Exit after generating templates +EOF +} + +## PARAMETERS +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + printHelp + exit 0 + ;; + --max-kwok-pods-per-node=*) + maxKwokPodsPerNode="${1#*=}" + ;; + --num-kwok-deployments=*) + numKwokDeployments="${1#*=}" + ;; + --num-kwok-replicas=*) + numKwokReplicas="${1#*=}" + ;; + --max-real-pods-per-node=*) + maxRealPodsPerNode="${1#*=}" + ;; + --num-real-deployments=*) + numRealDeployments="${1#*=}" + ;; + --num-real-replicas=*) + numRealReplicas="${1#*=}" + ;; + --num-network-policies=*) + numNetworkPolicies="${1#*=}" + ;; + --num-unique-labels-per-pod=*) + numUniqueLabelsPerPod="${1#*=}" + ;; + --num-unique-labels-per-deployment=*) + numUniqueLabelsPerDeployment="${1#*=}" + ;; + --num-shared-labels-per-pod=*) + numSharedLabelsPerPod="${1#*=}" + ;; + --kubeconfig=*) + file=${1#*=} + KUBECONFIG_ARG="--kubeconfig $file" + test -f $file || { + echo "ERROR: kubeconfig not found: [$file]" + exit 1 + } + ;; + --restart-npm) + USING_NPM=true + ;; + --debug-exit-after-print-counts) + DEBUG_EXIT_AFTER_PRINT_COUNTS=true + ;; + --debug-exit-after-generation) + DEBUG_EXIT_AFTER_GENERATION=true + ;; + *) + echo "ERROR: unknown parameter $1. Make sure you're using '--key=value' for parameters with values" + exit 1 + ;; + esac + shift +done + +if [[ -z $maxKwokPodsPerNode || -z $numKwokDeployments || -z $numKwokReplicas || -z $maxRealPodsPerNode || -z $numRealDeployments || -z $numRealReplicas || -z $numNetworkPolicies || -z $numUniqueLabelsPerPod || -z $numUniqueLabelsPerDeployment || -z $numSharedLabelsPerPod ]]; then + echo "ERROR: missing required parameter. Check --help for usage" + exit 1 +fi + +if [[ $numNetworkPolicies -gt 0 && $numSharedLabelsPerPod -lt 3 ]]; then + echo "ERROR: numSharedLabelsPerPod must be >= 3 if numNetworkPolicies > 0 because of the way we generate network policies" + exit 1 +fi ## CALCULATIONS numKwokPods=$(( $numKwokDeployments * $numKwokReplicas )) @@ -44,7 +121,7 @@ numTotalPods=$(( $numKwokPods + $numRealPods )) ## NPM CALCULATIONS # unique to templates/networkpolicy.yaml -numACLsAddedByNPM=$(( 6 * $numNetworkPolicies )) +numACLsAddedByNPM=$(( 4 * $numNetworkPolicies )) # IPSet/member counts can be slight underestimates if there are more than one template-hash labels # 4 basic IPSets are [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] numIPSetsAddedByNPM=$(( 4 + 2*$numTotalPods*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod + 2*($numKwokDeployments+$numRealDeployments)*$numUniqueLabelsPerDeployment )) @@ -52,7 +129,7 @@ numIPSetsAddedByNPM=$(( 4 + 2*$numTotalPods*$numUniqueLabelsPerPod + 2*$numShare # 5*pods members go to [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] numIPSetMembersAddedByNPM=$(( 3 + $numTotalPods*(5 + 2*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod) + 2*($numKwokPods+$numRealPods)*$numUniqueLabelsPerDeployment )) -## PRINTOUT +## PRINT OUT COUNTS cat < "generated/kwok-nodes/node-$i.yaml" done +echo "Done generating yamls." + if [[ $DEBUG_EXIT_AFTER_GENERATION == true ]]; then echo "DEBUG: exiting after generation..." exit 0 @@ -165,21 +246,19 @@ fi ## DELETE PRIOR STATE echo "cleaning up previous scale test state..." -kubectl $KUBECONFIG_ARG delete ns scale-test && shouldRestartNPM=true +kubectl $KUBECONFIG_ARG delete ns scale-test --ignore-not-found kubectl $KUBECONFIG_ARG delete node -l type=kwok if [[ $USING_NPM == true ]]; then - if [[ $shouldRestartNPM == true ]]; then - echo "restarting NPM pods..." - kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm - kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm-win - echo "sleeping 3m to allow NPM pods to restart..." - sleep 1m - echo "2m remaining..." - sleep 1m - echo "1m remaining..." - sleep 1m - fi + echo "restarting NPM pods..." + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm-win + echo "sleeping 3m to allow NPM pods to restart..." + sleep 1m + echo "2m remaining..." + sleep 1m + echo "1m remaining..." + sleep 1m echo "making sure NPM pods are running..." kubectl $KUBECONFIG_ARG get pod -n kube-system | grep Running | grep -v "azure-npm-win" | grep -oP "azure-npm-[a-z0-9]+" -m 1 From 1513a0cdfcd4baf20886ac795bc5682a27a667fd Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 17:45:10 -0700 Subject: [PATCH 06/14] rename scale script and update readme --- test/scale/README.md | 2 +- test/scale/{scale-test.sh => test-scale.sh} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename test/scale/{scale-test.sh => test-scale.sh} (100%) diff --git a/test/scale/README.md b/test/scale/README.md index 2ce0702454..144c0ec3b2 100644 --- a/test/scale/README.md +++ b/test/scale/README.md @@ -7,5 +7,5 @@ KWOK saves time/resources, especially in Windows. ## Usage 1. Create AKS cluster with `--uptime-sla` and create any nodepools. 2. If making KWOK Pods, run `run-kwok.sh` in the background. -3. Scale with `scale-test.sh`. Specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. +3. Scale with `test-scale.sh`. Specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. 4. Test connectivity with `connectivity/test-connectivity.sh`. diff --git a/test/scale/scale-test.sh b/test/scale/test-scale.sh similarity index 100% rename from test/scale/scale-test.sh rename to test/scale/test-scale.sh From 6a59fa38175152d097c2ea21eb3119f87d613bc7 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 17:57:21 -0700 Subject: [PATCH 07/14] clean up readme --- test/scale/README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/scale/README.md b/test/scale/README.md index 144c0ec3b2..33eb3636f6 100644 --- a/test/scale/README.md +++ b/test/scale/README.md @@ -1,8 +1,12 @@ ## Overview -Scripts for scale testing our components with both real resources and fake resources via [KWOK](https://github.com/kubernetes-sigs/kwok). +Scripts for scale testing our components in AKS with fake and/or real resources. -### Why KWOK? -KWOK saves time/resources, especially in Windows. +### Fake Resources +Scripts can use [KWOK](https://github.com/kubernetes-sigs/kwok) to simulate running Pods. KWOK can instantly run thousands of fake VMs and Pods. + +This saves us from: +1. Large resource costs. +2. Hours waiting for VMs and Pods to bootup. ## Usage 1. Create AKS cluster with `--uptime-sla` and create any nodepools. From 631ce672482dd2c2473d7fc44bd162a556fcc0ff Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 30 Mar 2023 19:34:43 -0700 Subject: [PATCH 08/14] add NetPol after connectivity check --- test/scale/connectivity/allow-pinger.yaml | 22 +++ test/scale/connectivity/test-connectivity.sh | 137 ++++++++++++++----- test/scale/test-scale.sh | 3 +- 3 files changed, 126 insertions(+), 36 deletions(-) create mode 100644 test/scale/connectivity/allow-pinger.yaml diff --git a/test/scale/connectivity/allow-pinger.yaml b/test/scale/connectivity/allow-pinger.yaml new file mode 100644 index 0000000000..f3c9c096ed --- /dev/null +++ b/test/scale/connectivity/allow-pinger.yaml @@ -0,0 +1,22 @@ +kind: NetworkPolicy +apiVersion: networking.k8s.io/v1 +metadata: + name: allow-pinger + namespace: scale-test +spec: + podSelector: + matchLabels: + shared-lab-00001: val + ingress: + - from: + - namespaceSelector: + matchLabels: + app: pinger + egress: + - to: + - podSelector: + matchLabels: + app: pinger + policyTypes: + - Ingress + - Egress diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh index d1101cd231..5f6cc84e40 100755 --- a/test/scale/connectivity/test-connectivity.sh +++ b/test/scale/connectivity/test-connectivity.sh @@ -1,11 +1,18 @@ # exit on error set -e +## CONSTANTS +# agnhost timeout in seconds +TIMEOUT=5 +# seconds to wait between failed connectivity checks after adding allow-pinger NetworkPolicy +NETPOL_SLEEP=5 + printHelp() { cat < [--kubeconfig=] +./test-connectivity.sh --num-scale-pods-to-verify= --max-wait-after-adding-netpol= [--kubeconfig=] Verifies that scale test Pods can connect to each other, but cannot connect to a new "pinger" Pod. +Then, adds a NetworkPolicy to allow traffic between the scale test Pods and the "pinger" Pod, and verifies connectivity. USAGE: 1. Follow steps for test-scale.sh @@ -14,13 +21,15 @@ USAGE: EXIT CODES: 0 - success -7 - non-retriable error -8 - potentially retriable error -9 - retriable connectivity error +6 - non-retriable error +7 - potentially retriable error +8 - retriable connectivity error +9 - connectivity failed after adding allow-pinger NetworkPolicy other - script exited from an unhandled error REQUIRED PARAMETERS: - --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] + --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] + --max-wait-after-adding-netpol= maximum time in seconds to wait for allowed connections after adding the allow-pinger NetworkPolicy OPTIONAL PARAMETERS: --kubeconfig= path to kubeconfig file @@ -37,52 +46,67 @@ while [[ $# -gt 0 ]]; do --num-scale-pods-to-verify=*) numScalePodsToVerify="${1#*=}" ;; + --max-wait-after-adding-netpol=*) + maxWaitAfterAddingNetpol="${1#*=}" + ;; --kubeconfig=*) file=${1#*=} KUBECONFIG_ARG="--kubeconfig $file" test -f $file || { echo "ERROR: kubeconfig not found: [$file]" - exit 7 + exit 6 } + echo "using kubeconfig: $file" ;; *) echo "ERROR: unknown parameter $1. Make sure you're using '--key=value' for parameters with values" - exit 7 + exit 6 ;; esac shift done -if [[ -z $numScalePodsToVerify ]]; then - echo "ERROR: --num-scale-pods-to-verify= is required" - exit 7 +if [[ -z $numScalePodsToVerify || -z $maxWaitAfterAddingNetpol ]]; then + echo "ERROR: missing required parameter. Check --help for usage" + exit 6 fi +## PRINT OUT ARGS +cat < connectivity-test=true" - exit 7 + exit 6 fi ## RUN @@ -106,7 +130,7 @@ for nameIP in "${scalePodNameIPs[@]}"; do if [[ -z $name || -z $ip ]]; then echo "ERROR: expected scale Pod name and IP to be non-empty" - exit 8 + exit 7 fi scalePods+=($name) @@ -120,7 +144,7 @@ done numScalePodsFound=${#scalePods[@]} if [[ $numScalePodsFound == 0 ]]; then echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/test-scale.sh with real Pods first." - exit 8 + exit 7 elif [[ $numScalePodsFound -lt $numScalePodsToVerify ]]; then echo "WARNING: there are only $numScalePodsFound real scale Pods running which is less than numScalePodsToVerify. Will verify just these $numScalePodsFound Pods" numScalePodsToVerify=$numScalePodsFound @@ -135,7 +159,7 @@ sleep 5s echo "waiting for pingers to be ready..." kubectl $KUBECONFIG_ARG wait --for=condition=Ready pod -n connectivity-test -l app=pinger --timeout=60s || { echo "ERROR: pingers never ran" - exit 8 + exit 7 } pingerNameIPs=(`kubectl $KUBECONFIG_ARG get pod -n connectivity-test -l app=pinger --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) @@ -149,18 +173,19 @@ pinger2IP=${pinger2NameIP[1]} echo "pinger2: $pinger2, IP: $pinger2IP" if [[ -z $pinger1 || -z $pinger1IP || -z $pinger2 || -z $pinger2IP ]]; then echo "ERROR: expected two pingers to be running with IPs. Exiting." - exit 8 + exit 7 fi ## VERIFY CONNECTIVITY +echo "verifying connectivity at $(date)..." connectFromPinger $pinger1 $pinger2IP || { echo "ERROR: expected pinger1 to be able to connect to pinger2. Pods may need more time to bootup" - exit 9 + exit 8 } connectFromPinger $pinger2 $pinger2 || { echo "ERROR: expected pinger2 to be able to connect to pinger1. Pods may need more time to bootup" - exit 9 + exit 8 } for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do @@ -174,7 +199,7 @@ for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do dstIP=${scalePodIPs[$j]} connectFromScalePod $scalePod $dstIP || { echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" - exit 9 + exit 8 } done done @@ -183,29 +208,71 @@ for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do scalePod=${scalePods[$i]} scalePodIP=${scalePodIPs[$i]} - if [[ $INTER_NS_TRAFFIC_SHOULD_BE_BLOCKED == true ]]; then - connectFromScalePod $scalePod $pinger1IP && { - echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" - exit 9 - } + connectFromScalePod $scalePod $pinger1IP && { + echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" + exit 8 + } + + connectFromPinger $pinger1 $scalePodIP && { + echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" + exit 8 + } +done + +echo "SUCCESS: all connectivity tests passed" + +## ADD NETWORK POLICY AND VERIFY CONNECTIVITY +set -x +echo "adding new NetworkPolicy to allow pingers at $(date)..." +kubectl $KUBECONFIG_ARG apply -f allow-pinger.yaml + +netpolStart=`date +%s` +lastTry=false +while : ; do + success=true + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + scalePodIP=${scalePodIPs[$i]} - connectFromPinger $pinger1 $scalePodIP && { - echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" - exit 9 - } - else connectFromScalePod $scalePod $pinger1IP || { - echo "ERROR: expected scale Pod $scalePod to be able to connect to pinger1" - exit 9 + echo "WARNING: expected scale Pod $scalePod to be able to connect to pinger1 after adding NetworkPolicy" + success=false + break } connectFromPinger $pinger1 $scalePodIP || { - echo "ERROR: expected pinger1 to be able to connect to scale Pod $scalePod" - exit 9 + echo "WARNING: expected pinger1 to be able to connect to scale Pod $scalePod after adding NetworkPolicy" + success=false + break } + done + + if [[ $success == true ]]; then + break + else + echo "will retry in ${NETPOL_SLEEP} seconds..." + sleep $NETPOL_SLEEP + fi + + # if reached max wait time, try once more. If that try fails, then quit + if [[ `date +%s` -gt $(( $netpolStart + $maxWaitAfterAddingNetpol )) ]]; then + if [[ $lastTry == true ]]; then + break + fi + + echo "WARNING: reached max wait time of $maxWaitAfterAddingNetpol seconds after adding allow-pinger NetworkPolicy. Will try one more time" + lastTry=true fi done +if [[ $success == false ]]; then + echo "ERROR: timed out after waiting $maxWaitAfterAddingNetpol seconds for allow-pinger NetworkPolicy to take effect" + exit 9 +fi + +timeDiff=$(( `date +%s` - $netpolStart )) +echo "SUCCESS: all connectivity tests passed after adding allow-pinger NetworkPolicy. Took between $(( $timeDiff - $NETPOL_SLEEP )) to $(( $timeDiff + $TIMEOUT )) seconds to take effect" + echo echo "FINISHED at $(date -u). Had started at $startDate." echo diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh index 1c7bb5e84e..11ea7df454 100755 --- a/test/scale/test-scale.sh +++ b/test/scale/test-scale.sh @@ -84,6 +84,7 @@ while [[ $# -gt 0 ]]; do echo "ERROR: kubeconfig not found: [$file]" exit 1 } + echo "using kubeconfig: $file" ;; --restart-npm) USING_NPM=true @@ -246,7 +247,7 @@ fi ## DELETE PRIOR STATE echo "cleaning up previous scale test state..." -kubectl $KUBECONFIG_ARG delete ns scale-test --ignore-not-found +kubectl $KUBECONFIG_ARG delete ns scale-test connectivity-test --ignore-not-found kubectl $KUBECONFIG_ARG delete node -l type=kwok if [[ $USING_NPM == true ]]; then From a0e5b58ac2f2508b1e74e6b226ec9d9d0084dd89 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 31 Mar 2023 10:48:53 -0700 Subject: [PATCH 09/14] retry connectivity loops --- test/scale/connectivity/test-connectivity.sh | 162 +++++++++++-------- 1 file changed, 99 insertions(+), 63 deletions(-) diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh index 5f6cc84e40..77c7b9d928 100755 --- a/test/scale/connectivity/test-connectivity.sh +++ b/test/scale/connectivity/test-connectivity.sh @@ -4,12 +4,13 @@ set -e ## CONSTANTS # agnhost timeout in seconds TIMEOUT=5 +CONNECTIVITY_SLEEP=60 # seconds to wait between failed connectivity checks after adding allow-pinger NetworkPolicy NETPOL_SLEEP=5 printHelp() { cat < --max-wait-after-adding-netpol= [--kubeconfig=] +./test-connectivity.sh --num-scale-pods-to-verify= --max-wait-for-initial-connectivity= --max-wait-after-adding-netpol= [--kubeconfig=] Verifies that scale test Pods can connect to each other, but cannot connect to a new "pinger" Pod. Then, adds a NetworkPolicy to allow traffic between the scale test Pods and the "pinger" Pod, and verifies connectivity. @@ -28,8 +29,9 @@ EXIT CODES: other - script exited from an unhandled error REQUIRED PARAMETERS: - --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] - --max-wait-after-adding-netpol= maximum time in seconds to wait for allowed connections after adding the allow-pinger NetworkPolicy + --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] + --max-wait-for-initial-connectivity= maximum time in seconds to wait for initial connectivity after Pinger Pods are running + --max-wait-after-adding-netpol= maximum time in seconds to wait for allowed connections after adding the allow-pinger NetworkPolicy OPTIONAL PARAMETERS: --kubeconfig= path to kubeconfig file @@ -46,6 +48,9 @@ while [[ $# -gt 0 ]]; do --num-scale-pods-to-verify=*) numScalePodsToVerify="${1#*=}" ;; + --max-wait-for-initial-connectivity=*) + maxWaitForInitialConnectivity="${1#*=}" + ;; --max-wait-after-adding-netpol=*) maxWaitAfterAddingNetpol="${1#*=}" ;; @@ -80,6 +85,7 @@ maxWaitAfterAddingNetpol: $maxWaitAfterAddingNetpol TIMEOUT: $TIMEOUT NETPOL_SLEEP: $NETPOL_SLEEP + EOF ## HELPER FUNCTIONS @@ -177,101 +183,131 @@ if [[ -z $pinger1 || -z $pinger1IP || -z $pinger2 || -z $pinger2IP ]]; then fi ## VERIFY CONNECTIVITY -echo "verifying connectivity at $(date)..." -connectFromPinger $pinger1 $pinger2IP || { - echo "ERROR: expected pinger1 to be able to connect to pinger2. Pods may need more time to bootup" - exit 8 -} +verifyInitialConnectivity() { + connectFromPinger $pinger1 $pinger2IP || { + echo "ERROR: expected pinger1 to be able to connect to pinger2. Pods may need more time to bootup" + return 8 + } -connectFromPinger $pinger2 $pinger2 || { - echo "ERROR: expected pinger2 to be able to connect to pinger1. Pods may need more time to bootup" - exit 8 -} + connectFromPinger $pinger2 $pinger2 || { + echo "ERROR: expected pinger2 to be able to connect to pinger1. Pods may need more time to bootup" + return 8 + } -for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do - scalePod=${scalePods[$i]} - for j in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do - if [[ $i == $j ]]; then - continue - fi + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + for j in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + if [[ $i == $j ]]; then + continue + fi + + dstPod=${scalePods[$j]} + dstIP=${scalePodIPs[$j]} + connectFromScalePod $scalePod $dstIP || { + echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" + return 8 + } + done + done - dstPod=${scalePods[$j]} - dstIP=${scalePodIPs[$j]} - connectFromScalePod $scalePod $dstIP || { - echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" - exit 8 + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + scalePodIP=${scalePodIPs[$i]} + + connectFromScalePod $scalePod $pinger1IP && { + echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" + return 8 + } + + connectFromPinger $pinger1 $scalePodIP && { + echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" + return 8 } done -done +} -for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do - scalePod=${scalePods[$i]} - scalePodIP=${scalePodIPs[$i]} +echo "verifying initial connectivity at $(date)..." +connectivityStartDate=`date +%s` +maxWaitDate=$(( $connectivityStartDate + $maxWaitForInitialConnectivity )) +prevTryDate=$connectivityStartDate +while : ; do + verifyInitialConnectivity && break - connectFromScalePod $scalePod $pinger1IP && { - echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" - exit 8 - } + echo "WARNING: initial connectivity test failed. Retrying in $CONNECTIVITY_SLEEP seconds..." + sleep $CONNECTIVITY_SLEEP - connectFromPinger $pinger1 $scalePodIP && { - echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" - exit 8 - } + # if reached max wait time, try once more. If that try fails, then quit + currDate=`date +%s` + if [[ $currDate -gt $maxWaitDate ]]; then + if [[ $prevTryDate -gt $maxWaitDate ]]; then + echo "ERROR: initial connectivity test timed out. Last try was at least $(( $prevTryDate - $connectivityStartDate )) seconds after pinger Pods began running" + exit 8 + fi + + echo "WARNING: reached max wait time of $maxWaitForInitialConnectivity seconds after pinger Pods began running. Will try one more time" + fi + + prevTryDate=$currDate done -echo "SUCCESS: all connectivity tests passed" +low=$connectivityStartDate +if [[ $prevTryDate -gt $connectivityStartDate ]]; then + low=$(( $prevTryDate - $CONNECTIVITY_SLEEP )) +fi +high=$(( `date +%s` - $connectivityStartDate )) +echo "SUCCESS: all initial connectivity tests passed. Took between $low and $high seconds to succeed" ## ADD NETWORK POLICY AND VERIFY CONNECTIVITY -set -x -echo "adding new NetworkPolicy to allow pingers at $(date)..." +echo "adding allow-pinger NetworkPolicy at $(date)..." kubectl $KUBECONFIG_ARG apply -f allow-pinger.yaml -netpolStart=`date +%s` -lastTry=false -while : ; do - success=true - for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do +verifyNetPol() { + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do scalePod=${scalePods[$i]} scalePodIP=${scalePodIPs[$i]} connectFromScalePod $scalePod $pinger1IP || { echo "WARNING: expected scale Pod $scalePod to be able to connect to pinger1 after adding NetworkPolicy" - success=false - break + return 9 } connectFromPinger $pinger1 $scalePodIP || { echo "WARNING: expected pinger1 to be able to connect to scale Pod $scalePod after adding NetworkPolicy" - success=false - break + return 9 } done +} - if [[ $success == true ]]; then - break - else - echo "will retry in ${NETPOL_SLEEP} seconds..." - sleep $NETPOL_SLEEP - fi +echo "verifying allow-pinger NetworkPolicy at $(date)..." +netpolStartDate=`date +%s` +maxWaitDate=$(( $netpolStartDate + $maxWaitAfterAddingNetpol )) +prevTryDate=$netpolStartDate +while : ; do + verifyNetPol && break + + echo "WARNING: verifying allow-pinger NetworkPolicy failed. Retrying in $NETPOL_SLEEP seconds..." + sleep $NETPOL_SLEEP # if reached max wait time, try once more. If that try fails, then quit - if [[ `date +%s` -gt $(( $netpolStart + $maxWaitAfterAddingNetpol )) ]]; then - if [[ $lastTry == true ]]; then - break + currDate=`date +%s` + if [[ $currDate -gt $maxWaitDate ]]; then + if [[ $prevTryDate -gt $maxWaitDate ]]; then + echo "ERROR: allow-pinger NetworkPolicy has not taken effact. Last try was at least $(( $prevTryDate - $netpolStartDate )) seconds after creating allow-pinger NetworkPolicy" + exit 9 fi echo "WARNING: reached max wait time of $maxWaitAfterAddingNetpol seconds after adding allow-pinger NetworkPolicy. Will try one more time" - lastTry=true fi + + prevTryDate=$currDate done -if [[ $success == false ]]; then - echo "ERROR: timed out after waiting $maxWaitAfterAddingNetpol seconds for allow-pinger NetworkPolicy to take effect" - exit 9 +low=$netpolStartDate +if [[ $prevTryDate -gt $netpolStartDate ]]; then + low=$(( $prevTryDate - $NETPOL_SLEEP )) fi - -timeDiff=$(( `date +%s` - $netpolStart )) -echo "SUCCESS: all connectivity tests passed after adding allow-pinger NetworkPolicy. Took between $(( $timeDiff - $NETPOL_SLEEP )) to $(( $timeDiff + $TIMEOUT )) seconds to take effect" +high=$(( `date +%s` - $netpolStartDate )) +echo "SUCCESS: all connectivity tests passed after adding allow-pinger NetworkPolicy. Took between $low and $high seconds to take effect" echo echo "FINISHED at $(date -u). Had started at $startDate." From 5b08d2ca752b6299d617e43250a5903753756aee Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 31 Mar 2023 16:52:34 -0700 Subject: [PATCH 10/14] fix connectivity test script and netpol --- test/scale/connectivity/allow-pinger.yaml | 8 ++++- test/scale/connectivity/test-connectivity.sh | 31 ++++++++++++-------- test/scale/run-kwok.sh | 6 ++-- test/scale/test-scale.sh | 2 +- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/test/scale/connectivity/allow-pinger.yaml b/test/scale/connectivity/allow-pinger.yaml index f3c9c096ed..b2511ae78a 100644 --- a/test/scale/connectivity/allow-pinger.yaml +++ b/test/scale/connectivity/allow-pinger.yaml @@ -9,14 +9,20 @@ spec: shared-lab-00001: val ingress: - from: - - namespaceSelector: + - podSelector: matchLabels: app: pinger + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: connectivity-test egress: - to: - podSelector: matchLabels: app: pinger + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: connectivity-test policyTypes: - Ingress - Egress diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh index 77c7b9d928..75ab9256ff 100755 --- a/test/scale/connectivity/test-connectivity.sh +++ b/test/scale/connectivity/test-connectivity.sh @@ -20,14 +20,6 @@ USAGE: 2. Label a node to schedule "pinger" Pods: kubectl label node connectivity-test=true 3. Run this script -EXIT CODES: -0 - success -6 - non-retriable error -7 - potentially retriable error -8 - retriable connectivity error -9 - connectivity failed after adding allow-pinger NetworkPolicy -other - script exited from an unhandled error - REQUIRED PARAMETERS: --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] --max-wait-for-initial-connectivity= maximum time in seconds to wait for initial connectivity after Pinger Pods are running @@ -35,6 +27,15 @@ REQUIRED PARAMETERS: OPTIONAL PARAMETERS: --kubeconfig= path to kubeconfig file + +EXIT CODES: +0 - success +6 - non-retriable error +7 - potentially retriable error while getting Pods/IPs +8 - failed on initial connectivity test +9 - failed after adding allow-pinger NetworkPolicy +other - script exited from an unhandled error + EOF } @@ -152,7 +153,7 @@ if [[ $numScalePodsFound == 0 ]]; then echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/test-scale.sh with real Pods first." exit 7 elif [[ $numScalePodsFound -lt $numScalePodsToVerify ]]; then - echo "WARNING: there are only $numScalePodsFound real scale Pods running which is less than numScalePodsToVerify. Will verify just these $numScalePodsFound Pods" + echo "WARNING: there are only $numScalePodsFound real scale Pods running which is less than numScalePodsToVerify=$numScalePodsToVerify. Will verify just these $numScalePodsFound Pods" numScalePodsToVerify=$numScalePodsFound else echo "will verify connectivity to $numScalePodsToVerify scale Pods" @@ -224,6 +225,8 @@ verifyInitialConnectivity() { return 8 } done + + return 0 } echo "verifying initial connectivity at $(date)..." @@ -250,9 +253,9 @@ while : ; do prevTryDate=$currDate done -low=$connectivityStartDate +low=0 if [[ $prevTryDate -gt $connectivityStartDate ]]; then - low=$(( $prevTryDate - $CONNECTIVITY_SLEEP )) + low=$(( `date +%s` - $prevTryDate - $CONNECTIVITY_SLEEP )) fi high=$(( `date +%s` - $connectivityStartDate )) echo "SUCCESS: all initial connectivity tests passed. Took between $low and $high seconds to succeed" @@ -276,6 +279,8 @@ verifyNetPol() { return 9 } done + + return 0 } echo "verifying allow-pinger NetworkPolicy at $(date)..." @@ -302,9 +307,9 @@ while : ; do prevTryDate=$currDate done -low=$netpolStartDate +low=0 if [[ $prevTryDate -gt $netpolStartDate ]]; then - low=$(( $prevTryDate - $NETPOL_SLEEP )) + low=$(( `date +%s` - $prevTryDate - $NETPOL_SLEEP )) fi high=$(( `date +%s` - $netpolStartDate )) echo "SUCCESS: all connectivity tests passed after adding allow-pinger NetworkPolicy. Took between $low and $high seconds to take effect" diff --git a/test/scale/run-kwok.sh b/test/scale/run-kwok.sh index 48ca902401..c8edcd6e13 100755 --- a/test/scale/run-kwok.sh +++ b/test/scale/run-kwok.sh @@ -1,6 +1,6 @@ -##################################################################################### -# This script is used to schedule kwok nodes/pods and maintain kwok node heartbeats. -##################################################################################### +###################################################################################### +# This script is used to schedule kwok nodes/pods and maintain kwok node heartbeats. # +###################################################################################### INSTALL_KWOK=false # KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh index 11ea7df454..3d3db7f266 100755 --- a/test/scale/test-scale.sh +++ b/test/scale/test-scale.sh @@ -241,7 +241,7 @@ echo "checking if there are enough real nodes..." numRealNodes=$(kubectl $KUBECONFIG_ARG get nodes -l scale-test=true | grep -v NAME | wc -l) if [[ $numRealNodes -lt $numRealNodesRequired ]]; then kubectl $KUBECONFIG_ARG get nodes - echo "ERROR: need $numRealNodesRequired real nodes to achieve a scale of $numRealPods real Pods. Make sure to label nodes with: kubectl label node scale-test=true." + echo "ERROR: need $numRealNodesRequired real nodes to achieve a scale of $numRealPods real Pods. Make sure to label nodes with: kubectl label node scale-test=true" exit 1 fi From a193e5f9f9448a06070ed6b49c7fdc2de137206e Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 31 Mar 2023 16:59:46 -0700 Subject: [PATCH 11/14] script to capture cpu/mem --- test/scale/utils/capture-cpu-and-mem.sh | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 test/scale/utils/capture-cpu-and-mem.sh diff --git a/test/scale/utils/capture-cpu-and-mem.sh b/test/scale/utils/capture-cpu-and-mem.sh new file mode 100755 index 0000000000..4102482910 --- /dev/null +++ b/test/scale/utils/capture-cpu-and-mem.sh @@ -0,0 +1,48 @@ +##################################################################################### +# Periodically captures CPU/Memory of Pods/nodes and writes to csvs. # +##################################################################################### +APPEND_TO_EXISTING_FILES=true + +FOLDER="captures" +RUNNING_PODS_FILE=$FOLDER/cpu-and-mem-running-pods.out +POD_MEM_CSV=$FOLDER/cpu-and-mem-pod-results.csv +NODE_MEM_CSV=$FOLDER/cpu-and-mem-node-results.csv + +# kubectl top seems to refresh every minute +SLEEP_BETWEEN_CAPTURES=65 + +## RUN +mkdir -p $FOLDER + +if [[ $APPEND_TO_EXISTING_FILES != true ]]; then + if [[ -f $RUNNING_PODS_FILE || -f $POD_MEM_CSV || -f $ NODE_MEM_CSV ]]; then + echo "ERROR: $RUNNING_PODS_FILE, $POD_MEM_CSV, or $NODE_MEM_CSV already exists. Either 1) set APPEND_TO_EXISTING_FILES=true or 2) move the old files" + exit 1 + fi + + echo "time,pod,cpu,mem" > $POD_MEM_CSV + echo "time,node,cpu,cpuPercent,mem,memPercent" > $NODE_MEM_CSV +fi + +while true; do + currentTime=`date -u` + echo "running k top pod" + lines=`kubectl top pod -A | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ',' | tr -d 'm' | tr -d 'Mi'` + for line in $lines; do + echo "$currentTime,$line" >> $POD_MEM_CSV + done + + currentTime=`date -u` + echo "running k top node" + lines=`kubectl top node | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ',' | tr -d 'm' | tr -d 'Mi' | tr -d '%'` + for line in $lines; do + echo "$currentTime,$line" >> $NODE_MEM_CSV + done + + echo `date -u` >> $RUNNING_PODS_FILE + kubectl get pod -A -owide | npm >> $RUNNING_PODS_FILE + echo " " >> $RUNNING_PODS_FILE + + echo "sleeping $SLEEP_BETWEEN_CAPTURES seconds" + sleep $SLEEP_BETWEEN_CAPTURES +done From 59303b77227830a38cfd17ed52ff4426d878ed1c Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 3 Apr 2023 17:43:08 -0700 Subject: [PATCH 12/14] fix typo in help --- test/scale/test-scale.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh index 3d3db7f266..ce87775e56 100755 --- a/test/scale/test-scale.sh +++ b/test/scale/test-scale.sh @@ -3,7 +3,7 @@ set -e printHelp() { cat < --num-kwok-deployments= --num-kwok-replicas= --max-real-pods-per-node= --num-real-deployments= --num-real-replicas= --num-network-policies= --num-unique-labels-per-pod= --num-unique-labels-per-deployment= --num-shared-labels-per-pod= [--kubeconfig=] [--using-npm] [--debug-exit-after-print-counts] [--debug-exit-after-generation] +./test-scale.sh --max-kwok-pods-per-node= --num-kwok-deployments= --num-kwok-replicas= --max-real-pods-per-node= --num-real-deployments= --num-real-replicas= --num-network-policies= --num-unique-labels-per-pod= --num-unique-labels-per-deployment= --num-shared-labels-per-pod= [--kubeconfig=] [--restart-npm] [--debug-exit-after-print-counts] [--debug-exit-after-generation] Scales the number of Pods, Pod labels, and NetworkPolicies in a cluster. Uses KWOK to create fake nodes and fake pods as needed. From 985ca547fa360828cd6a908330c6915c701b9051 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 5 Apr 2023 13:49:29 -0700 Subject: [PATCH 13/14] kwok kubeconfig --- test/scale/run-kwok.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/scale/run-kwok.sh b/test/scale/run-kwok.sh index c8edcd6e13..f1118ae320 100755 --- a/test/scale/run-kwok.sh +++ b/test/scale/run-kwok.sh @@ -1,12 +1,21 @@ ###################################################################################### # This script is used to schedule kwok nodes/pods and maintain kwok node heartbeats. # ###################################################################################### - INSTALL_KWOK=false # KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') KWOK_VERSION=${KWOK_LATEST_RELEASE:-"v0.1.1"} # kubeconfig arg doesn't seem to work for kwok. It seems to just use current context of the default kubeconfig. -KUBECONFIG=~/.kube/config + +# specify kubeconfig file as first arg if you want +if [[ $1 != "" ]]; then + file=$1 + test -f $file || { + echo "ERROR: KUBECONFIG=$file does not exist" + exit 1 + } + + KUBECONFIG_ARG="--kubeconfig $file" +fi if [[ INSTALL_KWOK == true ]]; then wget -O kwokctl -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_VERSION}/kwokctl-$(go env GOOS)-$(go env GOARCH)" @@ -18,7 +27,7 @@ if [[ INSTALL_KWOK == true ]]; then sudo mv kwok /usr/local/bin/kwok fi -kwok --kubeconfig $KUBECONFIG \ +kwok $KUBECONFIG_ARG \ --cidr=155.0.0.0/16 \ --node-ip=155.0.0.1 \ --manage-all-nodes=false \ From d92a58e7bd116346807df1a36a0a30df10b5a09c Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 5 Apr 2023 13:52:32 -0700 Subject: [PATCH 14/14] fix cpu and mem capture --- test/scale/utils/capture-cpu-and-mem.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/scale/utils/capture-cpu-and-mem.sh b/test/scale/utils/capture-cpu-and-mem.sh index 4102482910..6c24bac6c7 100755 --- a/test/scale/utils/capture-cpu-and-mem.sh +++ b/test/scale/utils/capture-cpu-and-mem.sh @@ -15,7 +15,7 @@ SLEEP_BETWEEN_CAPTURES=65 mkdir -p $FOLDER if [[ $APPEND_TO_EXISTING_FILES != true ]]; then - if [[ -f $RUNNING_PODS_FILE || -f $POD_MEM_CSV || -f $ NODE_MEM_CSV ]]; then + if [[ -f $RUNNING_PODS_FILE || -f $POD_MEM_CSV || -f $NODE_MEM_CSV ]]; then echo "ERROR: $RUNNING_PODS_FILE, $POD_MEM_CSV, or $NODE_MEM_CSV already exists. Either 1) set APPEND_TO_EXISTING_FILES=true or 2) move the old files" exit 1 fi @@ -27,20 +27,20 @@ fi while true; do currentTime=`date -u` echo "running k top pod" - lines=`kubectl top pod -A | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ',' | tr -d 'm' | tr -d 'Mi'` + lines=`kubectl top pod -A | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ','` for line in $lines; do echo "$currentTime,$line" >> $POD_MEM_CSV done currentTime=`date -u` echo "running k top node" - lines=`kubectl top node | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ',' | tr -d 'm' | tr -d 'Mi' | tr -d '%'` + lines=`kubectl top node | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ','` for line in $lines; do echo "$currentTime,$line" >> $NODE_MEM_CSV done echo `date -u` >> $RUNNING_PODS_FILE - kubectl get pod -A -owide | npm >> $RUNNING_PODS_FILE + kubectl get pod -A -owide | grep npm >> $RUNNING_PODS_FILE echo " " >> $RUNNING_PODS_FILE echo "sleeping $SLEEP_BETWEEN_CAPTURES seconds"