diff --git a/.gitignore b/.gitignore index 5197febb92..6ecb9304d9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,6 @@ npm/debug/http *.srl go.work* + +# scale-test +test/scale/generated/* diff --git a/test/scale/README.md b/test/scale/README.md new file mode 100644 index 0000000000..33eb3636f6 --- /dev/null +++ b/test/scale/README.md @@ -0,0 +1,15 @@ +## Overview +Scripts for scale testing our components in AKS with fake and/or real resources. + +### Fake Resources +Scripts can use [KWOK](https://github.com/kubernetes-sigs/kwok) to simulate running Pods. KWOK can instantly run thousands of fake VMs and Pods. + +This saves us from: +1. Large resource costs. +2. Hours waiting for VMs and Pods to bootup. + +## Usage +1. Create AKS cluster with `--uptime-sla` and create any nodepools. +2. If making KWOK Pods, run `run-kwok.sh` in the background. +3. Scale with `test-scale.sh`. Specify number of Deployments, Pod replicas, NetworkPolicies, and labels for Pods. +4. Test connectivity with `connectivity/test-connectivity.sh`. diff --git a/test/scale/connectivity/allow-pinger.yaml b/test/scale/connectivity/allow-pinger.yaml new file mode 100644 index 0000000000..b2511ae78a --- /dev/null +++ b/test/scale/connectivity/allow-pinger.yaml @@ -0,0 +1,28 @@ +kind: NetworkPolicy +apiVersion: networking.k8s.io/v1 +metadata: + name: allow-pinger + namespace: scale-test +spec: + podSelector: + matchLabels: + shared-lab-00001: val + ingress: + - from: + - podSelector: + matchLabels: + app: pinger + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: connectivity-test + egress: + - to: + - podSelector: + matchLabels: + app: pinger + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: connectivity-test + policyTypes: + - Ingress + - Egress diff --git a/test/scale/connectivity/pinger.yaml b/test/scale/connectivity/pinger.yaml new file mode 100644 index 0000000000..17a7d9662a --- /dev/null +++ b/test/scale/connectivity/pinger.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pinger + namespace: connectivity-test + labels: + app: pinger +spec: + replicas: 2 + selector: + matchLabels: + app: pinger + template: + metadata: + labels: + app: pinger + spec: + nodeSelector: + connectivity-test: "true" + containers: + - command: + - /agnhost + - serve-hostname + - --tcp + - --http=false + - --port + - "80" + image: k8s.gcr.io/e2e-test-images/agnhost:2.33 + imagePullPolicy: IfNotPresent + name: cont-80-tcp + ports: + - containerPort: 80 + protocol: TCP diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh new file mode 100755 index 0000000000..75ab9256ff --- /dev/null +++ b/test/scale/connectivity/test-connectivity.sh @@ -0,0 +1,319 @@ +# exit on error +set -e + +## CONSTANTS +# agnhost timeout in seconds +TIMEOUT=5 +CONNECTIVITY_SLEEP=60 +# seconds to wait between failed connectivity checks after adding allow-pinger NetworkPolicy +NETPOL_SLEEP=5 + +printHelp() { + cat < --max-wait-for-initial-connectivity= --max-wait-after-adding-netpol= [--kubeconfig=] + +Verifies that scale test Pods can connect to each other, but cannot connect to a new "pinger" Pod. +Then, adds a NetworkPolicy to allow traffic between the scale test Pods and the "pinger" Pod, and verifies connectivity. + +USAGE: +1. Follow steps for test-scale.sh +2. Label a node to schedule "pinger" Pods: kubectl label node connectivity-test=true +3. Run this script + +REQUIRED PARAMETERS: + --num-scale-pods-to-verify= number of scale Pods to test. Will verify that each scale Pod can connect to each other [(N-1)^2 connections] and that each Scale Pod cannot connect to a "pinger" Pod [2N connection attempts with a 3-second timeout] + --max-wait-for-initial-connectivity= maximum time in seconds to wait for initial connectivity after Pinger Pods are running + --max-wait-after-adding-netpol= maximum time in seconds to wait for allowed connections after adding the allow-pinger NetworkPolicy + +OPTIONAL PARAMETERS: + --kubeconfig= path to kubeconfig file + +EXIT CODES: +0 - success +6 - non-retriable error +7 - potentially retriable error while getting Pods/IPs +8 - failed on initial connectivity test +9 - failed after adding allow-pinger NetworkPolicy +other - script exited from an unhandled error + +EOF +} + +## PARAMETERS +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + printHelp + exit 0 + ;; + --num-scale-pods-to-verify=*) + numScalePodsToVerify="${1#*=}" + ;; + --max-wait-for-initial-connectivity=*) + maxWaitForInitialConnectivity="${1#*=}" + ;; + --max-wait-after-adding-netpol=*) + maxWaitAfterAddingNetpol="${1#*=}" + ;; + --kubeconfig=*) + file=${1#*=} + KUBECONFIG_ARG="--kubeconfig $file" + test -f $file || { + echo "ERROR: kubeconfig not found: [$file]" + exit 6 + } + echo "using kubeconfig: $file" + ;; + *) + echo "ERROR: unknown parameter $1. Make sure you're using '--key=value' for parameters with values" + exit 6 + ;; + esac + shift +done + +if [[ -z $numScalePodsToVerify || -z $maxWaitAfterAddingNetpol ]]; then + echo "ERROR: missing required parameter. Check --help for usage" + exit 6 +fi + +## PRINT OUT ARGS +cat < connectivity-test=true" + exit 6 +fi + +## RUN +set -e +startDate=`date -u` +echo "STARTING CONNECTIVITY TEST at $startDate" + +## GET SCALE PODS +echo "getting scale Pods..." +scalePodNameIPs=(`kubectl $KUBECONFIG_ARG get pods -n scale-test --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) +scalePods=() +scalePodIPs=() +for nameIP in "${scalePodNameIPs[@]}"; do + nameIP=(`echo $nameIP | tr ',' ' '`) + name=${nameIP[0]} + ip=${nameIP[1]} + + echo $name | grep real-dep || continue + + echo "scale Pod: $name, IP: $ip" + + if [[ -z $name || -z $ip ]]; then + echo "ERROR: expected scale Pod name and IP to be non-empty" + exit 7 + fi + + scalePods+=($name) + scalePodIPs+=($ip) + + if [[ ${#scalePods[@]} -eq $numScalePodsToVerify ]]; then + break + fi +done + +numScalePodsFound=${#scalePods[@]} +if [[ $numScalePodsFound == 0 ]]; then + echo "ERROR: expected namespace scale-test to exist with real (non-kwok) Pods. Run test/scale/test-scale.sh with real Pods first." + exit 7 +elif [[ $numScalePodsFound -lt $numScalePodsToVerify ]]; then + echo "WARNING: there are only $numScalePodsFound real scale Pods running which is less than numScalePodsToVerify=$numScalePodsToVerify. Will verify just these $numScalePodsFound Pods" + numScalePodsToVerify=$numScalePodsFound +else + echo "will verify connectivity to $numScalePodsToVerify scale Pods" +fi + +## CREATE PINGERS +kubectl $KUBECONFIG_ARG create ns connectivity-test || true +kubectl $KUBECONFIG_ARG apply -f pinger.yaml +sleep 5s +echo "waiting for pingers to be ready..." +kubectl $KUBECONFIG_ARG wait --for=condition=Ready pod -n connectivity-test -l app=pinger --timeout=60s || { + echo "ERROR: pingers never ran" + exit 7 +} + +pingerNameIPs=(`kubectl $KUBECONFIG_ARG get pod -n connectivity-test -l app=pinger --field-selector=status.phase==Running -o jsonpath='{range .items[*]}{@.metadata.name}{","}{@.status.podIP}{" "}{end}'`) +pinger1NameIP=(`echo "${pingerNameIPs[0]}" | tr ',' ' '`) +pinger1=${pinger1NameIP[0]} +pinger1IP=${pinger1NameIP[1]} +echo "pinger1: $pinger1, IP: $pinger1IP" +pinger2NameIP=(`echo "${pingerNameIPs[1]}" | tr ',' ' '`) +pinger2=${pinger2NameIP[0]} +pinger2IP=${pinger2NameIP[1]} +echo "pinger2: $pinger2, IP: $pinger2IP" +if [[ -z $pinger1 || -z $pinger1IP || -z $pinger2 || -z $pinger2IP ]]; then + echo "ERROR: expected two pingers to be running with IPs. Exiting." + exit 7 +fi + +## VERIFY CONNECTIVITY +verifyInitialConnectivity() { + connectFromPinger $pinger1 $pinger2IP || { + echo "ERROR: expected pinger1 to be able to connect to pinger2. Pods may need more time to bootup" + return 8 + } + + connectFromPinger $pinger2 $pinger2 || { + echo "ERROR: expected pinger2 to be able to connect to pinger1. Pods may need more time to bootup" + return 8 + } + + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + for j in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + if [[ $i == $j ]]; then + continue + fi + + dstPod=${scalePods[$j]} + dstIP=${scalePodIPs[$j]} + connectFromScalePod $scalePod $dstIP || { + echo "ERROR: expected scale Pod $scalePod to be able to connect to scale Pod $dstPod" + return 8 + } + done + done + + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + scalePodIP=${scalePodIPs[$i]} + + connectFromScalePod $scalePod $pinger1IP && { + echo "ERROR: expected scale Pod $scalePod to NOT be able to connect to pinger1" + return 8 + } + + connectFromPinger $pinger1 $scalePodIP && { + echo "ERROR: expected pinger1 to NOT be able to connect to scale Pod $scalePod" + return 8 + } + done + + return 0 +} + +echo "verifying initial connectivity at $(date)..." +connectivityStartDate=`date +%s` +maxWaitDate=$(( $connectivityStartDate + $maxWaitForInitialConnectivity )) +prevTryDate=$connectivityStartDate +while : ; do + verifyInitialConnectivity && break + + echo "WARNING: initial connectivity test failed. Retrying in $CONNECTIVITY_SLEEP seconds..." + sleep $CONNECTIVITY_SLEEP + + # if reached max wait time, try once more. If that try fails, then quit + currDate=`date +%s` + if [[ $currDate -gt $maxWaitDate ]]; then + if [[ $prevTryDate -gt $maxWaitDate ]]; then + echo "ERROR: initial connectivity test timed out. Last try was at least $(( $prevTryDate - $connectivityStartDate )) seconds after pinger Pods began running" + exit 8 + fi + + echo "WARNING: reached max wait time of $maxWaitForInitialConnectivity seconds after pinger Pods began running. Will try one more time" + fi + + prevTryDate=$currDate +done + +low=0 +if [[ $prevTryDate -gt $connectivityStartDate ]]; then + low=$(( `date +%s` - $prevTryDate - $CONNECTIVITY_SLEEP )) +fi +high=$(( `date +%s` - $connectivityStartDate )) +echo "SUCCESS: all initial connectivity tests passed. Took between $low and $high seconds to succeed" + +## ADD NETWORK POLICY AND VERIFY CONNECTIVITY +echo "adding allow-pinger NetworkPolicy at $(date)..." +kubectl $KUBECONFIG_ARG apply -f allow-pinger.yaml + +verifyNetPol() { + for i in $(seq 0 $(( ${#scalePods[@]} - 1 ))); do + scalePod=${scalePods[$i]} + scalePodIP=${scalePodIPs[$i]} + + connectFromScalePod $scalePod $pinger1IP || { + echo "WARNING: expected scale Pod $scalePod to be able to connect to pinger1 after adding NetworkPolicy" + return 9 + } + + connectFromPinger $pinger1 $scalePodIP || { + echo "WARNING: expected pinger1 to be able to connect to scale Pod $scalePod after adding NetworkPolicy" + return 9 + } + done + + return 0 +} + +echo "verifying allow-pinger NetworkPolicy at $(date)..." +netpolStartDate=`date +%s` +maxWaitDate=$(( $netpolStartDate + $maxWaitAfterAddingNetpol )) +prevTryDate=$netpolStartDate +while : ; do + verifyNetPol && break + + echo "WARNING: verifying allow-pinger NetworkPolicy failed. Retrying in $NETPOL_SLEEP seconds..." + sleep $NETPOL_SLEEP + + # if reached max wait time, try once more. If that try fails, then quit + currDate=`date +%s` + if [[ $currDate -gt $maxWaitDate ]]; then + if [[ $prevTryDate -gt $maxWaitDate ]]; then + echo "ERROR: allow-pinger NetworkPolicy has not taken effact. Last try was at least $(( $prevTryDate - $netpolStartDate )) seconds after creating allow-pinger NetworkPolicy" + exit 9 + fi + + echo "WARNING: reached max wait time of $maxWaitAfterAddingNetpol seconds after adding allow-pinger NetworkPolicy. Will try one more time" + fi + + prevTryDate=$currDate +done + +low=0 +if [[ $prevTryDate -gt $netpolStartDate ]]; then + low=$(( `date +%s` - $prevTryDate - $NETPOL_SLEEP )) +fi +high=$(( `date +%s` - $netpolStartDate )) +echo "SUCCESS: all connectivity tests passed after adding allow-pinger NetworkPolicy. Took between $low and $high seconds to take effect" + +echo +echo "FINISHED at $(date -u). Had started at $startDate." +echo diff --git a/test/scale/run-kwok.sh b/test/scale/run-kwok.sh new file mode 100755 index 0000000000..f1118ae320 --- /dev/null +++ b/test/scale/run-kwok.sh @@ -0,0 +1,37 @@ +###################################################################################### +# This script is used to schedule kwok nodes/pods and maintain kwok node heartbeats. # +###################################################################################### +INSTALL_KWOK=false +# KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') +KWOK_VERSION=${KWOK_LATEST_RELEASE:-"v0.1.1"} +# kubeconfig arg doesn't seem to work for kwok. It seems to just use current context of the default kubeconfig. + +# specify kubeconfig file as first arg if you want +if [[ $1 != "" ]]; then + file=$1 + test -f $file || { + echo "ERROR: KUBECONFIG=$file does not exist" + exit 1 + } + + KUBECONFIG_ARG="--kubeconfig $file" +fi + +if [[ INSTALL_KWOK == true ]]; then + wget -O kwokctl -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_VERSION}/kwokctl-$(go env GOOS)-$(go env GOARCH)" + chmod +x kwokctl + sudo mv kwokctl /usr/local/bin/kwokctl + + wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_VERSION}/kwok-$(go env GOOS)-$(go env GOARCH)" + chmod +x kwok + sudo mv kwok /usr/local/bin/kwok +fi + +kwok $KUBECONFIG_ARG \ + --cidr=155.0.0.0/16 \ + --node-ip=155.0.0.1 \ + --manage-all-nodes=false \ + --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ + --manage-nodes-with-label-selector= \ + --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ + --disregard-status-with-label-selector= diff --git a/test/scale/templates/kwok-deployment.yaml b/test/scale/templates/kwok-deployment.yaml new file mode 100644 index 0000000000..70ef60d5c2 --- /dev/null +++ b/test/scale/templates/kwok-deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fake-TEMP_NAME + namespace: scale-test + labels: + app: scale-test +spec: + replicas: TEMP_REPLICAS + selector: + matchLabels: + app: scale-testOTHER_LABELS_6_SPACES + template: + metadata: + labels: + app: scale-testOTHER_LABELS_8_SPACES + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + # A taints was added to an automatically created Node. + # You can remove taints of Node or add this tolerations. + tolerations: + - key: "kwok.x-k8s.io/node" + operator: "Exists" + effect: "NoSchedule" + containers: + - name: fake-container + image: fake-image diff --git a/test/scale/templates/kwok-node.yaml b/test/scale/templates/kwok-node.yaml new file mode 100644 index 0000000000..249cc717af --- /dev/null +++ b/test/scale/templates/kwok-node.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Node +metadata: + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: kwok-node-INSERT_NUMBER + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + type: kwok + name: kwok-node-INSERT_NUMBER +spec: + taints: # Avoid scheduling actual running pods to fake Node + - effect: NoSchedule + key: kwok.x-k8s.io/node + value: fake +status: + allocatable: + cpu: 32 + memory: 256Gi + pods: 110 + capacity: + cpu: 32 + memory: 256Gi + pods: 110 + nodeInfo: + architecture: amd64 + bootID: "" + containerRuntimeVersion: "" + kernelVersion: "" + kubeProxyVersion: fake + kubeletVersion: fake + machineID: "" + operatingSystem: linux + osImage: "" + systemUUID: "" + phase: Running diff --git a/test/scale/templates/networkpolicy.yaml b/test/scale/templates/networkpolicy.yaml new file mode 100644 index 0000000000..4bd07587f0 --- /dev/null +++ b/test/scale/templates/networkpolicy.yaml @@ -0,0 +1,22 @@ +kind: NetworkPolicy +apiVersion: networking.k8s.io/v1 +metadata: + name: TEMP_NAME + namespace: scale-test +spec: + podSelector: + matchLabels: + TEMP_LABEL_NAME: val + ingress: + - from: + - podSelector: + matchLabels: + TEMP_INGRESS_NAME: val + egress: + - to: + - podSelector: + matchLabels: + TEMP_EGRESS_NAME: val + policyTypes: + - Ingress + - Egress diff --git a/test/scale/templates/real-deployment.yaml b/test/scale/templates/real-deployment.yaml new file mode 100644 index 0000000000..38385eda49 --- /dev/null +++ b/test/scale/templates/real-deployment.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: TEMP_NAME + namespace: scale-test + labels: + app: scale-test +spec: + replicas: TEMP_REPLICAS + selector: + matchLabels: + app: scale-testOTHER_LABELS_6_SPACES + template: + metadata: + labels: + app: scale-testOTHER_LABELS_8_SPACES + spec: + nodeSelector: + scale-test: "true" + containers: + - command: + - /agnhost + - serve-hostname + - --tcp + - --http=false + - --port + - "80" + image: k8s.gcr.io/e2e-test-images/agnhost:2.33 + imagePullPolicy: IfNotPresent + name: cont-80-tcp + ports: + - containerPort: 80 + protocol: TCP diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh new file mode 100755 index 0000000000..ce87775e56 --- /dev/null +++ b/test/scale/test-scale.sh @@ -0,0 +1,328 @@ +# exit on error +set -e + +printHelp() { + cat < --num-kwok-deployments= --num-kwok-replicas= --max-real-pods-per-node= --num-real-deployments= --num-real-replicas= --num-network-policies= --num-unique-labels-per-pod= --num-unique-labels-per-deployment= --num-shared-labels-per-pod= [--kubeconfig=] [--restart-npm] [--debug-exit-after-print-counts] [--debug-exit-after-generation] + +Scales the number of Pods, Pod labels, and NetworkPolicies in a cluster. +Uses KWOK to create fake nodes and fake pods as needed. +Can also schedule real Pods. It will NOT scale real nodes. + +USAGE: +1. Create AKS cluster with --uptime-sla and create any nodepools +2. If making KWOK Pods, run `run-kwok.sh` in the background +3. Label node(s) to schedule real Pods: kubectl label node scale-test=true +4. Run this script with args like number of Deployments, replicas, and NetworkPolicies + +SPECIAL NOTES: +1. Check notes on --max-real-pods-per-node +2. For Cilium, check notes on --num-unique-labels-per-pod +3. Check restrictions on --num-shared-labels-per-pod + +REQUIRED PARAMETERS: + --max-kwok-pods-per-node limit for fake kwok nodes. 50 works. Not sure if there's a limit + --num-kwok-deployments number of fake deployments + --num-kwok-replicas per fake deployment + --max-real-pods-per-node check your VMs' --max-pod capacity and set maxRealPodsPerNode accordingly (leave wiggle room for system Pods) + --num-real-deployments deployments scheduled on nodes labeled with scale-test=true + --num-real-replicas per deployment + --num-network-policies NetPols applied to every Pod + --num-unique-labels-per-pod creates labels specific to each Pod. Creates numTotalPods*numUniqueLabelsPerPod distinct labels. In Cilium, a value >= 1 results in every Pod having a unique identity (not recommended for scale) + --num-unique-labels-per-deployment create labels shared between replicas of a deployment. Creates numTotalDeployments*numUniqueLabelsPerDeployment distinct labels + --num-shared-labels-per-pod create labels shared between all Pods. Creates numSharedLabelsPerPod distinct labels. Must be >= 3 if numNetworkPolicies > 0 because of the way we generate network policies + +OPTIONAL PARAMETERS: + --kubeconfig path to kubeconfig file + --restart-npm make sure NPM exists and restart it before running scale test + --debug-exit-after-print-counts skip scale test. Just print out counts of things to be created and counts of IPSets/ACLs that NPM would create + --debug-exit-after-generation skip scale test. Exit after generating templates +EOF +} + +## PARAMETERS +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + printHelp + exit 0 + ;; + --max-kwok-pods-per-node=*) + maxKwokPodsPerNode="${1#*=}" + ;; + --num-kwok-deployments=*) + numKwokDeployments="${1#*=}" + ;; + --num-kwok-replicas=*) + numKwokReplicas="${1#*=}" + ;; + --max-real-pods-per-node=*) + maxRealPodsPerNode="${1#*=}" + ;; + --num-real-deployments=*) + numRealDeployments="${1#*=}" + ;; + --num-real-replicas=*) + numRealReplicas="${1#*=}" + ;; + --num-network-policies=*) + numNetworkPolicies="${1#*=}" + ;; + --num-unique-labels-per-pod=*) + numUniqueLabelsPerPod="${1#*=}" + ;; + --num-unique-labels-per-deployment=*) + numUniqueLabelsPerDeployment="${1#*=}" + ;; + --num-shared-labels-per-pod=*) + numSharedLabelsPerPod="${1#*=}" + ;; + --kubeconfig=*) + file=${1#*=} + KUBECONFIG_ARG="--kubeconfig $file" + test -f $file || { + echo "ERROR: kubeconfig not found: [$file]" + exit 1 + } + echo "using kubeconfig: $file" + ;; + --restart-npm) + USING_NPM=true + ;; + --debug-exit-after-print-counts) + DEBUG_EXIT_AFTER_PRINT_COUNTS=true + ;; + --debug-exit-after-generation) + DEBUG_EXIT_AFTER_GENERATION=true + ;; + *) + echo "ERROR: unknown parameter $1. Make sure you're using '--key=value' for parameters with values" + exit 1 + ;; + esac + shift +done + +if [[ -z $maxKwokPodsPerNode || -z $numKwokDeployments || -z $numKwokReplicas || -z $maxRealPodsPerNode || -z $numRealDeployments || -z $numRealReplicas || -z $numNetworkPolicies || -z $numUniqueLabelsPerPod || -z $numUniqueLabelsPerDeployment || -z $numSharedLabelsPerPod ]]; then + echo "ERROR: missing required parameter. Check --help for usage" + exit 1 +fi + +if [[ $numNetworkPolicies -gt 0 && $numSharedLabelsPerPod -lt 3 ]]; then + echo "ERROR: numSharedLabelsPerPod must be >= 3 if numNetworkPolicies > 0 because of the way we generate network policies" + exit 1 +fi + +## CALCULATIONS +numKwokPods=$(( $numKwokDeployments * $numKwokReplicas )) +numKwokNodes=$(( ($numKwokPods + $maxKwokPodsPerNode - 1) / $maxKwokPodsPerNode)) +numRealPods=$(( $numRealDeployments * $numRealReplicas )) +numRealNodesRequired=$(( ($numRealPods + $maxRealPodsPerNode - 1) / $maxRealPodsPerNode)) +numTotalPods=$(( $numKwokPods + $numRealPods )) + +## NPM CALCULATIONS +# unique to templates/networkpolicy.yaml +numACLsAddedByNPM=$(( 4 * $numNetworkPolicies )) +# IPSet/member counts can be slight underestimates if there are more than one template-hash labels +# 4 basic IPSets are [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] +numIPSetsAddedByNPM=$(( 4 + 2*$numTotalPods*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod + 2*($numKwokDeployments+$numRealDeployments)*$numUniqueLabelsPerDeployment )) +# 3 basic members are [all-ns,kubernetes.io/metadata.name,kubernetes.io/metadata.name:scale-test] +# 5*pods members go to [ns-scale-test,kubernetes.io/metadata.name:scale-test,template-hash:xxxx,app:scale-test] +numIPSetMembersAddedByNPM=$(( 3 + $numTotalPods*(5 + 2*$numUniqueLabelsPerPod + 2*$numSharedLabelsPerPod) + 2*($numKwokPods+$numRealPods)*$numUniqueLabelsPerDeployment )) + +## PRINT OUT COUNTS +cat < $outFile + sed -i "s/TEMP_REPLICAS/$numReplicas/g" $outFile + + if [[ $numUniqueLabelsPerDeployment -gt 0 ]]; then + depLabels="" + for j in $(seq -f "%05g" 1 $numUniqueLabelsPerDeployment); do + depLabels="$depLabels\n $labelPrefix-$j: val" + done + perl -pi -e "s/OTHER_LABELS_6_SPACES/$depLabels/g" $outFile + + depLabels="" + for j in $(seq -f "%05g" 1 $numUniqueLabelsPerDeployment); do + depLabels="$depLabels\n $labelPrefix-$j: val" + done + perl -pi -e "s/OTHER_LABELS_8_SPACES/$depLabels/g" $outFile + else + sed -i "s/OTHER_LABELS_6_SPACES//g" $outFile + sed -i "s/OTHER_LABELS_8_SPACES//g" $outFile + fi + done +} + +echo "Generating yamls..." + +generateDeployments $numKwokDeployments $numKwokReplicas kwok +generateDeployments $numRealDeployments $numRealReplicas real + +for j in $(seq 1 $numNetworkPolicies); do + valNum=$j + i=`printf "%05d" $j` + sed "s/TEMP_NAME/policy-$i/g" templates/networkpolicy.yaml > generated/networkpolicies/policy-$i.yaml + if [[ $valNum -ge $(( numSharedLabelsPerPod - 2 )) ]]; then + valNum=$(( $numSharedLabelsPerPod - 2 )) + fi + k=`printf "%05d" $valNum` + sed -i "s/TEMP_LABEL_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml + + ingressNum=$(( $valNum + 1 )) + k=`printf "%05d" $ingressNum` + sed -i "s/TEMP_INGRESS_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml + + egressNum=$(( $valNum + 2 )) + k=`printf "%05d" $ingressNum` + sed -i "s/TEMP_EGRESS_NAME/shared-lab-$k/g" generated/networkpolicies/policy-$i.yaml +done + +for i in $(seq -f "%05g" 1 $numKwokNodes); do + cat templates/kwok-node.yaml | sed "s/INSERT_NUMBER/$i/g" > "generated/kwok-nodes/node-$i.yaml" +done + +echo "Done generating yamls." + +if [[ $DEBUG_EXIT_AFTER_GENERATION == true ]]; then + echo "DEBUG: exiting after generation..." + exit 0 +fi + +## VALIDATE REAL NODES +echo "checking if there are enough real nodes..." +numRealNodes=$(kubectl $KUBECONFIG_ARG get nodes -l scale-test=true | grep -v NAME | wc -l) +if [[ $numRealNodes -lt $numRealNodesRequired ]]; then + kubectl $KUBECONFIG_ARG get nodes + echo "ERROR: need $numRealNodesRequired real nodes to achieve a scale of $numRealPods real Pods. Make sure to label nodes with: kubectl label node scale-test=true" + exit 1 +fi + +## DELETE PRIOR STATE +echo "cleaning up previous scale test state..." +kubectl $KUBECONFIG_ARG delete ns scale-test connectivity-test --ignore-not-found +kubectl $KUBECONFIG_ARG delete node -l type=kwok + +if [[ $USING_NPM == true ]]; then + echo "restarting NPM pods..." + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm + kubectl $KUBECONFIG_ARG rollout restart -n kube-system ds azure-npm-win + echo "sleeping 3m to allow NPM pods to restart..." + sleep 1m + echo "2m remaining..." + sleep 1m + echo "1m remaining..." + sleep 1m + + echo "making sure NPM pods are running..." + kubectl $KUBECONFIG_ARG get pod -n kube-system | grep Running | grep -v "azure-npm-win" | grep -oP "azure-npm-[a-z0-9]+" -m 1 + if [[ $? != 0 ]]; then + echo "No Linux NPM pod running. Exiting." + exit 1 + fi + + kubectl $KUBECONFIG_ARG get pod -n kube-system | grep Running | grep -oP "azure-npm-win-[a-z0-9]+" -m 1 + if [[ $? != 0 ]]; then + echo "No Windows NPM pod running. Exiting." + exit 1 + fi +fi + +## RUN +if [[ $numKwokPods -gt 0 ]]; then + echo "START KWOK COMMAND NOW..." + sleep 10s +fi + +startDate=`date -u` +echo "STARTING RUN at $startDate" +echo + +set -x +kubectl $KUBECONFIG_ARG create ns scale-test +kubectl $KUBECONFIG_ARG apply -f generated/kwok-nodes/ +kubectl $KUBECONFIG_ARG apply -f generated/deployments/real/ +kubectl $KUBECONFIG_ARG apply -f generated/deployments/kwok/ +set +x + +if [[ $numSharedLabelsPerPod -gt 0 ]]; then + sharedLabels="" + for i in $(seq -f "%05g" 1 $numSharedLabelsPerPod); do + sharedLabels="$sharedLabels shared-lab-$i=val" + done + + set -x + kubectl $KUBECONFIG_ARG label pods -n scale-test --all $sharedLabels + set +x +fi + +if [[ $numUniqueLabelsPerPod -gt 0 ]]; then + count=1 + for pod in $(kubectl $KUBECONFIG_ARG get pods -n scale-test -o jsonpath='{.items[*].metadata.name}'); do + uniqueLabels="" + for tmp in $(seq 1 $numUniqueLabelsPerPod); do + i=`printf "%05d" $count` + uniqueLabels="$uniqueLabels uni-lab-$i=val" + count=$(( $count + 1 )) + done + + set -x + kubectl $KUBECONFIG_ARG label pods -n scale-test $pod $uniqueLabels + set +x + done +fi + +set -x +kubectl $KUBECONFIG_ARG apply -f generated/networkpolicies/ +set +x + +echo +echo "FINISHED at $(date -u). Had started at $startDate." +echo diff --git a/test/scale/utils/capture-cpu-and-mem.sh b/test/scale/utils/capture-cpu-and-mem.sh new file mode 100755 index 0000000000..6c24bac6c7 --- /dev/null +++ b/test/scale/utils/capture-cpu-and-mem.sh @@ -0,0 +1,48 @@ +##################################################################################### +# Periodically captures CPU/Memory of Pods/nodes and writes to csvs. # +##################################################################################### +APPEND_TO_EXISTING_FILES=true + +FOLDER="captures" +RUNNING_PODS_FILE=$FOLDER/cpu-and-mem-running-pods.out +POD_MEM_CSV=$FOLDER/cpu-and-mem-pod-results.csv +NODE_MEM_CSV=$FOLDER/cpu-and-mem-node-results.csv + +# kubectl top seems to refresh every minute +SLEEP_BETWEEN_CAPTURES=65 + +## RUN +mkdir -p $FOLDER + +if [[ $APPEND_TO_EXISTING_FILES != true ]]; then + if [[ -f $RUNNING_PODS_FILE || -f $POD_MEM_CSV || -f $NODE_MEM_CSV ]]; then + echo "ERROR: $RUNNING_PODS_FILE, $POD_MEM_CSV, or $NODE_MEM_CSV already exists. Either 1) set APPEND_TO_EXISTING_FILES=true or 2) move the old files" + exit 1 + fi + + echo "time,pod,cpu,mem" > $POD_MEM_CSV + echo "time,node,cpu,cpuPercent,mem,memPercent" > $NODE_MEM_CSV +fi + +while true; do + currentTime=`date -u` + echo "running k top pod" + lines=`kubectl top pod -A | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ','` + for line in $lines; do + echo "$currentTime,$line" >> $POD_MEM_CSV + done + + currentTime=`date -u` + echo "running k top node" + lines=`kubectl top node | grep -v NAME | grep -v kwok | awk '{$1=$1;print}' | tr ' ' ','` + for line in $lines; do + echo "$currentTime,$line" >> $NODE_MEM_CSV + done + + echo `date -u` >> $RUNNING_PODS_FILE + kubectl get pod -A -owide | grep npm >> $RUNNING_PODS_FILE + echo " " >> $RUNNING_PODS_FILE + + echo "sleeping $SLEEP_BETWEEN_CAPTURES seconds" + sleep $SLEEP_BETWEEN_CAPTURES +done