From 0bc064a111059566825e0c246e658a9b1dc441c1 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 11 Apr 2023 17:50:49 -0700 Subject: [PATCH 01/21] wip --- .../windows-validation-on-capz/npm/npm-e2e.sh | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 test/windows-validation-on-capz/npm/npm-e2e.sh diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh new file mode 100644 index 0000000000..81504bde2e --- /dev/null +++ b/test/windows-validation-on-capz/npm/npm-e2e.sh @@ -0,0 +1,360 @@ +# assumes that the following is set: +# set -o errexit + +# Installs NPM + a long-running Pod and does the following tests: +# 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran) +# 2. Run Cyclonus (filename: cyclonus.ran) +# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.ran) +# 4. Run Conformance (filename: conformance.ran) +# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.ran) +# 6. Run scale + connectivity test (filename: scale-connectivity.ran) +# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.ran) +# +# If any step fails, the script will exit and remaining tests won't be run (because of `set -o errexit`) +# +# NOTE: each step has both: +# - A .ran file that is created if the step is run +# - A .success file that is created if the step succeeds +# +# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all +npm_e2e () { + local kubeconfigFile=$1 + if [ -z "$kubeconfigFile" ]; then + log "ERROR: kubeconfigFile not set. can't run NPM e2e" + return 1 + fi + + test -f $kubeconfigFile || { + log "ERROR: kubeconfigFile $kubeconfigFile not found. can't run NPM e2e" + return 1 + } + + log "setting up npm e2e test" + + # make sure there are no previous results + log "cleaning up previous npm e2e results..." + rm npm-e2e.log npm-cyclonus.log npm-scale.log npm-scale-connectivity.log kwok.log || true + rm *.ran || true + rm *.success || true + + echo "" > npm-e2e.ran + + ## disable Calico NetPol + log "running helm uninstall on calico (this will remove the tigera-operator and prevent reconciling of the calico-node ClusterRole)..." + helm uninstall calico -n tigera-operator + kubectl delete ns tigera-operator + log "disabling Calico NetworkPolicy functionality by removing NetPol permission from calico-node ClusterRole..." + kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml + cat original-clusterrole.yaml | perl -0777 -i.original -pe 's/- apiGroups:\n - networking.k8s.io\n resources:\n - networkpolicies\n verbs:\n - watch\n - list\n//' > new-clusterrole.yaml + originalLineCount=`cat original-clusterrole.yaml | wc -l` + newLineCount=`cat new-clusterrole.yaml | wc -l` + if [ $originalLineCount != $(($newLineCount + 7)) ]; then + # NOTE: this check will only work the first time this script is run, since the original-clusterrole.yaml will be modified + log "ERROR: unable to run NPM e2e. unexpected line count difference between original and new calico-node clusterrole. original: $originalLineCount, new: $newLineCount" + return 1 + fi + kubectl rollout restart ds -n calico-system calico-node-windows + + ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol + kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon + kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label {} scale-test=true && kubectl label {} connectivity-test=true" + + # sleep for some time to let Calico CNI restart + sleep 3m + + ## install Azure NPM + log "installing Azure NPM..." + npmURL=https://raw.githubusercontent.com/Azure/azure-container-networking/0ea4e9ac3d287f7abb15a34a88beb87697fbbcdd/npm/examples/windows/azure-npm-capz.yaml #https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/azure-npm-capz.yaml + kubectl apply -f $npmURL + + ## install long-running pod + log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..." + kubectl create ns npm-e2e-longrunner + kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml + + # verify VFP tags after NPM boots up + # seems like the initial NPM Pods are always deleted and new ones are created (within the first minute of being applied it seems) + # sleep for some time to avoid running kubectl wait on pods that get deleted + log "waiting for NPM and long-runner to start running..." + sleep 3m + kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m + kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m + log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..." + sleep 8m + verify_vfp_tags_using_npm vfp-state-prior.ran + echo "" > vfp-state-prior.success + + ## NPM cyclonus + run_npm_cyclonus + echo "" > cyclonus.success + + log "sleeping 3m to allow VFP to update tags after cyclonus..." + sleep 3m + log "verifying VFP tags after cyclonus..." + verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran + echo "" > vfp-state-after-cyclonus.success + + ## NPM conformance + run_npm_conformance + echo "" > conformance.success + + log "sleeping 3m to allow VFP to update tags after conformance..." + sleep 3m + log "verifying VFP tags after conformance..." + verify_vfp_tags_using_npm vfp-state-after-conformance.ran + echo "" > vfp-state-after-conformance.success + + ## NPM scale + run_npm_scale $kubeconfigFile + echo "" > scale.success + log "sleeping 3m to allow VFP to update tags after scale test..." + sleep 3m + log "verifying VFP tags after scale test..." + verify_vfp_tags_using_npm vfp-state-after-scale.ran +} + +verify_vfp_tags_using_npm () { + local ranFilename=$1 + if [[ $ranFilename != *.ran ]]; then + log "ERROR: need a filename that ends in .ran passed as an argument to verify_vfp_tags_using_npm" + return 1 + fi + + log "verifying VFP tags are equal to HNS SetPolicies..." + npmNode=`kubectl get node -owide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | grep -v kwok-node | awk '{print $1}' | tail -n 1` || true + if [[ -z $npmNode ]]; then + log "ERROR: unable to find uncordoned node for NPM" + return 1 + fi + npmPod=`kubectl get pod -n kube-system -o wide | grep azure-npm-win | grep $npmNode | grep Running | awk '{print $1}'` || true + if [[ -z "$npmPod" ]]; then + log "ERROR: unable to find running azure-npm-win pod on node $npmNode" + kubectl get pod -n kube-system -o wide + kubectl logs -n kube-system -l k8s-app=azure-npm + return 1 + fi + + onNodeIPs=() ; for ip in `kubectl get pod -owide $kc -A | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done + matchString="" ; for ip in ${onNodeIPs[@]}; do matchString+=" \"${ip}\""; done + matchString=`echo $matchString | tr ' ' ','` + log "using matchString: $matchString" + ipsetCount=`kubectl exec -n kube-system $npmPod $kc -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\": \"IPSET\"').count" | tr -d '\r'` + log "HNS IPSET count: $ipsetCount" + kubectl exec -n kube-system $npmPod $kc -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags' + kubectl exec -n kube-system $npmPod $kc -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt + + hadEndpoints=false + hadFailure=false + for count in `cat vfp-tag-counts.txt | xargs -n 1 echo`; do + hadEndpoints=true + count=`echo $count | tr -d '\r'` + log "VFP tag count: $count" + if [[ $count != $ipsetCount ]]; then + log "WARNING: VFP tag count $count does not match HNS IPSET count $ipsetCount" + hadFailure=true + fi + done + + echo "" > $ranFilename + if [[ $hadEndpoints == false ]]; then + log "WARNING: VFP tags not validated for NPM since no endpoints found on node $npmNode" + fi + + if [[ $hadFailure == true ]]; then + log "ERROR: VFP tags are inconsistent with HNS SetPolicies" + capture_npm_hns_state + return 1 + fi +} + +# results in a file called npm-hns-state.zip +capture_npm_hns_state () { + log "capturing NPM HNS state..." + kubectl get pod -owide -A + test -d npm-hns-state/ && rm -rf npm-hns-state/ || true + mkdir npm-hns-state + cd npm-hns-state + curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/win-debug.sh + chmod u+x ./win-debug.sh + curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/pod_exec.ps1 + ./win-debug.sh + cd .. + zip -9qr npm-hns-state.zip npm-hns-state + # to unzip: + # unzip npm-hns-state.zip -d npm-hns-state +} + +# currently takes ~3 hours to run +# e.g. 19:37:05 to 22:32:44 and 19:16:18 to 22:29:13 +run_npm_conformance () { + ## install NPM e2e binary + log "ensuring NPM e2e binary is installed" + rc=0; test -f npm-e2e.test || rc=$? + if [[ $rc == 0 ]]; then + log "NPM e2e binary found, skipping install" + else + log "NPM e2e binary not found, installing..." + test -d npm-kubernetes/ && rm -rf npm-kubernetes/ || true + mkdir npm-kubernetes + cd npm-kubernetes + # NOTE: if this is not downloaded every run, then probably need to sleep before the VFP tag verification + git clone https://github.com/huntergregory/kubernetes.git --depth=1 --branch=quit-on-failure + cd kubernetes + make WHAT=test/e2e/e2e.test + cd ../.. + mv npm-kubernetes/kubernetes/_output/local/bin/linux/amd64/e2e.test ./npm-e2e.test + rm -rf npm-kubernetes/ + fi + + log "beginning npm conformance test..." + + toRun="NetworkPolicy" + + nomatch1="should enforce policy based on PodSelector or NamespaceSelector" + nomatch2="should enforce policy based on NamespaceSelector with MatchExpressions using default ns label" + nomatch3="should enforce policy based on PodSelector and NamespaceSelector" + nomatch4="should enforce policy based on Multiple PodSelectors and NamespaceSelectors" + cidrExcept1="should ensure an IP overlapping both IPBlock.CIDR and IPBlock.Except is allowed" + cidrExcept2="should enforce except clause while egress access to server in CIDR block" + namedPorts="named port" + wrongK8sVersion="Netpol API" + toSkip="\[LinuxOnly\]|$nomatch1|$nomatch2|$nomatch3|$nomatch4|$cidrExcept1|$cidrExcept2|$namedPorts|$wrongK8sVersion|SCTP" + + echo "" > conformance.ran + KUBERNETES_SERVICE_PORT=443 ./npm-e2e.test \ + --provider=skeleton \ + --ginkgo.noColor \ + --ginkgo.focus="$toRun" \ + --ginkgo.skip="$toSkip" \ + --allowed-not-ready-nodes=1 \ + --node-os-distro="windows" \ + --disable-log-dump \ + --ginkgo.progress=true \ + --ginkgo.slowSpecThreshold=120.0 \ + --ginkgo.flakeAttempts=0 \ + --ginkgo.trace=true \ + --ginkgo.v=true \ + --dump-logs-on-failure=true \ + --report-dir="${ARTIFACTS}" \ + --prepull-images=true \ + --v=5 "${ADDITIONAL_E2E_ARGS[@]}" | tee npm-e2e.log || true + + # grep "FAIL: unable to initialize resources: after 10 tries, 2 HTTP servers are not ready + + log "finished npm conformance test" + ## report if there's a failure + rc=0; cat npm-e2e.log | grep '"failed":1' > /dev/null 2>&1 || rc=$? + if [ $rc -eq 0 ]; then + log "ERROR: found failure in npm e2e test log" + capture_npm_hns_state + return 1 + fi +} + +# currently takes ~3.5 hours to run +# e.g. 20:49:05 to 00:21:12 +run_npm_cyclonus () { + log "installing cyclonus binary..." + curl -fsSL github.com/mattfenwick/cyclonus/releases/latest/download/cyclonus_linux_amd64.tar.gz | tar -zxv + + log "beginning npm cyclonus test..." + echo "" > cyclonus.ran + ./cyclonus_linux_amd64/cyclonus generate \ + --fail-fast \ + --noisy=true \ + --retries=7 \ + --ignore-loopback=true \ + --perturbation-wait-seconds=20 \ + --pod-creation-timeout-seconds=480 \ + --job-timeout-seconds=15 \ + --server-protocol=TCP,UDP \ + --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy | tee npm-cyclonus.log || true + + # for debugging with a smaller set of tests, use: + # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous + + rc=0; cat npm-cyclonus.log | grep "failed" > /dev/null 2>&1 || rc=$? + if [[ $rc == 0 ]]; then + echo "ERROR: failures encountered in npm cyclonus test" + capture_npm_hns_state + return 1 + fi + + rc=0; cat npm-cyclonus.log | grep "SummaryTable:" > /dev/null 2>&1 || rc=$? + if [[ $rc != 0 ]]; then + log "ERROR: npm cyclonus test did not finish for some reason" + capture_npm_hns_state + return 1 + fi +} + +run_npm_scale () { + local kubeconfigFile=$1 + + log "beginning npm scale test with kubeconfig [$kubeconfigFile]..." + + rm -rf azure-container-networking/ || true + git clone --depth=1 --branch=master https://github.com/Azure/azure-container-networking.git + + cd azure-container-networking/test/scale/ + + chmod u+x test-scale.sh + chmod u+x run-kwok.sh + cd connectivity/ + chmod u+x test-connectivity.sh + cd ../ + + ./run-kwok.sh $kubeconfigFile > ../../../kwok.log & + kwok_pid=$! + + # exact counts output from script + # Pod Counts: + # - 10 fake Pods + # - 10 real Pods + # HNS Counts: + # - number of ACLs per Pod Endpoint: 6 (6*numNetworkPolicies) + # - number of SetPolicies: ~40 (2*numUniqueLabelsPerPod*numFakePods) + # - max IPs per SetPolicy: number of total Pods + + # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods + ./test-scale.sh \ + --max-kwok-pods-per-node=50 \ + --num-kwok-deployments=10 \ + --num-kwok-replicas=1 \ + --max-real-pods-per-node=30 \ + --num-real-deployments=5 \ + --num-real-replicas=2 \ + --num-network-policies=1 \ + --num-unique-labels-per-pod=2 \ + --num-unique-labels-per-deployment=2 \ + --num-shared-labels-per-pod=10 | tee ../../../npm-scale.log || true + + rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$? + if [[ $rc != 0 ]]; then + log "ERROR: npm scale test did not properly scale" + kill $kwok_pid + cd ../../../ + return 1 + fi + + log "beginning npm scale connectivity test..." + + cd connectivity/ + + minutesToWaitForInitialConnectivity=30 + minutesToWaitAfterAddingNetPol=10 + echo "" > ../../../../scale-connectivity.ran + ./test-connectivity.sh --num-scale-pods-to-verify=10 --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true + + cd ../../../../ + rc=0; cat npm-scale-connectivity.log | grep "FINISHED" > /dev/null 2>&1 || rc=$? + if [[ $rc != 0 ]]; then + log "ERROR: npm scale test connectivity failed" + kill $kwok_pid + capture_npm_hns_state + return 1 + fi + + echo "" > scale-connectivity.success + kill $kwok_pid +} From f9dcfb655f68232df119b45512d9aabcfcef084c Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 11 Apr 2023 18:21:48 -0700 Subject: [PATCH 02/21] wip2 --- .../windows-validation-on-capz/npm/npm-e2e.sh | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh index 81504bde2e..80f65f5912 100644 --- a/test/windows-validation-on-capz/npm/npm-e2e.sh +++ b/test/windows-validation-on-capz/npm/npm-e2e.sh @@ -33,9 +33,7 @@ npm_e2e () { # make sure there are no previous results log "cleaning up previous npm e2e results..." - rm npm-e2e.log npm-cyclonus.log npm-scale.log npm-scale-connectivity.log kwok.log || true - rm *.ran || true - rm *.success || true + rm *.log *.ran *.success || true echo "" > npm-e2e.ran @@ -299,12 +297,19 @@ run_npm_scale () { cd azure-container-networking/test/scale/ chmod u+x test-scale.sh - chmod u+x run-kwok.sh cd connectivity/ chmod u+x test-connectivity.sh cd ../ - ./run-kwok.sh $kubeconfigFile > ../../../kwok.log & + # run kwok + kwok --kubeconfig=$kubeconfigFile \ + --cidr=155.0.0.0/16 \ + --node-ip=155.0.0.1 \ + --manage-all-nodes=false \ + --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ + --manage-nodes-with-label-selector= \ + --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ + --disregard-status-with-label-selector= > ../../../kwok.log & kwok_pid=$! # exact counts output from script @@ -317,8 +322,7 @@ run_npm_scale () { # - max IPs per SetPolicy: number of total Pods # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods - ./test-scale.sh \ - --max-kwok-pods-per-node=50 \ + ./test-scale.sh --max-kwok-pods-per-node=50 \ --num-kwok-deployments=10 \ --num-kwok-replicas=1 \ --max-real-pods-per-node=30 \ @@ -337,6 +341,14 @@ run_npm_scale () { return 1 fi + log "waiting up to 10m for all Pods to be running..." + kubectl wait --for=condition=Ready -n scale-test --all pods --timeout=10m > ../../../waiting-for-pods.log || { + log "ERROR: not all scale Pods are running" + kill $kwok_pid + cd ../../../ + return 1 + } + log "beginning npm scale connectivity test..." cd connectivity/ From de062fb10cc6e99cd704441a52a366f51b069471 Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Wed, 12 Apr 2023 11:11:51 -0700 Subject: [PATCH 03/21] set -o errexit --- test/windows-validation-on-capz/npm/npm-e2e.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh index 80f65f5912..9984617231 100644 --- a/test/windows-validation-on-capz/npm/npm-e2e.sh +++ b/test/windows-validation-on-capz/npm/npm-e2e.sh @@ -1,5 +1,5 @@ -# assumes that the following is set: -# set -o errexit +# this is required so that if a step fails, following steps are not run +set -o errexit # Installs NPM + a long-running Pod and does the following tests: # 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran) From 744fd5b51467ae52fc5f201b8cf2cddef0f0f80c Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Wed, 12 Apr 2023 14:38:25 -0700 Subject: [PATCH 04/21] note about debugging --- test/windows-validation-on-capz/npm/npm-e2e.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh index 9984617231..19f1de4967 100644 --- a/test/windows-validation-on-capz/npm/npm-e2e.sh +++ b/test/windows-validation-on-capz/npm/npm-e2e.sh @@ -217,6 +217,9 @@ run_npm_conformance () { namedPorts="named port" wrongK8sVersion="Netpol API" toSkip="\[LinuxOnly\]|$nomatch1|$nomatch2|$nomatch3|$nomatch4|$cidrExcept1|$cidrExcept2|$namedPorts|$wrongK8sVersion|SCTP" + + # to debug with one test case, uncomment this + # toRun="NetworkPolicy API should support creating NetworkPolicy API operations" echo "" > conformance.ran KUBERNETES_SERVICE_PORT=443 ./npm-e2e.test \ @@ -268,8 +271,8 @@ run_npm_cyclonus () { --server-protocol=TCP,UDP \ --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy | tee npm-cyclonus.log || true - # for debugging with a smaller set of tests, use: - # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous + # for debugging with a smaller set of tests, use this as the last line instead + # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous | tee npm-cyclonus.log || true rc=0; cat npm-cyclonus.log | grep "failed" > /dev/null 2>&1 || rc=$? if [[ $rc == 0 ]]; then From 9861d970fffb22f7d57d48aa1138c335186600d3 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 12 Apr 2023 16:55:32 -0700 Subject: [PATCH 05/21] rename folder --- test/{windows-validation-on-capz => capz}/npm/npm-e2e.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{windows-validation-on-capz => capz}/npm/npm-e2e.sh (100%) diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh similarity index 100% rename from test/windows-validation-on-capz/npm/npm-e2e.sh rename to test/capz/npm/npm-e2e.sh From 504ce17ef5b84459c18d6cf3b46decf78cb87dad Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 12 Apr 2023 17:58:21 -0700 Subject: [PATCH 06/21] small fixes --- test/capz/npm/npm-e2e.sh | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 19f1de4967..cd280b3f29 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -16,7 +16,7 @@ set -o errexit # - A .ran file that is created if the step is run # - A .success file that is created if the step succeeds # -# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all +# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all, and npm-e2e.success that indicates that all steps succeeded npm_e2e () { local kubeconfigFile=$1 if [ -z "$kubeconfigFile" ]; then @@ -55,7 +55,7 @@ npm_e2e () { ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon - kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label {} scale-test=true && kubectl label {} connectivity-test=true" + kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label node {} scale-test=true && kubectl label node {} connectivity-test=true" # sleep for some time to let Calico CNI restart sleep 3m @@ -104,11 +104,14 @@ npm_e2e () { ## NPM scale run_npm_scale $kubeconfigFile - echo "" > scale.success + echo "" > scale-connectivity.success log "sleeping 3m to allow VFP to update tags after scale test..." sleep 3m log "verifying VFP tags after scale test..." verify_vfp_tags_using_npm vfp-state-after-scale.ran + echo "" > vfp-state-after-scale.success + + echo "" > npm-e2e.success } verify_vfp_tags_using_npm () { @@ -132,14 +135,14 @@ verify_vfp_tags_using_npm () { return 1 fi - onNodeIPs=() ; for ip in `kubectl get pod -owide $kc -A | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done + onNodeIPs=() ; for ip in `kubectl get pod -owide -A | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done matchString="" ; for ip in ${onNodeIPs[@]}; do matchString+=" \"${ip}\""; done matchString=`echo $matchString | tr ' ' ','` log "using matchString: $matchString" - ipsetCount=`kubectl exec -n kube-system $npmPod $kc -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\": \"IPSET\"').count" | tr -d '\r'` + ipsetCount=`kubectl exec -n kube-system $npmPod -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\": \"IPSET\"').count" | tr -d '\r'` log "HNS IPSET count: $ipsetCount" - kubectl exec -n kube-system $npmPod $kc -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags' - kubectl exec -n kube-system $npmPod $kc -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt + kubectl exec -n kube-system $npmPod -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags' + kubectl exec -n kube-system $npmPod -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt hadEndpoints=false hadFailure=false From f6a71bb9b6affc2c6b4ec4b42f7b2797cc3ee5db Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 12 Apr 2023 17:58:56 -0700 Subject: [PATCH 07/21] longer sleep befor checking vfp tags --- test/capz/npm/npm-e2e.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index cd280b3f29..ae672bafa2 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -86,8 +86,8 @@ npm_e2e () { run_npm_cyclonus echo "" > cyclonus.success - log "sleeping 3m to allow VFP to update tags after cyclonus..." - sleep 3m + log "sleeping 5m to allow VFP to update tags after cyclonus..." + sleep 5m log "verifying VFP tags after cyclonus..." verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran echo "" > vfp-state-after-cyclonus.success @@ -96,8 +96,8 @@ npm_e2e () { run_npm_conformance echo "" > conformance.success - log "sleeping 3m to allow VFP to update tags after conformance..." - sleep 3m + log "sleeping 5m to allow VFP to update tags after conformance..." + sleep 5m log "verifying VFP tags after conformance..." verify_vfp_tags_using_npm vfp-state-after-conformance.ran echo "" > vfp-state-after-conformance.success @@ -105,8 +105,8 @@ npm_e2e () { ## NPM scale run_npm_scale $kubeconfigFile echo "" > scale-connectivity.success - log "sleeping 3m to allow VFP to update tags after scale test..." - sleep 3m + log "sleeping 5m to allow VFP to update tags after scale test..." + sleep 5m log "verifying VFP tags after scale test..." verify_vfp_tags_using_npm vfp-state-after-scale.ran echo "" > vfp-state-after-scale.success From 52e515f4977b404a36da8cf7636ff7b6e72f4997 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 13 Apr 2023 13:09:41 -0700 Subject: [PATCH 08/21] delete cyc pods and remove other code --- test/capz/npm/npm-e2e.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index ae672bafa2..5f3068b291 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -91,6 +91,8 @@ npm_e2e () { log "verifying VFP tags after cyclonus..." verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran echo "" > vfp-state-after-cyclonus.success + log "deleting cyclonus pods..." + kubectl delete ns x y z ## NPM conformance run_npm_conformance @@ -347,14 +349,6 @@ run_npm_scale () { return 1 fi - log "waiting up to 10m for all Pods to be running..." - kubectl wait --for=condition=Ready -n scale-test --all pods --timeout=10m > ../../../waiting-for-pods.log || { - log "ERROR: not all scale Pods are running" - kill $kwok_pid - cd ../../../ - return 1 - } - log "beginning npm scale connectivity test..." cd connectivity/ From 8bfa60d08546e0461bb86c5b4b8206e6f6b6e8ee Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 13 Apr 2023 13:48:41 -0700 Subject: [PATCH 09/21] add new scale parameters --- test/capz/npm/npm-e2e.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 5f3068b291..1c862f0bde 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -300,7 +300,7 @@ run_npm_scale () { log "beginning npm scale test with kubeconfig [$kubeconfigFile]..." rm -rf azure-container-networking/ || true - git clone --depth=1 --branch=master https://github.com/Azure/azure-container-networking.git + git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=hgregory/edit-scale cd azure-container-networking/test/scale/ @@ -337,9 +337,20 @@ run_npm_scale () { --num-real-deployments=5 \ --num-real-replicas=2 \ --num-network-policies=1 \ + --num-unapplied-network-policies=10 \ --num-unique-labels-per-pod=2 \ --num-unique-labels-per-deployment=2 \ - --num-shared-labels-per-pod=10 | tee ../../../npm-scale.log || true + --num-shared-labels-per-pod=10 \ + --delete-kwok-pods=10 \ + --delete-real-pods=5 \ + --delete-pods-interval=120 \ + --delete-pods-times=2 \ + --delete-labels \ + --delete-labels-interval=60 \ + --delete-labels-times=1 \ + --delete-netpols \ + --delete-netpols-interval=60 \ + --delete-netpols-times=1 | tee ../../../npm-scale.log || true rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$? if [[ $rc != 0 ]]; then @@ -356,7 +367,7 @@ run_npm_scale () { minutesToWaitForInitialConnectivity=30 minutesToWaitAfterAddingNetPol=10 echo "" > ../../../../scale-connectivity.ran - ./test-connectivity.sh --num-scale-pods-to-verify=10 --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true + ./test-connectivity.sh --num-scale-pods-to-verify=all --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true cd ../../../../ rc=0; cat npm-scale-connectivity.log | grep "FINISHED" > /dev/null 2>&1 || rc=$? From 0f98481a73d4ca65db5f04d8504dbf95c25539a6 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 10:12:05 -0700 Subject: [PATCH 10/21] switch branch --- test/capz/npm/npm-e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 1c862f0bde..fbacc8bb69 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -300,7 +300,7 @@ run_npm_scale () { log "beginning npm scale test with kubeconfig [$kubeconfigFile]..." rm -rf azure-container-networking/ || true - git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=hgregory/edit-scale + git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=master cd azure-container-networking/test/scale/ From c253da07ba26cdefc71f0581f172559e47beb852 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 10:13:43 -0700 Subject: [PATCH 11/21] define log --- test/capz/npm/npm-e2e.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index fbacc8bb69..5651549b2c 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -1,6 +1,11 @@ # this is required so that if a step fails, following steps are not run set -o errexit +log() { + local msg=$1 + echo "$(date -R): $msg" +} + # Installs NPM + a long-running Pod and does the following tests: # 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran) # 2. Run Cyclonus (filename: cyclonus.ran) From 23db567053709f4ac5e08dad67e7f2963511fd71 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 10:16:20 -0700 Subject: [PATCH 12/21] move installation to func --- test/capz/npm/npm-e2e.sh | 79 +++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 5651549b2c..70bb4ad9b1 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -42,6 +42,48 @@ npm_e2e () { echo "" > npm-e2e.ran + install_npm + + log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..." + sleep 8m + verify_vfp_tags_using_npm vfp-state-prior.ran + echo "" > vfp-state-prior.success + + ## NPM cyclonus + run_npm_cyclonus + echo "" > cyclonus.success + + log "sleeping 5m to allow VFP to update tags after cyclonus..." + sleep 5m + log "verifying VFP tags after cyclonus..." + verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran + echo "" > vfp-state-after-cyclonus.success + log "deleting cyclonus pods..." + kubectl delete ns x y z + + ## NPM conformance + run_npm_conformance + echo "" > conformance.success + + log "sleeping 5m to allow VFP to update tags after conformance..." + sleep 5m + log "verifying VFP tags after conformance..." + verify_vfp_tags_using_npm vfp-state-after-conformance.ran + echo "" > vfp-state-after-conformance.success + + ## NPM scale + run_npm_scale $kubeconfigFile + echo "" > scale-connectivity.success + log "sleeping 5m to allow VFP to update tags after scale test..." + sleep 5m + log "verifying VFP tags after scale test..." + verify_vfp_tags_using_npm vfp-state-after-scale.ran + echo "" > vfp-state-after-scale.success + + echo "" > npm-e2e.success +} + +install_npm () { ## disable Calico NetPol log "running helm uninstall on calico (this will remove the tigera-operator and prevent reconciling of the calico-node ClusterRole)..." helm uninstall calico -n tigera-operator @@ -82,43 +124,6 @@ npm_e2e () { sleep 3m kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m - log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..." - sleep 8m - verify_vfp_tags_using_npm vfp-state-prior.ran - echo "" > vfp-state-prior.success - - ## NPM cyclonus - run_npm_cyclonus - echo "" > cyclonus.success - - log "sleeping 5m to allow VFP to update tags after cyclonus..." - sleep 5m - log "verifying VFP tags after cyclonus..." - verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran - echo "" > vfp-state-after-cyclonus.success - log "deleting cyclonus pods..." - kubectl delete ns x y z - - ## NPM conformance - run_npm_conformance - echo "" > conformance.success - - log "sleeping 5m to allow VFP to update tags after conformance..." - sleep 5m - log "verifying VFP tags after conformance..." - verify_vfp_tags_using_npm vfp-state-after-conformance.ran - echo "" > vfp-state-after-conformance.success - - ## NPM scale - run_npm_scale $kubeconfigFile - echo "" > scale-connectivity.success - log "sleeping 5m to allow VFP to update tags after scale test..." - sleep 5m - log "verifying VFP tags after scale test..." - verify_vfp_tags_using_npm vfp-state-after-scale.ran - echo "" > vfp-state-after-scale.success - - echo "" > npm-e2e.success } verify_vfp_tags_using_npm () { From 9cc775a8a70e64d2a728fc12ff31626781f1d2ef Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 13:06:35 -0700 Subject: [PATCH 13/21] set registry keys for npm fixes --- test/capz/npm/npm-e2e.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 70bb4ad9b1..dbba946959 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -124,6 +124,33 @@ install_npm () { sleep 3m kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m + + ## set registry keys for NPM fixes + log "updating registry keys and restarting HNS for NPM fixes..." + npmNode=`kubectl get node -owide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | grep -v kwok-node | awk '{print $1}' | tail -n 1` || true + if [[ -z $npmNode ]]; then + log "ERROR: unable to find uncordoned node for NPM" + return 1 + fi + npmPod=`kubectl get pod -n kube-system -o wide | grep azure-npm-win | grep $npmNode | grep Running | awk '{print $1}'` || true + if [[ -z "$npmPod" ]]; then + log "ERROR: unable to find running azure-npm-win pod on node $npmNode" + kubectl get pod -n kube-system -o wide + kubectl logs -n kube-system -l k8s-app=azure-npm + return 1 + fi + cmd="reg add HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsAclUpdateChange /t REG_DWORD /d 1 /f" + cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsAclUpdateChange" + cmd="$cmd ; reg add HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh /t REG_DWORD /d 1 /f" + cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh" + cmd="$cmd ; Restart-Service HNS" + cmd="$cmd ; sleep 10" + kubectl exec -it -n kube-system $npmPod -- powershell.exe "$cmd" + log "sleeping 3m to let HNS restart..." + sleep 3m + log "making sure NPM and long-runner are running..." + kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m + kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m } verify_vfp_tags_using_npm () { From 72b2b4d0e23e69a863f98c1bb5f29f0e21cf8163 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 13:16:04 -0700 Subject: [PATCH 14/21] remove -it --- test/capz/npm/npm-e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index dbba946959..cb6da51580 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -145,7 +145,7 @@ install_npm () { cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh" cmd="$cmd ; Restart-Service HNS" cmd="$cmd ; sleep 10" - kubectl exec -it -n kube-system $npmPod -- powershell.exe "$cmd" + kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd" log "sleeping 3m to let HNS restart..." sleep 3m log "making sure NPM and long-runner are running..." From 65ebebd141cedbe21f842597f765e4fff1cce118 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 14:22:49 -0700 Subject: [PATCH 15/21] check for rehydration error --- test/capz/npm/npm-e2e.sh | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index cb6da51580..a358902ea9 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -112,18 +112,12 @@ install_npm () { npmURL=https://raw.githubusercontent.com/Azure/azure-container-networking/0ea4e9ac3d287f7abb15a34a88beb87697fbbcdd/npm/examples/windows/azure-npm-capz.yaml #https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/azure-npm-capz.yaml kubectl apply -f $npmURL - ## install long-running pod - log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..." - kubectl create ns npm-e2e-longrunner - kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml - # verify VFP tags after NPM boots up # seems like the initial NPM Pods are always deleted and new ones are created (within the first minute of being applied it seems) # sleep for some time to avoid running kubectl wait on pods that get deleted - log "waiting for NPM and long-runner to start running..." + log "waiting for NPM to start running..." sleep 3m kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m - kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m ## set registry keys for NPM fixes log "updating registry keys and restarting HNS for NPM fixes..." @@ -148,6 +142,21 @@ install_npm () { kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd" log "sleeping 3m to let HNS restart..." sleep 3m + + ## install long-running pod and restart HNS again (must install after restarting HNS because of a fix in rehydrating Endpoints in one of the registry keys) + log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..." + kubectl create ns npm-e2e-longrunner + kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml + sleep 10s + log "making sure long-runner is running" + kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m + + log "restarting HNS again to make sure Endpoints rehydrate correctly" + kubectl exec -n kube-system $npmPod -- powershell.exe "Restart-Service HNS" + + log "sleeping 3m to let HNS restart..." + sleep 3m + log "making sure NPM and long-runner are running..." kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m @@ -197,7 +206,8 @@ verify_vfp_tags_using_npm () { echo "" > $ranFilename if [[ $hadEndpoints == false ]]; then - log "WARNING: VFP tags not validated for NPM since no endpoints found on node $npmNode" + log "ERROR: no Endpoints found in HNS for node IPs $matchString on node $npmNode. Rehydration of Endpoints likely failed" + return 1 fi if [[ $hadFailure == true ]]; then From 4519f44c750166457ac6f454808a73d30362309b Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 14 Apr 2023 16:03:31 -0700 Subject: [PATCH 16/21] cleaner command --- test/capz/npm/npm-e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index a358902ea9..428fcf7f6f 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -102,7 +102,7 @@ install_npm () { ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon - kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label node {} scale-test=true && kubectl label node {} connectivity-test=true" + kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} kubectl label node {} scale-test=true connectivity-test=true # sleep for some time to let Calico CNI restart sleep 3m From 95ba9f847c043057b09a79b8a0a3f400f6e0c6df Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 19 Apr 2023 13:02:06 -0700 Subject: [PATCH 17/21] minor updates --- test/capz/npm/npm-e2e.sh | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 428fcf7f6f..b09a42cc58 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -89,7 +89,7 @@ install_npm () { helm uninstall calico -n tigera-operator kubectl delete ns tigera-operator log "disabling Calico NetworkPolicy functionality by removing NetPol permission from calico-node ClusterRole..." - kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml + kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml cat original-clusterrole.yaml | perl -0777 -i.original -pe 's/- apiGroups:\n - networking.k8s.io\n resources:\n - networkpolicies\n verbs:\n - watch\n - list\n//' > new-clusterrole.yaml originalLineCount=`cat original-clusterrole.yaml | wc -l` newLineCount=`cat new-clusterrole.yaml | wc -l` @@ -102,7 +102,7 @@ install_npm () { ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon - kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} kubectl label node {} scale-test=true connectivity-test=true + kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true # sleep for some time to let Calico CNI restart sleep 3m @@ -369,35 +369,33 @@ run_npm_scale () { # exact counts output from script # Pod Counts: - # - 10 fake Pods - # - 10 real Pods + # - 25 fake Pods + # - 5 real Pods # HNS Counts: # - number of ACLs per Pod Endpoint: 6 (6*numNetworkPolicies) - # - number of SetPolicies: ~40 (2*numUniqueLabelsPerPod*numFakePods) + # - number of SetPolicies: ~100 (2*numUniqueLabelsPerPod*numFakePods) # - max IPs per SetPolicy: number of total Pods - - # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods ./test-scale.sh --max-kwok-pods-per-node=50 \ - --num-kwok-deployments=10 \ - --num-kwok-replicas=1 \ + --num-kwok-deployments=5 \ + --num-kwok-replicas=5 \ --max-real-pods-per-node=30 \ --num-real-deployments=5 \ - --num-real-replicas=2 \ + --num-real-replicas=1 \ --num-network-policies=1 \ - --num-unapplied-network-policies=10 \ + --num-unapplied-network-policies=3 \ --num-unique-labels-per-pod=2 \ --num-unique-labels-per-deployment=2 \ --num-shared-labels-per-pod=10 \ - --delete-kwok-pods=10 \ - --delete-real-pods=5 \ - --delete-pods-interval=120 \ - --delete-pods-times=2 \ --delete-labels \ --delete-labels-interval=60 \ --delete-labels-times=1 \ --delete-netpols \ --delete-netpols-interval=60 \ - --delete-netpols-times=1 | tee ../../../npm-scale.log || true + --delete-netpols-times=1 \ + --delete-kwok-pods=1 \ + --delete-real-pods=1 \ + --delete-pods-interval=120 \ + --delete-pods-times=1 | tee ../../../npm-scale.log || true rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$? if [[ $rc != 0 ]]; then From 35d9b16bf399614709f98f8dbd7aaf3f1c85bc3c Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 19 Apr 2023 14:06:34 -0700 Subject: [PATCH 18/21] restart computer --- test/capz/npm/npm-e2e.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index b09a42cc58..2d21380e8a 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -139,6 +139,7 @@ install_npm () { cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh" cmd="$cmd ; Restart-Service HNS" cmd="$cmd ; sleep 10" + cmd="$cmd ; Restart-Computer" kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd" log "sleeping 3m to let HNS restart..." sleep 3m From a4f553aa7c5a0cbf0169ea75717f9dfce183556a Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Mon, 24 Apr 2023 11:21:56 -0700 Subject: [PATCH 19/21] run everything always --- test/capz/npm/npm-e2e.sh | 65 ++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index 2d21380e8a..fab67c44be 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -7,21 +7,20 @@ log() { } # Installs NPM + a long-running Pod and does the following tests: -# 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran) -# 2. Run Cyclonus (filename: cyclonus.ran) -# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.ran) -# 4. Run Conformance (filename: conformance.ran) -# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.ran) -# 6. Run scale + connectivity test (filename: scale-connectivity.ran) -# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.ran) +# 1.1. Check if HNS rehydration of endpoints works (filename: rehydration.failed) +# 1.2. Check VFP is in sync with HNS (filename: vfp-state-prior.success) +# 2. Run Cyclonus (filename: cyclonus.success) +# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.success) +# 4. Run Conformance (filename: conformance.success) +# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.success) +# 6. Run scale + connectivity test (filename: scale-connectivity.success) +# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.success) # -# If any step fails, the script will exit and remaining tests won't be run (because of `set -o errexit`) +# NOTE: each step also has a .ran file that is created if the step is run. # -# NOTE: each step has both: -# - A .ran file that is created if the step is run -# - A .success file that is created if the step succeeds -# -# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all, and npm-e2e.success that indicates that all steps succeeded +# There is also: +# - A npm-e2e.ran file that indicates that the npm e2e was run at all +# - A npm-e2e.success that indicates that all steps succeeded npm_e2e () { local kubeconfigFile=$1 if [ -z "$kubeconfigFile" ]; then @@ -35,10 +34,11 @@ npm_e2e () { } log "setting up npm e2e test" + anyStepFailed=false # make sure there are no previous results log "cleaning up previous npm e2e results..." - rm *.log *.ran *.success || true + rm *.log *.ran *.success *.failed || true echo "" > npm-e2e.ran @@ -46,41 +46,36 @@ npm_e2e () { log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..." sleep 8m - verify_vfp_tags_using_npm vfp-state-prior.ran - echo "" > vfp-state-prior.success + verify_vfp_tags_using_npm vfp-state-prior || anyStepFailed=true ## NPM cyclonus - run_npm_cyclonus - echo "" > cyclonus.success + run_npm_cyclonus && echo "" > cyclonus.success || anyStepFailed=true log "sleeping 5m to allow VFP to update tags after cyclonus..." sleep 5m log "verifying VFP tags after cyclonus..." - verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran - echo "" > vfp-state-after-cyclonus.success + verify_vfp_tags_using_npm vfp-state-after-cyclonus || anyStepFailed=true log "deleting cyclonus pods..." kubectl delete ns x y z ## NPM conformance - run_npm_conformance - echo "" > conformance.success + run_npm_conformance && echo "" > conformance.success || anyStepFailed=true log "sleeping 5m to allow VFP to update tags after conformance..." sleep 5m log "verifying VFP tags after conformance..." - verify_vfp_tags_using_npm vfp-state-after-conformance.ran - echo "" > vfp-state-after-conformance.success + verify_vfp_tags_using_npm vfp-state-after-conformance || anyStepFailed=true ## NPM scale - run_npm_scale $kubeconfigFile - echo "" > scale-connectivity.success + run_npm_scale $kubeconfigFile && echo "" > scale-connectivity.success || anyStepFailed=true log "sleeping 5m to allow VFP to update tags after scale test..." sleep 5m log "verifying VFP tags after scale test..." - verify_vfp_tags_using_npm vfp-state-after-scale.ran - echo "" > vfp-state-after-scale.success + verify_vfp_tags_using_npm vfp-state-after-scale || anyStepFailed=true - echo "" > npm-e2e.success + if [[ $anyStepFailed == false ]]; then + echo "" > npm-e2e.success + fi } install_npm () { @@ -165,8 +160,8 @@ install_npm () { verify_vfp_tags_using_npm () { local ranFilename=$1 - if [[ $ranFilename != *.ran ]]; then - log "ERROR: need a filename that ends in .ran passed as an argument to verify_vfp_tags_using_npm" + if [[ -z $ranFilename ]]; then + log "ERROR: need a filename passed as an argument to verify_vfp_tags_using_npm" return 1 fi @@ -205,17 +200,21 @@ verify_vfp_tags_using_npm () { fi done - echo "" > $ranFilename + echo "" > rehydration.ran if [[ $hadEndpoints == false ]]; then log "ERROR: no Endpoints found in HNS for node IPs $matchString on node $npmNode. Rehydration of Endpoints likely failed" + echo "" > rehydration.failed return 1 fi - + + echo "" > $ranFilename.ran if [[ $hadFailure == true ]]; then log "ERROR: VFP tags are inconsistent with HNS SetPolicies" capture_npm_hns_state return 1 fi + + echo "" > $ranFilename.success } # results in a file called npm-hns-state.zip From e898f1f966e4e528bc206ddf3cdafd34f02e7df5 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 25 Apr 2023 10:06:20 -0700 Subject: [PATCH 20/21] cyclonus junit and capture hns state once --- test/capz/npm/npm-e2e.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index fab67c44be..ff758613a2 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -39,6 +39,7 @@ npm_e2e () { # make sure there are no previous results log "cleaning up previous npm e2e results..." rm *.log *.ran *.success *.failed || true + rm -rf npm-hns-state/ || true echo "" > npm-e2e.ran @@ -219,9 +220,13 @@ verify_vfp_tags_using_npm () { # results in a file called npm-hns-state.zip capture_npm_hns_state () { + if [[ -f npm-hns-state.zip ]]; then + log "WARNING: not capturing NPM HNS state since state was previously captured" + return 0 + fi + log "capturing NPM HNS state..." kubectl get pod -owide -A - test -d npm-hns-state/ && rm -rf npm-hns-state/ || true mkdir npm-hns-state cd npm-hns-state curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/win-debug.sh @@ -313,6 +318,7 @@ run_npm_cyclonus () { log "beginning npm cyclonus test..." echo "" > cyclonus.ran ./cyclonus_linux_amd64/cyclonus generate \ + --junit-results-file=cyclonus.xml \ --fail-fast \ --noisy=true \ --retries=7 \ From 2e3137c9b0c140ee0d63298dd82dbe19877da44d Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Tue, 25 Apr 2023 13:42:10 -0700 Subject: [PATCH 21/21] delete conformance namespaces --- test/capz/npm/npm-e2e.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh index ff758613a2..2a33e23d2c 100644 --- a/test/capz/npm/npm-e2e.sh +++ b/test/capz/npm/npm-e2e.sh @@ -57,7 +57,7 @@ npm_e2e () { log "verifying VFP tags after cyclonus..." verify_vfp_tags_using_npm vfp-state-after-cyclonus || anyStepFailed=true log "deleting cyclonus pods..." - kubectl delete ns x y z + kubectl delete ns x y z || true ## NPM conformance run_npm_conformance && echo "" > conformance.success || anyStepFailed=true @@ -66,6 +66,8 @@ npm_e2e () { sleep 5m log "verifying VFP tags after conformance..." verify_vfp_tags_using_npm vfp-state-after-conformance || anyStepFailed=true + log "deleting NPM conformance namespaces if they were leftover from a failure..." + kubectl delete ns -l pod-security.kubernetes.io/enforce=baseline || true ## NPM scale run_npm_scale $kubeconfigFile && echo "" > scale-connectivity.success || anyStepFailed=true