From 0bc064a111059566825e0c246e658a9b1dc441c1 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Tue, 11 Apr 2023 17:50:49 -0700
Subject: [PATCH 01/21] wip

---
 .../windows-validation-on-capz/npm/npm-e2e.sh | 360 ++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 test/windows-validation-on-capz/npm/npm-e2e.sh

diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh
new file mode 100644
index 0000000000..81504bde2e
--- /dev/null
+++ b/test/windows-validation-on-capz/npm/npm-e2e.sh
@@ -0,0 +1,360 @@
+# assumes that the following is set:
+# set -o errexit
+
+# Installs NPM + a long-running Pod and does the following tests:
+# 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran)
+# 2. Run Cyclonus (filename: cyclonus.ran)
+# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.ran)
+# 4. Run Conformance (filename: conformance.ran)
+# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.ran)
+# 6. Run scale + connectivity test (filename: scale-connectivity.ran)
+# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.ran)
+#
+# If any step fails, the script will exit and remaining tests won't be run (because of `set -o errexit`)
+#
+# NOTE: each step has both:
+# - A .ran file that is created if the step is run
+# - A .success file that is created if the step succeeds
+#
+# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all
+npm_e2e () {
+    local kubeconfigFile=$1
+    if [ -z "$kubeconfigFile" ]; then
+        log "ERROR: kubeconfigFile not set. can't run NPM e2e"
+        return 1
+    fi
+
+    test -f $kubeconfigFile || {
+        log "ERROR: kubeconfigFile $kubeconfigFile not found. can't run NPM e2e"
+        return 1
+    }
+
+    log "setting up npm e2e test"
+
+    # make sure there are no previous results
+    log "cleaning up previous npm e2e results..."
+    rm npm-e2e.log npm-cyclonus.log npm-scale.log npm-scale-connectivity.log kwok.log || true
+    rm *.ran || true
+    rm *.success || true
+
+    echo "" > npm-e2e.ran
+
+    ## disable Calico NetPol
+    log "running helm uninstall on calico (this will remove the tigera-operator and prevent reconciling of the calico-node ClusterRole)..."
+    helm uninstall calico -n tigera-operator
+    kubectl delete ns tigera-operator
+    log "disabling Calico NetworkPolicy functionality by removing NetPol permission from calico-node ClusterRole..."
+    kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml 
+    cat original-clusterrole.yaml | perl -0777 -i.original -pe 's/- apiGroups:\n  - networking.k8s.io\n  resources:\n  - networkpolicies\n  verbs:\n  - watch\n  - list\n//' > new-clusterrole.yaml
+    originalLineCount=`cat original-clusterrole.yaml | wc -l`
+    newLineCount=`cat new-clusterrole.yaml | wc -l`
+    if [ $originalLineCount != $(($newLineCount + 7)) ]; then
+        # NOTE: this check will only work the first time this script is run, since the original-clusterrole.yaml will be modified
+        log "ERROR: unable to run NPM e2e. unexpected line count difference between original and new calico-node clusterrole. original: $originalLineCount, new: $newLineCount"
+        return 1
+    fi
+    kubectl rollout restart ds -n calico-system calico-node-windows
+
+    ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol
+    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon
+    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label {} scale-test=true && kubectl label {} connectivity-test=true"
+
+    # sleep for some time to let Calico CNI restart
+    sleep 3m
+
+    ## install Azure NPM
+    log "installing Azure NPM..."
+    npmURL=https://raw.githubusercontent.com/Azure/azure-container-networking/0ea4e9ac3d287f7abb15a34a88beb87697fbbcdd/npm/examples/windows/azure-npm-capz.yaml #https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/azure-npm-capz.yaml
+    kubectl apply -f $npmURL
+
+    ## install long-running pod
+    log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..."
+    kubectl create ns npm-e2e-longrunner
+    kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml
+
+    # verify VFP tags after NPM boots up
+    # seems like the initial NPM Pods are always deleted and new ones are created (within the first minute of being applied it seems)
+    # sleep for some time to avoid running kubectl wait on pods that get deleted
+    log "waiting for NPM and long-runner to start running..."
+    sleep 3m
+    kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
+    kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
+    log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..."
+    sleep 8m
+    verify_vfp_tags_using_npm vfp-state-prior.ran
+    echo "" > vfp-state-prior.success
+
+    ## NPM cyclonus
+    run_npm_cyclonus
+    echo "" > cyclonus.success
+
+    log "sleeping 3m to allow VFP to update tags after cyclonus..."
+    sleep 3m
+    log "verifying VFP tags after cyclonus..."
+    verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
+    echo "" > vfp-state-after-cyclonus.success
+
+    ## NPM conformance
+    run_npm_conformance
+    echo "" > conformance.success
+
+    log "sleeping 3m to allow VFP to update tags after conformance..."
+    sleep 3m
+    log "verifying VFP tags after conformance..."
+    verify_vfp_tags_using_npm vfp-state-after-conformance.ran
+    echo "" > vfp-state-after-conformance.success
+
+    ## NPM scale
+    run_npm_scale $kubeconfigFile
+    echo "" > scale.success
+    log "sleeping 3m to allow VFP to update tags after scale test..."
+    sleep 3m
+    log "verifying VFP tags after scale test..."
+    verify_vfp_tags_using_npm vfp-state-after-scale.ran
+}
+
+verify_vfp_tags_using_npm () {
+    local ranFilename=$1
+    if [[ $ranFilename != *.ran ]]; then
+        log "ERROR: need a filename that ends in .ran passed as an argument to verify_vfp_tags_using_npm"
+        return 1
+    fi
+
+    log "verifying VFP tags are equal to HNS SetPolicies..."
+    npmNode=`kubectl get node -owide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | grep -v kwok-node | awk '{print $1}' | tail -n 1` || true
+    if [[ -z $npmNode ]]; then
+        log "ERROR: unable to find uncordoned node for NPM"
+        return 1
+    fi
+    npmPod=`kubectl get pod -n kube-system -o wide | grep azure-npm-win | grep $npmNode | grep Running | awk '{print $1}'` || true
+    if [[ -z "$npmPod" ]]; then
+        log "ERROR: unable to find running azure-npm-win pod on node $npmNode"
+        kubectl get pod -n kube-system -o wide
+        kubectl logs -n kube-system -l k8s-app=azure-npm
+        return 1
+    fi
+
+    onNodeIPs=() ; for ip in `kubectl get pod -owide $kc -A  | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done
+    matchString="" ; for ip in ${onNodeIPs[@]}; do matchString+=" \"${ip}\""; done
+    matchString=`echo $matchString | tr ' ' ','`
+    log "using matchString: $matchString"
+    ipsetCount=`kubectl exec -n kube-system $npmPod $kc -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json  > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\":  \"IPSET\"').count" | tr -d '\r'`
+    log "HNS IPSET count: $ipsetCount"
+    kubectl exec -n kube-system $npmPod $kc -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags'
+    kubectl exec -n kube-system $npmPod $kc -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt
+
+    hadEndpoints=false
+    hadFailure=false
+    for count in `cat vfp-tag-counts.txt | xargs -n 1 echo`; do
+        hadEndpoints=true
+        count=`echo $count | tr -d '\r'`
+        log "VFP tag count: $count"
+        if [[ $count != $ipsetCount ]]; then
+            log "WARNING: VFP tag count $count does not match HNS IPSET count $ipsetCount"
+            hadFailure=true
+        fi
+    done
+
+    echo "" > $ranFilename
+    if [[ $hadEndpoints == false ]]; then
+        log "WARNING: VFP tags not validated for NPM since no endpoints found on node $npmNode"
+    fi
+
+    if [[ $hadFailure == true ]]; then
+        log "ERROR: VFP tags are inconsistent with HNS SetPolicies"
+        capture_npm_hns_state
+        return 1
+    fi
+}
+
+# results in a file called npm-hns-state.zip
+capture_npm_hns_state () {
+    log "capturing NPM HNS state..."
+    kubectl get pod -owide -A
+    test -d npm-hns-state/ && rm -rf npm-hns-state/ || true
+    mkdir npm-hns-state
+    cd npm-hns-state
+    curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/win-debug.sh
+    chmod u+x ./win-debug.sh
+    curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/pod_exec.ps1
+    ./win-debug.sh
+    cd ..
+    zip -9qr npm-hns-state.zip npm-hns-state
+    # to unzip:
+    # unzip npm-hns-state.zip -d npm-hns-state
+}
+
+# currently takes ~3 hours to run
+# e.g. 19:37:05 to 22:32:44 and 19:16:18 to 22:29:13
+run_npm_conformance () {
+    ## install NPM e2e binary
+    log "ensuring NPM e2e binary is installed"
+    rc=0; test -f npm-e2e.test || rc=$?
+    if [[ $rc == 0 ]]; then
+        log "NPM e2e binary found, skipping install"
+    else
+        log "NPM e2e binary not found, installing..."
+        test -d npm-kubernetes/ && rm -rf npm-kubernetes/ || true
+        mkdir npm-kubernetes
+        cd npm-kubernetes
+        # NOTE: if this is not downloaded every run, then probably need to sleep before the VFP tag verification
+        git clone https://github.com/huntergregory/kubernetes.git --depth=1 --branch=quit-on-failure
+        cd kubernetes
+        make WHAT=test/e2e/e2e.test
+        cd ../..
+        mv npm-kubernetes/kubernetes/_output/local/bin/linux/amd64/e2e.test ./npm-e2e.test
+        rm -rf npm-kubernetes/
+    fi
+
+    log "beginning npm conformance test..."
+
+    toRun="NetworkPolicy"
+
+    nomatch1="should enforce policy based on PodSelector or NamespaceSelector"
+    nomatch2="should enforce policy based on NamespaceSelector with MatchExpressions using default ns label"
+    nomatch3="should enforce policy based on PodSelector and NamespaceSelector"
+    nomatch4="should enforce policy based on Multiple PodSelectors and NamespaceSelectors"
+    cidrExcept1="should ensure an IP overlapping both IPBlock.CIDR and IPBlock.Except is allowed"
+    cidrExcept2="should enforce except clause while egress access to server in CIDR block"
+    namedPorts="named port"
+    wrongK8sVersion="Netpol API"
+    toSkip="\[LinuxOnly\]|$nomatch1|$nomatch2|$nomatch3|$nomatch4|$cidrExcept1|$cidrExcept2|$namedPorts|$wrongK8sVersion|SCTP"
+
+    echo "" > conformance.ran
+    KUBERNETES_SERVICE_PORT=443 ./npm-e2e.test \
+        --provider=skeleton \
+        --ginkgo.noColor \
+        --ginkgo.focus="$toRun" \
+        --ginkgo.skip="$toSkip" \
+        --allowed-not-ready-nodes=1 \
+        --node-os-distro="windows" \
+        --disable-log-dump \
+        --ginkgo.progress=true \
+        --ginkgo.slowSpecThreshold=120.0 \
+        --ginkgo.flakeAttempts=0 \
+        --ginkgo.trace=true \
+        --ginkgo.v=true \
+        --dump-logs-on-failure=true \
+        --report-dir="${ARTIFACTS}" \
+        --prepull-images=true \
+        --v=5 "${ADDITIONAL_E2E_ARGS[@]}" | tee npm-e2e.log || true
+
+    # grep "FAIL: unable to initialize resources: after 10 tries, 2 HTTP servers are not ready
+
+    log "finished npm conformance test"
+    ## report if there's a failure
+    rc=0; cat npm-e2e.log | grep '"failed":1' > /dev/null 2>&1 || rc=$?
+    if [ $rc -eq 0 ]; then
+        log "ERROR: found failure in npm e2e test log"
+        capture_npm_hns_state
+        return 1
+    fi
+}
+
+# currently takes ~3.5 hours to run
+# e.g. 20:49:05 to 00:21:12
+run_npm_cyclonus () {
+    log "installing cyclonus binary..."
+    curl -fsSL github.com/mattfenwick/cyclonus/releases/latest/download/cyclonus_linux_amd64.tar.gz | tar -zxv
+
+    log "beginning npm cyclonus test..."
+    echo "" > cyclonus.ran
+    ./cyclonus_linux_amd64/cyclonus generate \
+        --fail-fast \
+        --noisy=true \
+        --retries=7 \
+        --ignore-loopback=true \
+        --perturbation-wait-seconds=20 \
+        --pod-creation-timeout-seconds=480 \
+        --job-timeout-seconds=15 \
+        --server-protocol=TCP,UDP \
+        --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy | tee npm-cyclonus.log || true
+
+    # for debugging with a smaller set of tests, use:
+        # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous
+
+    rc=0; cat npm-cyclonus.log | grep "failed" > /dev/null 2>&1 || rc=$?
+    if [[ $rc == 0 ]]; then
+        echo "ERROR: failures encountered in npm cyclonus test"
+        capture_npm_hns_state
+        return 1
+    fi
+
+    rc=0; cat npm-cyclonus.log | grep "SummaryTable:" > /dev/null 2>&1 || rc=$?
+    if [[ $rc != 0 ]]; then
+        log "ERROR: npm cyclonus test did not finish for some reason"
+        capture_npm_hns_state
+        return 1
+    fi
+}
+
+run_npm_scale () {
+    local kubeconfigFile=$1
+
+    log "beginning npm scale test with kubeconfig [$kubeconfigFile]..."
+
+    rm -rf azure-container-networking/ || true
+    git clone --depth=1 --branch=master https://github.com/Azure/azure-container-networking.git
+
+    cd azure-container-networking/test/scale/
+
+    chmod u+x test-scale.sh
+    chmod u+x run-kwok.sh
+    cd connectivity/
+    chmod u+x test-connectivity.sh
+    cd ../
+
+    ./run-kwok.sh $kubeconfigFile > ../../../kwok.log &
+    kwok_pid=$!
+
+    # exact counts output from script
+    # Pod Counts:
+    # - 10 fake Pods
+    # - 10 real Pods
+    # HNS Counts:
+    # - number of ACLs per Pod Endpoint: 6 (6*numNetworkPolicies)
+    # - number of SetPolicies: ~40 (2*numUniqueLabelsPerPod*numFakePods)
+    # - max IPs per SetPolicy: number of total Pods
+
+    # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods
+    ./test-scale.sh \
+        --max-kwok-pods-per-node=50 \
+        --num-kwok-deployments=10 \
+        --num-kwok-replicas=1 \
+        --max-real-pods-per-node=30 \
+        --num-real-deployments=5 \
+        --num-real-replicas=2 \
+        --num-network-policies=1 \
+        --num-unique-labels-per-pod=2 \
+        --num-unique-labels-per-deployment=2 \
+        --num-shared-labels-per-pod=10 | tee ../../../npm-scale.log || true
+
+    rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$?
+    if [[ $rc != 0 ]]; then
+        log "ERROR: npm scale test did not properly scale"
+        kill $kwok_pid
+        cd ../../../
+        return 1
+    fi
+
+    log "beginning npm scale connectivity test..."
+
+    cd connectivity/
+
+    minutesToWaitForInitialConnectivity=30
+    minutesToWaitAfterAddingNetPol=10
+    echo "" > ../../../../scale-connectivity.ran
+    ./test-connectivity.sh --num-scale-pods-to-verify=10 --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true
+
+    cd ../../../../
+    rc=0; cat npm-scale-connectivity.log | grep "FINISHED" > /dev/null 2>&1 || rc=$?
+    if [[ $rc != 0 ]]; then
+        log "ERROR: npm scale test connectivity failed"
+        kill $kwok_pid
+        capture_npm_hns_state
+        return 1
+    fi
+
+    echo "" > scale-connectivity.success
+    kill $kwok_pid
+}

From f9dcfb655f68232df119b45512d9aabcfcef084c Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Tue, 11 Apr 2023 18:21:48 -0700
Subject: [PATCH 02/21] wip2

---
 .../windows-validation-on-capz/npm/npm-e2e.sh | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh
index 81504bde2e..80f65f5912 100644
--- a/test/windows-validation-on-capz/npm/npm-e2e.sh
+++ b/test/windows-validation-on-capz/npm/npm-e2e.sh
@@ -33,9 +33,7 @@ npm_e2e () {
 
     # make sure there are no previous results
     log "cleaning up previous npm e2e results..."
-    rm npm-e2e.log npm-cyclonus.log npm-scale.log npm-scale-connectivity.log kwok.log || true
-    rm *.ran || true
-    rm *.success || true
+    rm *.log *.ran *.success || true
 
     echo "" > npm-e2e.ran
 
@@ -299,12 +297,19 @@ run_npm_scale () {
     cd azure-container-networking/test/scale/
 
     chmod u+x test-scale.sh
-    chmod u+x run-kwok.sh
     cd connectivity/
     chmod u+x test-connectivity.sh
     cd ../
 
-    ./run-kwok.sh $kubeconfigFile > ../../../kwok.log &
+    # run kwok
+    kwok --kubeconfig=$kubeconfigFile \
+        --cidr=155.0.0.0/16 \
+        --node-ip=155.0.0.1 \
+        --manage-all-nodes=false \
+        --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
+        --manage-nodes-with-label-selector= \
+        --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
+        --disregard-status-with-label-selector= > ../../../kwok.log &
     kwok_pid=$!
 
     # exact counts output from script
@@ -317,8 +322,7 @@ run_npm_scale () {
     # - max IPs per SetPolicy: number of total Pods
 
     # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods
-    ./test-scale.sh \
-        --max-kwok-pods-per-node=50 \
+    ./test-scale.sh --max-kwok-pods-per-node=50 \
         --num-kwok-deployments=10 \
         --num-kwok-replicas=1 \
         --max-real-pods-per-node=30 \
@@ -337,6 +341,14 @@ run_npm_scale () {
         return 1
     fi
 
+    log "waiting up to 10m for all Pods to be running..."
+    kubectl wait --for=condition=Ready -n scale-test --all pods --timeout=10m > ../../../waiting-for-pods.log  || {
+        log "ERROR: not all scale Pods are running"
+        kill $kwok_pid
+        cd ../../../
+        return 1
+    }
+
     log "beginning npm scale connectivity test..."
 
     cd connectivity/

From de062fb10cc6e99cd704441a52a366f51b069471 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>
Date: Wed, 12 Apr 2023 11:11:51 -0700
Subject: [PATCH 03/21] set -o errexit

---
 test/windows-validation-on-capz/npm/npm-e2e.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh
index 80f65f5912..9984617231 100644
--- a/test/windows-validation-on-capz/npm/npm-e2e.sh
+++ b/test/windows-validation-on-capz/npm/npm-e2e.sh
@@ -1,5 +1,5 @@
-# assumes that the following is set:
-# set -o errexit
+# this is required so that if a step fails, following steps are not run
+set -o errexit
 
 # Installs NPM + a long-running Pod and does the following tests:
 # 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran)

From 744fd5b51467ae52fc5f201b8cf2cddef0f0f80c Mon Sep 17 00:00:00 2001
From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>
Date: Wed, 12 Apr 2023 14:38:25 -0700
Subject: [PATCH 04/21] note about debugging

---
 test/windows-validation-on-capz/npm/npm-e2e.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/windows-validation-on-capz/npm/npm-e2e.sh
index 9984617231..19f1de4967 100644
--- a/test/windows-validation-on-capz/npm/npm-e2e.sh
+++ b/test/windows-validation-on-capz/npm/npm-e2e.sh
@@ -217,6 +217,9 @@ run_npm_conformance () {
     namedPorts="named port"
     wrongK8sVersion="Netpol API"
     toSkip="\[LinuxOnly\]|$nomatch1|$nomatch2|$nomatch3|$nomatch4|$cidrExcept1|$cidrExcept2|$namedPorts|$wrongK8sVersion|SCTP"
+    
+    # to debug with one test case, uncomment this
+    # toRun="NetworkPolicy API should support creating NetworkPolicy API operations"
 
     echo "" > conformance.ran
     KUBERNETES_SERVICE_PORT=443 ./npm-e2e.test \
@@ -268,8 +271,8 @@ run_npm_cyclonus () {
         --server-protocol=TCP,UDP \
         --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy | tee npm-cyclonus.log || true
 
-    # for debugging with a smaller set of tests, use:
-        # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous
+    # for debugging with a smaller set of tests, use this as the last line instead
+        # --exclude sctp,named-port,ip-block-with-except,multi-peer,upstream-e2e,example,end-port,namespaces-by-default-label,update-policy,all-namespaces,all-pods,allow-all,any-peer,any-port,any-port-protocol,deny-all,ip-block-no-except,multi-port/protocol,namespaces-by-label,numbered-port,pathological,peer-ipblock,peer-pods,pods-by-label,policy-namespace,port,protocol,rule,tcp,udp --include conflict,direction,egress,ingress,miscellaneous  | tee npm-cyclonus.log || true
 
     rc=0; cat npm-cyclonus.log | grep "failed" > /dev/null 2>&1 || rc=$?
     if [[ $rc == 0 ]]; then

From 9861d970fffb22f7d57d48aa1138c335186600d3 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Wed, 12 Apr 2023 16:55:32 -0700
Subject: [PATCH 05/21] rename folder

---
 test/{windows-validation-on-capz => capz}/npm/npm-e2e.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/{windows-validation-on-capz => capz}/npm/npm-e2e.sh (100%)

diff --git a/test/windows-validation-on-capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
similarity index 100%
rename from test/windows-validation-on-capz/npm/npm-e2e.sh
rename to test/capz/npm/npm-e2e.sh

From 504ce17ef5b84459c18d6cf3b46decf78cb87dad Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Wed, 12 Apr 2023 17:58:21 -0700
Subject: [PATCH 06/21] small fixes

---
 test/capz/npm/npm-e2e.sh | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 19f1de4967..cd280b3f29 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -16,7 +16,7 @@ set -o errexit
 # - A .ran file that is created if the step is run
 # - A .success file that is created if the step succeeds
 #
-# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all
+# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all, and npm-e2e.success that indicates that all steps succeeded
 npm_e2e () {
     local kubeconfigFile=$1
     if [ -z "$kubeconfigFile" ]; then
@@ -55,7 +55,7 @@ npm_e2e () {
 
     ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol
     kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon
-    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label {} scale-test=true && kubectl label {} connectivity-test=true"
+    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label node {} scale-test=true && kubectl label node {} connectivity-test=true"
 
     # sleep for some time to let Calico CNI restart
     sleep 3m
@@ -104,11 +104,14 @@ npm_e2e () {
 
     ## NPM scale
     run_npm_scale $kubeconfigFile
-    echo "" > scale.success
+    echo "" > scale-connectivity.success
     log "sleeping 3m to allow VFP to update tags after scale test..."
     sleep 3m
     log "verifying VFP tags after scale test..."
     verify_vfp_tags_using_npm vfp-state-after-scale.ran
+    echo "" > vfp-state-after-scale.success
+
+    echo "" > npm-e2e.success
 }
 
 verify_vfp_tags_using_npm () {
@@ -132,14 +135,14 @@ verify_vfp_tags_using_npm () {
         return 1
     fi
 
-    onNodeIPs=() ; for ip in `kubectl get pod -owide $kc -A  | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done
+    onNodeIPs=() ; for ip in `kubectl get pod -owide -A  | grep $npmNode | grep -oP "\d+\.\d+\.\d+\.\d+" | sort | uniq`; do onNodeIPs+=($ip); done
     matchString="" ; for ip in ${onNodeIPs[@]}; do matchString+=" \"${ip}\""; done
     matchString=`echo $matchString | tr ' ' ','`
     log "using matchString: $matchString"
-    ipsetCount=`kubectl exec -n kube-system $npmPod $kc -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json  > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\":  \"IPSET\"').count" | tr -d '\r'`
+    ipsetCount=`kubectl exec -n kube-system $npmPod -- powershell.exe "(Get-HNSNetwork | ? Name -Like Calico).Policies | convertto-json  > setpols.txt ; (type .\setpols.txt | select-string '\"PolicyType\":  \"IPSET\"').count" | tr -d '\r'`
     log "HNS IPSET count: $ipsetCount"
-    kubectl exec -n kube-system $npmPod $kc -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags'
-    kubectl exec -n kube-system $npmPod $kc -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt
+    kubectl exec -n kube-system $npmPod -- powershell.exe 'echo "attempting to delete previous results if they exist" ; Remove-Item -path vfptags -recurse ; mkdir vfptags'
+    kubectl exec -n kube-system $npmPod -- powershell.exe '$endpoints = (Get-HnsEndpoint | ? IPAddress -In '"$matchString"').Id ; foreach ($port in $endpoints) { vfpctrl /port $port /list-tag > vfptags\$port.txt ; (type vfptags\$port.txt | select-string -context 2 "TAG :").count }' > vfp-tag-counts.txt
 
     hadEndpoints=false
     hadFailure=false

From f6a71bb9b6affc2c6b4ec4b42f7b2797cc3ee5db Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Wed, 12 Apr 2023 17:58:56 -0700
Subject: [PATCH 07/21] longer sleep befor checking vfp tags

---
 test/capz/npm/npm-e2e.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index cd280b3f29..ae672bafa2 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -86,8 +86,8 @@ npm_e2e () {
     run_npm_cyclonus
     echo "" > cyclonus.success
 
-    log "sleeping 3m to allow VFP to update tags after cyclonus..."
-    sleep 3m
+    log "sleeping 5m to allow VFP to update tags after cyclonus..."
+    sleep 5m
     log "verifying VFP tags after cyclonus..."
     verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
     echo "" > vfp-state-after-cyclonus.success
@@ -96,8 +96,8 @@ npm_e2e () {
     run_npm_conformance
     echo "" > conformance.success
 
-    log "sleeping 3m to allow VFP to update tags after conformance..."
-    sleep 3m
+    log "sleeping 5m to allow VFP to update tags after conformance..."
+    sleep 5m
     log "verifying VFP tags after conformance..."
     verify_vfp_tags_using_npm vfp-state-after-conformance.ran
     echo "" > vfp-state-after-conformance.success
@@ -105,8 +105,8 @@ npm_e2e () {
     ## NPM scale
     run_npm_scale $kubeconfigFile
     echo "" > scale-connectivity.success
-    log "sleeping 3m to allow VFP to update tags after scale test..."
-    sleep 3m
+    log "sleeping 5m to allow VFP to update tags after scale test..."
+    sleep 5m
     log "verifying VFP tags after scale test..."
     verify_vfp_tags_using_npm vfp-state-after-scale.ran
     echo "" > vfp-state-after-scale.success

From 52e515f4977b404a36da8cf7636ff7b6e72f4997 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Thu, 13 Apr 2023 13:09:41 -0700
Subject: [PATCH 08/21] delete cyc pods and remove other code

---
 test/capz/npm/npm-e2e.sh | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index ae672bafa2..5f3068b291 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -91,6 +91,8 @@ npm_e2e () {
     log "verifying VFP tags after cyclonus..."
     verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
     echo "" > vfp-state-after-cyclonus.success
+    log "deleting cyclonus pods..."
+    kubectl delete ns x y z
 
     ## NPM conformance
     run_npm_conformance
@@ -347,14 +349,6 @@ run_npm_scale () {
         return 1
     fi
 
-    log "waiting up to 10m for all Pods to be running..."
-    kubectl wait --for=condition=Ready -n scale-test --all pods --timeout=10m > ../../../waiting-for-pods.log  || {
-        log "ERROR: not all scale Pods are running"
-        kill $kwok_pid
-        cd ../../../
-        return 1
-    }
-
     log "beginning npm scale connectivity test..."
 
     cd connectivity/

From 8bfa60d08546e0461bb86c5b4b8206e6f6b6e8ee Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Thu, 13 Apr 2023 13:48:41 -0700
Subject: [PATCH 09/21] add new scale parameters

---
 test/capz/npm/npm-e2e.sh | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 5f3068b291..1c862f0bde 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -300,7 +300,7 @@ run_npm_scale () {
     log "beginning npm scale test with kubeconfig [$kubeconfigFile]..."
 
     rm -rf azure-container-networking/ || true
-    git clone --depth=1 --branch=master https://github.com/Azure/azure-container-networking.git
+    git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=hgregory/edit-scale
 
     cd azure-container-networking/test/scale/
 
@@ -337,9 +337,20 @@ run_npm_scale () {
         --num-real-deployments=5 \
         --num-real-replicas=2 \
         --num-network-policies=1 \
+        --num-unapplied-network-policies=10 \
         --num-unique-labels-per-pod=2 \
         --num-unique-labels-per-deployment=2 \
-        --num-shared-labels-per-pod=10 | tee ../../../npm-scale.log || true
+        --num-shared-labels-per-pod=10 \
+        --delete-kwok-pods=10 \
+        --delete-real-pods=5 \
+        --delete-pods-interval=120 \
+        --delete-pods-times=2 \
+        --delete-labels \
+        --delete-labels-interval=60 \
+        --delete-labels-times=1 \
+        --delete-netpols \
+        --delete-netpols-interval=60 \
+        --delete-netpols-times=1 | tee ../../../npm-scale.log || true
 
     rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$?
     if [[ $rc != 0 ]]; then
@@ -356,7 +367,7 @@ run_npm_scale () {
     minutesToWaitForInitialConnectivity=30
     minutesToWaitAfterAddingNetPol=10
     echo "" > ../../../../scale-connectivity.ran
-    ./test-connectivity.sh --num-scale-pods-to-verify=10 --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true
+    ./test-connectivity.sh --num-scale-pods-to-verify=all --max-wait-for-initial-connectivity=$((60*minutesToWaitForInitialConnectivity)) --max-wait-after-adding-netpol=$((60*minutesToWaitAfterAddingNetPol)) | tee ../../../../npm-scale-connectivity.log || true
 
     cd ../../../../
     rc=0; cat npm-scale-connectivity.log | grep "FINISHED" > /dev/null 2>&1 || rc=$?

From 0f98481a73d4ca65db5f04d8504dbf95c25539a6 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 10:12:05 -0700
Subject: [PATCH 10/21] switch branch

---
 test/capz/npm/npm-e2e.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 1c862f0bde..fbacc8bb69 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -300,7 +300,7 @@ run_npm_scale () {
     log "beginning npm scale test with kubeconfig [$kubeconfigFile]..."
 
     rm -rf azure-container-networking/ || true
-    git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=hgregory/edit-scale
+    git clone https://github.com/Azure/azure-container-networking.git --depth=1 --branch=master
 
     cd azure-container-networking/test/scale/
 

From c253da07ba26cdefc71f0581f172559e47beb852 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 10:13:43 -0700
Subject: [PATCH 11/21] define log

---
 test/capz/npm/npm-e2e.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index fbacc8bb69..5651549b2c 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -1,6 +1,11 @@
 # this is required so that if a step fails, following steps are not run
 set -o errexit
 
+log() {
+	local msg=$1
+	echo "$(date -R): $msg"
+}
+
 # Installs NPM + a long-running Pod and does the following tests:
 # 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran)
 # 2. Run Cyclonus (filename: cyclonus.ran)

From 23db567053709f4ac5e08dad67e7f2963511fd71 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 10:16:20 -0700
Subject: [PATCH 12/21] move installation to func

---
 test/capz/npm/npm-e2e.sh | 79 +++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 5651549b2c..70bb4ad9b1 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -42,6 +42,48 @@ npm_e2e () {
 
     echo "" > npm-e2e.ran
 
+    install_npm
+
+    log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..."
+    sleep 8m
+    verify_vfp_tags_using_npm vfp-state-prior.ran
+    echo "" > vfp-state-prior.success
+
+    ## NPM cyclonus
+    run_npm_cyclonus
+    echo "" > cyclonus.success
+
+    log "sleeping 5m to allow VFP to update tags after cyclonus..."
+    sleep 5m
+    log "verifying VFP tags after cyclonus..."
+    verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
+    echo "" > vfp-state-after-cyclonus.success
+    log "deleting cyclonus pods..."
+    kubectl delete ns x y z
+
+    ## NPM conformance
+    run_npm_conformance
+    echo "" > conformance.success
+
+    log "sleeping 5m to allow VFP to update tags after conformance..."
+    sleep 5m
+    log "verifying VFP tags after conformance..."
+    verify_vfp_tags_using_npm vfp-state-after-conformance.ran
+    echo "" > vfp-state-after-conformance.success
+
+    ## NPM scale
+    run_npm_scale $kubeconfigFile
+    echo "" > scale-connectivity.success
+    log "sleeping 5m to allow VFP to update tags after scale test..."
+    sleep 5m
+    log "verifying VFP tags after scale test..."
+    verify_vfp_tags_using_npm vfp-state-after-scale.ran
+    echo "" > vfp-state-after-scale.success
+
+    echo "" > npm-e2e.success
+}
+
+install_npm () {
     ## disable Calico NetPol
     log "running helm uninstall on calico (this will remove the tigera-operator and prevent reconciling of the calico-node ClusterRole)..."
     helm uninstall calico -n tigera-operator
@@ -82,43 +124,6 @@ npm_e2e () {
     sleep 3m
     kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
     kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
-    log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..."
-    sleep 8m
-    verify_vfp_tags_using_npm vfp-state-prior.ran
-    echo "" > vfp-state-prior.success
-
-    ## NPM cyclonus
-    run_npm_cyclonus
-    echo "" > cyclonus.success
-
-    log "sleeping 5m to allow VFP to update tags after cyclonus..."
-    sleep 5m
-    log "verifying VFP tags after cyclonus..."
-    verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
-    echo "" > vfp-state-after-cyclonus.success
-    log "deleting cyclonus pods..."
-    kubectl delete ns x y z
-
-    ## NPM conformance
-    run_npm_conformance
-    echo "" > conformance.success
-
-    log "sleeping 5m to allow VFP to update tags after conformance..."
-    sleep 5m
-    log "verifying VFP tags after conformance..."
-    verify_vfp_tags_using_npm vfp-state-after-conformance.ran
-    echo "" > vfp-state-after-conformance.success
-
-    ## NPM scale
-    run_npm_scale $kubeconfigFile
-    echo "" > scale-connectivity.success
-    log "sleeping 5m to allow VFP to update tags after scale test..."
-    sleep 5m
-    log "verifying VFP tags after scale test..."
-    verify_vfp_tags_using_npm vfp-state-after-scale.ran
-    echo "" > vfp-state-after-scale.success
-
-    echo "" > npm-e2e.success
 }
 
 verify_vfp_tags_using_npm () {

From 9cc775a8a70e64d2a728fc12ff31626781f1d2ef Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 13:06:35 -0700
Subject: [PATCH 13/21] set registry keys for npm fixes

---
 test/capz/npm/npm-e2e.sh | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 70bb4ad9b1..dbba946959 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -124,6 +124,33 @@ install_npm () {
     sleep 3m
     kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
     kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
+
+    ## set registry keys for NPM fixes
+    log "updating registry keys and restarting HNS for NPM fixes..."
+    npmNode=`kubectl get node -owide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | grep -v kwok-node | awk '{print $1}' | tail -n 1` || true
+    if [[ -z $npmNode ]]; then
+        log "ERROR: unable to find uncordoned node for NPM"
+        return 1
+    fi
+    npmPod=`kubectl get pod -n kube-system -o wide | grep azure-npm-win | grep $npmNode | grep Running | awk '{print $1}'` || true
+    if [[ -z "$npmPod" ]]; then
+        log "ERROR: unable to find running azure-npm-win pod on node $npmNode"
+        kubectl get pod -n kube-system -o wide
+        kubectl logs -n kube-system -l k8s-app=azure-npm
+        return 1
+    fi
+    cmd="reg add HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsAclUpdateChange /t REG_DWORD /d 1 /f"
+    cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsAclUpdateChange"
+    cmd="$cmd ; reg add HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh /t REG_DWORD /d 1 /f"
+    cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh"
+    cmd="$cmd ; Restart-Service HNS"
+    cmd="$cmd ; sleep 10"
+    kubectl exec -it -n kube-system $npmPod -- powershell.exe "$cmd"
+    log "sleeping 3m to let HNS restart..."
+    sleep 3m
+    log "making sure NPM and long-runner are running..."
+    kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
+    kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
 }
 
 verify_vfp_tags_using_npm () {

From 72b2b4d0e23e69a863f98c1bb5f29f0e21cf8163 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 13:16:04 -0700
Subject: [PATCH 14/21] remove -it

---
 test/capz/npm/npm-e2e.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index dbba946959..cb6da51580 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -145,7 +145,7 @@ install_npm () {
     cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh"
     cmd="$cmd ; Restart-Service HNS"
     cmd="$cmd ; sleep 10"
-    kubectl exec -it -n kube-system $npmPod -- powershell.exe "$cmd"
+    kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd"
     log "sleeping 3m to let HNS restart..."
     sleep 3m
     log "making sure NPM and long-runner are running..."

From 65ebebd141cedbe21f842597f765e4fff1cce118 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 14:22:49 -0700
Subject: [PATCH 15/21] check for rehydration error

---
 test/capz/npm/npm-e2e.sh | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index cb6da51580..a358902ea9 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -112,18 +112,12 @@ install_npm () {
     npmURL=https://raw.githubusercontent.com/Azure/azure-container-networking/0ea4e9ac3d287f7abb15a34a88beb87697fbbcdd/npm/examples/windows/azure-npm-capz.yaml #https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/azure-npm-capz.yaml
     kubectl apply -f $npmURL
 
-    ## install long-running pod
-    log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..."
-    kubectl create ns npm-e2e-longrunner
-    kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml
-
     # verify VFP tags after NPM boots up
     # seems like the initial NPM Pods are always deleted and new ones are created (within the first minute of being applied it seems)
     # sleep for some time to avoid running kubectl wait on pods that get deleted
-    log "waiting for NPM and long-runner to start running..."
+    log "waiting for NPM to start running..."
     sleep 3m
     kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
-    kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
 
     ## set registry keys for NPM fixes
     log "updating registry keys and restarting HNS for NPM fixes..."
@@ -148,6 +142,21 @@ install_npm () {
     kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd"
     log "sleeping 3m to let HNS restart..."
     sleep 3m
+
+    ## install long-running pod and restart HNS again (must install after restarting HNS because of a fix in rehydrating Endpoints in one of the registry keys)
+    log "creating long-runner pod to ensure there's an endpoint for verifying VFP tags..."
+    kubectl create ns npm-e2e-longrunner
+    kubectl apply -f https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/examples/windows/long-running-pod-for-capz.yaml
+    sleep 10s
+    log "making sure long-runner is running"
+    kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
+
+    log "restarting HNS again to make sure Endpoints rehydrate correctly"
+    kubectl exec -n kube-system $npmPod -- powershell.exe "Restart-Service HNS"
+
+    log "sleeping 3m to let HNS restart..."
+    sleep 3m
+
     log "making sure NPM and long-runner are running..."
     kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m
     kubectl wait --for=condition=Ready pod -l app=long-runner -n npm-e2e-longrunner --timeout=15m
@@ -197,7 +206,8 @@ verify_vfp_tags_using_npm () {
 
     echo "" > $ranFilename
     if [[ $hadEndpoints == false ]]; then
-        log "WARNING: VFP tags not validated for NPM since no endpoints found on node $npmNode"
+        log "ERROR: no Endpoints found in HNS for node IPs $matchString on node $npmNode. Rehydration of Endpoints likely failed"
+        return 1
     fi
 
     if [[ $hadFailure == true ]]; then

From 4519f44c750166457ac6f454808a73d30362309b Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Fri, 14 Apr 2023 16:03:31 -0700
Subject: [PATCH 16/21] cleaner command

---
 test/capz/npm/npm-e2e.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index a358902ea9..428fcf7f6f 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -102,7 +102,7 @@ install_npm () {
 
     ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol
     kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon
-    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} bash -c "kubectl label node {} scale-test=true && kubectl label node {} connectivity-test=true"
+    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} kubectl label node {} scale-test=true connectivity-test=true
 
     # sleep for some time to let Calico CNI restart
     sleep 3m

From 95ba9f847c043057b09a79b8a0a3f400f6e0c6df Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Wed, 19 Apr 2023 13:02:06 -0700
Subject: [PATCH 17/21] minor updates

---
 test/capz/npm/npm-e2e.sh | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 428fcf7f6f..b09a42cc58 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -89,7 +89,7 @@ install_npm () {
     helm uninstall calico -n tigera-operator
     kubectl delete ns tigera-operator
     log "disabling Calico NetworkPolicy functionality by removing NetPol permission from calico-node ClusterRole..."
-    kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml 
+    kubectl get clusterrole calico-node -o yaml > original-clusterrole.yaml
     cat original-clusterrole.yaml | perl -0777 -i.original -pe 's/- apiGroups:\n  - networking.k8s.io\n  resources:\n  - networkpolicies\n  verbs:\n  - watch\n  - list\n//' > new-clusterrole.yaml
     originalLineCount=`cat original-clusterrole.yaml | wc -l`
     newLineCount=`cat new-clusterrole.yaml | wc -l`
@@ -102,7 +102,7 @@ install_npm () {
 
     ## disable scheduling for all but one node for NPM tests, since intra-node connectivity is broken after disabling Calico NetPol
     kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | tail -n +2 | xargs kubectl cordon
-    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -I {} kubectl label node {} scale-test=true connectivity-test=true
+    kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | grep -v SchedulingDisabled | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
 
     # sleep for some time to let Calico CNI restart
     sleep 3m
@@ -369,35 +369,33 @@ run_npm_scale () {
 
     # exact counts output from script
     # Pod Counts:
-    # - 10 fake Pods
-    # - 10 real Pods
+    # - 25 fake Pods
+    # - 5 real Pods
     # HNS Counts:
     # - number of ACLs per Pod Endpoint: 6 (6*numNetworkPolicies)
-    # - number of SetPolicies: ~40 (2*numUniqueLabelsPerPod*numFakePods)
+    # - number of SetPolicies: ~100 (2*numUniqueLabelsPerPod*numFakePods)
     # - max IPs per SetPolicy: number of total Pods
-
-    # NOTE: if editing real pod counts, should update --num-scale-pods-to-verify in test-connectivity.sh to test all those Pods
     ./test-scale.sh --max-kwok-pods-per-node=50 \
-        --num-kwok-deployments=10 \
-        --num-kwok-replicas=1 \
+        --num-kwok-deployments=5 \
+        --num-kwok-replicas=5 \
         --max-real-pods-per-node=30 \
         --num-real-deployments=5 \
-        --num-real-replicas=2 \
+        --num-real-replicas=1 \
         --num-network-policies=1 \
-        --num-unapplied-network-policies=10 \
+        --num-unapplied-network-policies=3 \
         --num-unique-labels-per-pod=2 \
         --num-unique-labels-per-deployment=2 \
         --num-shared-labels-per-pod=10 \
-        --delete-kwok-pods=10 \
-        --delete-real-pods=5 \
-        --delete-pods-interval=120 \
-        --delete-pods-times=2 \
         --delete-labels \
         --delete-labels-interval=60 \
         --delete-labels-times=1 \
         --delete-netpols \
         --delete-netpols-interval=60 \
-        --delete-netpols-times=1 | tee ../../../npm-scale.log || true
+        --delete-netpols-times=1 \
+        --delete-kwok-pods=1 \
+        --delete-real-pods=1 \
+        --delete-pods-interval=120 \
+        --delete-pods-times=1 | tee ../../../npm-scale.log || true
 
     rc=0; cat ../../../npm-scale.log | grep "FINISHED" > /dev/null 2>&1 || rc=$?
     if [[ $rc != 0 ]]; then

From 35d9b16bf399614709f98f8dbd7aaf3f1c85bc3c Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Wed, 19 Apr 2023 14:06:34 -0700
Subject: [PATCH 18/21] restart computer

---
 test/capz/npm/npm-e2e.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index b09a42cc58..2d21380e8a 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -139,6 +139,7 @@ install_npm () {
     cmd="$cmd ; reg query HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\hns\State /v HnsNpmRefresh"
     cmd="$cmd ; Restart-Service HNS"
     cmd="$cmd ; sleep 10"
+    cmd="$cmd ; Restart-Computer"
     kubectl exec -n kube-system $npmPod -- powershell.exe "$cmd"
     log "sleeping 3m to let HNS restart..."
     sleep 3m

From a4f553aa7c5a0cbf0169ea75717f9dfce183556a Mon Sep 17 00:00:00 2001
From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>
Date: Mon, 24 Apr 2023 11:21:56 -0700
Subject: [PATCH 19/21] run everything always

---
 test/capz/npm/npm-e2e.sh | 65 ++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index 2d21380e8a..fab67c44be 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -7,21 +7,20 @@ log() {
 }
 
 # Installs NPM + a long-running Pod and does the following tests:
-# 1. Check VFP is in sync with HNS (filename: vfp-state-prior.ran)
-# 2. Run Cyclonus (filename: cyclonus.ran)
-# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.ran)
-# 4. Run Conformance (filename: conformance.ran)
-# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.ran)
-# 6. Run scale + connectivity test (filename: scale-connectivity.ran)
-# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.ran)
+# 1.1. Check if HNS rehydration of endpoints works (filename: rehydration.failed)
+# 1.2. Check VFP is in sync with HNS (filename: vfp-state-prior.success)
+# 2. Run Cyclonus (filename: cyclonus.success)
+# 3. Check VFP is in sync with HNS (filename: vfp-state-after-cyclonus.success)
+# 4. Run Conformance (filename: conformance.success)
+# 5. Check VFP is in sync with HNS (filename: vfp-state-after-conformance.success)
+# 6. Run scale + connectivity test (filename: scale-connectivity.success)
+# 7. Check VFP is in sync with HNS (filename: vfp-state-after-scale.success)
 #
-# If any step fails, the script will exit and remaining tests won't be run (because of `set -o errexit`)
+# NOTE: each step also has a .ran file that is created if the step is run.
 #
-# NOTE: each step has both:
-# - A .ran file that is created if the step is run
-# - A .success file that is created if the step succeeds
-#
-# There is also a npm-e2e.ran file that indicates that the npm e2e was run at all, and npm-e2e.success that indicates that all steps succeeded
+# There is also:
+# - A npm-e2e.ran file that indicates that the npm e2e was run at all
+# - A npm-e2e.success that indicates that all steps succeeded
 npm_e2e () {
     local kubeconfigFile=$1
     if [ -z "$kubeconfigFile" ]; then
@@ -35,10 +34,11 @@ npm_e2e () {
     }
 
     log "setting up npm e2e test"
+    anyStepFailed=false
 
     # make sure there are no previous results
     log "cleaning up previous npm e2e results..."
-    rm *.log *.ran *.success || true
+    rm *.log *.ran *.success *.failed || true
 
     echo "" > npm-e2e.ran
 
@@ -46,41 +46,36 @@ npm_e2e () {
 
     log "sleeping 8m for NPM to bootup, then verifying VFP tags after bootup..."
     sleep 8m
-    verify_vfp_tags_using_npm vfp-state-prior.ran
-    echo "" > vfp-state-prior.success
+    verify_vfp_tags_using_npm vfp-state-prior || anyStepFailed=true
 
     ## NPM cyclonus
-    run_npm_cyclonus
-    echo "" > cyclonus.success
+    run_npm_cyclonus && echo "" > cyclonus.success || anyStepFailed=true
 
     log "sleeping 5m to allow VFP to update tags after cyclonus..."
     sleep 5m
     log "verifying VFP tags after cyclonus..."
-    verify_vfp_tags_using_npm vfp-state-after-cyclonus.ran
-    echo "" > vfp-state-after-cyclonus.success
+    verify_vfp_tags_using_npm vfp-state-after-cyclonus || anyStepFailed=true
     log "deleting cyclonus pods..."
     kubectl delete ns x y z
 
     ## NPM conformance
-    run_npm_conformance
-    echo "" > conformance.success
+    run_npm_conformance && echo "" > conformance.success || anyStepFailed=true
 
     log "sleeping 5m to allow VFP to update tags after conformance..."
     sleep 5m
     log "verifying VFP tags after conformance..."
-    verify_vfp_tags_using_npm vfp-state-after-conformance.ran
-    echo "" > vfp-state-after-conformance.success
+    verify_vfp_tags_using_npm vfp-state-after-conformance || anyStepFailed=true
 
     ## NPM scale
-    run_npm_scale $kubeconfigFile
-    echo "" > scale-connectivity.success
+    run_npm_scale $kubeconfigFile && echo "" > scale-connectivity.success ||  anyStepFailed=true
     log "sleeping 5m to allow VFP to update tags after scale test..."
     sleep 5m
     log "verifying VFP tags after scale test..."
-    verify_vfp_tags_using_npm vfp-state-after-scale.ran
-    echo "" > vfp-state-after-scale.success
+    verify_vfp_tags_using_npm vfp-state-after-scale || anyStepFailed=true
 
-    echo "" > npm-e2e.success
+    if [[ $anyStepFailed == false ]]; then
+    	echo "" > npm-e2e.success
+    fi
 }
 
 install_npm () {
@@ -165,8 +160,8 @@ install_npm () {
 
 verify_vfp_tags_using_npm () {
     local ranFilename=$1
-    if [[ $ranFilename != *.ran ]]; then
-        log "ERROR: need a filename that ends in .ran passed as an argument to verify_vfp_tags_using_npm"
+    if [[ -z $ranFilename ]]; then
+        log "ERROR: need a filename passed as an argument to verify_vfp_tags_using_npm"
         return 1
     fi
 
@@ -205,17 +200,21 @@ verify_vfp_tags_using_npm () {
         fi
     done
 
-    echo "" > $ranFilename
+    echo "" > rehydration.ran
     if [[ $hadEndpoints == false ]]; then
         log "ERROR: no Endpoints found in HNS for node IPs $matchString on node $npmNode. Rehydration of Endpoints likely failed"
+	echo "" > rehydration.failed
         return 1
     fi
-
+    
+    echo "" > $ranFilename.ran
     if [[ $hadFailure == true ]]; then
         log "ERROR: VFP tags are inconsistent with HNS SetPolicies"
         capture_npm_hns_state
         return 1
     fi
+
+    echo "" > $ranFilename.success
 }
 
 # results in a file called npm-hns-state.zip

From e898f1f966e4e528bc206ddf3cdafd34f02e7df5 Mon Sep 17 00:00:00 2001
From: Hunter Gregory <hunterlgregory@gmail.com>
Date: Tue, 25 Apr 2023 10:06:20 -0700
Subject: [PATCH 20/21] cyclonus junit and capture hns state once

---
 test/capz/npm/npm-e2e.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index fab67c44be..ff758613a2 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -39,6 +39,7 @@ npm_e2e () {
     # make sure there are no previous results
     log "cleaning up previous npm e2e results..."
     rm *.log *.ran *.success *.failed || true
+    rm -rf npm-hns-state/ || true
 
     echo "" > npm-e2e.ran
 
@@ -219,9 +220,13 @@ verify_vfp_tags_using_npm () {
 
 # results in a file called npm-hns-state.zip
 capture_npm_hns_state () {
+    if [[ -f npm-hns-state.zip ]]; then
+        log "WARNING: not capturing NPM HNS state since state was previously captured"	
+        return 0	
+    fi
+
     log "capturing NPM HNS state..."
     kubectl get pod -owide -A
-    test -d npm-hns-state/ && rm -rf npm-hns-state/ || true
     mkdir npm-hns-state
     cd npm-hns-state
     curl -LO https://raw.githubusercontent.com/Azure/azure-container-networking/master/debug/windows/npm/win-debug.sh
@@ -313,6 +318,7 @@ run_npm_cyclonus () {
     log "beginning npm cyclonus test..."
     echo "" > cyclonus.ran
     ./cyclonus_linux_amd64/cyclonus generate \
+        --junit-results-file=cyclonus.xml \
         --fail-fast \
         --noisy=true \
         --retries=7 \

From 2e3137c9b0c140ee0d63298dd82dbe19877da44d Mon Sep 17 00:00:00 2001
From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>
Date: Tue, 25 Apr 2023 13:42:10 -0700
Subject: [PATCH 21/21] delete conformance namespaces

---
 test/capz/npm/npm-e2e.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/capz/npm/npm-e2e.sh b/test/capz/npm/npm-e2e.sh
index ff758613a2..2a33e23d2c 100644
--- a/test/capz/npm/npm-e2e.sh
+++ b/test/capz/npm/npm-e2e.sh
@@ -57,7 +57,7 @@ npm_e2e () {
     log "verifying VFP tags after cyclonus..."
     verify_vfp_tags_using_npm vfp-state-after-cyclonus || anyStepFailed=true
     log "deleting cyclonus pods..."
-    kubectl delete ns x y z
+    kubectl delete ns x y z || true
 
     ## NPM conformance
     run_npm_conformance && echo "" > conformance.success || anyStepFailed=true
@@ -66,6 +66,8 @@ npm_e2e () {
     sleep 5m
     log "verifying VFP tags after conformance..."
     verify_vfp_tags_using_npm vfp-state-after-conformance || anyStepFailed=true
+    log "deleting NPM conformance namespaces if they were leftover from a failure..."
+    kubectl delete ns -l pod-security.kubernetes.io/enforce=baseline || true
 
     ## NPM scale
     run_npm_scale $kubeconfigFile && echo "" > scale-connectivity.success ||  anyStepFailed=true