Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
027e313
ci: check if directory is empty before applying it
huntergregory May 23, 2023
403ce2a
ci: don't wait for pods if they weren't created
huntergregory May 23, 2023
33c2f16
docs: fix script name
huntergregory May 23, 2023
028699f
ci: wip for enabling linux scale test
huntergregory May 23, 2023
b21d6e6
ci: parameters for linux vs windows
huntergregory May 23, 2023
6483ffd
ci: adjust params
huntergregory May 23, 2023
9022154
ci: fix bash typo
huntergregory May 23, 2023
16aa6a4
Merge branch 'master' into hgregory/05-23-linux-scale
huntergregory Jun 1, 2023
c20a389
ci: fix cp
huntergregory Jun 1, 2023
48fe587
ci: fix npm url
huntergregory Jun 1, 2023
a3cc904
ci: increase max pods for linux nodepool
huntergregory Jun 1, 2023
b7bbffb
ci: start building windows image again
huntergregory Jun 2, 2023
eff43c5
tmp: use apply netpol in background image
huntergregory Jun 2, 2023
b96c0fe
Revert "tmp: use apply netpol in background image"
huntergregory Jun 2, 2023
74761da
refactor: use CLUSTER_NAME variable
huntergregory Jun 2, 2023
2411d2f
ci: require succeeded() for scale & conformance tests
huntergregory Jun 2, 2023
f998904
test: fix vars used in test-scale.sh checks
huntergregory Jun 2, 2023
c0a59a0
Merge branch 'master' into hgregory/05-23-linux-scale
huntergregory Jun 2, 2023
7480bdb
ci: disable linux, reenable windows
huntergregory Jun 7, 2023
2446719
Merge branch 'master' into hgregory/05-23-linux-scale
huntergregory Jun 7, 2023
ced1030
ci: increase sleep before waiting for NPM to start & log info when it…
huntergregory Jun 8, 2023
c8cdb3a
ci: better log capture & remove command from other pipeline
huntergregory Jun 8, 2023
b73781c
ci: do not get logs of npm on kwok nodes
huntergregory Jun 8, 2023
903ae42
ci: do not get logs of npm on kwok nodes (part 2)
huntergregory Jun 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pipelines/npm/npm-conformance-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ jobs:
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
scriptType: "bash"
scriptLocation: "inlineScript"
condition: succeeded()
inlineScript: |
echo Deleting $(RESOURCE_GROUP)
az group delete -n $(RESOURCE_GROUP) --yes
135 changes: 84 additions & 51 deletions .pipelines/npm/npm-scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ jobs:
name: "$(BUILD_POOL_NAME_DEFAULT)"
strategy:
matrix:
npm_linux_amd64:
arch: amd64
name: npm
os: linux
npm_windows2022_amd64:
arch: amd64
name: npm
Expand Down Expand Up @@ -74,8 +78,14 @@ jobs:
FQDN: empty
strategy:
matrix:
v2-windows:
PROFILE: "scale-win"
# v2-linux:
# PROFILE: "sc-lin"
# NUM_NETPOLS: 800
# INITIAL_CONNECTIVITY_TIMEOUT: 60
ws22:
PROFILE: "sc-ws22"
NUM_NETPOLS: 50
INITIAL_CONNECTIVITY_TIMEOUT: 720
steps:
- checkout: self
- bash: |
Expand Down Expand Up @@ -115,44 +125,46 @@ jobs:
az extension add --name aks-preview
az extension update --name aks-preview

export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)

echo "Creating resource group named $CLUSTER_NAME"
az group create --name $CLUSTER_NAME -l $(LOCATION) -o table
echo "Creating resource group named $(RESOURCE_GROUP)"
az group create --name $(RESOURCE_GROUP) -l $(LOCATION) -o table

echo "Creating resource group named $CLUSTER_NAME"
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
echo "Creating cluster named $CLUSTER_NAME"
az aks create \
--resource-group $CLUSTER_NAME \
--resource-group $(RESOURCE_GROUP) \
--name $CLUSTER_NAME \
--generate-ssh-keys \
--windows-admin-username e2eadmin \
--windows-admin-password alpha@numeric!password2 \
--network-plugin azure \
--vm-set-type VirtualMachineScaleSets \
--node-vm-size Standard_D4s_v3 \
--node-count 1

# don't schedule anything on the linux system pool
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
az aks nodepool update \
--cluster-name $CLUSTER_NAME \
-g $CLUSTER_NAME \
-n nodepool1 \
--node-taints CriticalAddonsOnly=true:NoSchedule

echo "Adding Windows nodepool to $CLUSTER_NAME"
az aks nodepool add \
--resource-group $CLUSTER_NAME \
--cluster-name $CLUSTER_NAME \
--name awin22 \
--os-type Windows \
--os-sku Windows2022 \
--node-vm-size Standard_D4s_v3 \
--node-count 1 \
--max-pods 100

if [[ $(PROFILE) == *ws22 ]]; then
# don't schedule anything on the linux system pool
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
az aks nodepool update \
--cluster-name $CLUSTER_NAME \
-g $(RESOURCE_GROUP) \
-n nodepool1 \
--node-taints CriticalAddonsOnly=true:NoSchedule

echo "Adding Windows nodepool to $CLUSTER_NAME"
az aks nodepool add \
--resource-group $(RESOURCE_GROUP) \
--cluster-name $CLUSTER_NAME \
--name awin22 \
--os-type Windows \
--os-sku Windows2022 \
--node-vm-size Standard_D4s_v3 \
--node-count 1 \
--max-pods 100
fi

echo "Getting credentials to $CLUSTER_NAME"
az aks get-credentials -g $CLUSTER_NAME -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
mkdir -p ~/.kube/
cp ./kubeconfig ~/.kube/config

Expand All @@ -168,28 +180,42 @@ jobs:
set -e

# deploy azure-npm
cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm.yaml
# set higher memory limit
cp $(Pipeline.Workspace)/s/npm/azure-npm.yaml azure-npm.yaml
sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm.yaml
kubectl apply -f azure-npm.yaml

cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm-win.yaml
# set higher memory limit
sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm-win.yaml
kubectl apply -f azure-npm-win.yaml

# swap azure-npm image with one built during run
kubectl set image daemonset/azure-npm -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:linux-amd64-$(TAG)
kubectl set image daemonset/azure-npm-win -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:windows-amd64-ltsc2022-$(TAG)

sleep 5s
sleep 30s
echo "waiting for NPM to start running..."
kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=20m
kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m || {
kubectl describe pod -n kube-system -l k8s-app=azure-npm
echo "##vso[task.logissue type=error]NPM failed to start running"
exit 1
}
echo "sleep 3m to let NPM restart in case of bootup failure due to HNS errors"
sleep 3m

kubectl get po -n kube-system -owide -A

echo "labeling Windows nodes for scale test"
kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
if [[ $(PROFILE) == *ws22 ]]; then
echo "labeling Windows nodes for scale test"
kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
else
echo "labeling Linux nodes for scale test"
kubectl get node -o wide | grep "Ubuntu" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
fi

export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
echo "Showing cluster status for $CLUSTER_NAME"
FQDN=`az aks show -n $CLUSTER_NAME -g $CLUSTER_NAME --query fqdn -o tsv`
FQDN=`az aks show -n $CLUSTER_NAME -g $(RESOURCE_GROUP) --query fqdn -o tsv`
echo "##vso[task.setvariable variable=FQDN]$FQDN"

- task: AzureCLI@2
Expand All @@ -202,15 +228,16 @@ jobs:
condition: succeeded()
inlineScript: |
set -e
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-scale-up.log &
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log &
kwok_pid=$!

# 20 kwok nodes
Expand All @@ -229,8 +256,8 @@ jobs:
--max-real-pods-per-node=30 \
--num-real-deployments=10 \
--num-real-replicas=3 \
--num-network-policies=50 \
--num-unapplied-network-policies=50 \
--num-network-policies=$(NUM_NETPOLS) \
--num-unapplied-network-policies=$(NUM_NETPOLS) \
--num-unique-labels-per-pod=2 \
--num-unique-labels-per-deployment=2 \
--num-shared-labels-per-pod=10
Expand All @@ -248,28 +275,30 @@ jobs:
condition: succeeded()
inlineScript: |
set -e
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-bootup-latency.log &
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log &
kwok_pid=$!

kubectl rollout restart -n kube-system ds azure-npm-win
echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
sleep 3m

cd $(Pipeline.Workspace)/s/test/scale/connectivity/
# notes for Windows:
# initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
# adding new network policy to all 30 Pods should happen within 30 seconds
set +e
./test-connectivity.sh --kubectl-binary=$kubectlPath \
--num-scale-pods-to-verify=all \
--max-wait-for-initial-connectivity=$((12*60)) \
--max-wait-for-initial-connectivity=$(INITIAL_CONNECTIVITY_TIMEOUT) \
--max-wait-after-adding-netpol=30
rc=$?
if [[ $rc != 0 ]]; then
Expand All @@ -286,18 +315,19 @@ jobs:
scriptType: "bash"
scriptLocation: "inlineScript"
failOnStderr: true
# condition: succeeded()
condition: succeeded()
inlineScript: |
set -e
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-crud.log &
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log &
kwok_pid=$!

# will delete scale-test and connectivity-test namespaces from previous run
Expand Down Expand Up @@ -342,15 +372,16 @@ jobs:
condition: succeeded()
inlineScript: |
set -e
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
./kwok --kubeconfig ~/.kube/config \
--cidr=155.0.0.0/16 \
--node-ip=155.0.0.1 \
--manage-all-nodes=false \
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
--manage-nodes-with-label-selector= \
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-crud-connectivity.log &
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log &
kwok_pid=$!

cd $(Pipeline.Workspace)/s/test/scale/connectivity/
Expand All @@ -371,14 +402,15 @@ jobs:

- bash: |
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
cp cyclonus-$CLUSTER_NAME $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/cyclonus-$CLUSTER_NAME
echo "Getting cluster state for $CLUSTER_NAME"
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
kubectl get pods -n kube-system | grep npm
kubectl logs -n kube-system -l k8s-app=azure-npm --tail -1 --prefix > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE).txt
# capture any previous logs in case there was a crash
npmPodList=`kubectl get pods -n kube-system | grep npm | awk '{print $1}'`
kubectl get pods -n kube-system -owide | grep npm | grep -v kwok
npmPodList=`kubectl get pods -n kube-system -owide | grep npm | grep -v kwok | awk '{print $1}'`
for npmPod in $npmPodList; do
logFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE)-$npmPod.txt
kubectl logs -n kube-system $npmPod > $logFile

# capture any previous logs in case there was a crash
previousLogFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/previous-npm-logs_$(PROFILE).txt
kubectl logs -n kube-system $npmPod -p > $previousLogFile
if [[ $? -ne 0 ]]; then
Expand Down Expand Up @@ -413,6 +445,7 @@ jobs:
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
scriptType: "bash"
scriptLocation: "inlineScript"
condition: succeeded()
inlineScript: |
echo Deleting $(RESOURCE_GROUP)
az group delete -n $(RESOURCE_GROUP) --yes
2 changes: 1 addition & 1 deletion test/scale/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ This saves us from:

Note: you must run `./test-scale.sh` first with `--num-network-policies=1` or more, and `--num-shared-labels-per-pod=3` or more.
```
./test-connectivity --num-scale-pods-to-verify=all \
./test-connectivity.sh --num-scale-pods-to-verify=all \
--max-wait-for-initial-connectivity=600 \
--max-wait-after-adding-netpol=120
```
36 changes: 27 additions & 9 deletions test/scale/test-scale.sh
Original file line number Diff line number Diff line change
Expand Up @@ -261,13 +261,17 @@ wait_for_pods() {
# wait for all pods to run
minutesToWaitForRealPods=$(( 10 + $numRealPods / 250 ))
set -x
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
if [[ $numRealPods -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
fi
set +x

# just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
minutesToWaitForKwokPods=$(( 1 + $numKwokPods / 500 ))
set -x
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
if [[ $numKwokPods -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
fi
set +x
}

Expand Down Expand Up @@ -404,9 +408,15 @@ echo

set -x
$KUBECTL $KUBECONFIG_ARG create ns scale-test
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real/
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
if [[ $numKwokNodes -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
fi
if [[ $numRealPods -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real/
fi
if [[ $numKwokPods -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
fi
set +x

add_shared_labels() {
Expand Down Expand Up @@ -441,8 +451,12 @@ if [[ $numUniqueLabelsPerPod -gt 0 ]]; then
fi

set -x
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
fi
if [[ $numNetworkPolicies -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
fi
set +x

wait_for_pods
Expand Down Expand Up @@ -470,8 +484,12 @@ if [[ $deleteNetpols == true ]]; then

echo "re-adding network policies. round $i/$deleteNetpolsTimes..."
set -x
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
fi
if [[ $numNetworkPolicies -gt 0 ]]; then
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
fi
set +x
echo "sleeping $deleteNetpolsInterval seconds after readding network policies (end of round $i/$deleteNetpolsTimes)..."
sleep $deleteNetpolsInterval
Expand Down