In [None]:
# Check prerequisites
echo "Checking prerequisites..."
echo "==========================================="

command -v terraform >/dev/null && terraform version || echo 'terraform missing'
command -v az >/dev/null && az version --output table || echo 'azure cli missing'
command -v jq >/dev/null && jq --version || echo 'jq missing'
command -v kubectl >/dev/null && kubectl version --client || echo 'kubectl missing'

# Set ROOT_DIR to the repository root
if git rev-parse --show-toplevel >/dev/null 2>&1; then
    export ROOT_DIR=$(git rev-parse --show-toplevel)
else
    # Fallback: go up 3 levels from scenarios/perf-eval/image-pull-test
    export ROOT_DIR=$(cd ../../.. && pwd)
fi

echo "Repository Root: $ROOT_DIR"

In [None]:
# Install Python dependencies
if [ -z "$ROOT_DIR" ]; then
    echo "Error: ROOT_DIR is not set. Please run the first cell to initialize variables."
    exit 1
fi

echo "Installing python requirements..."
python3 -m pip install --user -r $ROOT_DIR/modules/python/requirements.txt

In [None]:
# Define test scenario variables
export SCENARIO_TYPE=perf-eval
export SCENARIO_NAME=image-pull-test
export OWNER=$(whoami)
export RUN_ID=${RUN_ID:-$(date +%s)}
export CLOUD=azure
export REGION=eastus2
export AZURE_SUBSCRIPTION_ID="c0d4b923-b5ea-4f8f-9b56-5390a9bf2248"
export SKU_TIER=Standard
export KUBERNETES_VERSION=1.31
export NETWORK_POLICY=""
export NETWORK_DATAPLANE=azure
export TERRAFORM_MODULES_DIR=$ROOT_DIR/modules/terraform/$CLOUD
export TERRAFORM_INPUT_FILE=$ROOT_DIR/scenarios/$SCENARIO_TYPE/$SCENARIO_NAME/terraform-inputs/${CLOUD}.tfvars
export SYSTEM_NODE_POOL=null
export USER_NODE_POOL=null

echo "Scenario: $SCENARIO_TYPE/$SCENARIO_NAME"
echo "Run ID: $RUN_ID"
echo "Terraform Input: $TERRAFORM_INPUT_FILE"


In [None]:
# Azure login
echo "Azure Authentication"
if az account show >/dev/null 2>&1; then
  echo "Already logged in"
  az account set -s $AZURE_SUBSCRIPTION_ID
else
  echo "Logging into Azure..."
  az login --use-device-code
  az account set -s $AZURE_SUBSCRIPTION_ID
fi
export ARM_SUBSCRIPTION_ID=$(az account show --query id -o tsv)
export ARM_TENANT_ID=$(az account show --query tenantId -o tsv)
az account show --query '{Name:name, Id:id}' --output table

In [None]:
# Create resource group
echo "Creating resource group $RUN_ID in $REGION"
az group create --name $RUN_ID --location $REGION --tags run_id=$RUN_ID scenario=${SCENARIO_TYPE}-${SCENARIO_NAME} owner=$OWNER

In [None]:
# Prepare Terraform input JSON
echo "Preparing Terraform input JSON"

# Ensure variables are set to defaults if empty to avoid jq errors
: ${NETWORK_POLICY:=""}
: ${NETWORK_DATAPLANE:=""}
: ${SYSTEM_NODE_POOL:="null"}
: ${USER_NODE_POOL:="null"}

export INPUT_JSON=$(jq -n \
  --arg run_id "$RUN_ID" \
  --arg region "$REGION" \
  --arg aks_sku_tier "$SKU_TIER" \
  --arg aks_kubernetes_version "$KUBERNETES_VERSION" \
  --arg aks_network_policy "$NETWORK_POLICY" \
  --arg aks_network_dataplane "$NETWORK_DATAPLANE" \
  --arg k8s_machine_type "Standard_D4s_v3" \
  --arg k8s_os_disk_type "Managed" \
  --argjson aks_cli_system_node_pool "$SYSTEM_NODE_POOL" \
  --argjson aks_cli_user_node_pool "$USER_NODE_POOL" \
  '{run_id:$run_id,region:$region,aks_sku_tier:$aks_sku_tier,aks_kubernetes_version:$aks_kubernetes_version,aks_network_policy:$aks_network_policy,aks_network_dataplane:$aks_network_dataplane,k8s_machine_type:$k8s_machine_type,k8s_os_disk_type:$k8s_os_disk_type,aks_cli_system_node_pool:$aks_cli_system_node_pool,aks_cli_user_node_pool:$aks_cli_user_node_pool}' | jq 'with_entries(select(.value != null and .value != ""))')
echo $INPUT_JSON | jq .

In [None]:
# Terraform init & plan
pushd $TERRAFORM_MODULES_DIR
terraform init
terraform plan -var json_input="$(echo $INPUT_JSON | jq -c .)" -var-file $TERRAFORM_INPUT_FILE
popd


In [None]:
# Terraform apply
pushd $TERRAFORM_MODULES_DIR
terraform apply -var json_input="$(echo $INPUT_JSON | jq -c .)" -var-file $TERRAFORM_INPUT_FILE --auto-approve
popd


In [None]:
# Attach ACR permissions so AKS can pull private images
# TODO: Set your ACR name and subscription ID
export ACR_NAME=${ACR_NAME:-<your-acr-name>}
export ACR_SUBSCRIPTION_ID=${ACR_SUBSCRIPTION_ID:-<your-subscription-id>}

# Automatically find the cluster name in the resource group
export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query "[0].name" -o tsv)

if [ -z "$CLUSTER_NAME" ]; then
  echo "Error: No AKS cluster found in resource group $RUN_ID"
  exit 1
fi

# Resolve ACR Resource ID (Required for cross-subscription attach)
if [ -n "$ACR_SUBSCRIPTION_ID" ]; then
    echo "Looking up ACR '$ACR_NAME' in subscription '$ACR_SUBSCRIPTION_ID'..."
    ACR_ID=$(az acr show --name $ACR_NAME --subscription $ACR_SUBSCRIPTION_ID --query id -o tsv)
else
    echo "Looking up ACR '$ACR_NAME' in current subscription..."
    ACR_ID=$(az acr show --name $ACR_NAME --query id -o tsv 2>/dev/null)
fi

if [ -z "$ACR_ID" ]; then
    echo "Warning: Could not find ACR ID. Attempting to attach by name..."
    ACR_ID=$ACR_NAME
else
    echo "Found ACR ID: $ACR_ID"
fi

echo "Attaching registry to cluster $CLUSTER_NAME..."
az aks update --resource-group $RUN_ID --name $CLUSTER_NAME --attach-acr $ACR_ID

In [None]:
# Get kubeconfig
if [ -z "$CLUSTER_NAME" ]; then
    export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query "[0].name" -o tsv)
fi

echo "Getting credentials for $CLUSTER_NAME..."
az aks get-credentials --resource-group $RUN_ID --name $CLUSTER_NAME --overwrite-existing
kubectl get nodes

In [None]:
# Clean up previous Prometheus resources
echo "Removing stale monitoring.coreos.com resources"
for crd in alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com prometheuses.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com probes.monitoring.coreos.com; do
  kubectl delete crd $crd --ignore-not-found
done
for cr in prometheus-operator prometheus-operator-psp prometheus-operator-cm; do
  kubectl delete clusterrole $cr --ignore-not-found
  kubectl delete clusterrolebinding $cr --ignore-not-found
done

In [None]:
# Run ClusterLoader2 image-pull scenario
$ROOT_DIR/scenarios/perf-eval/image-pull-test/run_cl2.sh


In [None]:
# Show result files
ls -lah $ROOT_DIR/scenarios/perf-eval/image-pull-test/results

In [None]:
# Debug: Check if Prometheus has the raw containerd metrics
echo "Checking Prometheus for containerd histogram metrics..."

# Port-forward to Prometheus
kubectl port-forward -n monitoring svc/prometheus-operated 9090:9090 &
PF_PID=$!
sleep 3

echo ""
echo "=== containerd_cri_image_pull_duration_seconds_bucket ==="
curl -s 'http://localhost:9090/api/v1/query?query=containerd_cri_image_pull_duration_seconds_bucket' | jq -r '.data.result | length' | xargs -I {} echo "Found {} time series"

echo ""
echo "=== kubelet_runtime_operations_duration_seconds_bucket ==="
curl -s 'http://localhost:9090/api/v1/query?query=kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}' | jq -r '.data.result | length' | xargs -I {} echo "Found {} time series"

# Kill port-forward
kill $PF_PID 2>/dev/null

In [None]:
# Cleanup resources
pushd $TERRAFORM_MODULES_DIR
terraform destroy -var json_input="$(echo $INPUT_JSON | jq -c .)" -var-file $TERRAFORM_INPUT_FILE --auto-approve
popd
az group delete --name $RUN_ID -y


In [None]:
# Analyze Results
$ROOT_DIR/scenarios/perf-eval/image-pull-test/analyze_results.sh
