From 0e0707faeb1e35f9003ee97bd4414a81b79d77e7 Mon Sep 17 00:00:00 2001 From: Ivan Sikiric Date: Tue, 11 Nov 2025 12:22:19 +0100 Subject: [PATCH 1/4] Fix single-host TPU nodepools, always set num-nodes if not flex. --- goldens.yaml | 2 + goldens/Basic_cluster_create.txt | 2 +- ...Cluster_create_for_multi-host_nodepool.txt | 201 ++++++++++++++++++ ...reate_for_single-host_single-slice_TPU.txt | 199 +++++++++++++++++ goldens/Cluster_create_private.txt | 2 +- ...h_CPU_and_memory_limits_above_capacity.txt | 2 +- ...h_CPU_and_memory_limits_below_capacity.txt | 2 +- goldens/NAP_cluster-create.txt | 2 +- goldens/NAP_cluster-create_with_pathways.txt | 2 +- src/xpk/core/nodepool.py | 5 +- 10 files changed, 411 insertions(+), 8 deletions(-) create mode 100644 goldens/Cluster_create_for_multi-host_nodepool.txt create mode 100644 goldens/Cluster_create_for_single-host_single-slice_TPU.txt diff --git a/goldens.yaml b/goldens.yaml index 692f659f4..bc65c7567 100644 --- a/goldens.yaml +++ b/goldens.yaml @@ -5,6 +5,8 @@ goldens: command: xpk cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run "Basic cluster create": command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run + "Cluster create for multi-host nodepool": + command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-16 --spot --dry-run "Cluster create with CPU and memory limits below capacity": command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=1 --memory-limit=1Mi --dry-run "Cluster create with CPU and memory limits above capacity": diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index e08282ca2..5b2657e3e 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -52,7 +52,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/Cluster_create_for_multi-host_nodepool.txt b/goldens/Cluster_create_for_multi-host_nodepool.txt new file mode 100644 index 000000000..eae794c59 --- /dev/null +++ b/goldens/Cluster_create_for_multi-host_nodepool.txt @@ -0,0 +1,201 @@ +$ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-16 --spot --dry-run +[XPK] Starting xpk v0.14.3 +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)" +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --location=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --location=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-16 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, requires_workload_policy=True) +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-16 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, requires_workload_policy=True) +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe tpu7x-16-2x2x2-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-16-2x2x2-placement-policy --enable-gvnic --node-version=0 --num-nodes=2 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.4/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}' +[XPK] Installing Kueue version v0.14.3... +[XPK] Try 1: Install Kueue +[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Applying following Kueue resources: +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "1xtpu7x-16" +spec: + nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x2"} + +--- + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: AdmissionCheck +metadata: + name: dws-prov +spec: + controllerName: kueue.x-k8s.io/provisioning-request + parameters: + apiGroup: kueue.x-k8s.io + kind: ProvisioningRequestConfig + name: dws-config +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ProvisioningRequestConfig +metadata: + name: dws-config +spec: + provisioningClassName: queued-provisioning.gke.io + podSetUpdates: + nodeSelector: + - key: autoscaling.gke.io/provisioning-request + valueFromProvisioningClassDetail: ResizeRequestName + managedResources: + - google.com/tpu +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + preemption: + reclaimWithinCohort: Never # Don't preempt other queues in the cohort. + withinClusterQueue: LowerPriority + namespaceSelector: {} # match all. + resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-16', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 8}]}]}] + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: multislice-queue +spec: + clusterQueue: cluster-queue +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-low +value: 100 +globalDefault: false +description: "Very Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low +value: 250 +globalDefault: false +description: "Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: medium +value: 500 +globalDefault: false +description: "Medium" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high +value: 750 +globalDefault: false +description: "High" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-high +value: 1000 +globalDefault: false +description: "Very High" +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f f0a510ac08b9c6d8f549478c49836dca41a72a347c491acac1fa70272d531056 +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}' +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_create_for_single-host_single-slice_TPU.txt b/goldens/Cluster_create_for_single-host_single-slice_TPU.txt new file mode 100644 index 000000000..60ff930fa --- /dev/null +++ b/goldens/Cluster_create_for_single-host_single-slice_TPU.txt @@ -0,0 +1,199 @@ +$ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=v4-8 --spot --num-slices=1 --dry-run +[XPK] Starting xpk v0.14.3 +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)" +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --location=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --location=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of v4-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, requires_workload_policy=False) +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of v4-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, requires_workload_policy=False) +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=ct4p-hightpu-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.4/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}' +[XPK] Installing Kueue version v0.14.3... +[XPK] Try 1: Install Kueue +[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Applying following Kueue resources: +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "1xv4-8" +spec: + nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu-v4-podslice", "cloud.google.com/gke-tpu-topology": "2x2x1"} + +--- + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: AdmissionCheck +metadata: + name: dws-prov +spec: + controllerName: kueue.x-k8s.io/provisioning-request + parameters: + apiGroup: kueue.x-k8s.io + kind: ProvisioningRequestConfig + name: dws-config +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ProvisioningRequestConfig +metadata: + name: dws-config +spec: + provisioningClassName: queued-provisioning.gke.io + podSetUpdates: + nodeSelector: + - key: autoscaling.gke.io/provisioning-request + valueFromProvisioningClassDetail: ResizeRequestName + managedResources: + - google.com/tpu +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + preemption: + reclaimWithinCohort: Never # Don't preempt other queues in the cohort. + withinClusterQueue: LowerPriority + namespaceSelector: {} # match all. + resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xv4-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}] + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: multislice-queue +spec: + clusterQueue: cluster-queue +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-low +value: 100 +globalDefault: false +description: "Very Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low +value: 250 +globalDefault: false +description: "Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: medium +value: 500 +globalDefault: false +description: "Medium" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high +value: 750 +globalDefault: false +description: "High" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-high +value: 1000 +globalDefault: false +description: "Very High" +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f a3364905145decc397944f1b959444704898536e2a069bc443200feb2e3459fd +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}' +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index cd0daa4c2..7902db28f 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -54,7 +54,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster-private --p [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --location=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --tpu-topology=2x2x1 --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --location=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --location=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt index 0e660f4ec..fa3b63ae5 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt @@ -52,7 +52,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt index 51ebd6b9a..b7ce142c8 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt @@ -52,7 +52,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 6fd391b8b..df6d80416 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -52,7 +52,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 9c2173504..2d7166f9b 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -52,7 +52,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --location=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 66ac9aed6..fea2471dd 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -287,13 +287,14 @@ def run_gke_node_pool_create_command( topology_product = get_topology_product(system.topology) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' - elif topology_product > 1: + else: command += f' --num-nodes={system.vms_per_slice}' command += ( f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}' ) - if topology_product > 1: + # --tpu-topology should not be set for single-host single-slice node pools + if system.vms_per_slice > 1 or args.num_slices > 1: # --placement-type=COMPACT enables group placement policy which # is mutually exclusive with workload policy, --tpu-topology should # also not be passed when workload policy is used From fab1cc0b26cb671f311dad5424930a5b8965286c Mon Sep 17 00:00:00 2001 From: Ivan Sikiric Date: Tue, 11 Nov 2025 12:46:27 +0100 Subject: [PATCH 2/4] clean up unused topology product --- src/xpk/core/nodepool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index fea2471dd..cb888c5d1 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -16,7 +16,7 @@ from typing import List from ..utils.console import ask_for_user_consent, xpk_print -from ..utils.topology import get_topology_product, is_topology_valid +from ..utils.topology import is_topology_valid from .capacity import ( AUTOPROVISIONING_CONFIG_VALUE, H100_MEGA_DEVICE_TYPE, @@ -284,7 +284,6 @@ def run_gke_node_pool_create_command( ) if system.accelerator_type == AcceleratorType.TPU: command += f' --node-version={gke_node_pool_version}' - topology_product = get_topology_product(system.topology) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' else: From a1a59bd23f6df139d40fcb700b76151f4427c85e Mon Sep 17 00:00:00 2001 From: Ivan Sikiric Date: Wed, 12 Nov 2025 10:30:58 +0100 Subject: [PATCH 3/4] update golden --- goldens/Cluster_create_with_shared_reservation.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goldens/Cluster_create_with_shared_reservation.txt b/goldens/Cluster_create_with_shared_reservation.txt index 490da9aba..5a9555228 100644 --- a/goldens/Cluster_create_with_shared_reservation.txt +++ b/goldens/Cluster_create_with_shared_reservation.txt @@ -54,7 +54,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Existing node pool names ['0'] [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. From faab40006039c03d5beda589545cc4b546ebe039 Mon Sep 17 00:00:00 2001 From: Ivan Sikiric Date: Wed, 12 Nov 2025 10:41:51 +0100 Subject: [PATCH 4/4] Also extend the fix to multi-slice single-host nodepools. --- src/xpk/core/nodepool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index cb888c5d1..c760439e2 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -292,8 +292,8 @@ def run_gke_node_pool_create_command( f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}' ) - # --tpu-topology should not be set for single-host single-slice node pools - if system.vms_per_slice > 1 or args.num_slices > 1: + # --tpu-topology should not be set for single-host node pools + if system.vms_per_slice > 1: # --placement-type=COMPACT enables group placement policy which # is mutually exclusive with workload policy, --tpu-topology should # also not be passed when workload policy is used