From 3144f4ba0fd4bfabfc60a2dc9c248978f884000f Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 22 Sep 2025 15:50:14 +0000 Subject: [PATCH 1/4] feat: add a4x support --- Makefile | 8 +++- pyproject.toml | 1 + src/xpk/core/nodepool.py | 51 +++++++++++++++++++++++--- src/xpk/core/nodepool_test.py | 38 ++++++++++++++++++- src/xpk/core/system_characteristics.py | 18 +++++++++ src/xpk/utils/topology.py | 37 +++++++++++++++++++ src/xpk/utils/topology_test.py | 43 ++++++++++++++++++++++ 7 files changed, 188 insertions(+), 8 deletions(-) create mode 100644 src/xpk/utils/topology.py create mode 100644 src/xpk/utils/topology_test.py diff --git a/Makefile b/Makefile index 025bcf69c..58797b5b5 100644 --- a/Makefile +++ b/Makefile @@ -20,11 +20,15 @@ BIN_PATH=$(PROJECT_DIR)/bin install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl install-kjobctl pip-install .PHONY: install-dev -install-dev: check-python check-gcloud mkdir-bin install-kueuectl install-kjobctl pip-install install-pytest install-lint +install-dev: check-python check-gcloud mkdir-bin install-kueuectl install-kjobctl pip-install pip-install-dev install-pytest install-lint + +.PHONY: pip-install-dev +pip-install-dev: + pip install -e ".[dev]" .PHONY: pip-install pip-install: - pip install . + pip install -e . .PHONY: install-pytest install-pytest: diff --git a/pyproject.toml b/pyproject.toml index 5cf0fc826..35b212432 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dev = [ "pylint>=2.6.0", "pre-commit", "pytest", + "pytest-mock==3.15.1", "docker==7.1.0", "mypy ~= 1.17", "types-PyYAML == 6.0.2", diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 85ab6aba9..c33ef9f78 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -16,6 +16,7 @@ from typing import List from ..utils.console import get_user_input, xpk_print +from ..utils.topology import get_topology_product, is_topology_valid from .capacity import ( AUTOPROVISIONING_CONFIG_VALUE, H100_MEGA_DEVICE_TYPE, @@ -33,8 +34,7 @@ create_or_update_cluster_configmap, ) from .system_characteristics import AcceleratorType -from functools import reduce -from operator import mul + CLOUD_PLATFORM_AUTH_SCOPE_URL = ( '"https://www.googleapis.com/auth/cloud-platform"' @@ -271,6 +271,14 @@ def run_gke_node_pool_create_command( if return_code != 0: return 1 + placement_args = '' + if system.accelerator_type == AcceleratorType['GPU'] and is_topology_valid( + system.topology + ): + placement_policy = f'{args.cluster}-placement-policy' + ensure_resource_policy_exists(placement_policy, args, system.topology) + placement_args = f'--placement-policy={placement_policy}' + create_commands = [] create_task_names = [] for node_pool_name in desired_node_pool_names: @@ -285,13 +293,12 @@ def run_gke_node_pool_create_command( f' --machine-type={system.gce_machine_type}' f' --host-maintenance-interval={args.host_maintenance_interval}' f' {capacity_args}' + f' {placement_args}' ' --enable-gvnic' ) if system.accelerator_type == AcceleratorType['TPU']: command += f' --node-version={gke_node_pool_version}' - topology_product = reduce( - mul, (int(x) for x in system.topology.split('x')), 1 - ) + topology_product = get_topology_product(system.topology) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' elif topology_product > 1: @@ -632,3 +639,37 @@ def get_desired_node_pool_names( result.add(f'{cluster_name}-np-{i}') i += 1 return list(result) + + +def ensure_resource_policy_exists( + resource_policy_name: str, args, topology: str +) -> None: + return_code, _ = run_command_for_value( + ( + 'gcloud compute resource-policies describe' + f' {resource_policy_name} ' + f'--project={args.project} ' + f'--region={zone_to_region(args.zone)}' + ), + 'Retrieve resource policy', + args, + ) + + if return_code == 0: + return + + return_code, _ = run_command_for_value( + ( + 'gcloud compute resource-policies create workload-policy ' + f'{resource_policy_name} ' + f'--project={args.project} ' + f'--region={zone_to_region(args.zone)} ' + '--type=HIGH_THROUGHPUT ' + f'--accelerator-topology={topology}' + ), + 'Create resource policy', + args, + ) + + if return_code != 0: + raise RuntimeError('Unable to create resource policy') diff --git a/src/xpk/core/nodepool_test.py b/src/xpk/core/nodepool_test.py index 71cc540c3..ac98922de 100644 --- a/src/xpk/core/nodepool_test.py +++ b/src/xpk/core/nodepool_test.py @@ -14,7 +14,8 @@ limitations under the License. """ -from xpk.core.nodepool import get_desired_node_pool_names +import pytest +from xpk.core.nodepool import get_desired_node_pool_names, ensure_resource_policy_exists CLUSTER_NAME = "running-cucumber" @@ -80,3 +81,38 @@ def test_compute_desired_node_pool_names_with_unknown_node_pools(): expected_result = [node_pool_name(0), node_pool_name(3)] assert set(result) == set(expected_result) + + +def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_policy( + mocker, +): + args = mocker.Mock(project="test-project", zone="us-central1-a") + mock = mocker.patch( + "xpk.core.nodepool.run_command_for_value", return_value=(0, "") + ) + ensure_resource_policy_exists("resource-policy", args, "2x2x1") + mock.assert_called_once() + + +def test_ensure_resource_policy_exists_without_existing_policy_creates_policy( + mocker, +): + args = mocker.Mock(project="test-project", zone="us-central1-a") + mock = mocker.patch( + "xpk.core.nodepool.run_command_for_value", side_effect=[(1, ""), (0, "")] + ) + ensure_resource_policy_exists("resource-policy", args, "2x2x1") + assert mock.call_count == 2 + assert mock.call_args_list[0].args[1] == "Retrieve resource policy" + + +def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creation_fails( + mocker, +): + with pytest.raises(RuntimeError): + args = mocker.Mock(project="test-project", zone="us-central1-a") + mocker.patch( + "xpk.core.nodepool.run_command_for_value", + side_effect=[(1, ""), (1, "")], + ) + ensure_resource_policy_exists("resource-policy", args, "2x2x1") diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 0f627078b..b9ccdd4aa 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -203,6 +203,24 @@ def get_tpu_system_characteristics_map( AcceleratorType['GPU'], 'a100-40gb-8', ), + 'gb200-4': SystemCharacteristics( + '1x72', + 1, + 'nvidia-gb200', + 'a4x-highgpu-4g', + 4, + AcceleratorType['GPU'], + 'gb200-4', + ), + 'gb200-4-nolssd': SystemCharacteristics( + '1x72', + 1, + 'nvidia-gb200', + 'a4x-highgpu-4g-nolssd', + 4, + AcceleratorType['GPU'], + 'gb200-4', + ), 'b200-8': SystemCharacteristics( 'N/A', 1, diff --git a/src/xpk/utils/topology.py b/src/xpk/utils/topology.py new file mode 100644 index 000000000..3d9f6c474 --- /dev/null +++ b/src/xpk/utils/topology.py @@ -0,0 +1,37 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from functools import reduce +from operator import mul + + +def is_topology_valid(topology: str) -> bool: + try: + parse_topology(topology) + return True + except ValueError: + return False + + +def get_topology_product(topology: str) -> int: + return reduce(mul, parse_topology(topology), 1) + + +def parse_topology(topology: str) -> list[int]: + if len(topology) <= 0: + raise ValueError("Topology is an empty string") + + return [int(el) for el in topology.lower().split("x")] diff --git a/src/xpk/utils/topology_test.py b/src/xpk/utils/topology_test.py new file mode 100644 index 000000000..78d6764a6 --- /dev/null +++ b/src/xpk/utils/topology_test.py @@ -0,0 +1,43 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +from .topology import is_topology_valid, get_topology_product, parse_topology + + +def test_is_topology_valid_with_invalid_topology(): + result = is_topology_valid("N/A") + assert result is False + + +def test_is_topology_valid_with_valid_topology(): + result = is_topology_valid("1x1x1") + assert result is True + + +def test_parse_topology_with_valid_topology(): + result = parse_topology("1x2x3") + assert result == [1, 2, 3] + + +def test_parse_topology_with_empty_input(): + with pytest.raises(ValueError): + parse_topology("") + + +def test_get_topology_product(): + result = get_topology_product("1x2x3") + assert result == 6 From 56236acf64ed35cf63ed958884db055267935d8a Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 22 Sep 2025 15:55:30 +0000 Subject: [PATCH 2/4] feat: add goldens for a4x --- goldens.yaml | 2 + goldens/Cluster_create_with_gb200-4.txt | 110 ++++++++++++++++++++++++ src/xpk/core/nodepool.py | 4 +- 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 goldens/Cluster_create_with_gb200-4.txt diff --git a/goldens.yaml b/goldens.yaml index c06f2883e..91f360dfd 100644 --- a/goldens.yaml +++ b/goldens.yaml @@ -5,6 +5,8 @@ goldens: command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run "Basic cluster create": command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run + "Cluster create with gb200-4": + command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run "Cluster create private": command: python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run "Cluster delete": diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt new file mode 100644 index 000000000..42e4dcf00 --- /dev/null +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -0,0 +1,110 @@ +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --enable-dataplane-v2 --enable-multi-networking --no-enable-autoupgrade --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of gb200-4 +We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Creating 1 node pool with 2 nodes of gb200-4 +Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=a4x-highgpu-4g --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=golden-cluster-placement-policy --enable-gvnic --num-nodes=2 --accelerator type=nvidia-gb200,count=4,gpu-driver-version=latest --no-enable-autoupgrade --scopes="https://www.googleapis.com/auth/cloud-platform" +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4cd56d148e911790cb27b2e75c1962022433407faf1127d45fb180cbe13f6853 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] Installing NCCL Plugin for cluster +[XPK] Task: `Install NCCL Plugin On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index c33ef9f78..5a2d69d7b 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -277,7 +277,7 @@ def run_gke_node_pool_create_command( ): placement_policy = f'{args.cluster}-placement-policy' ensure_resource_policy_exists(placement_policy, args, system.topology) - placement_args = f'--placement-policy={placement_policy}' + placement_args = f' --placement-policy={placement_policy}' create_commands = [] create_task_names = [] @@ -293,7 +293,7 @@ def run_gke_node_pool_create_command( f' --machine-type={system.gce_machine_type}' f' --host-maintenance-interval={args.host_maintenance_interval}' f' {capacity_args}' - f' {placement_args}' + f'{placement_args}' ' --enable-gvnic' ) if system.accelerator_type == AcceleratorType['TPU']: From c994f6b2f7c1a07a49a7ff794dcec7c6e63a273a Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 24 Sep 2025 07:14:54 +0000 Subject: [PATCH 3/4] feat: updates after develop changes --- goldens/Cluster_create_with_gb200-4.txt | 2 +- src/xpk/core/nodepool.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 42e4dcf00..915a48f50 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -79,7 +79,7 @@ kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=avai [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4cd56d148e911790cb27b2e75c1962022433407faf1127d45fb180cbe13f6853 +kubectl apply -f 7aee1635a549cbab3308e64e5f973f49f1b09f0ea7c3633a60b69828be981fc5 [XPK] Update Kueue Controller Manager resources [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index e1013cce0..2ed9d9fad 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -652,7 +652,6 @@ def ensure_resource_policy_exists( f'--region={zone_to_region(args.zone)}' ), 'Retrieve resource policy', - args, ) if return_code == 0: @@ -668,7 +667,6 @@ def ensure_resource_policy_exists( f'--accelerator-topology={topology}' ), 'Create resource policy', - args, ) if return_code != 0: From 49009a227a6e0f6614d89ddfcbc4678a3383c649 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 24 Sep 2025 08:13:04 +0000 Subject: [PATCH 4/4] feat: update goldens --- goldens/Cluster_create_with_gb200-4.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 915a48f50..9b8fe82a6 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run [XPK] Starting xpk [XPK] Starting cluster create for cluster golden-cluster: [XPK] Working on golden-project and us-central1-a