From 9554a411fa02cee1dc39cace505f34f9189606f0 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:32:23 +0200 Subject: [PATCH 01/10] Modular system_characteristics computation (#717) feat: modular system_characteristics computation --- src/xpk/core/system_characteristics.py | 20 ++++-- src/xpk/core/system_characteristics_test.py | 73 +++++++++++++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 src/xpk/core/system_characteristics_test.py diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index edb9e75f9..b81224fa6 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -135,10 +135,9 @@ def get_tpu_system_characteristics_map( ) -> dict[str, SystemCharacteristics]: system_characteristics_map = {} for topology in supported_topologies: - total_chips = get_topology_product(topology) - num_tensorcores = total_chips * tensorcores_per_chip - chips_per_vm = 1 if total_chips == 1 else 4 - vms_per_slice = total_chips // chips_per_vm + chips_per_vm = compute_chips_per_vm(topology) + vms_per_slice = compute_vms_per_slice(topology) + num_tensorcores = compute_num_tensorcores(tensorcores_per_chip, topology) system = SystemCharacteristics( topology=topology, vms_per_slice=vms_per_slice, @@ -156,6 +155,19 @@ def get_tpu_system_characteristics_map( return system_characteristics_map +def compute_chips_per_vm(topology: str) -> int: + return 1 if get_topology_product(topology) == 1 else 4 + + +def compute_num_tensorcores(tensorcores_per_chip: int, topology: str) -> int: + return get_topology_product(topology) * tensorcores_per_chip + + +def compute_vms_per_slice(topology: str) -> int: + chips_per_vm = compute_chips_per_vm(topology) + return get_topology_product(topology) // chips_per_vm + + ################### Subcommand Helper Functions ############################# """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD diff --git a/src/xpk/core/system_characteristics_test.py b/src/xpk/core/system_characteristics_test.py new file mode 100644 index 000000000..f0933f0e0 --- /dev/null +++ b/src/xpk/core/system_characteristics_test.py @@ -0,0 +1,73 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics + + +def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology(): + result = get_tpu_system_characteristics_map( + prefix="test", + tensorcores_per_chip=1, + gke_accelerator="test", + machine_type="test", + supported_topologies=["1x1"], + supports_sub_slicing=False, + requires_workload_policy=True, + ) + + expected_system_characteristics = SystemCharacteristics( + topology="1x1", + vms_per_slice=1, + gke_accelerator="test", + gce_machine_type="test", + chips_per_vm=1, + accelerator_type=1, + device_type="test-1", + supports_sub_slicing=False, + requires_workload_policy=True, + ) + assert result == { + "test-1": expected_system_characteristics, + "test-1x1": expected_system_characteristics, + } + + +def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topology(): + result = get_tpu_system_characteristics_map( + prefix="test", + tensorcores_per_chip=2, + gke_accelerator="test", + machine_type="test", + supported_topologies=["2x2"], + supports_sub_slicing=False, + requires_workload_policy=True, + ) + + expected_system_characteristics = SystemCharacteristics( + topology="2x2", + vms_per_slice=1, + gke_accelerator="test", + gce_machine_type="test", + chips_per_vm=4, + accelerator_type=1, + device_type="test-8", + supports_sub_slicing=False, + requires_workload_policy=True, + ) + assert result == { + "test-8": expected_system_characteristics, + "test-2x2": expected_system_characteristics, + } From 70de626558a9436c1e7e283c92f66cfbeb5f9547 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:05:34 +0200 Subject: [PATCH 02/10] Dynamic vms_per_slice computation for subslicing (#718) * feat: dynamic vms_per_slice computation for subslicing * style: apply peer review feedback * update goldens --- goldens/Workload_create.txt | 2 +- src/xpk/commands/workload.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index c99ffd136..c2ca07090 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -25,7 +25,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453 +kubectl apply -f abc33690f7a11b2ba50a8f949970fd3ba812f088367b7f64260729f01f41a231 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 68241f05d..41aa35547 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -78,6 +78,7 @@ from ..core.system_characteristics import ( AcceleratorType, get_system_characteristics, + compute_vms_per_slice, ) from ..core.vertex import create_vertex_experiment from ..core.workload import ( @@ -120,8 +121,8 @@ replicas: {args.num_slices} template: spec: - parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice - completions: {system.vms_per_slice} # Same as the above. + parallelism: {vms_per_slice} # Equal to the number of VMs per slice (or sub-slice). + completions: {vms_per_slice} # Same as the above. backoffLimit: 0 # When any pod fails, the job is failed {pod_failure_policy} template: @@ -558,8 +559,14 @@ def workload_create(args) -> None: ) yml_string = WORKLOAD_CREATE_YAML.format( args=args, - system=system, container=container, + vms_per_slice=( + compute_vms_per_slice(args.sub_slicing_topology) + if system.accelerator_type == AcceleratorType['TPU'] + and FeatureFlags.SUB_SLICING_ENABLED + and args.sub_slicing_topology is not None + else system.vms_per_slice + ), affinity=get_cpu_affinity(system.accelerator_type), accelerator_label=create_accelerator_label( system.accelerator_type, system From fc4642c12b4dd4f875874b08a91e1c3c33d57211 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Wed, 22 Oct 2025 17:37:57 +0200 Subject: [PATCH 03/10] feat: Add and use sub-slicing topology during Kueue configuration (#719) * feat: Add and use sub-slicing topology * style: fix circular dependency and make goldens * fix kind KueueConfig * revert: re-add assertNotIn Topology * style: remove unnecessary brackets * style: apply suggestion --- goldens/Basic_cluster_create.txt | 5 +- goldens/Cluster_create_private.txt | 7 +- goldens/Cluster_create_with_gb200-4.txt | 5 +- goldens/NAP_cluster-create.txt | 5 +- goldens/NAP_cluster-create_with_pathways.txt | 7 +- src/xpk/commands/cluster.py | 4 ++ src/xpk/commands/cluster_gcluster.py | 4 ++ src/xpk/commands/kind.py | 1 + src/xpk/core/kueue_manager.py | 67 ++++++++++++++----- src/xpk/core/kueue_manager_test.py | 46 ++++++++++++- src/xpk/templates/kueue_config.yaml.j2 | 3 +- ....j2 => kueue_gke_default_topology.yaml.j2} | 0 .../kueue_sub_slicing_topology.yaml.j2 | 14 ++++ 13 files changed, 130 insertions(+), 38 deletions(-) rename src/xpk/templates/{kueue_topology.yaml.j2 => kueue_gke_default_topology.yaml.j2} (100%) create mode 100644 src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 9087adfdf..359117b8a 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -81,7 +81,6 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m [XPK] Applying following Kueue resources: ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -89,8 +88,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: @@ -175,7 +174,7 @@ value: 1000 globalDefault: false description: "Very High" [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f a1fe8e014a200d6489b8871301a9e80de7e6f45e94b61ad0e60f40f254711bec +kubectl apply -f ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating Kueue Controller Manager resources diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 1a9128cbc..d17a4a9ae 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -86,7 +86,6 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m [XPK] Applying following Kueue resources: ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -94,8 +93,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu-v5p-slice", "cloud.google.com/gke-tpu-topology": "2x2x1"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -103,8 +102,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-nodepool": "cpu-np"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: @@ -189,7 +188,7 @@ value: 1000 globalDefault: false description: "Very High" [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 0ff2bce892606d1497f21fc7b2cea78a4ee103094ce0f509211f3f9730536ad6 +kubectl apply -f 1838a525f73d2cfd19aa616665e8b33fcc4ea10ba8d6015a9307109a6be6d372 [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating Kueue Controller Manager resources diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 70c76fab9..dc3afe117 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -85,7 +85,6 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m [XPK] Applying following Kueue resources: ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -93,8 +92,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-accelerator": "nvidia-gb200"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: @@ -179,7 +178,7 @@ value: 1000 globalDefault: false description: "Very High" [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f f807069b73747a423ec0d1915b2e919cfde400b01654de15746b566709b80f7e +kubectl apply -f 5c5d70f8d2bbedea9acccd9c1a153e2f55efd31cc61d2b55ecdd4a8f009fab11 [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating Kueue Controller Manager resources diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index be3d5f03f..2210b47ca 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -92,7 +92,6 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m [XPK] Applying following Kueue resources: ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -100,8 +99,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: @@ -186,7 +185,7 @@ value: 1000 globalDefault: false description: "Very High" [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f e5fccf0957dcb7f60400bb4e28ce8c5fc251a9aeb6d67793dc119554d13dc900 +kubectl apply -f 40a7aac5b047c750ee98477984af3d46acd60d164d852eccd1b47a21c4155f2d [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating Kueue Controller Manager resources diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 8e55957df..a202bc3a9 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -93,7 +93,6 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m [XPK] Applying following Kueue resources: ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -101,8 +100,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -110,8 +109,8 @@ metadata: spec: nodeLabels: {"cloud.google.com/gke-nodepool": "cpu-np"} - --- + apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: @@ -196,7 +195,7 @@ value: 1000 globalDefault: false description: "Very High" [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f edda4d2ffefedaabd6f77eb6d07a05c1bac829ed90a4c3983b21ded280136557 +kubectl apply -f f89effb1f55aef327018037d75f743b5c62d59f1f62fddadaaa31f72e5e07bdf [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating Kueue Controller Manager resources diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 60a9b0b83..5ff9ce277 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -16,6 +16,7 @@ from tabulate import tabulate +from ..utils.feature_flags import FeatureFlags from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE from ..core.cluster import ( get_all_clusters_programmatic, @@ -1251,6 +1252,9 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config): memory_limit=args.memory_limit, cpu_limit=args.cpu_limit, is_pathways_cluster=args.enable_pathways, + configure_sub_slicing=( + FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing + ), ), ) diff --git a/src/xpk/commands/cluster_gcluster.py b/src/xpk/commands/cluster_gcluster.py index 0ac48c35c..195e22bb2 100644 --- a/src/xpk/commands/cluster_gcluster.py +++ b/src/xpk/commands/cluster_gcluster.py @@ -16,6 +16,7 @@ import os +from ..utils.feature_flags import FeatureFlags from ..utils.execution_context import is_dry_run from ..core.kueue_manager import KueueConfig, KueueManager from ..core.nap import enable_autoprovisioning_on_cluster @@ -159,6 +160,9 @@ def __install_kueue(args) -> int: cpu_limit=args.cpu_limit, is_pathways_cluster=args.enable_pathways, flex=args.flex, + configure_sub_slicing=( + FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing + ), ), tolerations=tolerations, ) diff --git a/src/xpk/commands/kind.py b/src/xpk/commands/kind.py index fe40e342b..93f01a5bf 100644 --- a/src/xpk/commands/kind.py +++ b/src/xpk/commands/kind.py @@ -110,6 +110,7 @@ def cluster_create(args) -> None: cpu_limit=0, is_pathways_cluster=False, flex=False, + configure_sub_slicing=False, ), ) diff --git a/src/xpk/core/kueue_manager.py b/src/xpk/core/kueue_manager.py index 73583b4cf..582e27519 100644 --- a/src/xpk/core/kueue_manager.py +++ b/src/xpk/core/kueue_manager.py @@ -44,9 +44,11 @@ WAIT_FOR_KUEUE_TIMEOUT = "10m" CLUSTER_QUEUE_NAME = "cluster-queue" LOCAL_QUEUE_NAME = "multislice-queue" +SUB_SLICE_TOPOLOGY_NAME = "sub-slice-topology" KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2" -KUEUE_TOPOLOGY_JINJA_FILE = "kueue_topology.yaml.j2" +KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE = "kueue_gke_default_topology.yaml.j2" KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2" +KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2" MEMORY_SIZE_PER_VM = 1.2 MIN_MEMORY_LIMIT_SIZE = 4096 KUEUE_VERSION = "v0.14.1" @@ -58,12 +60,19 @@ class KueueConfig: total_chips: int cpu_limit: int memory_limit: str + configure_sub_slicing: bool is_pathways_cluster: bool = False autoprovisioning_enabled: bool = False flex: bool = False num_slices: int = 1 +@dataclass +class _NameAndYaml: + name: str + yaml: str + + class KueueManager: """Manages the installation and configuration of Kueue on an XPK cluster.""" @@ -208,6 +217,13 @@ def __configure( """ template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE) + topology_name_and_yaml = self.__get_topology_name_and_yaml( + kueue_config.system, kueue_config.configure_sub_slicing + ) + topology_name = ( + topology_name_and_yaml.name if topology_name_and_yaml else None + ) + # The manager builds the context internally based on its opinionated logic context = self.__build_template_context( system=kueue_config.system, @@ -218,18 +234,16 @@ def __configure( num_slices=kueue_config.num_slices, cpu_limit=kueue_config.cpu_limit, memory_limit=kueue_config.memory_limit, + topology_name=topology_name, ) - rendered_manifest = template.render(context) + config_yaml = template.render(context) + yamls = [config_yaml] - if kueue_config.system.device_type in [ - H100_MEGA_DEVICE_TYPE, - H200_DEVICE_TYPE, - B200_DEVICE_TYPE, - ]: - topology_yaml = self.template_env.get_template(KUEUE_TOPOLOGY_JINJA_FILE) - rendered_manifest = topology_yaml.render() + rendered_manifest + if topology_name_and_yaml: + yamls.append(topology_name_and_yaml.yaml) + rendered_manifest = "\n---\n".join(yamls) return_code = self.__apply_manifest(rendered_manifest) if return_code != 0: return return_code @@ -246,6 +260,7 @@ def __build_template_context( num_slices: int, cpu_limit: int, memory_limit: str, + topology_name: str | None, ) -> Dict[str, Any]: """Prepares the context for the Jinja2 template.""" # Main accelerator flavor @@ -267,13 +282,7 @@ def __build_template_context( key, value = machine_label.split(":", 1) node_labels_dict[key] = value.strip() - topology_label = "" - if system.device_type in [ - H100_MEGA_DEVICE_TYPE, - H200_DEVICE_TYPE, - B200_DEVICE_TYPE, - ]: - topology_label = 'topologyName: "gke-default"' + topology_label = f"topologyName: {topology_name}" if topology_name else "" flavors = [{ "name": main_flavor_name, @@ -335,6 +344,32 @@ def __build_template_context( "admission_checks": admission_checks, } + def __get_topology_name_and_yaml( + self, system: SystemCharacteristics, configure_sub_slicing: bool + ) -> _NameAndYaml | None: + if system.device_type in [ + H100_MEGA_DEVICE_TYPE, + H200_DEVICE_TYPE, + B200_DEVICE_TYPE, + ]: + return _NameAndYaml( + name="gke-default", + yaml=self.template_env.get_template( + KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE + ).render(), + ) + elif configure_sub_slicing: + return _NameAndYaml( + name=SUB_SLICE_TOPOLOGY_NAME, + yaml=self.template_env.get_template( + KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE + ).render({ + "sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME, + }), + ) + else: + return None + def __apply_manifest(self, manifest: str) -> int: task = "Applying Kueue Custom Resources" if is_dry_run(): diff --git a/src/xpk/core/kueue_manager_test.py b/src/xpk/core/kueue_manager_test.py index 37f05c30e..7a793aedc 100644 --- a/src/xpk/core/kueue_manager_test.py +++ b/src/xpk/core/kueue_manager_test.py @@ -240,6 +240,7 @@ def test_configuration_updates_resources( total_chips=8, cpu_limit=100, memory_limit="100Gi", + configure_sub_slicing=False, ) with ( @@ -265,6 +266,7 @@ def test_resource_update_for_small_cluster(self, mock_run_retry): total_chips=8, cpu_limit=100, memory_limit="100Gi", + configure_sub_slicing=False, ) with ( @@ -307,6 +309,7 @@ def test_resource_update_for_large_cluster(self, mock_run_retry): total_chips=8, cpu_limit=100, memory_limit="100Gi", + configure_sub_slicing=False, ) with ( @@ -344,7 +347,7 @@ def test_resource_update_for_large_cluster(self, mock_run_retry): @patch( "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary" ) - def test_configure_generates_correct_manifest( + def test_configure_generates_correct_manifest_for_tpu( self, mock_update_resources, mock_install ): """Test that __configure generates the correct manifest content for TPUs.""" @@ -357,6 +360,7 @@ def test_configure_generates_correct_manifest( memory_limit="100Gi", autoprovisioning_enabled=False, num_slices=2, + configure_sub_slicing=False, ) rendered_manifest = self._trigger_installation(kueue_config) @@ -413,6 +417,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic autoprovisioning_enabled=False, num_slices=1, flex=True, + configure_sub_slicing=False, ) rendered_manifest = self._trigger_installation(kueue_config) @@ -432,7 +437,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic @patch( "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary" ) - def test_configure_generates_correct_manifest_with_topology( + def test_configure_generates_correct_manifest_with_gke_default_topology( self, mock_update_resources, mock_install ): """Test that __configure generates correct manifest for GPUs.""" @@ -444,11 +449,11 @@ def test_configure_generates_correct_manifest_with_topology( cpu_limit=100, memory_limit="100Gi", num_slices=2, + configure_sub_slicing=False, ) rendered_manifest = self._trigger_installation(kueue_config) - self.assertIn("kind: Topology", rendered_manifest) manifest_docs = list(yaml.safe_load_all(rendered_manifest)) resource_flavor = _first( doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor" @@ -459,6 +464,40 @@ def test_configure_generates_correct_manifest_with_topology( ], "h100-mega-80gb-8", ) + self.assertEqual(resource_flavor["spec"]["topologyName"], "gke-default") + topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology") + self.assertEqual(topology["metadata"]["name"], "gke-default") + + @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install") + @patch( + "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary" + ) + def test_configure_generates_correct_manifest_with_sub_slicing( + self, mock_update_resources, mock_install + ): + """Test that __configure generates correct manifest with sub-slicing topology.""" + mock_install.return_value = 0 + mock_update_resources.return_value = 0 + kueue_config = KueueConfig( + system=self.mock_system_chars, + total_chips=16, + cpu_limit=100, + memory_limit="100Gi", + num_slices=2, + configure_sub_slicing=True, + ) + + rendered_manifest = self._trigger_installation(kueue_config) + + manifest_docs = list(yaml.safe_load_all(rendered_manifest)) + resource_flavor = _first( + doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor" + ) + self.assertEqual( + resource_flavor["spec"]["topologyName"], "sub-slice-topology" + ) + topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology") + self.assertEqual(topology["metadata"]["name"], "sub-slice-topology") @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install") @patch( @@ -477,6 +516,7 @@ def test_configure_generates_correct_manifest_with_pathways( memory_limit="100Gi", is_pathways_cluster=True, num_slices=2, + configure_sub_slicing=False, ) rendered_manifest = self._trigger_installation(kueue_config) diff --git a/src/xpk/templates/kueue_config.yaml.j2 b/src/xpk/templates/kueue_config.yaml.j2 index ecb31320a..6b25350bc 100644 --- a/src/xpk/templates/kueue_config.yaml.j2 +++ b/src/xpk/templates/kueue_config.yaml.j2 @@ -1,5 +1,4 @@ {% for flavor in flavors %} ---- apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -9,8 +8,8 @@ spec: {% if flavor.topologyLabel %} {{ flavor.topologyLabel }} {% endif %} -{% endfor %} --- +{% endfor %} apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: diff --git a/src/xpk/templates/kueue_topology.yaml.j2 b/src/xpk/templates/kueue_gke_default_topology.yaml.j2 similarity index 100% rename from src/xpk/templates/kueue_topology.yaml.j2 rename to src/xpk/templates/kueue_gke_default_topology.yaml.j2 diff --git a/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 b/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 new file mode 100644 index 000000000..87e874e57 --- /dev/null +++ b/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 @@ -0,0 +1,14 @@ +apiVersion: kueue.x-k8s.io/v1beta1 +kind: Topology +metadata: + name: {{ sub_slice_topology_name }} +spec: + levels: + - nodeLabel: "cloud.google.com/gke-tpu-slice-16x16-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-8x16-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-8x8-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-4x8-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-4x4-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-2x4-id" + - nodeLabel: "cloud.google.com/gke-tpu-slice-2x2-id" + - nodeLabel: "kubernetes.io/hostname" From 15548be2b76f5c0d66c1d6f3b74f25f3ab887b9c Mon Sep 17 00:00:00 2001 From: FIoannides Date: Thu, 23 Oct 2025 10:34:52 +0200 Subject: [PATCH 04/10] Merge main back to develop (#724) * Release v0.13.0 * Release v0.13.0 * Release v0.14.0 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index b67a6026a..9873608e9 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.13.0' +__version__ = 'v0.14.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From ce2db901ef9a87947192c508a32b0d57a16d3db6 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Thu, 23 Oct 2025 13:00:21 +0200 Subject: [PATCH 05/10] Subslicing topology validation (#722) * feat: add topology validation util * feat: subslicing topology validation * style: peer review fixes * feat: rename method --- src/xpk/commands/workload.py | 29 ++++++++++++++- src/xpk/commands/workload_test.py | 62 +++++++++++++++++++++++++++++++ src/xpk/utils/topology.py | 9 +++++ src/xpk/utils/topology_test.py | 22 ++++++++++- 4 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 src/xpk/commands/workload_test.py diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 41aa35547..05021f3fa 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -52,7 +52,7 @@ get_user_workload_for_pathways, try_to_delete_pathwaysjob_first, ) -from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics +from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap from ..core.scheduling import ( check_if_workload_can_schedule, @@ -100,6 +100,7 @@ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies from . import cluster_gcluster from .common import is_TAS_possible +from ..utils.topology import is_topology_contained from ..utils.feature_flags import FeatureFlags WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2 @@ -281,6 +282,8 @@ {user_workload} """ +SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'] + def workload_create_pathways(args) -> None: """Run jobset apply command for a file, specifically for Pathways. @@ -338,6 +341,9 @@ def workload_create(args) -> None: xpk_print('Fetching system characteristics failed!') xpk_exit(return_code) + if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None: + _validate_sub_slicing_topology(system, args.sub_slicing_topology) + if not check_if_workload_can_schedule(args, system): xpk_exit(1) @@ -674,6 +680,27 @@ def workload_create(args) -> None: xpk_exit(0) +def _validate_sub_slicing_topology( + system_characteristics: SystemCharacteristics, sub_slicing_topology: str +) -> None: + if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES: + xpk_print( + f'Error: --sub-slicing-topology={sub_slicing_topology} shape is' + f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.' + ) + xpk_exit(1) + + if not is_topology_contained( + contained=sub_slicing_topology, container=system_characteristics.topology + ): + xpk_print( + f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too' + ' large. The shape cannot be bigger than' + f' {system_characteristics.topology}.' + ) + xpk_exit(1) + + def get_restart_exit_codes(args) -> list: exit_codes = [42] exit_codes.extend(range(127, 256, 1)) diff --git a/src/xpk/commands/workload_test.py b/src/xpk/commands/workload_test.py new file mode 100644 index 000000000..94d2e8d49 --- /dev/null +++ b/src/xpk/commands/workload_test.py @@ -0,0 +1,62 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +from ..core.system_characteristics import SystemCharacteristics +from .workload import _validate_sub_slicing_topology + + +SYSTEM_CHARACTERISTICS = SystemCharacteristics( + topology='8x8', + vms_per_slice=1, + gke_accelerator='nvidia-l4', + gce_machine_type='g2-standard-12', + chips_per_vm=1, + accelerator_type=1, + device_type='l4-1', + supports_sub_slicing=True, + requires_workload_policy=False, +) + + +@pytest.fixture(autouse=True) +def xpk_print(mocker): + return mocker.patch('xpk.commands.workload.xpk_print') + + +def test_validate_sub_slicing_topology_exits_for_unsupported_topology( + xpk_print, +): + with pytest.raises(SystemExit): + _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1') + + assert ( + 'shape is invalid. It has to be one of' in xpk_print.mock_calls[0].args[0] + ) + + +def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print): + with pytest.raises(SystemExit): + _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16') + + assert ( + 'shape is too large. The shape cannot be' + in xpk_print.mock_calls[0].args[0] + ) + + +def test_validate_sub_slicing_topology_does_nothing_for_supported_topology(): + _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4') diff --git a/src/xpk/utils/topology.py b/src/xpk/utils/topology.py index 3d9f6c474..f65d1722e 100644 --- a/src/xpk/utils/topology.py +++ b/src/xpk/utils/topology.py @@ -35,3 +35,12 @@ def parse_topology(topology: str) -> list[int]: raise ValueError("Topology is an empty string") return [int(el) for el in topology.lower().split("x")] + + +def is_topology_contained(contained: str, container: str) -> bool: + contained_parsed = parse_topology(contained) + container_parsed = parse_topology(container) + return len(contained_parsed) == len(container_parsed) and all( + contained <= container + for contained, container in zip(contained_parsed, container_parsed) + ) diff --git a/src/xpk/utils/topology_test.py b/src/xpk/utils/topology_test.py index 78d6764a6..885c4ba3a 100644 --- a/src/xpk/utils/topology_test.py +++ b/src/xpk/utils/topology_test.py @@ -15,7 +15,7 @@ """ import pytest -from .topology import is_topology_valid, get_topology_product, parse_topology +from .topology import is_topology_valid, get_topology_product, parse_topology, is_topology_contained def test_is_topology_valid_with_invalid_topology(): @@ -41,3 +41,23 @@ def test_parse_topology_with_empty_input(): def test_get_topology_product(): result = get_topology_product("1x2x3") assert result == 6 + + +def test_is_topology_contained_with_container_smaller_than_contained_returns_false(): + result = is_topology_contained(contained="3x3x3", container="2x2x2") + assert result is False + + +def test_is_topology_contained_with_container_larger_than_contained_returns_true(): + result = is_topology_contained(contained="1x1x1", container="2x2x2") + assert result is True + + +def test_is_topology_contained_with_container_equal_to_contained_returns_true(): + result = is_topology_contained(contained="2x2x2", container="2x2x2") + assert result is True + + +def test_is_topology_contained_with_different_topologies_dimensions_returns_false(): + result = is_topology_contained(contained="2x2", container="2x2x2") + assert result is False From c99ccfa7175e9c1cfb50bc2d78567e0de42311c4 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Thu, 23 Oct 2025 13:21:47 +0200 Subject: [PATCH 06/10] feat: Add sub-slicing system validation (#723) --- src/xpk/commands/cluster.py | 11 +++- src/xpk/commands/cluster_test.py | 92 ++++++++++++++++++++++++++++++++ src/xpk/commands/common.py | 6 +++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 src/xpk/commands/cluster_test.py diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 5ff9ce277..ef2715061 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -76,7 +76,7 @@ from ..utils.execution_context import is_dry_run from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies from . import cluster_gcluster -from .common import set_cluster_command +from .common import set_cluster_command, validate_sub_slicing_system from jinja2 import Environment, FileSystemLoader from ..utils.templates import TEMPLATE_PATH import shutil @@ -201,6 +201,11 @@ def cluster_adapt(args) -> None: xpk_exit(0) +def _validate_cluster_create_args(args, system: SystemCharacteristics): + if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing: + validate_sub_slicing_system(system) + + def cluster_create(args) -> None: """Function around cluster creation. @@ -213,12 +218,14 @@ def cluster_create(args) -> None: SystemDependency.KJOB, SystemDependency.GCLOUD, ]) - system, return_code = get_system_characteristics(args) + system, return_code = get_system_characteristics(args) if return_code > 0 or system is None: xpk_print('Fetching system characteristics failed!') xpk_exit(return_code) + _validate_cluster_create_args(args, system) + xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True) add_zone_and_project(args) diff --git a/src/xpk/commands/cluster_test.py b/src/xpk/commands/cluster_test.py new file mode 100644 index 000000000..77010d3d3 --- /dev/null +++ b/src/xpk/commands/cluster_test.py @@ -0,0 +1,92 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from argparse import Namespace +from dataclasses import dataclass +from unittest.mock import MagicMock +import pytest + +from xpk.commands.cluster import _validate_cluster_create_args +from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics +from xpk.utils.feature_flags import FeatureFlags + + +@dataclass +class _Mocks: + common_print_mock: MagicMock + common_exit_mock: MagicMock + + +@pytest.fixture +def mock_common_print_and_exit(mocker): + common_print_mock = mocker.patch( + 'xpk.commands.common.xpk_print', + return_value=None, + ) + common_exit_mock = mocker.patch( + 'xpk.commands.common.xpk_exit', + return_value=None, + ) + return _Mocks( + common_print_mock=common_print_mock, common_exit_mock=common_exit_mock + ) + + +DEFAULT_TEST_SYSTEM: SystemCharacteristics = ( + UserFacingNameToSystemCharacteristics['l4-1'] +) +SUB_SLICING_SYSTEM: SystemCharacteristics = ( + UserFacingNameToSystemCharacteristics['v6e-4x4'] +) + + +def test_validate_cluster_create_args_for_correct_args_pass( + mock_common_print_and_exit: _Mocks, +): + args = Namespace() + + _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM) + + assert mock_common_print_and_exit.common_print_mock.call_count == 0 + assert mock_common_print_and_exit.common_exit_mock.call_count == 0 + + +def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass( + mock_common_print_and_exit: _Mocks, +): + FeatureFlags.SUB_SLICING_ENABLED = True + args = Namespace(sub_slicing=True) + + _validate_cluster_create_args(args, SUB_SLICING_SYSTEM) + + assert mock_common_print_and_exit.common_print_mock.call_count == 0 + assert mock_common_print_and_exit.common_exit_mock.call_count == 0 + + +def test_validate_cluster_create_args_for_not_supported_system_throws( + mock_common_print_and_exit: _Mocks, +): + FeatureFlags.SUB_SLICING_ENABLED = True + args = Namespace(sub_slicing=True) + + _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM) + + assert mock_common_print_and_exit.common_print_mock.call_count == 1 + assert ( + mock_common_print_and_exit.common_print_mock.call_args[0][0] + == 'Error: l4-1 does not support Sub-slicing.' + ) + assert mock_common_print_and_exit.common_exit_mock.call_count == 1 diff --git a/src/xpk/commands/common.py b/src/xpk/commands/common.py index ca2c1be69..e4113936c 100644 --- a/src/xpk/commands/common.py +++ b/src/xpk/commands/common.py @@ -67,3 +67,9 @@ def is_TAS_possible( system_characteristics.device_type != H100_MEGA_DEVICE_TYPE or capacity_type == CapacityType.RESERVATION ) + + +def validate_sub_slicing_system(system: SystemCharacteristics): + if not system.supports_sub_slicing: + xpk_print(f'Error: {system.device_type} does not support Sub-slicing.') + xpk_exit(1) From e3aacebd9b324e474246c0c48e42419a8c9ea769 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Thu, 23 Oct 2025 15:28:31 +0200 Subject: [PATCH 07/10] feat: Add sub-slicing system validation in workload create (#727) * feat: Add sub-slicing system validation in workload create * make goldens --- goldens/Workload_create.txt | 1 - goldens/Workload_create_pathways.txt | 1 - src/xpk/commands/workload.py | 6 +++--- src/xpk/commands/workload_test.py | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index c2ca07090..98e4314ae 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -2,7 +2,6 @@ $ python3 xpk.py workload create --project=golden-project --zone=us-central1-a - [XPK] Starting xpk [XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' -[XPK] Starting workload create [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Starting workload create diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index 773b344da..cba88585d 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -2,7 +2,6 @@ $ python3 xpk.py workload create-pathways --project=golden-project --zone=us-cen [XPK] Starting xpk [XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' -[XPK] Starting workload create [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Starting workload create diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 05021f3fa..add9d5d9e 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -99,7 +99,7 @@ from ..utils.execution_context import is_dry_run from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies from . import cluster_gcluster -from .common import is_TAS_possible +from .common import is_TAS_possible, validate_sub_slicing_system from ..utils.topology import is_topology_contained from ..utils.feature_flags import FeatureFlags @@ -334,9 +334,7 @@ def workload_create(args) -> None: ) xpk_exit(1) - xpk_print('Starting workload create', flush=True) system, return_code = get_system_characteristics(args) - if return_code > 0 or system is None: xpk_print('Fetching system characteristics failed!') xpk_exit(return_code) @@ -700,6 +698,8 @@ def _validate_sub_slicing_topology( ) xpk_exit(1) + validate_sub_slicing_system(system_characteristics) + def get_restart_exit_codes(args) -> list: exit_codes = [42] diff --git a/src/xpk/commands/workload_test.py b/src/xpk/commands/workload_test.py index 94d2e8d49..843e42d23 100644 --- a/src/xpk/commands/workload_test.py +++ b/src/xpk/commands/workload_test.py @@ -14,6 +14,8 @@ limitations under the License. """ +import dataclasses +from unittest.mock import MagicMock, patch import pytest from ..core.system_characteristics import SystemCharacteristics from .workload import _validate_sub_slicing_topology @@ -60,3 +62,20 @@ def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print): def test_validate_sub_slicing_topology_does_nothing_for_supported_topology(): _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4') + + +@patch('xpk.commands.common.xpk_print') +def test_validate_sub_slicing_topology_fails_for_unsupported_system( + common_xpk_print: MagicMock, +): + unsupported_system = dataclasses.replace( + SYSTEM_CHARACTERISTICS, supports_sub_slicing=False + ) + + with pytest.raises(SystemExit): + _validate_sub_slicing_topology(unsupported_system, '4x4') + + assert ( + 'l4-1 does not support Sub-slicing.' + in common_xpk_print.mock_calls[0].args[0] + ) From 49b4abc025c062a4f0b65233a76981b780262e06 Mon Sep 17 00:00:00 2001 From: FIoannides Date: Thu, 23 Oct 2025 15:36:38 +0200 Subject: [PATCH 08/10] Add packages and access templates instead of absolute path (#726) * Add packages and access templates instead of absolute path * Fix dev mode for goldens and local dev as well * textual change --- pyproject.toml | 7 +++++-- src/xpk/commands/cluster.py | 6 ++++-- src/xpk/core/kueue_manager.py | 9 +++++++-- src/xpk/utils/templates.py | 15 ++++++++++++++- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 495086681..464c64ee6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,9 +75,12 @@ dev = [ version = {attr = "xpk.core.config.__version__"} [tool.setuptools] -packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.remote_state", "xpk.core.workload_decorators"] package-dir = {"" = "src"} -package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]} +packages = { find = { where = ["src"] } } + +[tool.setuptools.package-data] +"xpk" = ["templates/*"] +"xpk.api" = ["*.yaml"] [tool.pyink] # Formatting configuration to follow Google style-guide. diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index ef2715061..f1a02bda0 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -78,7 +78,7 @@ from . import cluster_gcluster from .common import set_cluster_command, validate_sub_slicing_system from jinja2 import Environment, FileSystemLoader -from ..utils.templates import TEMPLATE_PATH +from ..utils.templates import get_templates_absolute_path import shutil import os @@ -434,7 +434,9 @@ def cluster_cacheimage(args) -> None: system.accelerator_type ].accelerator_label - template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH)) + template_env = Environment( + loader=FileSystemLoader(searchpath=get_templates_absolute_path()) + ) cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE) rendered_yaml = cluster_preheat_yaml.render( cachekey=args.cache_key, diff --git a/src/xpk/core/kueue_manager.py b/src/xpk/core/kueue_manager.py index 582e27519..921350db6 100644 --- a/src/xpk/core/kueue_manager.py +++ b/src/xpk/core/kueue_manager.py @@ -39,7 +39,7 @@ ) from ..utils.file import write_tmp_file from ..utils.console import xpk_print, xpk_exit -from ..utils.templates import TEMPLATE_PATH +from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path WAIT_FOR_KUEUE_TIMEOUT = "10m" CLUSTER_QUEUE_NAME = "cluster-queue" @@ -82,7 +82,12 @@ def __init__( template_path=TEMPLATE_PATH, ): self.kueue_version = kueue_version - self.template_env = Environment(loader=FileSystemLoader(template_path)) + + self.template_env = Environment( + loader=FileSystemLoader( + searchpath=get_templates_absolute_path(template_path) + ) + ) def install_or_upgrade( self, diff --git a/src/xpk/utils/templates.py b/src/xpk/utils/templates.py index 11e9cba24..eca9d2063 100644 --- a/src/xpk/utils/templates.py +++ b/src/xpk/utils/templates.py @@ -18,7 +18,7 @@ import ruamel.yaml -TEMPLATE_PATH = "src/xpk/templates/" +TEMPLATE_PATH = "templates" yaml = ruamel.yaml.YAML() @@ -28,3 +28,16 @@ def load(path: str) -> dict: with open(template_path, "r", encoding="utf-8") as file: data: dict = yaml.load(file) return data + + +def get_templates_absolute_path(templates_path: str = TEMPLATE_PATH) -> str: + """ + Return the absolute path to the templates folder + + Args: + templates_path: The path to the templates folder relative to the src/xpk directory + """ + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + xpk_package_dir = os.path.dirname(current_dir) + return os.path.join(xpk_package_dir, templates_path) From f920ee22a4b248b8606b6325340047d74806e3dc Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:47:49 +0200 Subject: [PATCH 09/10] Expose get_installed_kueue_version (#728) feat: expose get_installed_kueue_version --- src/xpk/core/kueue_manager.py | 4 ++-- src/xpk/core/kueue_manager_test.py | 26 +++++++++----------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/xpk/core/kueue_manager.py b/src/xpk/core/kueue_manager.py index 921350db6..f1d6ddc1b 100644 --- a/src/xpk/core/kueue_manager.py +++ b/src/xpk/core/kueue_manager.py @@ -101,7 +101,7 @@ def install_or_upgrade( Args: tolerations: An optional list of tolerations to apply to the kueue-controller-manager. """ - return_code, installed_version = self.__get_installed_kueue_version() + return_code, installed_version = self.get_installed_kueue_version() if return_code == 0: if installed_version and installed_version > self.kueue_version: @@ -121,7 +121,7 @@ def install_or_upgrade( return self.__configure(kueue_config) - def __get_installed_kueue_version(self) -> tuple[int, str | None]: + def get_installed_kueue_version(self) -> tuple[int, str | None]: command = ( "kubectl get deployment kueue-controller-manager -n kueue-system -o" " jsonpath='{.spec.template.spec.containers[0].image}'" diff --git a/src/xpk/core/kueue_manager_test.py b/src/xpk/core/kueue_manager_test.py index 7a793aedc..7f340eacd 100644 --- a/src/xpk/core/kueue_manager_test.py +++ b/src/xpk/core/kueue_manager_test.py @@ -76,9 +76,7 @@ def test_version_check_when_kueue_not_installed(self, mock_run_for_value): mock_install.assert_called_once() mock_configure.assert_called_once() - @patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version" - ) + @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version") @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install") @patch("xpk.core.kueue_manager.KueueManager._KueueManager__configure") def test_install_or_upgrade_when_newer_version_already_installed( @@ -95,9 +93,7 @@ def test_install_or_upgrade_when_newer_version_already_installed( mock_install.assert_not_called() mock_configure.assert_not_called() - @patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version" - ) + @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version") def test_install_or_upgrade_when_outdated( self, mock_get_version, @@ -121,9 +117,7 @@ def test_install_or_upgrade_when_outdated( mock_install.assert_called_once() mock_configure.assert_called_once() - @patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version" - ) + @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version") def test_install_or_upgrade_when_not_installed( self, mock_get_version, @@ -155,7 +149,7 @@ def test_installation_with_tolerations(self): return_value=0, ) as mock_run_retry, patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version", + "xpk.core.kueue_manager.KueueManager.get_installed_kueue_version", return_value=(1, None), ), patch( @@ -199,7 +193,7 @@ def test_installation_without_tolerations(self): return_value=0, ) as mock_run_retry, patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version", + "xpk.core.kueue_manager.KueueManager.get_installed_kueue_version", return_value=(1, None), ), patch( @@ -224,9 +218,7 @@ def test_installation_without_tolerations(self): self.assertEqual(result, 0) self.assertEqual(mock_run_retry.call_count, 0) - @patch( - "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version" - ) + @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version") @patch("xpk.core.kueue_manager.KueueManager._KueueManager__apply_manifest") def test_configuration_updates_resources( self, mock_apply_manifest, mock_get_version @@ -276,7 +268,7 @@ def test_resource_update_for_small_cluster(self, mock_run_retry): ), patch.object( self.kueue_manager, - "_KueueManager__get_installed_kueue_version", + "get_installed_kueue_version", return_value=(1, None), ), patch.object( @@ -319,7 +311,7 @@ def test_resource_update_for_large_cluster(self, mock_run_retry): ), patch.object( self.kueue_manager, - "_KueueManager__get_installed_kueue_version", + "get_installed_kueue_version", return_value=(1, None), ), patch.object( @@ -553,7 +545,7 @@ def _trigger_installation(self, kueue_config: KueueConfig) -> str: """Calls Kueue installation and returns the rendered manifest.""" with ( patch.object( - self.kueue_manager, "_KueueManager__get_installed_kueue_version" + self.kueue_manager, "get_installed_kueue_version" ) as mock_get_version, patch.object( self.kueue_manager, "_KueueManager__apply_manifest" From 8ebe1fee96b7e7d51de74c4afc9036543c3fb041 Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Thu, 23 Oct 2025 14:05:21 +0000 Subject: [PATCH 10/10] Release v0.14.1 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 9873608e9..26fc14bba 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.14.0' +__version__ = 'v0.14.1' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')