From f963a902823e5c554bf4e11037013642e1e6fca6 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Tue, 21 Oct 2025 15:11:18 +0000 Subject: [PATCH 1/4] feat: subslicing workload annotations --- src/xpk/commands/workload.py | 9 +++++++++ src/xpk/core/scheduling.py | 18 ++++++++++++++++++ src/xpk/core/scheduling_test.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 src/xpk/core/scheduling_test.py diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 76ee8cc82..6df5b2e4c 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -62,6 +62,7 @@ create_tpu_topology, get_cpu_affinity, get_gpu_scheduler, + create_sub_slicing_annotations, ) from ..core.storage import ( GCE_PD_TYPE, @@ -128,6 +129,7 @@ xpk.google.com/workload: {args.workload} annotations: {storage_annotations} + {subslicing_annotations} spec: schedulerName: {args.scheduler} imagePullSecrets: @@ -561,6 +563,13 @@ def workload_create(args) -> None: accelerator_label=create_accelerator_label( system.accelerator_type, system ), + subslicing_annotations=( + '' + if args.sub_slicing_topology is None + else ('\n' + (' ' * 16)).join( + create_sub_slicing_annotations(args.sub_slicing_topology) + ) + ), machine_label=create_machine_label(system.accelerator_type, system), local_queue_name=LOCAL_QUEUE_NAME, autoprovisioning_args=autoprovisioning_args, diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index 3032eb086..0fde27172 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -291,3 +291,21 @@ def create_tpu_topology( ): return f'{system.topology}' return '' + + +def create_sub_slicing_annotations(subslicing_topology: str) -> list[str]: + """Generates subslicing annotations. + + Args: + subslicing_topology: subslice topology. + + Returns: + Annotations to be rendered in deployment yaml. + """ + return [ + ( + 'kueue.x-k8s.io/podset-required-topology:' + f' "google.com/gke-tpu-slice-{subslicing_topology}-id"' + ), + f'cloud.google.com/gke-tpu-slice-topology: {subslicing_topology}', + ] diff --git a/src/xpk/core/scheduling_test.py b/src/xpk/core/scheduling_test.py new file mode 100644 index 000000000..ee7aea43f --- /dev/null +++ b/src/xpk/core/scheduling_test.py @@ -0,0 +1,31 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .scheduling import create_sub_slicing_annotations + + +def test_create_sub_slicing_annotations_returns_valid_annotations(): + subslicing_topology = '2x2' + + result = create_sub_slicing_annotations(subslicing_topology) + + assert result == [ + ( + f'kueue.x-k8s.io/podset-required-topology:' + f' "google.com/gke-tpu-slice-2x2-id"' + ), + f'cloud.google.com/gke-tpu-slice-topology: 2x2', + ] From ec57ade40002fdff608095dccbba9d66dfe26bc3 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Tue, 21 Oct 2025 15:17:38 +0000 Subject: [PATCH 2/4] fix: update goldens --- goldens/Workload_create.txt | 2 +- src/xpk/commands/workload.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index f54bc1e08..c99ffd136 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -25,7 +25,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144 +kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 6df5b2e4c..098e0afd0 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -99,6 +99,7 @@ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies from . import cluster_gcluster from .common import is_TAS_possible +from ..utils.feature_flags import FeatureFlags WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet @@ -565,7 +566,8 @@ def workload_create(args) -> None: ), subslicing_annotations=( '' - if args.sub_slicing_topology is None + if not FeatureFlags.SUB_SLICING_ENABLED + or args.sub_slicing_topology is None else ('\n' + (' ' * 16)).join( create_sub_slicing_annotations(args.sub_slicing_topology) ) From 64ccb633ce8ccdd7eb48427cb9f4babe376c963c Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Tue, 21 Oct 2025 15:28:20 +0000 Subject: [PATCH 3/4] style: lint --- src/xpk/core/scheduling_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xpk/core/scheduling_test.py b/src/xpk/core/scheduling_test.py index ee7aea43f..5f7663067 100644 --- a/src/xpk/core/scheduling_test.py +++ b/src/xpk/core/scheduling_test.py @@ -24,8 +24,8 @@ def test_create_sub_slicing_annotations_returns_valid_annotations(): assert result == [ ( - f'kueue.x-k8s.io/podset-required-topology:' - f' "google.com/gke-tpu-slice-2x2-id"' + 'kueue.x-k8s.io/podset-required-topology:' + ' "google.com/gke-tpu-slice-2x2-id"' ), - f'cloud.google.com/gke-tpu-slice-topology: 2x2', + 'cloud.google.com/gke-tpu-slice-topology: 2x2', ] From 1a934d648168b68d034d12298cd2c0ff746890a3 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 22 Oct 2025 08:34:06 +0000 Subject: [PATCH 4/4] style: typo --- src/xpk/commands/workload.py | 4 ++-- src/xpk/core/scheduling.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 098e0afd0..9b8329241 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -130,7 +130,7 @@ xpk.google.com/workload: {args.workload} annotations: {storage_annotations} - {subslicing_annotations} + {sub_slicing_annotations} spec: schedulerName: {args.scheduler} imagePullSecrets: @@ -564,7 +564,7 @@ def workload_create(args) -> None: accelerator_label=create_accelerator_label( system.accelerator_type, system ), - subslicing_annotations=( + sub_slicing_annotations=( '' if not FeatureFlags.SUB_SLICING_ENABLED or args.sub_slicing_topology is None diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index 0fde27172..ae11cff80 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -293,11 +293,11 @@ def create_tpu_topology( return '' -def create_sub_slicing_annotations(subslicing_topology: str) -> list[str]: +def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]: """Generates subslicing annotations. Args: - subslicing_topology: subslice topology. + sub_slicing_topology: subslice topology. Returns: Annotations to be rendered in deployment yaml. @@ -305,7 +305,7 @@ def create_sub_slicing_annotations(subslicing_topology: str) -> list[str]: return [ ( 'kueue.x-k8s.io/podset-required-topology:' - f' "google.com/gke-tpu-slice-{subslicing_topology}-id"' + f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"' ), - f'cloud.google.com/gke-tpu-slice-topology: {subslicing_topology}', + f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}', ]