Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion goldens/Workload_create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
docker push gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144
kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
Expand Down
11 changes: 11 additions & 0 deletions src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
create_tpu_topology,
get_cpu_affinity,
get_gpu_scheduler,
create_sub_slicing_annotations,
)
from ..core.storage import (
GCE_PD_TYPE,
Expand Down Expand Up @@ -98,6 +99,7 @@
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
from . import cluster_gcluster
from .common import is_TAS_possible
from ..utils.feature_flags import FeatureFlags

WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
Expand Down Expand Up @@ -128,6 +130,7 @@
xpk.google.com/workload: {args.workload}
annotations:
{storage_annotations}
{sub_slicing_annotations}
spec:
schedulerName: {args.scheduler}
imagePullSecrets:
Expand Down Expand Up @@ -561,6 +564,14 @@ def workload_create(args) -> None:
accelerator_label=create_accelerator_label(
system.accelerator_type, system
),
sub_slicing_annotations=(
''
if not FeatureFlags.SUB_SLICING_ENABLED
or args.sub_slicing_topology is None
else ('\n' + (' ' * 16)).join(
create_sub_slicing_annotations(args.sub_slicing_topology)
)
),
machine_label=create_machine_label(system.accelerator_type, system),
local_queue_name=LOCAL_QUEUE_NAME,
autoprovisioning_args=autoprovisioning_args,
Expand Down
18 changes: 18 additions & 0 deletions src/xpk/core/scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,21 @@ def create_tpu_topology(
):
return f'{system.topology}'
return ''


def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
"""Generates subslicing annotations.

Args:
sub_slicing_topology: subslice topology.

Returns:
Annotations to be rendered in deployment yaml.
"""
return [
(
'kueue.x-k8s.io/podset-required-topology:'
f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
),
f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
]
31 changes: 31 additions & 0 deletions src/xpk/core/scheduling_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Copyright 2025 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from .scheduling import create_sub_slicing_annotations


def test_create_sub_slicing_annotations_returns_valid_annotations():
subslicing_topology = '2x2'

result = create_sub_slicing_annotations(subslicing_topology)

assert result == [
(
'kueue.x-k8s.io/podset-required-topology:'
' "google.com/gke-tpu-slice-2x2-id"'
),
'cloud.google.com/gke-tpu-slice-topology: 2x2',
]
Loading