From 8d02f7ed47576f9c4b369ea4178177ef040d3ecd Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 22 Oct 2025 11:30:43 +0000 Subject: [PATCH 1/3] feat: dynamic vms_per_slice computation for subslicing --- src/xpk/commands/workload.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 68241f05d..bfce2b43a 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -78,6 +78,7 @@ from ..core.system_characteristics import ( AcceleratorType, get_system_characteristics, + compute_vms_per_slice, ) from ..core.vertex import create_vertex_experiment from ..core.workload import ( @@ -120,8 +121,8 @@ replicas: {args.num_slices} template: spec: - parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice - completions: {system.vms_per_slice} # Same as the above. + parallelism: {vms_per_slice} # Equal to the number of VMs per slice + completions: {vms_per_slice} # Same as the above. backoffLimit: 0 # When any pod fails, the job is failed {pod_failure_policy} template: @@ -560,6 +561,13 @@ def workload_create(args) -> None: args=args, system=system, container=container, + vms_per_slice=( + compute_vms_per_slice(args.sub_slicing_topology) + if system.accelerator_type == AcceleratorType['TPU'] + and FeatureFlags.SUB_SLICING_ENABLED + and args.sub_slicing_topology is not None + else system.vms_per_slice + ), affinity=get_cpu_affinity(system.accelerator_type), accelerator_label=create_accelerator_label( system.accelerator_type, system From ced6074b13f364a3511f7daa8b855317aaeb109a Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 22 Oct 2025 12:45:19 +0000 Subject: [PATCH 2/3] style: apply peer review feedback --- src/xpk/commands/workload.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index bfce2b43a..41aa35547 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -121,7 +121,7 @@ replicas: {args.num_slices} template: spec: - parallelism: {vms_per_slice} # Equal to the number of VMs per slice + parallelism: {vms_per_slice} # Equal to the number of VMs per slice (or sub-slice). completions: {vms_per_slice} # Same as the above. backoffLimit: 0 # When any pod fails, the job is failed {pod_failure_policy} @@ -559,7 +559,6 @@ def workload_create(args) -> None: ) yml_string = WORKLOAD_CREATE_YAML.format( args=args, - system=system, container=container, vms_per_slice=( compute_vms_per_slice(args.sub_slicing_topology) From 3c7f37fa40e3f000261c46237ec61bf2f23cee4c Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Wed, 22 Oct 2025 12:58:14 +0000 Subject: [PATCH 3/3] update goldens --- goldens/Workload_create.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index c99ffd136..c2ca07090 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -25,7 +25,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453 +kubectl apply -f abc33690f7a11b2ba50a8f949970fd3ba812f088367b7f64260729f01f41a231 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.