Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion goldens/Workload_create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
docker push gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453
kubectl apply -f abc33690f7a11b2ba50a8f949970fd3ba812f088367b7f64260729f01f41a231
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
Expand Down
13 changes: 10 additions & 3 deletions src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from ..core.system_characteristics import (
AcceleratorType,
get_system_characteristics,
compute_vms_per_slice,
)
from ..core.vertex import create_vertex_experiment
from ..core.workload import (
Expand Down Expand Up @@ -120,8 +121,8 @@
replicas: {args.num_slices}
template:
spec:
parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice
completions: {system.vms_per_slice} # Same as the above.
parallelism: {vms_per_slice} # Equal to the number of VMs per slice (or sub-slice).
completions: {vms_per_slice} # Same as the above.
backoffLimit: 0 # When any pod fails, the job is failed
{pod_failure_policy}
template:
Expand Down Expand Up @@ -558,8 +559,14 @@ def workload_create(args) -> None:
)
yml_string = WORKLOAD_CREATE_YAML.format(
args=args,
system=system,
container=container,
vms_per_slice=(
compute_vms_per_slice(args.sub_slicing_topology)
if system.accelerator_type == AcceleratorType['TPU']
and FeatureFlags.SUB_SLICING_ENABLED
and args.sub_slicing_topology is not None
else system.vms_per_slice
),
affinity=get_cpu_affinity(system.accelerator_type),
accelerator_label=create_accelerator_label(
system.accelerator_type, system
Expand Down
Loading