Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions goldens/Basic_cluster_create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,15 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xtpu7x-8"
spec:
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
Expand Down Expand Up @@ -175,7 +174,7 @@ value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f a1fe8e014a200d6489b8871301a9e80de7e6f45e94b61ad0e60f40f254711bec
kubectl apply -f ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
Expand Down
7 changes: 3 additions & 4 deletions goldens/Cluster_create_private.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,24 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xv5p-8"
spec:
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu-v5p-slice", "cloud.google.com/gke-tpu-topology": "2x2x1"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "cpu-user"
spec:
nodeLabels: {"cloud.google.com/gke-nodepool": "cpu-np"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
Expand Down Expand Up @@ -189,7 +188,7 @@ value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f 0ff2bce892606d1497f21fc7b2cea78a4ee103094ce0f509211f3f9730536ad6
kubectl apply -f 1838a525f73d2cfd19aa616665e8b33fcc4ea10ba8d6015a9307109a6be6d372
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
Expand Down
5 changes: 2 additions & 3 deletions goldens/Cluster_create_with_gb200-4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,15 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xgb200-4"
spec:
nodeLabels: {"cloud.google.com/gke-accelerator": "nvidia-gb200"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
Expand Down Expand Up @@ -179,7 +178,7 @@ value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f f807069b73747a423ec0d1915b2e919cfde400b01654de15746b566709b80f7e
kubectl apply -f 5c5d70f8d2bbedea9acccd9c1a153e2f55efd31cc61d2b55ecdd4a8f009fab11
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
Expand Down
5 changes: 2 additions & 3 deletions goldens/NAP_cluster-create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,15 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xtpu7x-8"
spec:
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
Expand Down Expand Up @@ -186,7 +185,7 @@ value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f e5fccf0957dcb7f60400bb4e28ce8c5fc251a9aeb6d67793dc119554d13dc900
kubectl apply -f 40a7aac5b047c750ee98477984af3d46acd60d164d852eccd1b47a21c4155f2d
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
Expand Down
7 changes: 3 additions & 4 deletions goldens/NAP_cluster-create_with_pathways.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,25 +93,24 @@ kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-s
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xtpu7x-8"
spec:
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "cpu-user"
spec:
nodeLabels: {"cloud.google.com/gke-nodepool": "cpu-np"}


---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
Expand Down Expand Up @@ -196,7 +195,7 @@ value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f edda4d2ffefedaabd6f77eb6d07a05c1bac829ed90a4c3983b21ded280136557
kubectl apply -f f89effb1f55aef327018037d75f743b5c62d59f1f62fddadaaa31f72e5e07bdf
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
Expand Down
3 changes: 1 addition & 2 deletions goldens/Workload_create.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ $ python3 xpk.py workload create --project=golden-project --zone=us-central1-a -
[XPK] Starting xpk
[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run.
kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name'
[XPK] Starting workload create
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
[XPK] Starting workload create
Expand All @@ -25,7 +24,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
docker push gcr.io/golden-project/dry-run-runner:prefix-current
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
kubectl apply -f 492d3bb4e1055d9d47679ed9c3ba617c304f47bac9b83fea3c14507b04a65453
kubectl apply -f abc33690f7a11b2ba50a8f949970fd3ba812f088367b7f64260729f01f41a231
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
Expand Down
1 change: 0 additions & 1 deletion goldens/Workload_create_pathways.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ $ python3 xpk.py workload create-pathways --project=golden-project --zone=us-cen
[XPK] Starting xpk
[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run.
kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name'
[XPK] Starting workload create
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
[XPK] Starting workload create
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,12 @@ dev = [
version = {attr = "xpk.core.config.__version__"}

[tool.setuptools]
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.remote_state", "xpk.core.workload_decorators"]
package-dir = {"" = "src"}
package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]}
packages = { find = { where = ["src"] } }

[tool.setuptools.package-data]
"xpk" = ["templates/*"]
"xpk.api" = ["*.yaml"]

[tool.pyink]
# Formatting configuration to follow Google style-guide.
Expand Down
21 changes: 17 additions & 4 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from tabulate import tabulate

from ..utils.feature_flags import FeatureFlags
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
from ..core.cluster import (
get_all_clusters_programmatic,
Expand Down Expand Up @@ -75,9 +76,9 @@
from ..utils.execution_context import is_dry_run
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
from . import cluster_gcluster
from .common import set_cluster_command
from .common import set_cluster_command, validate_sub_slicing_system
from jinja2 import Environment, FileSystemLoader
from ..utils.templates import TEMPLATE_PATH
from ..utils.templates import get_templates_absolute_path
import shutil
import os

Expand Down Expand Up @@ -200,6 +201,11 @@ def cluster_adapt(args) -> None:
xpk_exit(0)


def _validate_cluster_create_args(args, system: SystemCharacteristics):
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
validate_sub_slicing_system(system)


def cluster_create(args) -> None:
"""Function around cluster creation.

Expand All @@ -212,12 +218,14 @@ def cluster_create(args) -> None:
SystemDependency.KJOB,
SystemDependency.GCLOUD,
])
system, return_code = get_system_characteristics(args)

system, return_code = get_system_characteristics(args)
if return_code > 0 or system is None:
xpk_print('Fetching system characteristics failed!')
xpk_exit(return_code)

_validate_cluster_create_args(args, system)

xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
add_zone_and_project(args)

Expand Down Expand Up @@ -426,7 +434,9 @@ def cluster_cacheimage(args) -> None:
system.accelerator_type
].accelerator_label

template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
template_env = Environment(
loader=FileSystemLoader(searchpath=get_templates_absolute_path())
)
cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
rendered_yaml = cluster_preheat_yaml.render(
cachekey=args.cache_key,
Expand Down Expand Up @@ -1251,6 +1261,9 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
memory_limit=args.memory_limit,
cpu_limit=args.cpu_limit,
is_pathways_cluster=args.enable_pathways,
configure_sub_slicing=(
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
),
),
)

Expand Down
4 changes: 4 additions & 0 deletions src/xpk/commands/cluster_gcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import os

from ..utils.feature_flags import FeatureFlags
from ..utils.execution_context import is_dry_run
from ..core.kueue_manager import KueueConfig, KueueManager
from ..core.nap import enable_autoprovisioning_on_cluster
Expand Down Expand Up @@ -159,6 +160,9 @@ def __install_kueue(args) -> int:
cpu_limit=args.cpu_limit,
is_pathways_cluster=args.enable_pathways,
flex=args.flex,
configure_sub_slicing=(
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
),
),
tolerations=tolerations,
)
Expand Down
92 changes: 92 additions & 0 deletions src/xpk/commands/cluster_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
Copyright 2025 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from argparse import Namespace
from dataclasses import dataclass
from unittest.mock import MagicMock
import pytest

from xpk.commands.cluster import _validate_cluster_create_args
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
from xpk.utils.feature_flags import FeatureFlags


@dataclass
class _Mocks:
common_print_mock: MagicMock
common_exit_mock: MagicMock


@pytest.fixture
def mock_common_print_and_exit(mocker):
common_print_mock = mocker.patch(
'xpk.commands.common.xpk_print',
return_value=None,
)
common_exit_mock = mocker.patch(
'xpk.commands.common.xpk_exit',
return_value=None,
)
return _Mocks(
common_print_mock=common_print_mock, common_exit_mock=common_exit_mock
)


DEFAULT_TEST_SYSTEM: SystemCharacteristics = (
UserFacingNameToSystemCharacteristics['l4-1']
)
SUB_SLICING_SYSTEM: SystemCharacteristics = (
UserFacingNameToSystemCharacteristics['v6e-4x4']
)


def test_validate_cluster_create_args_for_correct_args_pass(
mock_common_print_and_exit: _Mocks,
):
args = Namespace()

_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)

assert mock_common_print_and_exit.common_print_mock.call_count == 0
assert mock_common_print_and_exit.common_exit_mock.call_count == 0


def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
mock_common_print_and_exit: _Mocks,
):
FeatureFlags.SUB_SLICING_ENABLED = True
args = Namespace(sub_slicing=True)

_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)

assert mock_common_print_and_exit.common_print_mock.call_count == 0
assert mock_common_print_and_exit.common_exit_mock.call_count == 0


def test_validate_cluster_create_args_for_not_supported_system_throws(
mock_common_print_and_exit: _Mocks,
):
FeatureFlags.SUB_SLICING_ENABLED = True
args = Namespace(sub_slicing=True)

_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)

assert mock_common_print_and_exit.common_print_mock.call_count == 1
assert (
mock_common_print_and_exit.common_print_mock.call_args[0][0]
== 'Error: l4-1 does not support Sub-slicing.'
)
assert mock_common_print_and_exit.common_exit_mock.call_count == 1
6 changes: 6 additions & 0 deletions src/xpk/commands/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,9 @@ def is_TAS_possible(
system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
or capacity_type == CapacityType.RESERVATION
)


def validate_sub_slicing_system(system: SystemCharacteristics):
if not system.supports_sub_slicing:
xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
xpk_exit(1)
1 change: 1 addition & 0 deletions src/xpk/commands/kind.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def cluster_create(args) -> None:
cpu_limit=0,
is_pathways_cluster=False,
flex=False,
configure_sub_slicing=False,
),
)

Expand Down
Loading
Loading