diff --git a/src/integration/gcluster_a3mega_test.py b/src/integration/gcluster_a3mega_test.py index 3087a49eb..ce2521a58 100644 --- a/src/integration/gcluster_a3mega_test.py +++ b/src/integration/gcluster_a3mega_test.py @@ -119,11 +119,6 @@ def test_create_a3_mega_deployment_files(setup_tests): assert os.path.isfile( os.path.join(blueprint_deps_test_path, "config-map.yaml.tftpl") ) - assert os.path.isfile( - os.path.join( - blueprint_deps_test_path, "kueue-xpk-configuration.yaml.tftpl" - ) - ) gcluster_manager = GclusterManager( gcluster_command_runner=docker_manager, remote_state_client=None diff --git a/src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl b/src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl deleted file mode 100644 index e15bc7e71..000000000 --- a/src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl +++ /dev/null @@ -1,111 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1alpha1 -kind: Topology -metadata: - name: "gke-default" -spec: - levels: - - nodeLabel: "cloud.google.com/gce-topology-block" - - nodeLabel: "cloud.google.com/gce-topology-subblock" - - nodeLabel: "cloud.google.com/gce-topology-host" - - nodeLabel: "kubernetes.io/hostname" ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: "1xh100-mega-80gb-8" -spec: - nodeLabels: - cloud.google.com/gke-accelerator: "nvidia-h100-mega-80gb" - %{~ if reservation==1 ~} - topologyName: "gke-default" - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ProvisioningRequestConfig -metadata: - name: dws-config -spec: - provisioningClassName: queued-provisioning.gke.io - managedResources: - - nvidia.com/gpu ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: AdmissionCheck -metadata: - name: dws-prov -spec: - controllerName: kueue.x-k8s.io/provisioning-request - parameters: - apiGroup: kueue.x-k8s.io - kind: ProvisioningRequestConfig - name: dws-config ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: cluster-queue -spec: - namespaceSelector: {} # match all. - resourceGroups: - - coveredResources: ["nvidia.com/gpu", "cpu", "memory"] - flavors: - - name: "1xh100-mega-80gb-8" - resources: - - name: "nvidia.com/gpu" - nominalQuota: ${num_chips} - - name: "cpu" - nominalQuota: 10000 - - name: "memory" - nominalQuota: 10000Gi - %{~ if flex_start==1 ~} - admissionChecks: - - dws-prov - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - namespace: default - name: multislice-queue -spec: - clusterQueue: cluster-queue ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-low -value: 100 -globalDefault: false -description: "Very Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low -value: 250 -globalDefault: false -description: "Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: medium -value: 500 -globalDefault: false -description: "Medium" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high -value: 750 -globalDefault: false -description: "High" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-high -value: 1000 -globalDefault: false -description: "Very High" \ No newline at end of file diff --git a/src/xpk/blueprints/a3ultra/kueue-xpk-configuration.yaml.tftpl b/src/xpk/blueprints/a3ultra/kueue-xpk-configuration.yaml.tftpl deleted file mode 100644 index 0af9664ed..000000000 --- a/src/xpk/blueprints/a3ultra/kueue-xpk-configuration.yaml.tftpl +++ /dev/null @@ -1,111 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1alpha1 -kind: Topology -metadata: - name: "gke-default" -spec: - levels: - - nodeLabel: "cloud.google.com/gce-topology-block" - - nodeLabel: "cloud.google.com/gce-topology-subblock" - - nodeLabel: "cloud.google.com/gce-topology-host" - - nodeLabel: "kubernetes.io/hostname" ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: "1xh200-141gb-8" -spec: - nodeLabels: - cloud.google.com/gke-accelerator: "nvidia-h200-141gb" - %{~ if flex_start==0 ~} - topologyName: "gke-default" - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ProvisioningRequestConfig -metadata: - name: dws-config -spec: - provisioningClassName: queued-provisioning.gke.io - managedResources: - - nvidia.com/gpu ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: AdmissionCheck -metadata: - name: dws-prov -spec: - controllerName: kueue.x-k8s.io/provisioning-request - parameters: - apiGroup: kueue.x-k8s.io - kind: ProvisioningRequestConfig - name: dws-config ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: cluster-queue -spec: - namespaceSelector: {} # match all. - resourceGroups: - - coveredResources: ["nvidia.com/gpu", "cpu", "memory"] - flavors: - - name: "1xh200-141gb-8" - resources: - - name: "nvidia.com/gpu" - nominalQuota: ${num_chips} - - name: "cpu" - nominalQuota: 10000 - - name: "memory" - nominalQuota: 10000Gi - %{~ if flex_start==1 ~} - admissionChecks: - - dws-prov - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - namespace: default - name: multislice-queue -spec: - clusterQueue: cluster-queue ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-low -value: 100 -globalDefault: false -description: "Very Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low -value: 250 -globalDefault: false -description: "Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: medium -value: 500 -globalDefault: false -description: "Medium" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high -value: 750 -globalDefault: false -description: "High" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-high -value: 1000 -globalDefault: false -description: "Very High" \ No newline at end of file diff --git a/src/xpk/blueprints/a4/kueue-xpk-configuration.yaml.tftpl b/src/xpk/blueprints/a4/kueue-xpk-configuration.yaml.tftpl deleted file mode 100644 index e6638c375..000000000 --- a/src/xpk/blueprints/a4/kueue-xpk-configuration.yaml.tftpl +++ /dev/null @@ -1,111 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1alpha1 -kind: Topology -metadata: - name: "gke-default" -spec: - levels: - - nodeLabel: "cloud.google.com/gce-topology-block" - - nodeLabel: "cloud.google.com/gce-topology-subblock" - - nodeLabel: "cloud.google.com/gce-topology-host" - - nodeLabel: "kubernetes.io/hostname" ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ProvisioningRequestConfig -metadata: - name: dws-config -spec: - provisioningClassName: queued-provisioning.gke.io - managedResources: - - nvidia.com/gpu ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: "1xb200-8" -spec: - nodeLabels: - cloud.google.com/gke-accelerator: "nvidia-b200" - %{~ if flex_start==0 ~} - topologyName: "gke-default" - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: AdmissionCheck -metadata: - name: dws-prov -spec: - controllerName: kueue.x-k8s.io/provisioning-request - parameters: - apiGroup: kueue.x-k8s.io - kind: ProvisioningRequestConfig - name: dws-config ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: cluster-queue -spec: - namespaceSelector: {} # match all. - resourceGroups: - - coveredResources: ["nvidia.com/gpu", "cpu", "memory"] - flavors: - - name: "1xb200-8" - resources: - - name: "nvidia.com/gpu" - nominalQuota: ${num_chips} - - name: "cpu" - nominalQuota: 10000 - - name: "memory" - nominalQuota: 10000Gi - %{~ if flex_start==1 ~} - admissionChecks: - - dws-prov - %{ endif } ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - namespace: default - name: multislice-queue -spec: - clusterQueue: cluster-queue ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-low -value: 100 -globalDefault: false -description: "Very Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low -value: 250 -globalDefault: false -description: "Low" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: medium -value: 500 -globalDefault: false -description: "Medium" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high -value: 750 -globalDefault: false -description: "High" ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: very-high -value: 1000 -globalDefault: false -description: "Very High" \ No newline at end of file diff --git a/src/xpk/commands/cluster_gcluster.py b/src/xpk/commands/cluster_gcluster.py index 9cad0175a..0ac48c35c 100644 --- a/src/xpk/commands/cluster_gcluster.py +++ b/src/xpk/commands/cluster_gcluster.py @@ -16,6 +16,12 @@ import os +from ..utils.execution_context import is_dry_run +from ..core.kueue_manager import KueueConfig, KueueManager +from ..core.nap import enable_autoprovisioning_on_cluster +from ..core.scheduling import get_total_chips_requested_from_args +from ..core.system_characteristics import get_system_characteristics + from ..core.blueprint.blueprint_generator import ( BlueprintGenerator, BlueprintGeneratorOutput, @@ -75,21 +81,28 @@ def cluster_create(args) -> None: bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix) # staging: sending the blueprint file(s) to gcluster's working directory - bp_staged_path = gcm.stage_files( - blueprint_file=bp.blueprint_file, - blueprint_dependencies=bp.blueprint_dependencies, - prefix=prefix, - ) - gcm.deploy( - blueprint_path=bp_staged_path, - deployment_name=unique_name, - prefix=prefix, - ) - if args.cluster_state_gcs_bucket is not None: - gcm.upload_state() + if is_dry_run(): + xpk_print(f'Blueprint file: {bp.blueprint_file}') + else: + bp_staged_path = gcm.stage_files( + blueprint_file=bp.blueprint_file, + blueprint_dependencies=bp.blueprint_dependencies, + prefix=prefix, + ) + gcm.deploy( + blueprint_path=bp_staged_path, + deployment_name=unique_name, + prefix=prefix, + ) + if args.cluster_state_gcs_bucket is not None: + gcm.upload_state() get_cluster_credentials(args) + err_code = __install_kueue(args) + if err_code > 0: + xpk_exit(err_code) + err_code = apply_kjob_crds() if err_code > 0: xpk_exit(err_code) @@ -101,6 +114,57 @@ def cluster_create(args) -> None: xpk_exit(0) +def __install_kueue(args) -> int: + system, return_code = get_system_characteristics(args) + + if return_code > 0 or system is None: + xpk_print('Fetching system characteristics failed!') + return return_code + + # Provision node pools dynamically based on incoming workloads: + # Currently autoprovisioning is not supported with Pathways. + autoprovisioning_config = None + if args.enable_autoprovisioning: + xpk_print('Enabling Autoprovisioning') + autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster( + args, system + ) + if return_code != 0: + return return_code + + autoprovisioning_enabled = False + if autoprovisioning_config: + # Determine total resources available based on autoprovisioning max chips. + autoprovisioning_enabled = True + total_chips = autoprovisioning_config.maximum_chips + else: + # Determine total chips based on user specified topology. + total_chips = get_total_chips_requested_from_args(args, system) + kueue_manager = KueueManager() + + tolerations = [{ + 'key': 'components.gke.io/gke-managed-components', + 'operator': 'Equal', + 'value': 'true', + 'effect': 'NoSchedule', + }] + + kueue_manager.install_or_upgrade( + KueueConfig( + system, + total_chips=total_chips, + autoprovisioning_enabled=autoprovisioning_enabled, + num_slices=args.num_slices, + memory_limit=args.memory_limit, + cpu_limit=args.cpu_limit, + is_pathways_cluster=args.enable_pathways, + flex=args.flex, + ), + tolerations=tolerations, + ) + return 0 + + def cluster_delete(args) -> None: """Function around cluster delete for the clusters created by Cluster toolkit. diff --git a/src/xpk/commands/cluster_gcluster_test.py b/src/xpk/commands/cluster_gcluster_test.py new file mode 100644 index 000000000..e1873b823 --- /dev/null +++ b/src/xpk/commands/cluster_gcluster_test.py @@ -0,0 +1,177 @@ +""" +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from xpk.commands.cluster_gcluster import cluster_create +from xpk.core.kueue_manager import KueueConfig +from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics + + +@pytest.fixture +def mock_args(): + """Provides a mock for args.""" + args = MagicMock() + args.enable_autoprovisioning = False + args.num_slices = 1 + args.memory_limit = "200G" + args.cpu_limit = "50" + args.enable_pathways = False + args.flex = False + args.project = "test-project" + args.cluster = "test-cluster" + args.zone = "us-central1-c" + args.cluster_state_gcs_bucket = None + return args + + +@pytest.fixture +def mock_cluster_create_deps(request): + """Mocks dependencies for cluster_create.""" + with ( + patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit, + patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob, + patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob, + patch( + "xpk.commands.cluster_gcluster.get_cluster_credentials" + ) as mock_get_creds, + patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp, + patch( + "xpk.commands.cluster_gcluster.prepare_gcluster_manager" + ) as mock_prep_gcm, + patch( + "xpk.commands.cluster_gcluster.prepare_directories" + ) as mock_prep_dirs, + patch( + "xpk.commands.cluster_gcluster.check_gcloud_authenticated" + ) as mock_check_auth, + patch( + "xpk.commands.cluster_gcluster.get_system_characteristics" + ) as mock_get_sys_char, + patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager, + ): + yield { + "xpk_exit": mock_exit, + "prepare_kjob": mock_prep_kjob, + "apply_kjob_crds": mock_apply_kjob, + "get_cluster_credentials": mock_get_creds, + "generate_blueprint": mock_gen_bp, + "prepare_gcluster_manager": mock_prep_gcm, + "prepare_directories": mock_prep_dirs, + "check_gcloud_authenticated": mock_check_auth, + "get_system_characteristics": mock_get_sys_char, + "KueueManager": mock_kueue_manager, + } + + +@patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args") +def test_install_kueue_standard( + mock_get_total_chips, mock_args, mock_cluster_create_deps +): + """Tests __install_kueue for a standard installation.""" + mock_cluster_create_deps["prepare_kjob"].return_value = 0 + mock_cluster_create_deps["apply_kjob_crds"].return_value = 0 + + mock_system = SystemCharacteristics( + topology="N/A", + vms_per_slice=1, + gke_accelerator="nvidia-h100-mega-80gb", + gce_machine_type="a3-megagpu-8g", + chips_per_vm=8, + accelerator_type=AcceleratorType["GPU"], + device_type="h100-mega-80gb-8", + supports_sub_slicing=False, + ) + mock_cluster_create_deps["get_system_characteristics"].return_value = ( + mock_system, + 0, + ) + mock_get_total_chips.return_value = 16 + + cluster_create(mock_args) + + mock_cluster_create_deps["xpk_exit"].assert_called_with(0) + mock_kueue_manager = mock_cluster_create_deps["KueueManager"] + mock_kueue_manager.return_value.install_or_upgrade.assert_called_once() + call_args, call_kwargs = ( + mock_kueue_manager.return_value.install_or_upgrade.call_args + ) + kueue_config: KueueConfig = call_args[0] + + assert kueue_config.system == mock_system + assert kueue_config.total_chips == 16 + assert not kueue_config.autoprovisioning_enabled + assert "tolerations" in call_kwargs + tolerations = call_kwargs["tolerations"] + assert any( + t.get("key") == "components.gke.io/gke-managed-components" + and t.get("effect") == "NoSchedule" + for t in tolerations + ) + + +@patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster") +def test_install_kueue_with_autoprovisioning( + mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps +): + """Tests __install_kueue with autoprovisioning enabled.""" + mock_cluster_create_deps["prepare_kjob"].return_value = 0 + mock_cluster_create_deps["apply_kjob_crds"].return_value = 0 + + mock_args.enable_autoprovisioning = True + mock_system = SystemCharacteristics( + topology="N/A", + vms_per_slice=1, + gke_accelerator="nvidia-h100-mega-80gb", + gce_machine_type="a3-megagpu-8g", + chips_per_vm=8, + accelerator_type=AcceleratorType["GPU"], + device_type="h100-mega-80gb-8", + supports_sub_slicing=False, + ) + mock_cluster_create_deps["get_system_characteristics"].return_value = ( + mock_system, + 0, + ) + + mock_autoprovisioning_config = MagicMock() + mock_autoprovisioning_config.maximum_chips = 128 + mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0) + + cluster_create(mock_args) + + mock_cluster_create_deps["xpk_exit"].assert_called_with(0) + mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system) + mock_kueue_manager = mock_cluster_create_deps["KueueManager"] + mock_kueue_manager.return_value.install_or_upgrade.assert_called_once() + + call_args, call_kwargs = ( + mock_kueue_manager.return_value.install_or_upgrade.call_args + ) + kueue_config: KueueConfig = call_args[0] + + assert kueue_config.system == mock_system + assert kueue_config.total_chips == 128 + assert kueue_config.autoprovisioning_enabled + assert "tolerations" in call_kwargs + tolerations = call_kwargs["tolerations"] + assert any( + t.get("key") == "components.gke.io/gke-managed-components" + and t.get("effect") == "NoSchedule" + for t in tolerations + ) diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py index 1692e72f8..ed13c1cb0 100644 --- a/src/xpk/core/blueprint/blueprint_generator.py +++ b/src/xpk/core/blueprint/blueprint_generator.py @@ -32,7 +32,6 @@ ) from ..system_characteristics import get_system_characteristics_by_device_type from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule -from ..kueue import KUEUE_VERSION yaml_parser = yaml.YAML() @@ -217,26 +216,11 @@ def generate_a3_mega_blueprint( a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes}) set_placement_policy = capacity_type != CapacityType.SPOT - num_chips = num_nodes * system.chips_per_vm workload = DeploymentModule( id="workload_component_install", source="modules/management/kubectl-apply", use=["gke_cluster"], settings={ - "kueue": { - "install": True, - "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT - "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl', - "config_template_vars": { - "num_chips": num_chips, - "reservation": ( - 1 if capacity_type == CapacityType.RESERVATION else 0 - ), - "flex_start": ( - 1 if capacity_type == CapacityType.FLEX_START else 0 - ), - }, - }, "jobset": {"install": True, "version": "v0.7.2"}, "apply_manifests": [{ "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml' @@ -600,24 +584,12 @@ def generate_a3_ultra_blueprint( else: gpu_pool.settings.update({"static_node_count": num_nodes}) - num_chips = num_nodes * system.chips_per_vm workload_manager_install_id = "workload-manager-install" workload_manager_install = DeploymentModule( id=workload_manager_install_id, source="modules/management/kubectl-apply", use=[cluster_id], settings={ - "kueue": { - "install": True, - "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT - "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl', - "config_template_vars": { - "num_chips": num_chips, - "flex_start": ( - 1 if capacity_type == CapacityType.FLEX_START else 0 - ), - }, - }, "jobset": {"install": True, "version": "v0.7.2"}, "apply_manifests": [ {"source": nccl_installer_path}, @@ -887,24 +859,12 @@ def generate_a4_blueprint( else: gpu_pool.settings.update({"static_node_count": num_nodes}) - num_chips = num_nodes * system.chips_per_vm workload_manager_install_id = "workload-manager-install" workload_manager_install = DeploymentModule( id=workload_manager_install_id, source="modules/management/kubectl-apply", use=[cluster_id], settings={ - "kueue": { - "install": True, - "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT - "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl', - "config_template_vars": { - "num_chips": num_chips, - "flex_start": ( - 1 if capacity_type == CapacityType.FLEX_START else 0 - ), - }, - }, "jobset": {"install": True, "version": "v0.7.2"}, "apply_manifests": [ {"source": nccl_installer_path}, diff --git a/src/xpk/core/blueprint/blueprint_test.py b/src/xpk/core/blueprint/blueprint_test.py index cfd26075f..be98a0ee1 100644 --- a/src/xpk/core/blueprint/blueprint_test.py +++ b/src/xpk/core/blueprint/blueprint_test.py @@ -32,7 +32,6 @@ a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml" a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml" config_map_filename = "config-map.yaml.tftpl" -kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl" tmp_test_dir = "/tmp/xpk_test" @@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint(): tmp_test_dir, "prefix", blueprint_name, config_map_filename ) ) - assert os.path.exists( - os.path.join( - tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename - ) - ) shutil.rmtree(tmp_test_dir) diff --git a/src/xpk/core/blueprint/testing/data/a3_mega.yaml b/src/xpk/core/blueprint/testing/data/a3_mega.yaml index 679989cc0..9d9c919d3 100644 --- a/src/xpk/core/blueprint/testing/data/a3_mega.yaml +++ b/src/xpk/core/blueprint/testing/data/a3_mega.yaml @@ -94,7 +94,7 @@ deployment_groups: gpu_driver_version: "LATEST" auto_upgrade: true static_node_count: 2 - placement_policy: + placement_policy: type: COMPACT name: test-reservation-placement outputs: [instructions] @@ -104,18 +104,10 @@ deployment_groups: source: modules/management/kubectl-apply use: [gke_cluster] settings: - kueue: - install: true - version: "v0.12.2" - config_path: $(ghpc_stage("xpk-gke-a3-megagpu"))/kueue-xpk-configuration.yaml.tftpl - config_template_vars: - num_chips: 16 - reservation: 1 - flex_start: 0 jobset: install: true version: v0.7.2 - apply_manifests: + apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml - !DeploymentModule @@ -131,4 +123,4 @@ deployment_groups: cluster_config_name: "bar-metadata-configmap", capacity_type: "reservation", reservation: "test-reservation", - } \ No newline at end of file + } diff --git a/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml b/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml index 48ae3db16..a9441242a 100644 --- a/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +++ b/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml @@ -100,18 +100,10 @@ deployment_groups: source: modules/management/kubectl-apply use: [gke_cluster] settings: - kueue: - install: true - version: "v0.12.2" - config_path: $(ghpc_stage("xpk-gke-a3-megagpu"))/kueue-xpk-configuration.yaml.tftpl - config_template_vars: - num_chips: 16 - reservation: 0 - flex_start: 0 jobset: install: true version: v0.7.2 - apply_manifests: + apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml - !DeploymentModule @@ -127,4 +119,4 @@ deployment_groups: cluster_config_name: "bar-metadata-configmap", capacity_type: "spot", reservation: "None", - } \ No newline at end of file + } diff --git a/src/xpk/core/blueprint/testing/data/a3_ultra.yaml b/src/xpk/core/blueprint/testing/data/a3_ultra.yaml index 8d3643dbb..908146118 100644 --- a/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +++ b/src/xpk/core/blueprint/testing/data/a3_ultra.yaml @@ -153,13 +153,6 @@ deployment_groups: source: modules/management/kubectl-apply use: [gke-a3-ultra-a3-ultragpu-cluster] settings: - kueue: - install: true - version: v0.12.2 # TAS feature-gates is enabled in CT - config_path: $(ghpc_stage("xpk-gke-a3-ultra"))/kueue-xpk-configuration.yaml.tftpl - config_template_vars: - num_chips: 16 - flex_start: 0 jobset: install: true version: v0.7.2 @@ -167,13 +160,13 @@ deployment_groups: - source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml - source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml - source: $(ghpc_stage("xpk-gke-a3-ultra"))/storage_crd.yaml - + - !DeploymentModule id: workload_configmap source: 'modules/management/kubectl-apply' use: ['gke-a3-ultra-a3-ultragpu-cluster'] settings: - apply_manifests: + apply_manifests: - source: '$(ghpc_stage("xpk-gke-a3-ultra"))/config-map.yaml.tftpl' template_vars: { resource_config_name: "gke-a3-ultra-resources-configmap", diff --git a/src/xpk/core/blueprint/testing/data/a4.yaml b/src/xpk/core/blueprint/testing/data/a4.yaml index f626f4e34..7843fdd81 100644 --- a/src/xpk/core/blueprint/testing/data/a4.yaml +++ b/src/xpk/core/blueprint/testing/data/a4.yaml @@ -88,7 +88,7 @@ deployment_groups: settings: network_name: gke-a4-rdma-net mtu: 8896 - network_profile: + network_profile: https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce network_routing_mode: REGIONAL subnetworks_template: @@ -165,13 +165,6 @@ deployment_groups: use: - gke-a4-a4-cluster settings: - kueue: - install: true - version: v0.12.2 - config_path: $(ghpc_stage("xpk-gke-a4"))/kueue-xpk-configuration.yaml.tftpl - config_template_vars: - num_chips: 16 - flex_start: 0 jobset: install: true version: v0.7.2 diff --git a/src/xpk/core/gcluster_manager.py b/src/xpk/core/gcluster_manager.py index 9b7a789b5..b67926a9b 100644 --- a/src/xpk/core/gcluster_manager.py +++ b/src/xpk/core/gcluster_manager.py @@ -27,9 +27,6 @@ deployment_module = '/out/xpk-deployment' a3_utils_dir_name = 'a3-mega-xpk' config_map_repo_path = 'src/xpk/blueprints/a3-mega-xpk/config-map.yaml.tftpl' -kueue_config_repo_path = ( - 'src/xpk/blueprints/a3-mega-xpk/kueue-xpk-configuration.yaml.tftpl' -) class GclusterManager: