diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 2f392c676..f0f20ae0e 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -39,17 +39,19 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --placement-policy=golden-cluster-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 92302d9eb..9bb1eb1d6 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -41,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster-private --region us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v5p-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8', requires_placement_policy=False) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of v5p-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8', requires_placement_policy=False) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 2ee918ac9..81aab69f8 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -39,13 +39,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of gb200-4 -We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4', requires_placement_policy=True) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool with 2 nodes of gb200-4 -Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4', requires_placement_policy=True) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 2eb9e9868..5367a1d86 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -39,17 +39,19 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=golden-cluster-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 8adfff2bd..efae9f29b 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -39,17 +39,19 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8', requires_placement_policy=True) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --placement-policy=golden-cluster-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 [XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index bc8df6b99..aaaf11172 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -268,9 +268,7 @@ def run_gke_node_pool_create_command( return 1 placement_args = '' - if system.accelerator_type == AcceleratorType['GPU'] and is_topology_valid( - system.topology - ): + if system.requires_placement_policy and is_topology_valid(system.topology): placement_policy = f'{args.cluster}-placement-policy' ensure_resource_policy_exists(placement_policy, args, system.topology) placement_args = f' --placement-policy={placement_policy}' diff --git a/src/xpk/core/nodepool_test.py b/src/xpk/core/nodepool_test.py index ac98922de..c05da1233 100644 --- a/src/xpk/core/nodepool_test.py +++ b/src/xpk/core/nodepool_test.py @@ -15,7 +15,12 @@ """ import pytest -from xpk.core.nodepool import get_desired_node_pool_names, ensure_resource_policy_exists +from xpk.core.nodepool import ( + ensure_resource_policy_exists, + get_desired_node_pool_names, + run_gke_node_pool_create_command, +) +from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics CLUSTER_NAME = "running-cucumber" @@ -116,3 +121,147 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati side_effect=[(1, ""), (1, "")], ) ensure_resource_policy_exists("resource-policy", args, "2x2x1") + + +@pytest.fixture +def mock_nodepool_dependencies(mocker): + """Mocks dependencies for run_gke_node_pool_create_command.""" + mocker.patch( + "xpk.core.nodepool.get_all_nodepools_programmatic", return_value=([], 0) + ) + mocker.patch( + "xpk.core.nodepool.get_capacity_type", return_value=("on-demand", 0) + ) + mocker.patch( + "xpk.core.nodepool.get_capacity_arguments_from_capacity_type", + return_value=("--on-demand", 0), + ) + mocker.patch("xpk.core.nodepool.run_commands", return_value=0) + mocker.patch("xpk.core.nodepool.get_user_input", return_value=True) + mock_is_topology_valid = mocker.patch("xpk.core.nodepool.is_topology_valid") + mock_ensure_resource_policy = mocker.patch( + "xpk.core.nodepool.ensure_resource_policy_exists" + ) + return mock_is_topology_valid, mock_ensure_resource_policy + + +def test_placement_policy_created_for_gpu_with_valid_topology( + mocker, mock_nodepool_dependencies # pylint: disable=redefined-outer-name +): + """Tests that placement policy is created for GPUs with a valid topology.""" + mock_is_topology_valid, mock_ensure_resource_policy = ( + mock_nodepool_dependencies + ) + mock_is_topology_valid.return_value = True + args = mocker.Mock( + tpu_type=None, + device_type="h100-80gb-8", + cluster="test-cluster", + project="test-project", + zone="us-central1-a", + ) + system = SystemCharacteristics( + topology="N/A", + vms_per_slice=1, + gke_accelerator="nvidia-h100-80gb", + gce_machine_type="a3-highgpu-8g", + chips_per_vm=8, + accelerator_type=AcceleratorType["GPU"], + device_type="h100-80gb-8", + ) + + run_gke_node_pool_create_command(args, system, "1.2.3") + + mock_ensure_resource_policy.assert_called_once() + + +def test_placement_policy_not_created_for_gpu_with_invalid_topology( + mocker, mock_nodepool_dependencies # pylint: disable=redefined-outer-name +): + """Tests that placement policy is not created for GPUs with an invalid topology.""" + mock_is_topology_valid, mock_ensure_resource_policy = ( + mock_nodepool_dependencies + ) + mock_is_topology_valid.return_value = False + args = mocker.Mock( + tpu_type=None, + device_type="h100-80gb-8", + cluster="test-cluster", + zone="us-central1-a", + ) + system = SystemCharacteristics( + topology="N/A", + vms_per_slice=1, + gke_accelerator="nvidia-h100-80gb", + gce_machine_type="a3-highgpu-8g", + chips_per_vm=8, + accelerator_type=AcceleratorType["GPU"], + device_type="h100-80gb-8", + ) + + run_gke_node_pool_create_command(args, system, "1.2.3") + + mock_ensure_resource_policy.assert_not_called() + + +def test_placement_policy_created_for_tpu7x_with_valid_topology( + mocker, mock_nodepool_dependencies # pylint: disable=redefined-outer-name +): + """Tests that placement policy is created for tpu7x with a valid topology.""" + mock_is_topology_valid, mock_ensure_resource_policy = ( + mock_nodepool_dependencies + ) + mock_is_topology_valid.return_value = True + args = mocker.Mock( + tpu_type="tpu7x-8", + device_type=None, + num_slices=1, + cluster="test-cluster", + project="test-project", + zone="us-central1-a", + ) + system = SystemCharacteristics( + topology="2x2x1", + vms_per_slice=1, + gke_accelerator="tpu7x", + gce_machine_type="tpu7x-standard-4t", + chips_per_vm=4, + accelerator_type=AcceleratorType["TPU"], + device_type="tpu7x-8", + requires_placement_policy=True, + ) + + run_gke_node_pool_create_command(args, system, "1.2.3") + + mock_ensure_resource_policy.assert_called_once() + + +def test_placement_policy_not_created_for_non7x_tpu( + mocker, mock_nodepool_dependencies # pylint: disable=redefined-outer-name +): + """Tests that placement policy is not created for non-tpu7x TPUs.""" + mock_is_topology_valid, mock_ensure_resource_policy = ( + mock_nodepool_dependencies + ) + mock_is_topology_valid.return_value = True + args = mocker.Mock( + tpu_type="v6e", + device_type=None, + num_slices=1, + cluster="test-cluster", + project="test-project", + zone="us-central1-a", + ) + system = SystemCharacteristics( + topology="2x2", + vms_per_slice=1, + gke_accelerator="v6e", + gce_machine_type="tpu-v6e-slice", + chips_per_vm=4, + accelerator_type=AcceleratorType["TPU"], + device_type="v6e-4", + ) + + run_gke_node_pool_create_command(args, system, "1.2.3") + + mock_ensure_resource_policy.assert_not_called() diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 9fa873d77..c9ca7db70 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -50,6 +50,31 @@ class AcceleratorCharacteristics: @dataclass class SystemCharacteristics: + """Contains the defining characteristics of a specific accelerator system. + + This dataclass holds the hardware and configuration details for a given + accelerator type, such as its topology, machine type, and chip count. It + provides a standardized way to access system-specific information throughout + the application. + + Attributes: + topology: The physical or logical layout of the accelerator chips (e.g., + '2x2x1' for TPUs, 'N/A' for single-VM GPUs). + vms_per_slice: The number of Virtual Machines that constitute a single + accelerator slice. + gke_accelerator: The name of the accelerator as recognized by GKE (e.g., + 'nvidia-l4', 'tpu7x'). + gce_machine_type: The GCE machine type that hosts the accelerator (e.g., + 'g2-standard-12'). + chips_per_vm: The number of accelerator chips attached to a single VM. + accelerator_type: The category of the accelerator (e.g., TPU, GPU, CPU) + from the AcceleratorType enum. + device_type: A user-facing name for the specific hardware configuration + (e.g., 'l4-1', 'h100-80gb-8'). + requires_placement_policy: A boolean indicating if a GCE resource + placement policy is required. This is automatically set to True for GPUs. + """ + topology: str vms_per_slice: int gke_accelerator: str @@ -57,6 +82,11 @@ class SystemCharacteristics: chips_per_vm: int accelerator_type: int # TODO: use enums device_type: str + requires_placement_policy: bool = False + + def __post_init__(self): + if self.accelerator_type == AcceleratorType['GPU']: + self.requires_placement_policy = True def get_system_characteristics( @@ -99,6 +129,7 @@ def get_tpu_system_characteristics_map( gke_accelerator: str, machine_type: str, supported_topologies: list[str], + requires_placement_policy: bool = False, ) -> dict[str, SystemCharacteristics]: system_characteristics_map = {} for topology in supported_topologies: @@ -114,6 +145,7 @@ def get_tpu_system_characteristics_map( chips_per_vm=chips_per_vm, accelerator_type=AcceleratorType['TPU'], device_type=f'{prefix}-{num_tensorcores}', + requires_placement_policy=requires_placement_policy, ) system_characteristics_map[f'{prefix}-{topology}'] = system system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system @@ -266,6 +298,7 @@ def get_tpu_system_characteristics_map( gke_accelerator='tpu7x', machine_type='tpu7x-standard-1t', supported_topologies=['1x1x1'], + requires_placement_policy=True, ), **get_tpu_system_characteristics_map( prefix='tpu7x', @@ -372,6 +405,7 @@ def get_tpu_system_characteristics_map( '8x8x8', '8x8x92', ], + requires_placement_policy=True, ), **get_tpu_system_characteristics_map( prefix='v6e',