diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 47feda594..f6af5039d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1,2 @@ -* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN +* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN @scaliby @jamOne- @SikaGrr @FIoannides @fatoshoti slice/ @mwysokin @mimowo @gabesaba @PBundyra @mwielgus @pajakd \ No newline at end of file diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index b3359852f..58b795ca7 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -40,6 +40,7 @@ jobs: group-name: ${{ steps.set-group-name.outputs.group-name }} zone: ${{ steps.set-zone.outputs.zone }} tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }} + tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }} location: ${{steps.set-location.outputs.location}} run-id: ${{steps.set-run-id.outputs.run-id}} steps: @@ -76,6 +77,10 @@ jobs: id: set-tpu-type run: | echo tpu-type=v4-8 >> $GITHUB_OUTPUT + - name: set tpu-type-topology + id: set-tpu-type-topology + run: | + echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT - name: set location id: set-location run: | @@ -152,7 +157,7 @@ jobs: with: run-id: '${{needs.set-variables.outputs.run-id}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' secrets: inherit @@ -165,7 +170,7 @@ jobs: with: cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' run-id: '${{needs.set-variables.outputs.run-id}}' @@ -180,6 +185,7 @@ jobs: cluster-name: ${{needs.set-variables.outputs.cluster-name}} cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' tpu-type: ${{needs.set-variables.outputs.tpu-type}} + tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}} zone: ${{needs.set-variables.outputs.zone}} run-id: '${{needs.set-variables.outputs.run-id}}' secrets: inherit diff --git a/.github/workflows/reusable_workload_tests.yaml b/.github/workflows/reusable_workload_tests.yaml index 6bf5f14d6..fad5034cd 100644 --- a/.github/workflows/reusable_workload_tests.yaml +++ b/.github/workflows/reusable_workload_tests.yaml @@ -24,6 +24,9 @@ on: tpu-type: required: true type: string + tpu-type-topology: + required: true + type: string tpu-type-dws: required: false type: string @@ -108,7 +111,7 @@ jobs: --docker-password='${{secrets.GCP_SA_KEY}}' \ --docker-email='${{secrets.GCP_SA_EMAIL}}' - name: Run workload with private image - run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key + run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type-topology}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key - name: Wait for private image workload completion and confirm it succeeded run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $PRIVATE_IMAGE_WORKLOAD_NAME --timeout 300 - name: Delete kubectl secret diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 99126db65..e4cb29119 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -78,6 +78,8 @@ from ..utils.file import write_tmp_file from . import cluster_gcluster from .common import set_cluster_command +import shutil +import os def cluster_adapt(args) -> None: @@ -247,6 +249,10 @@ def cluster_create(args) -> None: get_cluster_credentials(args) + update_coredns_command_code = update_coredns_if_necessary(args) + if update_coredns_command_code != 0: + xpk_exit(update_cluster_command_code) + k8s_client = setup_k8s_env(args) install_storage_crd(k8s_client) @@ -702,6 +708,262 @@ def cluster_create_ray_cluster(args) -> None: cluster_create(args) +def install_jq(args): + """Installs 'jq' utility.""" + if shutil.which('jq'): + xpk_print("Task: 'Install jq' skipped, jq already installed.") + return + command_jq_install = 'sudo apt install jq -y' + xpk_print("Task: 'Install jq' in progress.") + return_code = run_command_with_updates(command_jq_install, 'Install jq', args) + if return_code != 0: + xpk_print(f'Install jq error {return_code}') + xpk_exit(return_code) + + +def clone_coredns_deployment_repo(args, coredns_repo_full_path: str): + """Clones the CoreDNS deployment repository if it doesn't exist.""" + if os.path.exists(coredns_repo_full_path): + xpk_print( + f"Directory '{coredns_repo_full_path}' already exists, skip git clone." + ) + return + command_git_clone = ( + 'git clone https://github.com/coredns/deployment.git' + f' {coredns_repo_full_path}' + ) + xpk_print( + "Task: 'Clone deployment' in progress, Target" + f' directory:{coredns_repo_full_path}.' + ) + return_code = run_command_with_updates( + command_git_clone, 'Clone deployment', args + ) + if return_code != 0: + xpk_print(f'Clone deployment error {return_code}') + xpk_exit(return_code) + + +def deploy_coredns_manifests(args, coredns_k8s_path: str): + """Deploys CoreDNS manifests to the cluster.""" + if not os.path.isdir(coredns_k8s_path): + xpk_print( + f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist." + ' Has git clone been successful?' + ) + xpk_exit(1) + original_cwd = os.getcwd() + try: + os.chdir(coredns_k8s_path) + xpk_print(f'Current working directory changed to: {os.getcwd()}') + + command_deploy_coredns = './deploy.sh | kubectl apply -f -' + xpk_print( + f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'" + ) + return_code = run_command_with_updates( + command_deploy_coredns, 'Deploy CoreDNS', args + ) + if return_code != 0: + xpk_print(f'Deploy CoreDNS error {return_code}') + + finally: + xpk_print(f'Restoring working directory to: {original_cwd}') + os.chdir(original_cwd) + if return_code != 0: + xpk_exit(return_code) + + +def scale_down_deployment( + args, deployment_name: str, namespace: str = 'kube-system' +): + """Scales down a specified Kubernetes deployment to 0 replicas.""" + command = ( + f'kubectl scale deployment {deployment_name} --replicas=0' + f' --namespace={namespace}' + ) + xpk_print(f"Task: 'Scaling down {deployment_name}' in progress") + return_code = run_command_with_updates( + command, f'Scale down {deployment_name}', args + ) + if return_code != 0: + xpk_print(f'Scale down {deployment_name} error {return_code}') + xpk_exit(return_code) + xpk_print(f'\n{deployment_name} has been scaled down.') + + +def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): + """Scales up the CoreDNS deployment to a specified number of replicas.""" + command_coredns_scale = ( + f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}' + ) + xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)") + return_code = run_command_with_updates( + command_coredns_scale, 'Scale CoreDNS', args + ) + if return_code != 0: + xpk_print(f'Scale CoreDNS error {return_code}') + xpk_exit(return_code) + + +def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool: + """Check for the existence of a specific Deployment in a given namespace.""" + command = ( + f'kubectl get deployment {deployment_name} -n' + f' {namespace} --ignore-not-found' + ) + result = run_command_with_updates( + command, 'Waiting for kubeDNS to be checked.', args + ) + return result + + +def verify_coredns_readiness( + args, timeout: int = 120, namespace: str = 'kube-system' +): + """Verifies CoreDNS readiness using kubectl wait commands.""" + xpk_print('Now verifying CoreDNS readiness...') + kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace) + if kube_dns_exists: + # Wait for kube-dns to be fully scaled down + command_kube_dns_wait_scaled_down = ( + 'kubectl wait deployment/kube-dns' + " --for=jsonpath='{.status.replicas}'=0" + f' --namespace={namespace} --timeout={timeout}s' + ) + xpk_print('Verifying if kube-dns has scaled down...') + return_code_kube_dns = run_command_with_updates( + command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args + ) + if return_code_kube_dns != 0: + xpk_print('kube-dns did not scale down successfully within the timeout.') + xpk_exit(1) # Exit if kube-dns cannot scale down + else: + xpk_print('kube-dns has successfully scaled down.') + else: + xpk_print('kube-dns deployment not found.') + # Wait for CoreDNS to be fully scaled up and available + command_coredns_wait_available = ( + 'kubectl wait deployment/coredns --for=condition=Available=true' + f' --namespace={namespace} --timeout={timeout}s' + ) + xpk_print('Verifying if CoreDNS is available...') + return_code_coredns = run_command_with_updates( + command_coredns_wait_available, 'Wait for coredns available', args + ) + if return_code_coredns != 0: + xpk_print( + 'CoreDNS verification failed, it might not have fully started within' + ' the timeout.' + ) + xpk_exit(1) # Exit if coredns cannot become available + + xpk_print('CoreDNS has successfully started and passed verification.') + + +def cleanup_coredns_repo(coredns_repo_full_path: str): + """Deletes the cloned CoreDNS deployment directory.""" + xpk_print( + "Task: 'Deleting CoreDNS deployment directory' in progress:" + f' {coredns_repo_full_path}' + ) + try: + shutil.rmtree(coredns_repo_full_path) + xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}') + except OSError as e: + xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}') + + +def update_coredns(args): + """Updates and deploys CoreDNS within a cluster. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if successful and 1 otherwise. + """ + coredns_repo_dir = os.path.expanduser('/tmp/') + coredns_repo_dir_name = 'deployment' + coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name) + coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes') + # 1. Install jq + install_jq(args) + + # 2. Clone CoreDNS deployment repository + clone_coredns_deployment_repo(args, coredns_repo_full_path) + + # 3. Deploy CoreDNS to the cluster + deploy_coredns_manifests(args, coredns_k8s_path) + + # 4. Scale down kube-dns-autoscaler + scale_down_deployment(args, 'kube-dns-autoscaler') + + # 5. Scale down kube-dns + scale_down_deployment(args, 'kube-dns') + + # 6. Scale up coredns and verify readiness + scale_up_coredns(args, replicas=15) + verify_coredns_readiness(args, timeout=120) + + xpk_print('The CoreDNS setup process has been completed.') + + # 7. Cleanup + cleanup_coredns_repo(coredns_repo_full_path) + + return 0 + + +def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: + """Checks if the CoreDNS deployment exists in the given namespace. + + Args: + namespace: The Kubernetes namespace to check for the CoreDNS deployment. + + Returns: + True if the 'coredns' deployment exists, False otherwise. + """ + command = f'kubectl get deployment coredns -n {namespace}' + xpk_print( + "Task: 'Checking CoreDNS deployment existence' in progress for" + f' namespace: {namespace}' + ) + return_code = run_command_with_updates( + command, f'Check CoreDNS deployment in {namespace}', args + ) + if return_code == 0: + verify_coredns_readiness(args) + xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.") + return True + else: + xpk_print( + f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or" + ' an error occurred.' + ) + return False + + +def update_coredns_if_necessary(args) -> int: + """Updates and deploys CoreDNS within the cluster if it's not already present. + + This function checks for the existence of the CoreDNS deployment. + If it's not found, it proceeds to deploy and configure CoreDNS. + + Args: + args: User-provided arguments for running the command. + + Returns: + 0 if successful (CoreDNS was already present or successfully deployed), + and 1 otherwise. + """ + if coredns_deployment_exists(args, namespace='kube-system'): + xpk_print('Skipping CoreDNS deployment since it already exists.') + return 0 + else: + xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.') + return update_coredns(args) + + def create_cluster_if_necessary( args, gke_control_plane_version: str, system: SystemCharacteristics ) -> int: @@ -842,6 +1104,7 @@ def run_gke_cluster_create_command( f' {args.custom_cluster_arguments}' f' {rapid_release_cmd}' ' --enable-dns-access' + ' --autoscaling-profile=optimize-utilization' ) enable_ip_alias = False diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 08d17c09b..93f2d672c 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type( case CapacityType.ON_DEMAND.name: node_selector = '' case CapacityType.FLEX_START.name: - node_selector = 'cloud.google.com/gke-queued="true"' + node_selector = 'cloud.google.com/gke-queued: "true"' case CapacityType.SPOT.name: - node_selector = 'cloud.google.com/gke-spot="true"' + node_selector = 'cloud.google.com/gke-spot: "true"' case CapacityType.RESERVATION.name: node_selector = f'cloud.google.com/reservation-name: {args.reservation}' case _: diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 67e71fa56..279f6b8d1 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.10.1' +__version__ = 'v0.11.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') diff --git a/src/xpk/core/jobset.py b/src/xpk/core/jobset.py index 3b53c6a58..135cfda63 100644 --- a/src/xpk/core/jobset.py +++ b/src/xpk/core/jobset.py @@ -81,7 +81,7 @@ limits: memory: {memory_limit_size} requests: - cpu: 500m + cpu: 1000m memory: 128Mi securityContext: allowPrivilegeEscalation: false diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 31430838d..257ed2bf1 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -244,14 +244,16 @@ periodSeconds: 10 resources: limits: - cpu: 500m + cpu: 1000m memory: {memory_limit_size} requests: - cpu: 500m + cpu: 1000m memory: 512Mi securityContext: allowPrivilegeEscalation: false volumeMounts: + - mountPath: /visibility + name: visibility - mountPath: /tmp/k8s-webhook-server/serving-certs name: cert readOnly: true @@ -263,6 +265,8 @@ serviceAccountName: kueue-controller-manager terminationGracePeriodSeconds: 10 volumes: + - name: visibility + emptyDir: {{}} - name: cert secret: defaultMode: 420 diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 9c788d6b4..8ba0d24ce 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -42,6 +42,8 @@ management: autoRepair: true autoUpgrade: true +scopes: + - "https://www.googleapis.com/auth/devstorage.read_write" autoprovisioningLocations: {zones} {resource_limits} @@ -106,6 +108,18 @@ def enable_autoprovisioning_on_cluster( xpk_print(f'{task} request returned ERROR {return_code}') return autoprovisioning_config, return_code + command = ( + 'gcloud container clusters update' + f' {args.cluster} --project={args.project}' + f' --region={zone_to_region(args.zone)}' + ' --autoscaling-profile=optimize-utilization' + ) + task = 'Update cluster with autoscaling-profile' + return_code = run_command_with_updates(command, task, args) + if return_code != 0: + xpk_print(f'{task} request returned ERROR {return_code}') + return autoprovisioning_config, return_code + # Update created accelerator node pools to support autoprovisioning. existing_node_pool_names, return_code = get_all_nodepools_programmatic(args) if return_code != 0: @@ -171,11 +185,11 @@ def create_autoprovisioning_config( # is not controlled by NAP. cpu_limits = """ minimum: 1 - maximum: 10000 + maximum: 1000000 """ memory_limits = """ minimum: 1 - maximum: 10000 + maximum: 10000000 """ # By default, the maximum chips is set to be the current number of resources used diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index fccd7c886..cab159f15 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -14,6 +14,7 @@ limitations under the License. """ +from typing import List from ..utils.console import get_user_input, xpk_print from .capacity import ( AUTOPROVISIONING_CONFIG_VALUE, @@ -32,6 +33,8 @@ create_or_update_cluster_configmap, ) from .system_characteristics import AcceleratorType +from functools import reduce +from operator import mul CLOUD_PLATFORM_AUTH_SCOPE_URL = ( '"https://www.googleapis.com/auth/cloud-platform"' @@ -88,20 +91,26 @@ def run_gke_node_pool_create_command( xpk_print('Parsing capacity arguments failed!') return return_code - if system.accelerator_type == AcceleratorType['GPU']: - xpk_print( - f'Creating 1 node pool with {args.num_nodes} nodes of' - f' {system.device_type}\nUnderlyingly, we assume that means: {system}' - ) - desired_node_pool_names = [f'{args.cluster}-np-0'] - else: - xpk_print( - f'Creating {args.num_slices} node pool or pools of' - f' {system.device_type}\nUnderlyingly, we assume that means: {system}' - ) - desired_node_pool_names = [ - f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices) - ] + desired_node_pool_count = ( + 1 + if system.accelerator_type == AcceleratorType['GPU'] + else args.num_slices + ) + message = ( + ( + f'Creating 1 node pool with {args.num_nodes} nodes of' + f' {system.device_type}\nUnderlyingly, we assume that means: {system}' + ) + if system.accelerator_type == AcceleratorType['GPU'] + else ( + f'Creating {args.num_slices} node pool or pools of' + f' {system.device_type}\nUnderlyingly, we assume that means: {system}' + ) + ) + xpk_print(message) + desired_node_pool_names = get_desired_node_pool_names( + existing_node_pool_names, args.cluster, desired_node_pool_count + ) node_pools_to_remain = [] delete_commands = [] @@ -275,20 +284,24 @@ def run_gke_node_pool_create_command( f' --host-maintenance-interval={args.host_maintenance_interval}' f' {capacity_args}' ' --enable-gvnic' - f' {args.custom_nodepool_arguments}' ) if system.accelerator_type == AcceleratorType['TPU']: command += f' --node-version={gke_node_pool_version}' + topology_product = reduce( + mul, (int(x) for x in system.topology.split('x')), 1 + ) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' - else: + elif topology_product > 1: command += f' --num-nodes={system.vms_per_slice}' - command += ' --placement-type=COMPACT --max-pods-per-node 15' command += ( f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}' ) - command += f' --tpu-topology={system.topology}' - command += f' {args.custom_tpu_nodepool_arguments}' + + if topology_product > 1: + command += ' --placement-type=COMPACT --max-pods-per-node 15' + command += f' --tpu-topology={system.topology}' + command += f' {args.custom_tpu_nodepool_arguments}' elif system.accelerator_type == AcceleratorType['GPU']: subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}' if capacity_type == CapacityType.FLEX_START: @@ -319,6 +332,8 @@ def run_gke_node_pool_create_command( if args.enable_workload_identity or args.enable_gcsfuse_csi_driver: command += ' --workload-metadata=GKE_METADATA' + command += f' {args.custom_nodepool_arguments}' + task = f'NodepoolCreate-{node_pool_name}' create_commands.append(command) create_task_names.append(task) @@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode( return 1, None return 0, nodepool_WI_mode.strip() + + +def get_desired_node_pool_names( + existing_node_pool_names: List[str], + cluster_name: str, + desired_node_pool_count: int, +) -> List[str]: + cluster_node_pools = [ + np + for np in existing_node_pool_names + if np.startswith(f'{cluster_name}-np-') + ] + result = set(cluster_node_pools[:desired_node_pool_count]) + i = 0 + while len(result) < desired_node_pool_count: + result.add(f'{cluster_name}-np-{i}') + i += 1 + return list(result) diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index 8bc18c66d..d8957e133 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: missing_gke_accelerator_type = False if not cluster_config_map.get(system.gke_accelerator): xpk_print( - f'Gke Accelerator Type Check: {args.workload} is requesting' + f'GKE Accelerator Type Check: {args.workload} is requesting' f' {system.gke_accelerator} but cluster only contains' f' {cluster_config_map.keys()}. ' ) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 48fd2c6f3..5ae4d9746 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -15,6 +15,8 @@ """ from dataclasses import dataclass +from functools import reduce +from operator import mul AcceleratorType = {'TPU': 1, 'GPU': 2, 'CPU': 3} @@ -91,6 +93,34 @@ def get_system_characteristics_by_device_type( return None, 1 +def get_tpu_system_characteristics_map( + prefix: str, + tensorcores_per_chip: int, + gke_accelerator: str, + machine_type: str, + supported_topologies: list[str], +) -> dict[str, SystemCharacteristics]: + system_characteristics_map = {} + for topology in supported_topologies: + total_chips = reduce(mul, (int(x) for x in topology.split('x')), 1) + num_tensorcores = total_chips * tensorcores_per_chip + chips_per_vm = 1 if total_chips == 1 else 4 + vms_per_slice = total_chips // chips_per_vm + system = SystemCharacteristics( + topology, + vms_per_slice, + gke_accelerator, + machine_type, + chips_per_vm, + AcceleratorType['TPU'], + f'{prefix}-{num_tensorcores}', + ) + system_characteristics_map[f'{prefix}-{topology}'] = system + system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system + + return system_characteristics_map + + ################### Subcommand Helper Functions ############################# """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD @@ -212,1098 +242,254 @@ def get_system_characteristics_by_device_type( 'h100-mega-80gb-8', ), # TPU system characteristics - # v6e - 'v6e-1': SystemCharacteristics( - '1x1', - 1, - 'tpu-v6e-slice', - 'ct6e-standard-1t', - 1, - AcceleratorType['TPU'], - 'v6e-1', + **get_tpu_system_characteristics_map( + 'tpu7x', 2, 'tpu7x', 'tpu7x-standard-1t', ['1x1x1'] ), - 'v6e-4': SystemCharacteristics( - '2x2', + **get_tpu_system_characteristics_map( + 'tpu7x', + 2, + 'tpu7x', + 'tpu7x-standard-4t', + [ + '12x12x12', + '12x12x16', + '12x12x20', + '12x12x24', + '12x12x28', + '12x12x36', + '12x16x16', + '12x16x20', + '12x16x24', + '12x16x28', + '12x20x20', + '12x20x24', + '12x24x24', + '16x16x16', + '16x16x20', + '16x16x24', + '16x16x32', + '16x20x28', + '16x24x24', + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x12x116', + '4x12x12', + '4x12x124', + '4x12x20', + '4x12x28', + '4x12x44', + '4x12x52', + '4x12x68', + '4x12x76', + '4x12x92', + '4x20x20', + '4x20x28', + '4x20x44', + '4x20x52', + '4x20x68', + '4x20x76', + '4x28x28', + '4x28x44', + '4x28x52', + '4x4x116', + '4x4x12', + '4x4x124', + '4x4x148', + '4x4x164', + '4x4x172', + '4x4x188', + '4x4x20', + '4x4x212', + '4x4x236', + '4x4x244', + '4x4x28', + '4x4x4', + '4x4x44', + '4x4x52', + '4x4x68', + '4x4x76', + '4x4x8', + '4x4x92', + '4x8x116', + '4x8x12', + '4x8x124', + '4x8x148', + '4x8x164', + '4x8x172', + '4x8x188', + '4x8x20', + '4x8x28', + '4x8x44', + '4x8x52', + '4x8x68', + '4x8x76', + '4x8x8', + '4x8x92', + '8x12x12', + '8x12x16', + '8x12x20', + '8x12x28', + '8x12x44', + '8x12x52', + '8x16x16', + '8x16x20', + '8x16x28', + '8x16x44', + '8x20x20', + '8x20x28', + '8x8x12', + '8x8x16', + '8x8x20', + '8x8x28', + '8x8x44', + '8x8x52', + '8x8x68', + '8x8x76', + '8x8x8', + '8x8x92', + ], + ), + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] + ), + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-4', + ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v6e-8': SystemCharacteristics( - '2x4', + **get_tpu_system_characteristics_map( + 'v5p', 2, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-8', - ), - 'v6e-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-16', - ), - 'v6e-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-32', - ), - 'v6e-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-64', - ), - 'v6e-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-128', - ), - 'v6e-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-256', - ), - # v5p - 'v5p-8': SystemCharacteristics( - '2x2x1', - 1, 'tpu-v5p-slice', 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x4x12', + '4x8x8', + '4x4x20', + '4x8x12', + '4x4x28', + '8x8x8', + '4x12x12', + '4x8x20', + '4x4x44', + '8x8x12', + '4x4x52', + '4x8x28', + '4x12x20', + '8x8x16', + '4x4x68', + '8x12x12', + '4x4x76', + '8x8x20', + '4x12x28', + '4x8x44', + '4x4x92', + '8x12x16', + '4x20x20', + '4x8x52', + '12x12x12', + '8x8x28', + '4x4x116', + '8x12x20', + '4x4x124', + '8x16x16', + '4x12x44', + '4x8x68', + '4x20x28', + '12x12x16', + '4x4x148', + '4x8x76', + '4x12x52', + '8x16x20', + '4x4x164', + '8x12x28', + '4x4x172', + '8x8x44', + '12x12x20', + '4x8x92', + '4x4x188', + '12x16x16', + '4x28x28', + '8x20x20', + '4x12x68', + '8x8x52', + '4x4x212', + '12x12x24', + '4x20x44', + '8x16x28', + '4x12x76', + '4x8x116', + '4x4x236', + '12x16x20', + '4x4x244', + '4x8x124', + '12x12x28', + '16x16x16', + '4x20x52', + '8x12x44', + '8x8x68', + '4x12x92', + '8x20x28', + '12x16x24', + '4x8x148', + '12x20x20', + '8x8x76', + '4x28x44', + '8x12x52', + '16x16x20', + '12x12x36', + '4x8x164', + '12x16x28', + '4x20x68', + '4x8x172', + '4x12x116', + '8x16x44', + '12x20x24', + '4x28x52', + '8x8x92', + '4x12x124', + '4x8x188', + '4x20x76', + '16x16x24', + '12x24x24', + '16x20x28', + ], + ), + **get_tpu_system_characteristics_map( + 'v5litepod', + 1, + 'tpu-v5-lite-podslice', + 'ct5lp-hightpu-4t', + ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v5p-16': SystemCharacteristics( - '2x2x2', + **get_tpu_system_characteristics_map( + 'v4', 2, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-16', - ), - 'v5p-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-32', - ), - 'v5p-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-64', - ), - 'v5p-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-128', - ), - 'v5p-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-256', - ), - 'v5p-384': SystemCharacteristics( - '4x4x12', - 48, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-384', - ), - 'v5p-512': SystemCharacteristics( - '4x8x8', - 64, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-512', - ), - 'v5p-640': SystemCharacteristics( - '4x4x20', - 80, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-640', - ), - 'v5p-768': SystemCharacteristics( - '4x8x12', - 96, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-768', - ), - 'v5p-896': SystemCharacteristics( - '4x4x28', - 112, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-896', - ), - 'v5p-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1024', - ), - 'v5p-1152': SystemCharacteristics( - '4x12x12', - 144, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1152', - ), - 'v5p-1280': SystemCharacteristics( - '4x8x20', - 160, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1280', - ), - 'v5p-1408': SystemCharacteristics( - '4x4x44', - 176, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1408', - ), - 'v5p-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1536', - ), - 'v5p-1664': SystemCharacteristics( - '4x4x52', - 208, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1664', - ), - 'v5p-1792': SystemCharacteristics( - '4x8x28', - 224, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1792', - ), - 'v5p-1920': SystemCharacteristics( - '4x12x20', - 240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1920', - ), - 'v5p-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2048', - ), - 'v5p-2176': SystemCharacteristics( - '4x4x68', - 272, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2176', - ), - 'v5p-2304': SystemCharacteristics( - '8x12x12', - 288, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2304', - ), - 'v5p-2432': SystemCharacteristics( - '4x4x76', - 304, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2432', - ), - 'v5p-2560': SystemCharacteristics( - '8x8x20', - 320, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2560', - ), - 'v5p-2688': SystemCharacteristics( - '4x12x28', - 336, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2688', - ), - 'v5p-2816': SystemCharacteristics( - '4x8x44', - 352, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2816', - ), - 'v5p-2944': SystemCharacteristics( - '4x4x92', - 368, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2944', - ), - 'v5p-3072': SystemCharacteristics( - '8x12x16', - 384, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3072', - ), - 'v5p-3200': SystemCharacteristics( - '4x20x20', - 400, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3200', - ), - 'v5p-3328': SystemCharacteristics( - '4x8x52', - 416, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3328', - ), - 'v5p-3456': SystemCharacteristics( - '12x12x12', - 432, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3456', - ), - 'v5p-3584': SystemCharacteristics( - '8x8x28', - 448, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3584', - ), - 'v5p-3712': SystemCharacteristics( - '4x4x116', - 464, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3712', - ), - 'v5p-3840': SystemCharacteristics( - '8x12x20', - 480, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3840', - ), - 'v5p-3968': SystemCharacteristics( - '4x4x124', - 496, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3968', - ), - 'v5p-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4096', - ), - 'v5p-4224': SystemCharacteristics( - '4x12x44', - 528, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4224', - ), - 'v5p-4352': SystemCharacteristics( - '4x8x68', - 544, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4352', - ), - 'v5p-4480': SystemCharacteristics( - '4x20x28', - 560, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4480', - ), - 'v5p-4608': SystemCharacteristics( - '12x12x16', - 576, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4608', - ), - 'v5p-4736': SystemCharacteristics( - '4x4x148', - 592, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4736', - ), - 'v5p-4864': SystemCharacteristics( - '4x8x76', - 608, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4864', - ), - 'v5p-4992': SystemCharacteristics( - '4x12x52', - 624, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4992', - ), - 'v5p-5120': SystemCharacteristics( - '8x16x20', - 640, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5120', - ), - 'v5p-5248': SystemCharacteristics( - '4x4x164', - 656, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5248', - ), - 'v5p-5376': SystemCharacteristics( - '8x12x28', - 672, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5376', - ), - 'v5p-5504': SystemCharacteristics( - '4x4x172', - 688, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5504', - ), - 'v5p-5632': SystemCharacteristics( - '8x8x44', - 704, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5632', - ), - 'v5p-5760': SystemCharacteristics( - '12x12x20', - 720, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5760', - ), - 'v5p-5888': SystemCharacteristics( - '4x8x92', - 736, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5888', - ), - 'v5p-6016': SystemCharacteristics( - '4x4x188', - 752, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6016', - ), - 'v5p-6144': SystemCharacteristics( - '12x16x16', - 768, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6144', - ), - 'v5p-6272': SystemCharacteristics( - '4x28x28', - 784, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6272', - ), - 'v5p-6400': SystemCharacteristics( - '8x20x20', - 800, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6400', - ), - 'v5p-6528': SystemCharacteristics( - '4x12x68', - 816, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6528', - ), - 'v5p-6656': SystemCharacteristics( - '8x8x52', - 832, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6656', - ), - 'v5p-6784': SystemCharacteristics( - '4x4x212', - 848, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6784', - ), - 'v5p-6912': SystemCharacteristics( - '12x12x24', - 864, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6912', - ), - 'v5p-7040': SystemCharacteristics( - '4x20x44', - 880, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7040', - ), - 'v5p-7168': SystemCharacteristics( - '8x16x28', - 896, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7168', - ), - 'v5p-7296': SystemCharacteristics( - '4x12x76', - 912, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7296', - ), - 'v5p-7424': SystemCharacteristics( - '4x8x116', - 928, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7424', - ), - 'v5p-7552': SystemCharacteristics( - '4x4x236', - 944, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7552', - ), - 'v5p-7680': SystemCharacteristics( - '12x16x20', - 960, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7680', - ), - 'v5p-7808': SystemCharacteristics( - '4x4x244', - 976, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7808', - ), - 'v5p-7936': SystemCharacteristics( - '4x8x124', - 992, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7936', - ), - 'v5p-8064': SystemCharacteristics( - '12x12x28', - 1008, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8064', - ), - 'v5p-8192': SystemCharacteristics( - '16x16x16', - 1024, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8192', - ), - 'v5p-8320': SystemCharacteristics( - '4x20x52', - 1040, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8320', - ), - 'v5p-8448': SystemCharacteristics( - '8x12x44', - 1056, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8448', - ), - 'v5p-8704': SystemCharacteristics( - '8x8x68', - 1088, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8704', - ), - 'v5p-8832': SystemCharacteristics( - '4x12x92', - 1104, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8832', - ), - 'v5p-8960': SystemCharacteristics( - '8x20x28', - 1120, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8960', - ), - 'v5p-9216': SystemCharacteristics( - '12x16x24', - 1152, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9216', - ), - 'v5p-9472': SystemCharacteristics( - '4x8x148', - 1184, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9472', - ), - 'v5p-9600': SystemCharacteristics( - '12x20x20', - 1200, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9600', - ), - 'v5p-9728': SystemCharacteristics( - '8x8x76', - 1216, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9728', - ), - 'v5p-9856': SystemCharacteristics( - '4x28x44', - 1232, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9856', - ), - 'v5p-9984': SystemCharacteristics( - '8x12x52', - 1248, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9984', - ), - 'v5p-10240': SystemCharacteristics( - '16x16x20', - 1280, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10240', - ), - 'v5p-10368': SystemCharacteristics( - '12x12x36', - 1296, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10368', - ), - 'v5p-10496': SystemCharacteristics( - '4x8x164', - 1312, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10496', - ), - 'v5p-10752': SystemCharacteristics( - '12x16x28', - 1344, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10752', - ), - 'v5p-10880': SystemCharacteristics( - '4x20x68', - 1360, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10880', - ), - 'v5p-11008': SystemCharacteristics( - '4x8x172', - 1376, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11008', - ), - 'v5p-11136': SystemCharacteristics( - '4x12x116', - 1392, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11136', - ), - 'v5p-11264': SystemCharacteristics( - '8x16x44', - 1408, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11264', - ), - 'v5p-11520': SystemCharacteristics( - '12x20x24', - 1440, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11520', - ), - 'v5p-11648': SystemCharacteristics( - '4x28x52', - 1456, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11648', - ), - 'v5p-11776': SystemCharacteristics( - '8x8x92', - 1472, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11776', - ), - 'v5p-11904': SystemCharacteristics( - '4x12x124', - 1488, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11904', - ), - 'v5p-12032': SystemCharacteristics( - '4x8x188', - 1504, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12032', - ), - 'v5p-12160': SystemCharacteristics( - '4x20x76', - 1520, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12160', - ), - 'v5p-12288': SystemCharacteristics( - '16x16x24', - 1536, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12288', - ), - 'v5p-13824': SystemCharacteristics( - '12x24x24', - 1728, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-13824', - ), - 'v5p-17920': SystemCharacteristics( - '16x20x28', - 2240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-17920', - ), - # v5litepod - 'v5litepod-8': SystemCharacteristics( - '2x4', - 2, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-8', - ), - 'v5litepod-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-16', - ), - 'v5litepod-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-32', - ), - 'v5litepod-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-64', - ), - 'v5litepod-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-128', - ), - 'v5litepod-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-256', - ), - # v4 - 'v4-8': SystemCharacteristics( - '2x2x1', - 1, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-8', - ), - 'v4-16': SystemCharacteristics( - '2x2x2', - 2, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-16', - ), - 'v4-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-32', - ), - 'v4-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-64', - ), - 'v4-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-128', - ), - 'v4-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-256', - ), - 'v4-512': SystemCharacteristics( - '4x8x8', - 64, 'tpu-v4-podslice', 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-512', - ), - 'v4-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1024', - ), - 'v4-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1536', - ), - 'v4-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-2048', - ), - 'v4-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-4096', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x8x8', + '8x8x8', + '8x8x12', + '8x8x16', + '8x16x16', + ], ), # CPU system characteristics. # Note that chips_per_vm is actually the number of vCPUs in that CPU. diff --git a/src/xpk/core/tests/unit/test_nodepool.py b/src/xpk/core/tests/unit/test_nodepool.py new file mode 100644 index 000000000..71cc540c3 --- /dev/null +++ b/src/xpk/core/tests/unit/test_nodepool.py @@ -0,0 +1,82 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from xpk.core.nodepool import get_desired_node_pool_names + +CLUSTER_NAME = "running-cucumber" + + +def node_pool_name(number: int) -> str: + return f"{CLUSTER_NAME}-np-{number}" + + +def test_compute_desired_node_pool_names_with_desired_larger_than_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(1)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_desired_smaller_than_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(1)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=1, + ) + + expected_result = [node_pool_name(0)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_consecutive_numbers_missing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(3)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=3, + ) + + expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(3)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(3)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_unknown_node_pools(): + result = get_desired_node_pool_names( + existing_node_pool_names=[ + "unknown-node-pool", + node_pool_name(0), + node_pool_name(3), + ], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(3)] + assert set(result) == set(expected_result)