diff --git a/goldens.yaml b/goldens.yaml index 692f659f4..61d299935 100644 --- a/goldens.yaml +++ b/goldens.yaml @@ -9,6 +9,8 @@ goldens: command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=1 --memory-limit=1Mi --dry-run "Cluster create with CPU and memory limits above capacity": command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=20 --memory-limit=1Gi --dry-run + "Cluster create with shared reservation": + command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run "Cluster create with gb200-4": command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run "Cluster create private": diff --git a/goldens/Cluster_create_with_shared_reservation.txt b/goldens/Cluster_create_with_shared_reservation.txt new file mode 100644 index 000000000..490da9aba --- /dev/null +++ b/goldens/Cluster_create_with_shared_reservation.txt @@ -0,0 +1,205 @@ +$ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run +[XPK] Starting xpk v0.14.3 +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)" +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --location=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --location=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True) +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True) +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.4/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}' +[XPK] Installing Kueue version v0.14.3... +[XPK] Try 1: Install Kueue +[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Applying following Kueue resources: +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "1xtpu7x-8" +spec: + nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"} + +--- + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: AdmissionCheck +metadata: + name: dws-prov +spec: + controllerName: kueue.x-k8s.io/provisioning-request + parameters: + apiGroup: kueue.x-k8s.io + kind: ProvisioningRequestConfig + name: dws-config +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ProvisioningRequestConfig +metadata: + name: dws-config +spec: + provisioningClassName: queued-provisioning.gke.io + podSetUpdates: + nodeSelector: + - key: autoscaling.gke.io/provisioning-request + valueFromProvisioningClassDetail: ResizeRequestName + managedResources: + - google.com/tpu +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + preemption: + reclaimWithinCohort: Never # Don't preempt other queues in the cohort. + withinClusterQueue: LowerPriority + namespaceSelector: {} # match all. + resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}] + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: multislice-queue +spec: + clusterQueue: cluster-queue +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-low +value: 100 +globalDefault: false +description: "Very Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low +value: 250 +globalDefault: false +description: "Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: medium +value: 500 +globalDefault: false +description: "Medium" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high +value: 750 +globalDefault: false +description: "High" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-high +value: 1000 +globalDefault: false +description: "Very High" +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}' +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index fdf8033c2..ffe44d152 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -115,9 +115,12 @@ def get_reservation_maintenance_interval( Returns: 0 if successful and 1 otherwise. """ + reservation_project, reservation_name = get_reservation_project_and_name( + reservation, project + ) command = ( - f'gcloud beta compute reservations describe {reservation}' - f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"' + f'gcloud beta compute reservations describe {reservation_name}' + f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"' ) return_code, output = run_command_for_value( command, 'Get reservation maintenance interval' @@ -139,9 +142,12 @@ def get_reservation_placement_policy( Returns: 0 if successful and 1 otherwise. """ + reservation_project, reservation_name = get_reservation_project_and_name( + reservation, project + ) command = ( - f'gcloud beta compute reservations describe {reservation}' - f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"' + f'gcloud beta compute reservations describe {reservation_name}' + f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"' ) return_code, output = run_command_for_value( command, 'Get reservation placement policy' @@ -156,9 +162,12 @@ def get_reservation_deployment_type( reservation: str, zone: str, project: str ) -> str: """Get reservation deployment type.""" + reservation_project, reservation_name = get_reservation_project_and_name( + reservation, project + ) command = ( - f'gcloud beta compute reservations describe {reservation}' - f' --project={project} --zone={zone} --format="value(deploymentType)"' + f'gcloud beta compute reservations describe {reservation_name}' + f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"' ) return_code, output = run_command_for_value( command, 'Get reservation deployment type', dry_run_return_val='DENSE' @@ -178,9 +187,12 @@ def verify_reservation_exists(args) -> int: Returns: 0 if successful and 1 otherwise. """ + reservation_project, reservation_name = get_reservation_project_and_name( + args.reservation, args.project + ) command = ( - f'gcloud beta compute reservations describe {args.reservation}' - f' --project={args.project} --zone={args.zone}' + f'gcloud beta compute reservations describe {reservation_name}' + f' --project={reservation_project} --zone={args.zone}' ) return_code = run_command_with_updates(command, 'Describe reservation') if return_code != 0: @@ -264,3 +276,29 @@ def get_capacity_node_selectors_from_capacity_type( ) return_code = 1 return node_selector, return_code + + +def get_reservation_project_and_name( + reservation_name_or_path: str, cluster_project: str +) -> tuple[str, str]: + """Get the reservation project and name. + + Args: + reservation_name_or_path: either reservation name or reservation path in format + projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME + cluster_project: the cluster project + + Returns: + Tuple with reservation project and reservation name. + """ + if '/' not in reservation_name_or_path: + return cluster_project, reservation_name_or_path + reservation_parts = reservation_name_or_path.split('/') + if ( + len(reservation_parts) != 4 + or reservation_parts[0] != 'projects' + or reservation_parts[2] != 'reservations' + ): + xpk_print('Unable to parse reservation: ', reservation_name_or_path) + xpk_exit(1) + return reservation_parts[1], reservation_parts[3] diff --git a/src/xpk/core/capacity_test.py b/src/xpk/core/capacity_test.py index 302608b1d..bdb09af49 100644 --- a/src/xpk/core/capacity_test.py +++ b/src/xpk/core/capacity_test.py @@ -16,7 +16,7 @@ import pytest from unittest.mock import MagicMock, patch -from .capacity import get_reservation_deployment_type +from .capacity import get_reservation_deployment_type, get_reservation_project_and_name @patch('xpk.core.capacity.xpk_print') @@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su reservation='reservation', zone='zone', project='project' ) assert result == 'DENSE' + + +def test_get_reservation_project_and_name_parses_local_reservation(): + project, name = get_reservation_project_and_name( + 'test-reservation', 'cluster-project' + ) + + assert project == 'cluster-project' + assert name == 'test-reservation' + + +def test_get_reservation_project_and_name_parses_shared_reservation(): + project, name = get_reservation_project_and_name( + 'projects/reservation-project/reservations/test-reservation', + 'cluster-project', + ) + + assert project == 'reservation-project' + assert name == 'test-reservation' + + +@patch('xpk.core.capacity.xpk_print') +def test_get_reservation_project_and_name_fails_for_invalid_reservation( + xpk_print: MagicMock, mocker +): + with pytest.raises(SystemExit): + get_reservation_project_and_name( + 'invalid/reservation', + 'cluster-project', + ) + assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]