Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions goldens.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ goldens:
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=1 --memory-limit=1Mi --dry-run
"Cluster create with CPU and memory limits above capacity":
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=20 --memory-limit=1Gi --dry-run
"Cluster create with shared reservation":
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run
"Cluster create with gb200-4":
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run
"Cluster create private":
Expand Down
205 changes: 205 additions & 0 deletions goldens/Cluster_create_with_shared_reservation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
$ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run
[XPK] Starting xpk v0.14.3
[XPK] Starting cluster create for cluster golden-cluster:
[XPK] Working on golden-project and us-central1-a
[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run.
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)"
[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default
[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
gcloud container clusters describe golden-cluster --project=golden-project --location=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)"
[XPK] Private Nodes is not enabled on the cluster.
[XPK] Cluster is public and no need to authorize networks.
[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster
[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run.
gcloud container clusters get-credentials golden-cluster --location=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default
[XPK] Testing credentials with kubectl...
[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run.
kubectl get pods
[XPK] Credentials test succeeded.
[XPK] Finished get-credentials and kubectl setup.
[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system
[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run.
kubectl get deployment coredns -n kube-system
[XPK] Now verifying CoreDNS readiness...
[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run.
kubectl get deployment kube-dns -n kube-system --ignore-not-found
[XPK] kube-dns deployment not found.
[XPK] Verifying if CoreDNS is available...
[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run.
kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s
[XPK] CoreDNS has successfully started and passed verification.
[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'.
[XPK] Skipping CoreDNS deployment since it already exists.
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
[XPK] Creating 1 node pool or pools of tpu7x-8
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True)
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a
[XPK] Creating 1 node pool or pools of tpu7x-8
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True)
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
[XPK] Existing node pool names ['0']
[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run.
gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1
[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
[XPK] Breaking up a total of 1 commands into 1 batches
[XPK] Pretending all the jobs succeeded
[XPK] Create or delete node pool request complete.
[XPK] Creating ConfigMap for cluster
[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a
[XPK] Breaking up a total of 2 commands into 1 batches
[XPK] Pretending all the jobs succeeded
[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available
[XPK] Try 1: Install Jobset on golden-cluster
[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating jobset Controller Manager resources
[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95
[XPK] Try 1: Install PathwaysJob on golden-cluster
[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.4/install.yaml
[XPK] Enabling Kueue on the cluster
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
[XPK] Installing Kueue version v0.14.3...
[XPK] Try 1: Install Kueue
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m
[XPK] Applying following Kueue resources:
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xtpu7x-8"
spec:
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"}

---

apiVersion: kueue.x-k8s.io/v1beta1
kind: AdmissionCheck
metadata:
name: dws-prov
spec:
controllerName: kueue.x-k8s.io/provisioning-request
parameters:
apiGroup: kueue.x-k8s.io
kind: ProvisioningRequestConfig
name: dws-config
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ProvisioningRequestConfig
metadata:
name: dws-config
spec:
provisioningClassName: queued-provisioning.gke.io
podSetUpdates:
nodeSelector:
- key: autoscaling.gke.io/provisioning-request
valueFromProvisioningClassDetail: ResizeRequestName
managedResources:
- google.com/tpu
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "cluster-queue"
spec:
preemption:
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
withinClusterQueue: LowerPriority
namespaceSelector: {} # match all.
resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}]

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: default
name: multislice-queue
spec:
clusterQueue: cluster-queue
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-low
value: 100
globalDefault: false
description: "Very Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: low
value: 250
globalDefault: false
description: "Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: medium
value: 500
globalDefault: false
description: "Medium"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: high
value: 750
globalDefault: false
description: "High"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-high
value: 1000
globalDefault: false
description: "Very High"
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
kubectl apply -f ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
kubectl get node --no-headers | wc -l
[XPK] Try 1: Updating Kueue Controller Manager resources
[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
[XPK] Verifying kjob installation
[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run.
kubectl-kjob help
[XPK] kjob found
[XPK] Applying kjob CDRs
[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run.
kubectl kjob printcrds | kubectl apply --server-side -f -
[XPK] Creating kjob CRDs succeeded
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run.
kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61
[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run.
kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8
[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run.
kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486
[XPK] GKE commands done! Resources are created.
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
[XPK] Exiting XPK cleanly
54 changes: 46 additions & 8 deletions src/xpk/core/capacity.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,12 @@ def get_reservation_maintenance_interval(
Returns:
0 if successful and 1 otherwise.
"""
reservation_project, reservation_name = get_reservation_project_and_name(
reservation, project
)
command = (
f'gcloud beta compute reservations describe {reservation}'
f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
f'gcloud beta compute reservations describe {reservation_name}'
f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
)
return_code, output = run_command_for_value(
command, 'Get reservation maintenance interval'
Expand All @@ -139,9 +142,12 @@ def get_reservation_placement_policy(
Returns:
0 if successful and 1 otherwise.
"""
reservation_project, reservation_name = get_reservation_project_and_name(
reservation, project
)
command = (
f'gcloud beta compute reservations describe {reservation}'
f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
f'gcloud beta compute reservations describe {reservation_name}'
f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
)
return_code, output = run_command_for_value(
command, 'Get reservation placement policy'
Expand All @@ -156,9 +162,12 @@ def get_reservation_deployment_type(
reservation: str, zone: str, project: str
) -> str:
"""Get reservation deployment type."""
reservation_project, reservation_name = get_reservation_project_and_name(
reservation, project
)
command = (
f'gcloud beta compute reservations describe {reservation}'
f' --project={project} --zone={zone} --format="value(deploymentType)"'
f'gcloud beta compute reservations describe {reservation_name}'
f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
)
return_code, output = run_command_for_value(
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
Expand All @@ -178,9 +187,12 @@ def verify_reservation_exists(args) -> int:
Returns:
0 if successful and 1 otherwise.
"""
reservation_project, reservation_name = get_reservation_project_and_name(
args.reservation, args.project
)
command = (
f'gcloud beta compute reservations describe {args.reservation}'
f' --project={args.project} --zone={args.zone}'
f'gcloud beta compute reservations describe {reservation_name}'
f' --project={reservation_project} --zone={args.zone}'
)
return_code = run_command_with_updates(command, 'Describe reservation')
if return_code != 0:
Expand Down Expand Up @@ -264,3 +276,29 @@ def get_capacity_node_selectors_from_capacity_type(
)
return_code = 1
return node_selector, return_code


def get_reservation_project_and_name(
reservation_name_or_path: str, cluster_project: str
) -> tuple[str, str]:
"""Get the reservation project and name.

Args:
reservation_name_or_path: either reservation name or reservation path in format
projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
cluster_project: the cluster project

Returns:
Tuple with reservation project and reservation name.
"""
if '/' not in reservation_name_or_path:
return cluster_project, reservation_name_or_path
reservation_parts = reservation_name_or_path.split('/')
if (
len(reservation_parts) != 4
or reservation_parts[0] != 'projects'
or reservation_parts[2] != 'reservations'
):
xpk_print('Unable to parse reservation: ', reservation_name_or_path)
xpk_exit(1)
return reservation_parts[1], reservation_parts[3]
33 changes: 32 additions & 1 deletion src/xpk/core/capacity_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import pytest
from unittest.mock import MagicMock, patch
from .capacity import get_reservation_deployment_type
from .capacity import get_reservation_deployment_type, get_reservation_project_and_name


@patch('xpk.core.capacity.xpk_print')
Expand Down Expand Up @@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
reservation='reservation', zone='zone', project='project'
)
assert result == 'DENSE'


def test_get_reservation_project_and_name_parses_local_reservation():
project, name = get_reservation_project_and_name(
'test-reservation', 'cluster-project'
)

assert project == 'cluster-project'
assert name == 'test-reservation'


def test_get_reservation_project_and_name_parses_shared_reservation():
project, name = get_reservation_project_and_name(
'projects/reservation-project/reservations/test-reservation',
'cluster-project',
)

assert project == 'reservation-project'
assert name == 'test-reservation'


@patch('xpk.core.capacity.xpk_print')
def test_get_reservation_project_and_name_fails_for_invalid_reservation(
xpk_print: MagicMock, mocker
):
with pytest.raises(SystemExit):
get_reservation_project_and_name(
'invalid/reservation',
'cluster-project',
)
assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]
Loading