From 3b675c3306d2bc0ebdc368e6dcb8678147835d1a Mon Sep 17 00:00:00 2001 From: FIoannides Date: Fri, 19 Sep 2025 17:09:12 +0200 Subject: [PATCH 01/12] Update .gitignore (#637) --- .gitignore | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 244a9969c..8d71a5482 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,10 @@ dmypy.json **/.DS_Store # XPK/Cluster Toolkit working directory -xpkclusters/* \ No newline at end of file +xpkclusters/* + +# gemini-cli settings +.gemini/ + +# GitHub App credentials +gha-creds-*.json From 77f14619d3f4b3cfd7f931cda80ddb1e69e2ae5a Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Fri, 19 Sep 2025 19:20:24 +0200 Subject: [PATCH 02/12] Add golden tests to github actions (#635) * feat: add execution_context * fix: dry run * feat: do not validate deps in dry_run mode * feat: deterministic file names in dry run * build: integrate golden testing with github actions --- .github/workflows/build_tests.yaml | 5 + .github/workflows/reusable_goldens.yaml | 48 ++++++++++ goldens/NAP_cluster-create.txt | 97 +++++++++++++++---- goldens/NAP_cluster-create_with_pathways.txt | 98 ++++++++++++++++---- src/xpk/commands/cluster.py | 17 ++-- src/xpk/commands/inspector.py | 2 +- src/xpk/commands/workload.py | 8 +- src/xpk/core/cluster.py | 6 +- src/xpk/core/commands.py | 17 ++-- src/xpk/core/docker_image.py | 2 +- src/xpk/core/jobset.py | 2 +- src/xpk/core/kjob.py | 7 +- src/xpk/core/kueue.py | 4 +- src/xpk/core/nap.py | 2 +- src/xpk/core/network.py | 2 +- src/xpk/core/nodepool.py | 11 ++- src/xpk/core/pathways.py | 2 +- src/xpk/core/ray.py | 2 +- src/xpk/core/resources.py | 24 +++-- src/xpk/main.py | 5 +- src/xpk/utils/execution_context.py | 28 ++++++ src/xpk/utils/file.py | 35 +++++-- 22 files changed, 342 insertions(+), 82 deletions(-) create mode 100644 .github/workflows/reusable_goldens.yaml create mode 100644 src/xpk/utils/execution_context.py diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index 4962cc7d1..61ded0a56 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -131,6 +131,11 @@ jobs: uses: ./.github/workflows/reusable_lint_and_format.yml with: run-id: '${{needs.set-variables.outputs.run-id}}' + verify-goldens: + needs: [install-dependencies, set-variables] + uses: ./.github/workflows/reusable_goldens.yaml + with: + run-id: '${{needs.set-variables.outputs.run-id}}' run-unit-tests: needs: [install-dependencies, set-variables] uses: ./.github/workflows/reusable_unit_tests.yaml diff --git a/.github/workflows/reusable_goldens.yaml b/.github/workflows/reusable_goldens.yaml new file mode 100644 index 000000000..df77e76be --- /dev/null +++ b/.github/workflows/reusable_goldens.yaml @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +on: + workflow_call: + inputs: + run-id: + required: true + type: string + +permissions: + contents: read + +jobs: + verify-goldens: + runs-on: [ubuntu-22.04] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Prepare directories + run: mkdir -p ~/.cache/pip + - name: Restore cached dependencies + uses: actions/cache@v4 + with: + path: | + /usr/local/bin/kubectl-kueue + /usr/local/bin/kubectl-kjob + ~/.cache/pip + ${{env.pythonLocation}} + key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}} + restore-keys: xpk-deps-3.10- + - name: Verify goldens + run: ./golden_buddy.sh verify goldens.yaml goldens + env: + UPDATE_GOLDEN_COMMAND: make goldens diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 93adad125..dd1df293b 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -29,19 +29,84 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] CoreDNS has successfully started and passed verification. [XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. [XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Couldn't translate project id: golden-project to project number. Error: 403 Permission 'resourcemanager.projects.get' denied on resource '//cloudresourcemanager.googleapis.com/projects/golden-project' (or it may not exist). [reason: "IAM_PERMISSION_DENIED" -domain: "cloudresourcemanager.googleapis.com" -metadata { - key: "resource" - value: "projects/golden-project" -} -metadata { - key: "permission" - value: "resourcemanager.projects.get" -} -] -[XPK] XPK failed, error code 1 +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f b3843453fb19ae7105126245bac5b63930f46861462cd3a557aea44801a99280 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index cfa84b36b..f0c8ba7f6 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -29,19 +29,85 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] CoreDNS has successfully started and passed verification. [XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. [XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Couldn't translate project id: golden-project to project number. Error: 403 Permission 'resourcemanager.projects.get' denied on resource '//cloudresourcemanager.googleapis.com/projects/golden-project' (or it may not exist). [reason: "IAM_PERMISSION_DENIED" -domain: "cloudresourcemanager.googleapis.com" -metadata { - key: "resource" - value: "projects/golden-project" -} -metadata { - key: "permission" - value: "resourcemanager.projects.get" -} -] -[XPK] XPK failed, error code 1 +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 898c7686cc5ef7f026f74e55b73b5843767e3f1abb9639169f02ebc44d06af73 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index a57d071bb..d8bf77d45 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -76,6 +76,7 @@ from ..core.workload import get_workload_list from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from . import cluster_gcluster from .common import set_cluster_command import shutil @@ -128,9 +129,10 @@ def cluster_adapt(args) -> None: get_cluster_credentials(args) - k8s_client = setup_k8s_env(args) + if not is_dry_run(): + k8s_client = setup_k8s_env(args) + install_storage_crd(k8s_client) - install_storage_crd(k8s_client) install_storage_csis(args) # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set @@ -251,9 +253,10 @@ def cluster_create(args) -> None: if update_coredns_command_code != 0: xpk_exit(update_cluster_command_code) - k8s_client = setup_k8s_env(args) + if not is_dry_run(): + k8s_client = setup_k8s_env(args) + install_storage_crd(k8s_client) - install_storage_crd(k8s_client) install_storage_csis(args) # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set @@ -409,10 +412,8 @@ def cluster_cacheimage(args) -> None: nodeSelectorKey=node_selector_key, ) tmp = write_tmp_file(yml_string) - command_apply = f'kubectl apply -f {str(tmp.file.name)}' - command_delete = ( - f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true' - ) + command_apply = f'kubectl apply -f {str(tmp)}' + command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true' return_code = run_command_with_updates( command_delete, 'Deleting Cached Image', args diff --git a/src/xpk/commands/inspector.py b/src/xpk/commands/inspector.py index 580aeff36..3e8d783f0 100644 --- a/src/xpk/commands/inspector.py +++ b/src/xpk/commands/inspector.py @@ -346,7 +346,7 @@ def inspector(args) -> None: ) # Summarize inspector: - xpk_print(f'Find xpk inspector output file: {inspector_file.name}') + xpk_print(f'Find xpk inspector output file: {inspector_file}') if final_return_code != 0: xpk_print( diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 364d1b961..548d5c47f 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -569,7 +569,7 @@ def workload_create(args) -> None: pod_failure_policy=pod_failure_policy, ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' return_code = run_command_with_updates(command, 'Creating Workload', args) if return_code != 0: @@ -725,7 +725,11 @@ def workload_delete(args) -> None: ) else: return_code = run_commands( - commands, 'Delete Workload', task_names, batch=100 + commands, + 'Delete Workload', + task_names, + batch=100, + dry_run=args.dry_run, ) if return_code != 0: diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 41c7ec452..b2c9d8e1c 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -442,7 +442,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient: if not getattr(args, 'kind_cluster', False): add_zone_and_project(args) get_cluster_credentials(args) - args.project_number = project_id_to_project_number(args.project) + args.project_number = ( + project_id_to_project_number(args.project) + if not args.dry_run + else abs(hash(args.project) % (10**12)) # 12 digit hash + ) config.load_kube_config() return k8s_client.ApiClient() diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index ba8cb1191..cc3b266b7 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): The max return code and a list of all the return codes. """ + files = [open(f, 'w', encoding='utf-8') for f in output_logs] children = [] start_time = datetime.datetime.now() - for i, command in enumerate(commands): + for command, file in zip(commands, files): children.append( # subprocess managed by list pylint: disable=consider-using-with - subprocess.Popen( - command, stdout=output_logs[i], stderr=output_logs[i], shell=True - ) + subprocess.Popen(command, stdout=file, stderr=file, shell=True) ) while True: @@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): slow_worker_text = per_command_name[slow_worker_index] slow_str = ( f', task {slow_worker_text} still working, logfile' - f' {output_logs[slow_worker_index].name}' + f' {output_logs[slow_worker_index]}' ) else: slow_str = '' @@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): ) xpk_print( f'Failure is {per_command_name[failing_index]}' - f' and logfile {output_logs[failing_index].name}' + f' and logfile {output_logs[failing_index]}' ) for child in children: child.terminate() @@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): break time.sleep(1) + + for file in files: + file.close() + return max_returncode, returncodes @@ -351,6 +354,6 @@ def run_command_with_full_controls( def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' err_code = run_command_with_updates(command, task, args) return err_code diff --git a/src/xpk/core/docker_image.py b/src/xpk/core/docker_image.py index 7425b0fd6..c31b2c9fc 100644 --- a/src/xpk/core/docker_image.py +++ b/src/xpk/core/docker_image.py @@ -94,7 +94,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: ) tmp = write_tmp_file(docker_file) docker_build_command = ( - f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t' + f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t' f' {docker_name} {args.script_dir}' ) xpk_print(f'Building {args.script_dir} into docker image.') diff --git a/src/xpk/core/jobset.py b/src/xpk/core/jobset.py index 135cfda63..e47346796 100644 --- a/src/xpk/core/jobset.py +++ b/src/xpk/core/jobset.py @@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args): memory_limit_size=new_memory_limit, ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Updating jobset Controller Manager resources' return_code = run_command_with_updates_retry(command, task, args) diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index 59388b732..c2c43f27b 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -23,6 +23,7 @@ from kubernetes.client.rest import ApiException from ..utils import templates +from ..utils.execution_context import is_dry_run from ..utils.console import xpk_exit, xpk_print from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env @@ -368,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int: def prepare_kjob(args: Namespace) -> int: system = get_cluster_system_characteristics(args) - k8s_api_client = setup_k8s_env(args) - storages = get_auto_mount_storages(k8s_api_client) + storages = [] + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + storages = get_auto_mount_storages(k8s_api_client) service_account = "" if len(storages) > 0: diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8f1d434c1..49f57a4fd 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -474,7 +474,7 @@ def install_kueue_crs( yml_string = topology_yaml + yml_string tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Applying Kueue Custom Resources' return_code = run_command_with_updates_retry(command, task, args) @@ -536,7 +536,7 @@ def update_kueue_resources_if_necessary(args): memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Updating Kueue Controller Manager resources' return_code = run_command_with_updates_retry(command, task, args) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 6a628eb83..0b21e8ee8 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -250,7 +250,7 @@ def create_autoprovisioning_config( zones=f'- {args.zone}', ) autoprovisioning_config = AutoprovisioningConfig( - config_filename=write_tmp_file(yml_string).name, + config_filename=write_tmp_file(yml_string), minimum_chips=minimum, maximum_chips=maximum, ) diff --git a/src/xpk/core/network.py b/src/xpk/core/network.py index e42ca76c6..18f844c59 100644 --- a/src/xpk/core/network.py +++ b/src/xpk/core/network.py @@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int: """ yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' return_code = run_command_with_updates( command, 'GKE Cluster Create Network Config', args diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index cab159f15..85ab6aba9 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -265,7 +265,9 @@ def run_gke_node_pool_create_command( ) configmap_yml = {} configmap_yml[resources_configmap_name] = resources_yml - return_code = create_or_update_cluster_configmap(configmap_yml) + return_code = create_or_update_cluster_configmap( + configmap_yml, args.dry_run + ) if return_code != 0: return 1 @@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]: f' --region={zone_to_region(args.zone)} --format="value(locations)"' ) return_code, nodepool_zone = run_command_for_value( - command, 'Get Node Pool Zone', args + command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone ) if return_code != 0: xpk_print(f'Get Node Pool Zone returned ERROR {return_code}') @@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int: for i, command in enumerate(commands): xpk_print(f'To complete {task_names[i]} we are executing {command}') max_return_code = run_commands( - commands, 'Update GKE node pools to default RAPID GKE version', task_names + commands, + 'Update GKE node pools to default RAPID GKE version', + task_names, + dry_run=args.dry_run, ) if max_return_code != 0: xpk_print( diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 81770eb04..291afef68 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -322,7 +322,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool: return_code = run_command_with_updates(commands[0], 'Delete Workload', args) else: return_code = run_commands( - commands, 'Delete Workload', task_names, batch=100 + commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run ) if return_code != 0: diff --git a/src/xpk/core/ray.py b/src/xpk/core/ray.py index 2266d52ab..50e391025 100644 --- a/src/xpk/core/ray.py +++ b/src/xpk/core/ray.py @@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int: ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Applying RayCluster' retry_attempts = 1 return_code = run_command_with_updates_retry( diff --git a/src/xpk/core/resources.py b/src/xpk/core/resources.py index 85b266c70..f215e1063 100644 --- a/src/xpk/core/resources.py +++ b/src/xpk/core/resources.py @@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: ) return_code, return_value = run_command_for_value( - command, 'GKE Cluster Get ConfigMap', args + command, + 'GKE Cluster Get ConfigMap', + args, + dry_run_return_val='map[]', ) if return_code != 0: xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}') @@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: configs = return_value[4:-1].split(' ') for config in configs: - key, value = config.strip().split(':') - config_map[key] = value + parts = config.strip().split(':') + if len(parts) != 2: + continue + config_map[parts[0]] = parts[1] return config_map @@ -150,10 +155,12 @@ def create_cluster_configmaps( args=args, name=metadata_configmap_name, data=metadata ) configmap_yml[metadata_configmap_name] = metadata_yml - return create_or_update_cluster_configmap(configmap_yml) + return create_or_update_cluster_configmap(configmap_yml, args.dry_run) -def create_or_update_cluster_configmap(configmap_yml: dict) -> int: +def create_or_update_cluster_configmap( + configmap_yml: dict, dry_run: bool +) -> int: """ Args: configmap_yml: dict containing ConfigMap name and yml string. @@ -165,13 +172,16 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int: task_names = [] for configmap_name, yml_string in configmap_yml.items(): tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' commands.append(command) task_name = f'ConfigMap CreateOrUpdate-{configmap_name}' task_names.append(task_name) return_code = run_commands( - commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names + commands, + 'GKE Cluster CreateOrUpdate ConfigMap(s)', + task_names, + dry_run=dry_run, ) if return_code != 0: xpk_print( diff --git a/src/xpk/main.py b/src/xpk/main.py index 9662080ca..d166b9833 100644 --- a/src/xpk/main.py +++ b/src/xpk/main.py @@ -37,6 +37,7 @@ from .parser.core import set_parser from .utils.console import xpk_print from .utils.validation import validate_dependencies +from .utils.execution_context import set_dry_run ################### Compatibility Check ################### # Check that the user runs the below version or greater. @@ -63,9 +64,11 @@ def main() -> None: set_parser(parser=parser) xpk_print('Starting xpk', flush=True) - validate_dependencies() main_args = parser.parse_args() main_args.enable_ray_cluster = False + set_dry_run('dry_run' in main_args and main_args.dry_run) + if not main_args.dry_run: + validate_dependencies() main_args.func(main_args) xpk_print('XPK Done.', flush=True) diff --git a/src/xpk/utils/execution_context.py b/src/xpk/utils/execution_context.py new file mode 100644 index 000000000..d38088306 --- /dev/null +++ b/src/xpk/utils/execution_context.py @@ -0,0 +1,28 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +dry_run = False + + +def set_dry_run(value: bool) -> None: + """Sets the dry_run flag.""" + global dry_run + dry_run = value + + +def is_dry_run() -> bool: + """Returns the current value of the dry_run flag.""" + return dry_run diff --git a/src/xpk/utils/file.py b/src/xpk/utils/file.py index 57321cf43..f5242e2d3 100644 --- a/src/xpk/utils/file.py +++ b/src/xpk/utils/file.py @@ -16,10 +16,11 @@ import tempfile import os -from .console import xpk_print +import hashlib +from .execution_context import is_dry_run -def make_tmp_files(per_command_name): +def make_tmp_files(per_command_name: list[str]) -> list[str]: """Make temporary files for each command. Args: @@ -28,16 +29,19 @@ def make_tmp_files(per_command_name): Returns: A list of temporary files for each command. """ + if is_dry_run(): + return [_hash_filename(command) for command in per_command_name] + # Supports removal of spaces from command names before converting to file name. return [ tempfile.NamedTemporaryFile( delete=False, prefix=command.replace(' ', '-') + '-' - ) + ).file.name for command in per_command_name ] -def write_tmp_file(payload): +def write_tmp_file(payload: str) -> str: """Writes `payload` to a temporary file. Args: @@ -46,14 +50,17 @@ def write_tmp_file(payload): Returns: A file object that was written to. """ + if is_dry_run(): + return _hash_filename(payload) + with tempfile.NamedTemporaryFile(delete=False) as tmp: with open(file=tmp.name, mode='w', encoding='utf=8') as f: f.write(payload) f.flush() - return tmp + return tmp.file.name -def append_tmp_file(payload, file): +def append_tmp_file(payload: str, file: str) -> str: """Appends `payload` to an already created file. Use `write_temporary_file` to create a file. @@ -65,18 +72,26 @@ def append_tmp_file(payload, file): Returns: A file object that was written to. """ - with open(file=file.name, mode='a', encoding='utf=8') as f: + if is_dry_run(): + return file + + with open(file=file, mode='a', encoding='utf=8') as f: f.write(payload) f.flush() return file -def ensure_directory_exists(directory_path): +def ensure_directory_exists(directory_path: str) -> None: """Checks if a directory exists and creates it if it doesn't. Args: directory_path: The path to the directory. """ - if not os.path.exists(directory_path): + if not is_dry_run() and not os.path.exists(directory_path): os.makedirs(directory_path) - xpk_print(f"Directory '{directory_path}' created successfully.") + + +def _hash_filename(seed: str) -> str: + m = hashlib.sha256() + m.update(seed.encode('utf-8')) + return m.hexdigest() From 1185b1d3dea3f63c063e6caf0d12cb690e13a126 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Mon, 22 Sep 2025 11:12:19 +0200 Subject: [PATCH 03/12] build: cleanup codeowners (#639) --- .github/CODEOWNERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f6af5039d..7d5908436 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1 @@ -* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN @scaliby @jamOne- @SikaGrr @FIoannides @fatoshoti -slice/ @mwysokin @mimowo @gabesaba @PBundyra @mwielgus @pajakd \ No newline at end of file +* @scaliby @jamOne- @SikaGrr @FIoannides From e2295e3f101fbdc82c9731bae8cb4bbbcc204cf3 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:55:22 +0200 Subject: [PATCH 04/12] feat: add more goldens (#640) --- golden_buddy.sh | 5 +- goldens.yaml | 26 +++++++ goldens/Basic_cluster_create.txt | 101 +++++++++++++++++++++++++ goldens/Batch.txt | 14 ++++ goldens/Cluster_create_private.txt | 108 +++++++++++++++++++++++++++ goldens/Cluster_delete.txt | 16 ++++ goldens/Cluster_delete_force.txt | 13 ++++ goldens/Job_cancel.txt | 9 +++ goldens/Job_info.txt | 20 +++++ goldens/Job_list.txt | 9 +++ goldens/Storage_list.txt | 4 + goldens/Workload_create.txt | 33 ++++++++ goldens/Workload_create_pathways.txt | 32 ++++++++ goldens/Workload_delete.txt | 12 +++ goldens/Workload_list.txt | 12 +++ src/xpk/commands/batch.py | 16 ++-- src/xpk/commands/common.py | 4 + src/xpk/commands/job.py | 32 +++++++- src/xpk/commands/storage.py | 7 +- src/xpk/commands/workload.py | 17 +++-- src/xpk/core/cluster_private.py | 4 +- src/xpk/core/docker_image.py | 17 ++++- src/xpk/core/docker_resources.py | 13 +++- src/xpk/core/pathways.py | 6 +- src/xpk/core/scheduling.py | 4 + src/xpk/utils/network.py | 4 + 26 files changed, 507 insertions(+), 31 deletions(-) create mode 100644 goldens/Basic_cluster_create.txt create mode 100644 goldens/Batch.txt create mode 100644 goldens/Cluster_create_private.txt create mode 100644 goldens/Cluster_delete.txt create mode 100644 goldens/Cluster_delete_force.txt create mode 100644 goldens/Job_cancel.txt create mode 100644 goldens/Job_info.txt create mode 100644 goldens/Job_list.txt create mode 100644 goldens/Storage_list.txt create mode 100644 goldens/Workload_create.txt create mode 100644 goldens/Workload_create_pathways.txt create mode 100644 goldens/Workload_delete.txt create mode 100644 goldens/Workload_list.txt diff --git a/golden_buddy.sh b/golden_buddy.sh index 4b3745f8c..bfae97dbb 100755 --- a/golden_buddy.sh +++ b/golden_buddy.sh @@ -90,8 +90,9 @@ fi mkdir -p "$GOLDENS_DIR" -cat "$GOLDENS_FILE" | yq -r '.goldens | to_entries[] | [.key, .value.command] | @tsv' | \ - while IFS=$'\t' read -r key command; do +yq -r '.goldens | keys[]' "$GOLDENS_FILE" | \ + while read -r key; do + command=$(yq -r '.goldens["'"$key"'"].command' "$GOLDENS_FILE") if [[ "$MODE" = "update" ]]; then printf "${YELLOW}Updating: %s${NC}\n" "$key" fi diff --git a/goldens.yaml b/goldens.yaml index f07091d78..c06f2883e 100644 --- a/goldens.yaml +++ b/goldens.yaml @@ -3,3 +3,29 @@ goldens: command: python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run "NAP cluster-create": command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run + "Basic cluster create": + command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run + "Cluster create private": + command: python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run + "Cluster delete": + command: python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Cluster delete force": + command: python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --force --dry-run + "Workload create": + command: python3 xpk.py workload create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run + "Workload create pathways": + command: python3 xpk.py workload create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run + "Workload delete": + command: python3 xpk.py workload delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --dry-run + "Workload list": + command: python3 xpk.py workload list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Storage list": + command: python3 xpk.py storage list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Job cancel": + command: python3 xpk.py job cancel golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Batch": + command: python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh + "Job list": + command: python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Job info": + command: python3 xpk.py job info golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt new file mode 100644 index 000000000..df7685a99 --- /dev/null +++ b/goldens/Basic_cluster_create.txt @@ -0,0 +1,101 @@ +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 0f9af4fcbf6d012aed0c8e09d49827dc34495d3eed2893af6b68b2d121a519ac +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Batch.txt b/goldens/Batch.txt new file mode 100644 index 000000000..483cd914f --- /dev/null +++ b/goldens/Batch.txt @@ -0,0 +1,14 @@ +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `submit job` is implemented by the following command not running since it is a dry run. +kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --worker-container xpk-batch-container --first-node-ip --pod-template-annotation kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host -- batch-read.sh --partition multislice-queue +[XPK] XPK Done. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt new file mode 100644 index 000000000..400b59adb --- /dev/null +++ b/goldens/Cluster_create_private.txt @@ -0,0 +1,108 @@ +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster-private: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Task: `Fetching the list of authorized network from cluster describe.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" +[XPK] Current machine's IP adrress is already authorized. +[XPK] Try 1: get-credentials to cluster golden-cluster-private +[XPK] Task: `get-credentials to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster-private --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of v5p-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Creating 1 node pool or pools of v5p-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --region=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster-private +[XPK] Task: `Install Jobset on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster-private +[XPK] Task: `Install PathwaysJob on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f e528cf13756aaba6ef43789e09518fb9bcc6d43a945b51fb76d16e9869c73eec +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete.txt b/goldens/Cluster_delete.txt new file mode 100644 index 000000000..1ae8aa955 --- /dev/null +++ b/goldens/Cluster_delete.txt @@ -0,0 +1,16 @@ +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Get the name of the workloads in the cluster. +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete_force.txt b/goldens/Cluster_delete_force.txt new file mode 100644 index 000000000..060b5e5f0 --- /dev/null +++ b/goldens/Cluster_delete_force.txt @@ -0,0 +1,13 @@ +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt new file mode 100644 index 000000000..2e97a48aa --- /dev/null +++ b/goldens/Job_cancel.txt @@ -0,0 +1,9 @@ +[XPK] Starting xpk +[XPK] Starting job cancel for job: ['golden-job'] +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. +kubectl-kjob delete slurm golden-job +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_info.txt b/goldens/Job_info.txt new file mode 100644 index 000000000..4e8765887 --- /dev/null +++ b/goldens/Job_info.txt @@ -0,0 +1,20 @@ +[XPK] Starting xpk +[XPK] Task: `Getting job data` is implemented by the following command not running since it is a dry run. +kubectl-kjob describe slurm golden-job +[XPK] Task: `Getting job info` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm -o yaml --field-selector metadata.name==golden-job +[XPK] Task: `Getting pods list` is implemented by the following command not running since it is a dry run. +kubectl get pods -l=job-name=golden-job --no-headers +Job name: golden-job +Script name: echo hello +Profile: '' +Labels: + kjobctl.x-k8s.io/app-profile: default +Mounts: [] +Pods: +- Name: foo-pod + Status: Running +- Name: bar-pod + Status: Evicted +Entrypoint environment variables template: [] +[XPK] XPK Done. diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt new file mode 100644 index 000000000..76053723d --- /dev/null +++ b/goldens/Job_list.txt @@ -0,0 +1,9 @@ +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Listing jobs for project golden-project and zone us-central1-a: +[XPK] Task: `list jobs` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm --profile xpk-def-app-profile +[XPK] Exiting XPK cleanly diff --git a/goldens/Storage_list.txt b/goldens/Storage_list.txt new file mode 100644 index 000000000..8da11f27e --- /dev/null +++ b/goldens/Storage_list.txt @@ -0,0 +1,4 @@ +[XPK] Starting xpk +NAME TYPE AUTO MOUNT MOUNT POINT READONLY MANIFEST +------ ------ ------------ ------------- ---------- ---------- +[XPK] XPK Done. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt new file mode 100644 index 000000000..849c85e56 --- /dev/null +++ b/goldens/Workload_create.txt @@ -0,0 +1,33 @@ +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] No gcsfuse Storages to add detected +[XPK] No gcp filestore instances to add detected. +[XPK] No gcp parallelstore instances to add detected. +[XPK] No gce persistent disk instances to add detected. +[XPK] No managed lustre instances to add detected. +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144 +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your workload here: https://console.cloud.google.com/kubernetes/service/us-central1/golden-cluster/default/golden-workload/details?project=golden-project +[XPK] Follow your worker 0, slice 0 logs here: Adjust the pod name ([prefix]-slice-job-[slice_number]-[worker_number]) after clicking the url if you want other worker logs. https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22golden-project%22%0Aresource.labels.location%3D%22us-central1%22%0Aresource.labels.cluster_name%3D%22golden-cluster%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22golden-workload-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration=P1D?e=13802955&mods=allow_workbench_image_override&project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt new file mode 100644 index 000000000..e05b6840d --- /dev/null +++ b/goldens/Workload_create_pathways.txt @@ -0,0 +1,32 @@ +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f 871fa8b4813a0c43d7d5f0088986e20d11d4f093d6986a542d92a9420afa632b +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your Pathways workload and other resources here : https://console.cloud.google.com/logs/query;query=resource.type%3D"k8s_container"%0Aresource.labels.project_id%3D"golden-project"%0Aresource.labels.location%3D"us-central1"%0Aresource.labels.cluster_name%3D"golden-cluster"%0Aresource.labels.pod_name:"golden-workload-"%0Aseverity>%3DDEFAULT +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt new file mode 100644 index 000000000..c719630aa --- /dev/null +++ b/goldens/Workload_delete.txt @@ -0,0 +1,12 @@ +[XPK] Starting xpk +[XPK] Starting Workload delete +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Delete Workload` is implemented by the following command not running since it is a dry run. +kubectl delete pathwaysjob golden-workload -n default +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt new file mode 100644 index 000000000..4c608fdfb --- /dev/null +++ b/goldens/Workload_list.txt @@ -0,0 +1,12 @@ +[XPK] Starting xpk +[XPK] Starting workload list +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Workload List Output: +0 +[XPK] See your workloads in Cloud Console: https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/commands/batch.py b/src/xpk/commands/batch.py index 9759e23e3..e52edb9a8 100644 --- a/src/xpk/commands/batch.py +++ b/src/xpk/commands/batch.py @@ -31,6 +31,7 @@ ) from ..core.kueue import LOCAL_QUEUE_NAME from ..utils.console import xpk_exit, xpk_print +from ..utils.execution_context import is_dry_run from .kind import set_local_cluster_command from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command @@ -51,18 +52,16 @@ def batch(args: Namespace) -> None: if set_cluster_command_code != 0: xpk_exit(set_cluster_command_code) - err_code = prepare_kjob(args) - if err_code > 0: - xpk_exit(err_code) - setup_k8s_service_accounts() + if not is_dry_run(): + err_code = prepare_kjob(args) + if err_code > 0: + xpk_exit(err_code) + setup_k8s_service_accounts() submit_job(args) def submit_job(args: Namespace) -> None: - - setup_k8s_service_accounts() - cmd = ( 'kubectl kjob create slurm' f' --profile {AppProfileDefaults.NAME.value}' @@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None: cmd = add_gpu_networking_annotations_to_command(args, cmd) cmd = add_TAS_annotations_to_command(args, cmd) - for annotation in get_storage_annotations(args): + annotations = [] if is_dry_run() else get_storage_annotations(args) + for annotation in annotations: cmd += f' --pod-template-annotation {annotation}' if args.ignore_unknown_flags: diff --git a/src/xpk/commands/common.py b/src/xpk/commands/common.py index 7020a817f..53a377ac5 100644 --- a/src/xpk/commands/common.py +++ b/src/xpk/commands/common.py @@ -18,6 +18,7 @@ from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType from ..core.gcloud_context import zone_to_region from ..utils.console import xpk_print, xpk_exit +from ..utils.execution_context import is_dry_run from ..core.system_characteristics import ( SystemCharacteristics, ) @@ -63,6 +64,9 @@ def is_TAS_possible( True if possible and False otherwise. """ + if is_dry_run(): + return True + if system_characteristics is None: xpk_print('system_characteristics data was not found in configmaps.') xpk_exit(1) diff --git a/src/xpk/commands/job.py b/src/xpk/commands/job.py index 250fa9946..4d9d21457 100644 --- a/src/xpk/commands/job.py +++ b/src/xpk/commands/job.py @@ -28,6 +28,28 @@ from .kind import set_local_cluster_command +JOBS_DRY_RUN_YAML = """ +items: +- apiVersion: slurm.k8s.io/v1alpha1 + kind: SlurmJob + metadata: + annotations: + kjobctl.x-k8s.io/script: echo hello + creationTimestamp: '2024-04-29T12:00:00Z' + labels: + kjobctl.x-k8s.io/app-profile: default + name: golden-job + namespace: default + spec: + script: echo hello +""" + +PODS_DRY_RUN_RESULT = """ +foo-pod 2/2 Running 0 2d +bar-pod 1/1 Evicted 0 1d +""" + + def job_info(args): """Run commands obtaining information about a job given by name. @@ -52,7 +74,10 @@ def job_info(args): f' metadata.name=={job_name}' ) job_code, job_text = run_command_for_value( - job_command, 'Getting job info', args + job_command, + 'Getting job info', + args, + dry_run_return_val=JOBS_DRY_RUN_YAML, ) if job_code != 0: xpk_print(f'Job info request returned ERROR {job_code}') @@ -60,7 +85,10 @@ def job_info(args): pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers' pods_code, pods_text = run_command_for_value( - pods_command, 'Getting pods list', args + pods_command, + 'Getting pods list', + args, + dry_run_return_val=PODS_DRY_RUN_RESULT, ) if pods_code != 0: xpk_print(f'Pods list request returned ERROR {pods_code}') diff --git a/src/xpk/commands/storage.py b/src/xpk/commands/storage.py index fb3ba85ca..e27cdcfbc 100644 --- a/src/xpk/commands/storage.py +++ b/src/xpk/commands/storage.py @@ -58,6 +58,7 @@ ) from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.kubectl import apply_kubectl_manifest +from ..utils.execution_context import is_dry_run def storage_create(args: Namespace) -> None: @@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None: def storage_list(args: Namespace) -> None: - k8s_api_client = setup_k8s_env(args) - storages = list_storages(k8s_api_client) + storages = [] + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + storages = list_storages(k8s_api_client) print_storages_for_cluster(storages) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 548d5c47f..66e061de8 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -97,6 +97,7 @@ ) from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from . import cluster_gcluster from .common import is_TAS_possible @@ -306,8 +307,10 @@ def workload_create(args) -> None: Returns: 0 if successful and 1 otherwise. """ - k8s_api_client = setup_k8s_env(args) - setup_k8s_service_accounts() + k8s_api_client = None + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + setup_k8s_service_accounts() workload_exists = check_if_workload_exists(args) @@ -383,8 +386,10 @@ def workload_create(args) -> None: all_storages = [] # Currently storage customization is not supported for Pathways workloads. b/408468941 if not args.use_pathways: - storages: list[Storage] = get_storages_to_mount( - k8s_api_client, args.storage + storages: list[Storage] = ( + [] + if k8s_api_client is None + else get_storages_to_mount(k8s_api_client, args.storage) ) gcs_fuse_storages = list( filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages) @@ -576,7 +581,7 @@ def workload_create(args) -> None: xpk_print(f'Create Workload request returned ERROR {return_code}') xpk_exit(return_code) - if not args.use_pathways: + if not args.use_pathways and not is_dry_run(): add_bucket_iam_members(args, storages) # Get GKE outlier dashboard for TPU @@ -747,8 +752,6 @@ def workload_list(args) -> None: Returns: 0 if successful and 1 otherwise. """ - xpk_print(args) - xpk_print('Starting workload list', flush=True) add_zone_and_project(args) get_cluster_credentials(args) diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py index b5212b6da..3dd4b7f8e 100644 --- a/src/xpk/core/cluster_private.py +++ b/src/xpk/core/cluster_private.py @@ -19,6 +19,7 @@ add_current_machine_to_networks, is_current_machine_in_any_network, ) +from ..utils.execution_context import is_dry_run from ..utils.objects import is_text_true from .commands import run_command_for_value, run_command_with_updates from .gcloud_context import zone_to_region @@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int: if not args.private and args.authorized_networks is None: xpk_print('Cluster is public and no need to authorize networks.') return 0 - else: + elif not is_dry_run(): xpk_print( 'Cannot convert an existing public cluster to private. The arguments' ' --private and --authorized-networks are not acceptable for public' @@ -164,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]: command, 'Fetching the list of authorized network from cluster describe.', args, + dry_run_return_val='127.0.0.1/32', ) if return_code != 0: diff --git a/src/xpk/core/docker_image.py b/src/xpk/core/docker_image.py index c31b2c9fc..75050eb8d 100644 --- a/src/xpk/core/docker_image.py +++ b/src/xpk/core/docker_image.py @@ -21,6 +21,7 @@ from ..utils.console import xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from .commands import run_command_with_updates DEFAULT_DOCKER_IMAGE = 'python:3.10' @@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: """ # Pick a name for the docker image. - docker_image_prefix = os.getenv('USER', 'unknown') + docker_image_prefix = ( + 'dry-run' if is_dry_run() else os.getenv('USER', 'unknown') + ) docker_name = f'{docker_image_prefix}-runner' script_dir_dockerfile = """FROM {base_docker_image} @@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: # Pick a randomly generated `tag_length` character docker tag. tag_length = 4 - tag_random_prefix = ''.join( - random.choices(string.ascii_lowercase, k=tag_length) + tag_random_prefix = ( + 'prefix' + if is_dry_run() + else ''.join(random.choices(string.ascii_lowercase, k=tag_length)) + ) + tag_datetime = ( + 'current' + if is_dry_run() + else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') ) - tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') tag_name = f'{tag_random_prefix}-{tag_datetime}' cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}' xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}') diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py index a36651e47..0519845e9 100644 --- a/src/xpk/core/docker_resources.py +++ b/src/xpk/core/docker_resources.py @@ -20,6 +20,7 @@ from .cluster import setup_k8s_env from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount from .system_characteristics import AcceleratorType, SystemCharacteristics +from ..utils.execution_context import is_dry_run def get_main_container_resources( @@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str: - name: shared-data """ - storages: list[Storage] = get_storages_to_mount( - setup_k8s_env(args), args.storage + storages: list[Storage] = ( + [] + if is_dry_run() + else get_storages_to_mount(setup_k8s_env(args), args.storage) ) for storage in storages: if storage.type in { @@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str: elif system.accelerator_type == AcceleratorType['GPU']: volume_mount_yaml = '' - storages: list[Storage] = get_storages_to_mount( - setup_k8s_env(args), args.storage + storages: list[Storage] = ( + [] + if is_dry_run() + else get_storages_to_mount(setup_k8s_env(args), args.storage) ) for storage in storages: if storage.type in { diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 291afef68..017fb885e 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -19,6 +19,7 @@ from ..core.gcloud_context import zone_to_region from ..core.nodepool import get_all_nodepools_programmatic from ..utils.console import xpk_exit, xpk_print +from ..utils.execution_context import is_dry_run from .system_characteristics import AcceleratorType, SystemCharacteristics @@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool: # Ensure the cluster and CPU nodepools were created with create-pathways all_node_pools = get_all_nodepools_programmatic(args) desired_pw_cpu_node_pools = {'cpu-np'} - if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])): + if ( + not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])) + and not is_dry_run() + ): xpk_print( 'Cluster needs to be created with `xpk create-pathways` to run' ' Pathways workloads.' diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index d8957e133..ef6b469ce 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -15,6 +15,7 @@ """ from ..utils.console import xpk_print +from ..utils.execution_context import is_dry_run from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap from .system_characteristics import ( @@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: ) return True + if is_dry_run(): + return True + # Check for gke accelerator type: missing_gke_accelerator_type = False if not cluster_config_map.get(system.gke_accelerator): diff --git a/src/xpk/utils/network.py b/src/xpk/utils/network.py index cd506f760..509da276e 100644 --- a/src/xpk/utils/network.py +++ b/src/xpk/utils/network.py @@ -18,6 +18,7 @@ import socket import requests from .console import xpk_print +from .execution_context import is_dry_run # Retrives machine's external IP address ip_resolver_url = "http://api.ipify.org" @@ -36,6 +37,9 @@ def get_current_machine_ip(external_ip=True): The IP address as a string. """ + if is_dry_run(): + return 0, "127.0.0.1" + try: if external_ip: # Get external IP address From 1c603d79c45f612b9299af2658c0e14f84906323 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:33:10 +0200 Subject: [PATCH 05/12] build: simplify dependencies on build_test so it runs faster (#642) --- .github/workflows/build_tests.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index 61ded0a56..f756418e5 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -28,6 +28,9 @@ on: branches: ["main","develop"] pull_request: # By default this runs for types assigned, opened and synchronize. +permissions: + contents: read + jobs: set-variables: runs-on: [ubuntu-22.04] @@ -124,7 +127,7 @@ jobs: ${{env.pythonLocation}} key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}} linter: - needs: [install-dependencies, set-variables] + needs: [set-variables] concurrency: # We support one build or nightly test to run at a time currently. group: linter-${{needs.set-variables.outputs.run-id}} cancel-in-progress: true @@ -132,12 +135,12 @@ jobs: with: run-id: '${{needs.set-variables.outputs.run-id}}' verify-goldens: - needs: [install-dependencies, set-variables] + needs: [set-variables] uses: ./.github/workflows/reusable_goldens.yaml with: run-id: '${{needs.set-variables.outputs.run-id}}' run-unit-tests: - needs: [install-dependencies, set-variables] + needs: [set-variables] uses: ./.github/workflows/reusable_unit_tests.yaml with: run-id: ${{needs.set-variables.outputs.run-id}} From e5c646a60d36a47f272e23cd6e8090732bf173a9 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:56:15 +0200 Subject: [PATCH 06/12] Revert "build: simplify dependencies on build_test so it runs faster (#642)" (#644) This reverts commit 1c603d79c45f612b9299af2658c0e14f84906323. --- .github/workflows/build_tests.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index f756418e5..61ded0a56 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -28,9 +28,6 @@ on: branches: ["main","develop"] pull_request: # By default this runs for types assigned, opened and synchronize. -permissions: - contents: read - jobs: set-variables: runs-on: [ubuntu-22.04] @@ -127,7 +124,7 @@ jobs: ${{env.pythonLocation}} key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}} linter: - needs: [set-variables] + needs: [install-dependencies, set-variables] concurrency: # We support one build or nightly test to run at a time currently. group: linter-${{needs.set-variables.outputs.run-id}} cancel-in-progress: true @@ -135,12 +132,12 @@ jobs: with: run-id: '${{needs.set-variables.outputs.run-id}}' verify-goldens: - needs: [set-variables] + needs: [install-dependencies, set-variables] uses: ./.github/workflows/reusable_goldens.yaml with: run-id: '${{needs.set-variables.outputs.run-id}}' run-unit-tests: - needs: [set-variables] + needs: [install-dependencies, set-variables] uses: ./.github/workflows/reusable_unit_tests.yaml with: run-id: ${{needs.set-variables.outputs.run-id}} From 7e0bcd6d59398914be77b96324a1989cb500deed Mon Sep 17 00:00:00 2001 From: FIoannides Date: Tue, 23 Sep 2025 14:55:48 +0200 Subject: [PATCH 07/12] Add cpu and memory limit flags and use them in Kueue configuration (#648) * Add cpu and memory limit flags and use them in Kueue configuration * Remove print line --- src/xpk/core/kueue.py | 25 +++++++++++++++++++++---- src/xpk/parser/cluster.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 49f57a4fd..8f8e63c00 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -436,6 +436,8 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + cpu_limit=args.cpu_limit, + memory_limit=args.memory_limit, ) topology_label = '' if system.device_type in [ @@ -473,6 +475,7 @@ def install_kueue_crs( ]: yml_string = topology_yaml + yml_string + print(yml_string) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp)}' @@ -484,7 +487,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit ) -> str: """Gets Kueue covered resources configuration. @@ -497,17 +500,31 @@ def get_kueue_covered_resources_config( A string of Kueue covered resources configuration. """ config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: {resource_types} flavors: - name: {cluster_hardware_name} resources: - name: "{resource_type}" - nominalQuota: {total_chips} - """ + nominalQuota: {total_chips}""" + resource_types = [resource_type] + if cpu_limit: + config_format = config_format + """ + - name: "cpu" + nominalQuota: {cpu_limit}""" + resource_types.append('cpu') + if memory_limit: + config_format = config_format + """ + - name: "memory" + nominalQuota: {memory_limit}""" + resource_types.append('memory') + config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, + resource_types=resource_types, resource_type=resource_type, total_chips=total_chips, + cpu_limit=cpu_limit, + memory_limit=memory_limit, ) return config_string diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 663a6bd3b..ce2fd8260 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -176,6 +176,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) cluster_create_parser.set_defaults(func=cluster_create) + cluster_create_resource_limits = cluster_create_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -887,3 +893,23 @@ def add_shared_cluster_create_mtc_arguments( ' checkpointing. By default, it is set to "google.com/tpu".' ), ) + + +def add_resource_limits(parser_or_group: ParserOrArgumentGroup): + """Add resource limits arguments in cluster create. + + Args: + List of cluster create resource limits arguments parsers or group + """ + parser_or_group.add_argument( + '--memory-limit', + type=str, + default=None, + help='The memory limit for the Kueue controller manager.', + ) + parser_or_group.add_argument( + '--cpu-limit', + type=int, + default=None, + help='The CPU limit for the Kueue controller manager.', + ) From f64143ceadbaea482005b841a8e0328fe8a318ba Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Tue, 23 Sep 2025 13:04:35 +0000 Subject: [PATCH 08/12] Release v0.13.0 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 3a3992657..b67a6026a 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.12.0' +__version__ = 'v0.13.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From 386c2c7bdf977124b797cadbf2371db0d76d9c3d Mon Sep 17 00:00:00 2001 From: FIoannides Date: Tue, 23 Sep 2025 17:16:19 +0200 Subject: [PATCH 09/12] Cpu and memory limits (#651) * Add cpu and memory limit flags and use them in Kueue configuration * Remove print line * Add cpu-limit and memory-limit for other types of cluster creation as well --- goldens/Basic_cluster_create.txt | 2 +- goldens/Cluster_create_private.txt | 2 +- goldens/NAP_cluster-create.txt | 2 +- goldens/NAP_cluster-create_with_pathways.txt | 2 +- src/xpk/core/kueue.py | 1 - src/xpk/parser/cluster.py | 19 ++++++++++++++++++- 6 files changed, 22 insertions(+), 6 deletions(-) diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index df7685a99..03fce8269 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -73,7 +73,7 @@ kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=avai [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 0f9af4fcbf6d012aed0c8e09d49827dc34495d3eed2893af6b68b2d121a519ac +kubectl apply -f c49da377b542c14a80a64a13236f8d3a1c8e022dc7c82cc6f6f0560d980ee9e7 [XPK] Update Kueue Controller Manager resources [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 400b59adb..4c399efab 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -80,7 +80,7 @@ kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=avai [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f e528cf13756aaba6ef43789e09518fb9bcc6d43a945b51fb76d16e9869c73eec +kubectl apply -f ec56970df5766f33e470374e087b3061d9960c171fce12fdb2d75170eb75fe55 [XPK] Update Kueue Controller Manager resources [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index dd1df293b..863da3314 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -84,7 +84,7 @@ kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=avai [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f b3843453fb19ae7105126245bac5b63930f46861462cd3a557aea44801a99280 +kubectl apply -f eaa77bda2c85901c627ae9bb4baacdb37df006d6bf267b319b6bc8b2cbf7ca7e [XPK] Update Kueue Controller Manager resources [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index f0c8ba7f6..58bedcf6f 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -85,7 +85,7 @@ kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=avai [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 898c7686cc5ef7f026f74e55b73b5843767e3f1abb9639169f02ebc44d06af73 +kubectl apply -f 7ffd24a656c1ec9c1d331862e352cefd5348637b0f776a8e3db888b04fa7fad6 [XPK] Update Kueue Controller Manager resources [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8f8e63c00..fd4b8437d 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -475,7 +475,6 @@ def install_kueue_crs( ]: yml_string = topology_yaml + yml_string - print(yml_string) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp)}' diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index ce2fd8260..18dd2d04f 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -174,7 +174,6 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): 'Arguments for configuring MTC in cluster create.', ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) - cluster_create_parser.set_defaults(func=cluster_create) cluster_create_resource_limits = cluster_create_parser.add_argument_group( 'Optional Resource Limits Arguments', @@ -182,6 +181,8 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): ) add_resource_limits(cluster_create_resource_limits) + cluster_create_parser.set_defaults(func=cluster_create) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -251,6 +252,15 @@ def set_cluster_create_pathways_parser( ) ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) + + cluster_create_resource_limits = ( + cluster_create_pathways_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + ) + add_resource_limits(cluster_create_resource_limits) + cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways) @@ -326,6 +336,13 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser): 'Arguments for configuring MTC in cluster create.', ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) + + cluster_create_resource_limits = cluster_create_ray_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster) From 7b8782471d8c7242c05bbe9efcac066741103dda Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Tue, 23 Sep 2025 13:04:35 +0000 Subject: [PATCH 10/12] Release v0.13.0 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 3a3992657..b67a6026a 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.12.0' +__version__ = 'v0.13.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From 2e2745e676782acfe0f5a63dcbd0d7ad06a84677 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:37:52 +0200 Subject: [PATCH 11/12] Remove args from run_command_for_value (#647) * feat: remove args from run_command_for_value * feat: remove redundant args --- src/xpk/commands/batch.py | 2 +- src/xpk/commands/cluster.py | 45 +++++++++++++--------------- src/xpk/commands/cluster_gcluster.py | 1 - src/xpk/commands/info.py | 13 ++++---- src/xpk/commands/inspector.py | 2 +- src/xpk/commands/job.py | 6 +--- src/xpk/commands/kind.py | 13 ++++---- src/xpk/commands/kjob_common.py | 6 ++-- src/xpk/commands/shell.py | 1 - src/xpk/commands/workload.py | 4 +-- src/xpk/core/capacity.py | 4 +-- src/xpk/core/cluster.py | 17 +++-------- src/xpk/core/cluster_private.py | 2 -- src/xpk/core/commands.py | 5 ++-- src/xpk/core/gcloud_context.py | 1 - src/xpk/core/jobset.py | 2 +- src/xpk/core/kjob.py | 20 +++++-------- src/xpk/core/kueue.py | 14 ++++----- src/xpk/core/monitoring.py | 2 +- src/xpk/core/nap.py | 6 ++-- src/xpk/core/network.py | 15 ++++------ src/xpk/core/nodepool.py | 8 ++--- src/xpk/core/pathways.py | 2 +- src/xpk/core/ray.py | 13 ++++---- src/xpk/core/resources.py | 10 +++---- src/xpk/core/scheduling.py | 2 +- src/xpk/core/vertex.py | 2 +- src/xpk/core/workload.py | 9 +++--- src/xpk/utils/validation.py | 4 +-- 29 files changed, 93 insertions(+), 138 deletions(-) diff --git a/src/xpk/commands/batch.py b/src/xpk/commands/batch.py index e52edb9a8..65c333907 100644 --- a/src/xpk/commands/batch.py +++ b/src/xpk/commands/batch.py @@ -126,7 +126,7 @@ def submit_job(args: Namespace) -> None: if args.time is not None: cmd += f' --time {args.time}' - return_code, return_value = run_command_for_value(cmd, 'submit job', args) + return_code, return_value = run_command_for_value(cmd, 'submit job') if return_code != 0: xpk_print(f'Running batch job returned ERROR {return_code}') diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index d8bf77d45..b6b28ab14 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -109,7 +109,7 @@ def cluster_adapt(args) -> None: 'Argument --num-nodes was not provided, trying to determine number of' ' nodes based on the available nodes in the cluster...' ) - args.num_nodes = count_nodes_on_cluster(args, system) + args.num_nodes = count_nodes_on_cluster(system) if args.num_nodes == 0: xpk_print( 'Found unexpected number of nodes. Is the --device-type correct?' @@ -445,7 +445,7 @@ def cluster_describe(args) -> None: get_cluster_credentials(args) - return_code, data_table = nodepools_build_table(args) + return_code, data_table = nodepools_build_table() if return_code != 0: xpk_exit(return_code) @@ -461,7 +461,6 @@ def cluster_describe(args) -> None: r'kubectl get node --no-headers=true' r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l", 'Count TPU Nodes', - args, ) if return_code_node_output != 0: xpk_exit(return_code_node_output) @@ -472,7 +471,6 @@ def cluster_describe(args) -> None: "kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i" ' Running | wc -l', 'Count TPU Pods', - args, ) if return_code_pod_output != 0: xpk_exit(return_code_pod_output) @@ -487,7 +485,7 @@ def cluster_describe(args) -> None: xpk_exit(0) -def nodepools_build_table(args) -> tuple[int, list[list]]: +def nodepools_build_table() -> tuple[int, list[list]]: table = [[ 'NODEPOOL_NAME', 'SLICE', @@ -499,14 +497,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: nodepools_data = {} - nodepools, return_code = get_node_pools_name(args) + nodepools, return_code = get_node_pools_name() if return_code != 0: xpk_print(f'Get node pools name returned ERROR {return_code}') for name in nodepools: nodepools_data[name] = [name] - slices, return_code = get_slice_node_pool_size(args) + slices, return_code = get_slice_node_pool_size() if return_code != 0: xpk_print(f'Get slice node pool size returned ERROR {return_code}') @@ -515,7 +513,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: count, nodepool_name = s[0], s[1] nodepools_data[nodepool_name].append(count) - type_nodepool, return_code = get_node_pool_instance_type(args) + type_nodepool, return_code = get_node_pool_instance_type() if return_code != 0: xpk_print(f'Get node pool instance type returned ERROR {return_code}') @@ -524,7 +522,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: nodepool_name, instance_type = tn[0], tn[1] nodepools_data[nodepool_name].append(instance_type) - expected_healthy_nodes, return_code = get_expected_healthy_nodes(args) + expected_healthy_nodes, return_code = get_expected_healthy_nodes() if return_code != 0: xpk_print(f'Get expected healthy nodes returned ERROR {return_code}') @@ -533,7 +531,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: count, nodepool_name = ehn[0], ehn[1] nodepools_data[nodepool_name].append(count) - actual_healthy_nodes, return_code = get_actual_healthy_nodes(args) + actual_healthy_nodes, return_code = get_actual_healthy_nodes() if return_code != 0: xpk_print(f'Get actual healthy nodes returned ERROR {return_code}') @@ -542,7 +540,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: count, nodepool_name = ahn[0], ahn[1] nodepools_data[nodepool_name].append(count) - total_nodes, return_code = get_total_nodes_per_node_pool(args) + total_nodes, return_code = get_total_nodes_per_node_pool() if return_code != 0: xpk_print(f'Get total nodes per node pool returned ERROR {return_code}') @@ -557,20 +555,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]: return 0, table -def get_node_pools_name(args) -> tuple[list[str], int]: +def get_node_pools_name() -> tuple[list[str], int]: cmd_nodepools = ( 'kubectl get node --no-headers=true -o' " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'" " | grep -v 'none' | sort | uniq" ) - return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args) + return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list') if return_code != 0: return [], return_code return out.splitlines(), 0 -def get_slice_node_pool_size(args) -> tuple[list[str], int]: +def get_slice_node_pool_size() -> tuple[list[str], int]: cmd_slices = ( 'kubectl get node --no-headers=true -o' " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'" @@ -579,7 +577,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]: ' | uniq -c' ) return_code, out = run_command_for_value( - cmd_slices, 'Count nodes per nodepool slice', args + cmd_slices, 'Count nodes per nodepool slice' ) if return_code != 0: return [], return_code @@ -587,7 +585,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]: return out.splitlines(), 0 -def get_node_pool_instance_type(args) -> tuple[list[str], int]: +def get_node_pool_instance_type() -> tuple[list[str], int]: cmd_type_nodepool = ( 'kubectl get node --no-headers=true -o' " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool," @@ -595,7 +593,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]: " 'none' | sort | uniq" ) return_code, out = run_command_for_value( - cmd_type_nodepool, 'Instance type of nodepools', args + cmd_type_nodepool, 'Instance type of nodepools' ) if return_code != 0: return [], return_code @@ -603,7 +601,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]: return out.splitlines(), 0 -def get_expected_healthy_nodes(args) -> tuple[list[str], int]: +def get_expected_healthy_nodes() -> tuple[list[str], int]: cmd_expected_healthy_nodes = ( 'kubectl get node --no-headers=true -o' " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'" @@ -614,7 +612,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]: return_code, out = run_command_for_value( cmd_expected_healthy_nodes, 'Count expected healthy nodes per nodepool', - args, ) if return_code != 0: return [], return_code @@ -622,7 +619,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]: return out.splitlines(), 0 -def get_actual_healthy_nodes(args) -> tuple[list[str], int]: +def get_actual_healthy_nodes() -> tuple[list[str], int]: cmd_actual_healthy_nodes = ( 'kubectl get node --no-headers=true -o' " custom-columns='NODE_NAME:metadata.name," @@ -635,7 +632,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]: ' | uniq -c' ) return_code, out = run_command_for_value( - cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args + cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool' ) if return_code != 0: return [], return_code @@ -643,7 +640,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]: return out.splitlines(), 0 -def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]: +def get_total_nodes_per_node_pool() -> tuple[list[str], int]: cmd_total_nodes = ( 'kubectl get node --no-headers=true -o' " custom-columns='NODE_NAME:metadata.name," @@ -655,7 +652,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]: ' | uniq -c' ) return_code, out = run_command_for_value( - cmd_total_nodes, 'Count total nodes per nodepool', args + cmd_total_nodes, 'Count total nodes per nodepool' ) if return_code != 0: return [], return_code @@ -1204,7 +1201,7 @@ def install_storage_csis(args): def install_kjob(args): xpk_print('Verifying kjob installation') - err_code = verify_kjob_installed(args) + err_code = verify_kjob_installed() if err_code > 0: xpk_exit(err_code) diff --git a/src/xpk/commands/cluster_gcluster.py b/src/xpk/commands/cluster_gcluster.py index 600a58bba..cb98d8d90 100644 --- a/src/xpk/commands/cluster_gcluster.py +++ b/src/xpk/commands/cluster_gcluster.py @@ -213,7 +213,6 @@ def validate_state_gcs_bucket(args): err_code, _ = run_command_for_value( bucket_validate_cmd, 'Validate remote state bucket existence.', - global_args=args, ) if err_code != 0: xpk_exit(err_code) diff --git a/src/xpk/commands/info.py b/src/xpk/commands/info.py index 5dcc0ba86..fce87ce14 100644 --- a/src/xpk/commands/info.py +++ b/src/xpk/commands/info.py @@ -39,7 +39,7 @@ def info(args: Namespace) -> None: add_zone_and_project(args) get_cluster_credentials(args) - verify_kueuectl(args) + verify_kueuectl() lq, cq = bool(args.localqueue), bool(args.clusterqueue) if not lq and not cq: lq, cq = True, True @@ -48,7 +48,7 @@ def info(args: Namespace) -> None: if lq: lqs = run_kueuectl_list_localqueue(args) - cqs = run_kueuectl_list_clusterqueue(args) + cqs = run_kueuectl_list_clusterqueue() quotas = get_nominal_quotas(cqs) if lq and lqs is not None: @@ -214,7 +214,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str: command = 'kubectl kueue list localqueue -o json' if args.namespace != '': command += f' --namespace {args.namespace}' - return_code, val = run_command_for_value(command, 'list localqueue', args) + return_code, val = run_command_for_value(command, 'list localqueue') if return_code != 0: xpk_print(f'Cluster info request returned ERROR {return_code}') @@ -222,18 +222,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str: return val -def run_kueuectl_list_clusterqueue(args: Namespace) -> str: +def run_kueuectl_list_clusterqueue() -> str: """Run the kueuectl list clusterqueue command. - Args: - args: user provided arguments for running the command. - Returns: kueuectl list clusterqueue formatted as json string """ command = 'kubectl kueue list clusterqueue -o json' - return_code, val = run_command_for_value(command, 'list clusterqueue', args) + return_code, val = run_command_for_value(command, 'list clusterqueue') if return_code != 0: xpk_print(f'Cluster info request returned ERROR {return_code}') diff --git a/src/xpk/commands/inspector.py b/src/xpk/commands/inspector.py index 3e8d783f0..8c3c8673f 100644 --- a/src/xpk/commands/inspector.py +++ b/src/xpk/commands/inspector.py @@ -41,7 +41,7 @@ def inspector_run_command_helper( prefix = f'Command: {command}\nCommand Description: {command_description}\n' postfix = '========================================================' return_code, command_output = run_command_for_value( - command, f'{command_description}', args + command, f'{command_description}' ) if return_code != 0: diff --git a/src/xpk/commands/job.py b/src/xpk/commands/job.py index 4d9d21457..284655a6b 100644 --- a/src/xpk/commands/job.py +++ b/src/xpk/commands/job.py @@ -62,9 +62,7 @@ def job_info(args): job_name = args.name desc_command = f'kubectl-kjob describe slurm {job_name}' - desc_code, desc_text = run_command_for_value( - desc_command, 'Getting job data', args - ) + desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data') if desc_code != 0: xpk_print(f'Data info request returned ERROR {desc_code}') xpk_exit(desc_code) @@ -76,7 +74,6 @@ def job_info(args): job_code, job_text = run_command_for_value( job_command, 'Getting job info', - args, dry_run_return_val=JOBS_DRY_RUN_YAML, ) if job_code != 0: @@ -87,7 +84,6 @@ def job_info(args): pods_code, pods_text = run_command_for_value( pods_command, 'Getting pods list', - args, dry_run_return_val=PODS_DRY_RUN_RESULT, ) if pods_code != 0: diff --git a/src/xpk/commands/kind.py b/src/xpk/commands/kind.py index 274a2bb43..90a7fd870 100644 --- a/src/xpk/commands/kind.py +++ b/src/xpk/commands/kind.py @@ -70,7 +70,7 @@ def cluster_create(args) -> None: xpk_exit(install_kueue_on_cluster_code) xpk_print('Verifying kjob installation') - err_code = verify_kjob_installed(args) + err_code = verify_kjob_installed() if err_code > 0: xpk_exit(err_code) @@ -154,7 +154,7 @@ def create_cluster_if_necessary(args) -> int: Returns: 0 if successful and 1 otherwise. """ - all_clusters, return_code = get_all_local_clusters_programmatic(args) + all_clusters, return_code = get_all_local_clusters_programmatic() if return_code > 0: xpk_print('Listing all clusters failed!') return 1 @@ -229,18 +229,15 @@ def run_kind_cluster_create_command(args) -> int: return 0 -def get_all_local_clusters_programmatic(args) -> tuple[list[str], int]: +def get_all_local_clusters_programmatic() -> tuple[list[str], int]: """Gets all the local clusters. - Args: - args: user provided arguments for running the command. - Returns: List of cluster names and 0 if successful and 1 otherwise. """ command = 'kind get clusters' return_code, raw_cluster_output = run_command_for_value( - command, 'Find if Cluster Exists', args + command, 'Find if Cluster Exists' ) if return_code != 0: xpk_print(f'Find if Cluster Exists returned ERROR {return_code}') @@ -261,7 +258,7 @@ def set_local_cluster_command(args) -> int: if not args.cluster: command = 'kubectl config current-context' return_code, current_context = run_command_for_value( - command, 'get current-context', args + command, 'get current-context' ) xpk_print( 'No local cluster name specified. Using current-context' diff --git a/src/xpk/commands/kjob_common.py b/src/xpk/commands/kjob_common.py index a13485c91..5df178ab1 100644 --- a/src/xpk/commands/kjob_common.py +++ b/src/xpk/commands/kjob_common.py @@ -35,11 +35,11 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str: annotations: tuple if gpu_type == H100_MEGA_DEVICE_TYPE: - annotations = get_a3mega_pod_template_annotations(args) + annotations = get_a3mega_pod_template_annotations() elif gpu_type == H200_DEVICE_TYPE: - annotations = get_a3ultra_pod_template_annotations(args) + annotations = get_a3ultra_pod_template_annotations() elif gpu_type == B200_DEVICE_TYPE: - annotations = get_a4_pod_template_annotations(args) + annotations = get_a4_pod_template_annotations() else: annotations = tuple() diff --git a/src/xpk/commands/shell.py b/src/xpk/commands/shell.py index 4974b0c3d..e1f235161 100644 --- a/src/xpk/commands/shell.py +++ b/src/xpk/commands/shell.py @@ -60,7 +60,6 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None: ' -o custom-columns=":metadata.name"' ), task='Get existing interactive shell pod name.', - global_args=args, ) if return_code != 0: xpk_print( diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 66e061de8..205e4bb4e 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -334,7 +334,7 @@ def workload_create(args) -> None: xpk_print('Starting workload create', flush=True) metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) + cluster_config_map = get_cluster_configmap(metadata_configmap_name) cluster_xpk_version = None if cluster_config_map is None: xpk_print( @@ -507,7 +507,7 @@ def workload_create(args) -> None: annotations=annotations, ) - sub_networks = get_cluster_subnetworks(args) + sub_networks = get_cluster_subnetworks() if args.device_type == a3high_device_type: yml_string = tcpx_decorator.decorate_jobset(yml_string) elif args.device_type == a3mega_device_type: diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index f07e660b7..b8dfaaff9 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -119,7 +119,7 @@ def get_reservation_maintenance_interval( f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"' ) return_code, output = run_command_for_value( - command, 'Get reservation maintenance interval', None + command, 'Get reservation maintenance interval' ) if return_code != 0: xpk_print(f'Get reservation maintenance interval ERROR {return_code}') @@ -143,7 +143,7 @@ def get_reservation_placement_policy( f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"' ) return_code, output = run_command_for_value( - command, 'Get reservation placement policy', None + command, 'Get reservation placement policy' ) if return_code != 0: xpk_print(f'Get reservation placement policy ERROR {return_code}') diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index b2c9d8e1c..29696ef4a 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -199,12 +199,9 @@ def install_nri_on_cluster(args) -> int: return 0 -def get_cluster_nodes_info(args) -> list[dict]: +def get_cluster_nodes_info() -> list[dict]: """Get list of cluster's nodes descrition in yaml format - Args: - args: user provided arguments for running the command. - Returns: List of nodes info yaml objects. """ @@ -213,7 +210,6 @@ def get_cluster_nodes_info(args) -> list[dict]: err_code, val = run_command_for_value( command=command, task='Get cluster nodes info', - global_args=args, ) if err_code != 0: xpk_exit(err_code) @@ -221,9 +217,9 @@ def get_cluster_nodes_info(args) -> list[dict]: return data['items'] -def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int: +def count_nodes_on_cluster(system: SystemCharacteristics) -> int: """Count cluster nodes by accelerator type""" - nodes_info = get_cluster_nodes_info(args) + nodes_info = get_cluster_nodes_info() accelerators = [ node['metadata']['labels']['cloud.google.com/gke-accelerator'] for node in nodes_info @@ -248,7 +244,6 @@ def get_cluster_network(args) -> str: err_code, val = run_command_for_value( command=cluster_network_cmd, task='Get network cluster is in', - global_args=args, ) if err_code != 0: xpk_exit(err_code) @@ -361,7 +356,6 @@ def is_driver_enabled_on_cluster( command, f"Checks if {driver} driver's {config_key} is enabled in cluster" ' describe.', - args, ) if return_code != 0: xpk_exit(return_code) @@ -412,7 +406,7 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]: ' --format="csv[no-heading](name)"' ) return_code, raw_cluster_output = run_command_for_value( - command, 'Find if Cluster Exists', args + command, 'Find if Cluster Exists' ) if return_code != 0: xpk_print(f'Find if Cluster Exists returned ERROR {return_code}') @@ -734,7 +728,6 @@ def is_cluster_using_clouddns(args) -> bool: return_code, _ = run_command_for_value( command, 'Check if Cloud DNS is enabled in cluster describe.', - args, ) if return_code == 0: xpk_print('Cloud DNS is enabled on the cluster, no update needed.') @@ -757,7 +750,6 @@ def is_workload_identity_enabled_on_cluster(args) -> bool: return_code, workload_pool = run_command_for_value( command, 'Checks if Workload Identity Federation is enabled in cluster describe.', - args, ) if return_code != 0: xpk_exit(return_code) @@ -785,7 +777,6 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool: return_code, gcsfuse_driver_enabled = run_command_for_value( command, 'Checks if GCSFuse CSI driver is enabled in cluster describe.', - args, ) if return_code != 0: xpk_exit(return_code) diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py index 3dd4b7f8e..16169ef0a 100644 --- a/src/xpk/core/cluster_private.py +++ b/src/xpk/core/cluster_private.py @@ -133,7 +133,6 @@ def is_cluster_private(args) -> bool: return_code, private_nodes_enabled = run_command_for_value( command, 'Check if Private Nodes is enabled in cluster.', - args, ) if return_code != 0: @@ -164,7 +163,6 @@ def get_cluster_authorized_networks(args) -> list[str]: return_code, authorized_networks = run_command_for_value( command, 'Fetching the list of authorized network from cluster describe.', - args, dry_run_return_val='127.0.0.1/32', ) diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index cc3b266b7..f00c2d18b 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -23,6 +23,7 @@ from ..utils.objects import chunks from ..utils.file import make_tmp_files, write_tmp_file from ..utils.console import xpk_print +from ..utils.execution_context import is_dry_run def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False): @@ -226,7 +227,6 @@ def run_command_with_updates(command, task, global_args, verbose=True) -> int: def run_command_for_value( command, task, - global_args, dry_run_return_val='0', print_timer=False, hide_error=False, @@ -239,7 +239,6 @@ def run_command_for_value( Args: command: user provided command to run. task: user provided task name for running the command. - global_args: user provided arguments for running the command. dry_run_return_val: return value of this command for dry run. print_timer: print out the time the command is running. hide_error: hide the error from the command output upon success. @@ -249,7 +248,7 @@ def run_command_for_value( int: return_code, default is 0 str: return_val, default is '0' """ - if global_args is not None and global_args.dry_run: + if is_dry_run(): xpk_print( f'Task: `{task}` is implemented by the following command' ' not running since it is a dry run.' diff --git a/src/xpk/core/gcloud_context.py b/src/xpk/core/gcloud_context.py index a96fa2ee3..0d3688f1d 100644 --- a/src/xpk/core/gcloud_context.py +++ b/src/xpk/core/gcloud_context.py @@ -139,7 +139,6 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]: return_code, cmd_output = run_command_for_value( command, command_description, - args, hide_error=True, ) if return_code != 0: diff --git a/src/xpk/core/jobset.py b/src/xpk/core/jobset.py index e47346796..d20ea6e74 100644 --- a/src/xpk/core/jobset.py +++ b/src/xpk/core/jobset.py @@ -122,7 +122,7 @@ def update_jobset_resources_if_necessary(args): # Get total number of nodes cmd_total_node_num = 'kubectl get node --no-headers | wc -l' return_code, out = run_command_for_value( - cmd_total_node_num, 'Count total nodes', args + cmd_total_node_num, 'Count total nodes' ) if return_code != 0: xpk_exit(1) diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index c2c43f27b..16cc735e2 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -167,8 +167,8 @@ class PodTemplateDefaults(Enum): default_interface_annotation = "networking.gke.io/default-interface=eth0" -def get_a4_pod_template_annotations(args) -> tuple[str, str]: - sub_networks = get_cluster_subnetworks(args) +def get_a4_pod_template_annotations() -> tuple[str, str]: + sub_networks = get_cluster_subnetworks() interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry( sub_networks ) @@ -179,8 +179,8 @@ def get_a4_pod_template_annotations(args) -> tuple[str, str]: ) -def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]: - sub_networks = get_cluster_subnetworks(args) +def get_a3ultra_pod_template_annotations() -> tuple[str, str]: + sub_networks = get_cluster_subnetworks() interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry( sub_networks ) @@ -191,11 +191,9 @@ def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]: ) -def get_a3mega_pod_template_annotations( - args: Namespace, -) -> tuple[str, str, str]: +def get_a3mega_pod_template_annotations() -> tuple[str, str, str]: """Adds or updates annotations in the Pod template.""" - sub_networks = get_cluster_subnetworks(args) + sub_networks = get_cluster_subnetworks() tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry() interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry( sub_networks @@ -205,16 +203,14 @@ def get_a3mega_pod_template_annotations( return tcpxo, interfaces, default_interface_annotation -def verify_kjob_installed(args: Namespace) -> int: +def verify_kjob_installed() -> int: """Check if kjob is installed. If not provide user with proper communicate and exit. - Args: - args - user provided arguments. Returns: error code > if kjob not installed, otherwise 0 """ command = "kubectl-kjob help" task = "Verify kjob installation " - verify_kjob_installed_code, _ = run_command_for_value(command, task, args) + verify_kjob_installed_code, _ = run_command_for_value(command, task) if verify_kjob_installed_code == 0: xpk_print("kjob found") diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index fd4b8437d..0b499e2b2 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -281,10 +281,8 @@ """ -def verify_kueuectl(args: Namespace) -> None: +def verify_kueuectl() -> None: """Verify if kueuectl is installed. - Args: - args: user provided arguments. Returns: None """ @@ -292,7 +290,7 @@ def verify_kueuectl(args: Namespace) -> None: command = 'kubectl kueue version' task = 'Verify kueuectl installation on cluster' - verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args) + verify_kueuectl_installed_code, _ = run_command_for_value(command, task) if verify_kueuectl_installed_code == 0: xpk_print('kueuectl found') @@ -324,10 +322,10 @@ def delete_multikueueclusters_definitions(args) -> int: return return_code -def get_kueue_version(args) -> tuple[int, str]: +def get_kueue_version() -> tuple[int, str]: command = 'kubectl kueue version' task = 'Get kueue version on server' - return_code, val = run_command_for_value(command, task, args) + return_code, val = run_command_for_value(command, task) if return_code != 0: return return_code, '' lines = val.splitlines() @@ -348,7 +346,7 @@ def install_kueue_on_cluster(args) -> int: 0 if successful and 1 otherwise. """ - err_code, kueue_version_installed = get_kueue_version(args) + err_code, kueue_version_installed = get_kueue_version() if err_code == 0: if Version(kueue_version_installed) < Version('v0.9.0') and Version( KUEUE_VERSION @@ -540,7 +538,7 @@ def update_kueue_resources_if_necessary(args): # Get total number of nodes cmd_total_node_num = 'kubectl get node --no-headers | wc -l' return_code, out = run_command_for_value( - cmd_total_node_num, 'Count total nodes', args + cmd_total_node_num, 'Count total nodes' ) if return_code != 0: xpk_exit(1) diff --git a/src/xpk/core/monitoring.py b/src/xpk/core/monitoring.py index a1a791824..41726674e 100644 --- a/src/xpk/core/monitoring.py +++ b/src/xpk/core/monitoring.py @@ -40,7 +40,7 @@ def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]: ) return_code, return_value = run_command_for_value( - command, 'GKE Dashboard List', args + command, 'GKE Dashboard List' ) if return_code != 0: diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 0b21e8ee8..bccb50c90 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -272,7 +272,7 @@ def is_autoprovisioning_enabled( """ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, resources_configmap_name) + cluster_config_map = get_cluster_configmap(resources_configmap_name) if cluster_config_map is None: xpk_print( @@ -325,7 +325,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: if capacity_type_str == CapacityType.UNKNOWN.name: # Use default settings from cluster creation. metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) + cluster_config_map = get_cluster_configmap(metadata_configmap_name) # Error out if the metadata config map doesn't exist, and is attempting to use # autoprovisioning. @@ -369,7 +369,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: def get_cluster_provisioner(args) -> str: metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) + cluster_config_map = get_cluster_configmap(metadata_configmap_name) cluster_provisioner = 'gcloud' if not cluster_config_map is None: provisioner = cluster_config_map.get('provisioner') diff --git a/src/xpk/core/network.py b/src/xpk/core/network.py index 18f844c59..58f2849e4 100644 --- a/src/xpk/core/network.py +++ b/src/xpk/core/network.py @@ -235,19 +235,14 @@ def create_cluster_network_config(args) -> int: return 0 -def get_cluster_subnetworks(args) -> list[str]: +def get_cluster_subnetworks() -> list[str]: """Gets the list of cluster networks. - Args: - args: user provided arguments for running the command. - Returns: list[str]: list of cluster networks """ command = 'kubectl get GKENetworkParamSet' - return_code, stdout = run_command_for_value( - command, 'Get Cluster Networks', args - ) + return_code, stdout = run_command_for_value(command, 'Get Cluster Networks') if return_code != 0: xpk_print('GKE Cluster Get NetworkParamSet failed') xpk_exit(return_code) @@ -328,7 +323,7 @@ def get_all_networks_programmatic(args) -> tuple[list[str], int]: f' --project={args.project}' ) return_code, raw_network_output = run_command_for_value( - command, 'Get All Networks', args + command, 'Get All Networks' ) if return_code != 0: xpk_print(f'Get All Networks returned ERROR {return_code}') @@ -353,7 +348,7 @@ def get_all_subnets_programmatic(args) -> tuple[list[str], int]: f' --filter=name~"{subnet_name_filter}" --project={args.project}' ) return_code, raw_subnets_output = run_command_for_value( - command, 'Get All Subnets', args + command, 'Get All Subnets' ) if return_code != 0: xpk_print(f'Get All Subnets returned ERROR {return_code}') @@ -380,7 +375,7 @@ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]: f' --project={args.project}' ) return_code, raw_subnets_output = run_command_for_value( - command, 'Get All Firewall Rules', args + command, 'Get All Firewall Rules' ) if return_code != 0: xpk_print(f'Get All Firewall Rules returned ERROR {return_code}') diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 85ab6aba9..4dd25014c 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -436,7 +436,7 @@ def get_all_nodepools_programmatic(args) -> tuple[list[str], int]: ' --format="csv[no-heading](name)"' ) return_code, raw_nodepool_output = run_command_for_value( - command, 'Get All Node Pools', args + command, 'Get All Node Pools' ) if return_code != 0: xpk_print(f'Get All Node Pools returned ERROR {return_code}') @@ -463,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]: f' --region={zone_to_region(args.zone)} --format="value(locations)"' ) return_code, nodepool_zone = run_command_for_value( - command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone + command, 'Get Node Pool Zone', dry_run_return_val=args.zone ) if return_code != 0: xpk_print(f'Get Node Pool Zone returned ERROR {return_code}') @@ -496,7 +496,7 @@ def get_gke_node_pool_version( ) return_code, current_gke_master_version = run_command_for_value( - command, command_description, args + command, command_description ) if return_code != 0: xpk_print( @@ -604,7 +604,7 @@ def get_nodepool_workload_metadata_mode( f' --region={zone_to_region(args.zone)} --format="value(config.workloadMetadataConfig.mode)"' ) return_code, nodepool_WI_mode = run_command_for_value( - command, 'Get Node Pool Workload Identity Metadata Mode', args + command, 'Get Node Pool Workload Identity Metadata Mode' ) if return_code != 0: xpk_print( diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 017fb885e..d62b60c62 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -116,7 +116,7 @@ def check_if_pathways_job_is_installed(args) -> bool: ' custom-columns=NAME:.metadata.name' ) task = f'Check if PathwaysJob is installed on {args.cluster}' - return_code, return_msg = run_command_for_value(command, task, args) + return_code, return_msg = run_command_for_value(command, task) # return_msg contains the name of the controller pod, if found. xpk_print('check_if_pathways_job_is_installed', return_code, return_msg) diff --git a/src/xpk/core/ray.py b/src/xpk/core/ray.py index 50e391025..4fff877c0 100644 --- a/src/xpk/core/ray.py +++ b/src/xpk/core/ray.py @@ -106,12 +106,12 @@ def install_ray_cluster(args, system) -> int: label = 'cloud.google.com/gke-nodepool=default-pool' available_head_cpu, available_head_mem = generate_available_resources( - label, args, HEAD_CPU + label, HEAD_CPU ) label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}' available_worker_cpu, available_worker_mem = generate_available_resources( - label, args, WORKER_CPU + label, WORKER_CPU ) yml_string = ray_cluster_crd_yaml.format( @@ -168,12 +168,11 @@ def delete_ray_cluster(args) -> None: return -def generate_available_resources(label, args, percent) -> tuple: +def generate_available_resources(label, percent) -> tuple: """Generate the available resources for the nodes that match the given label Args: label: the label used to match the appropriate nodes - args: user provided arguments for running the command percent: the percent of the available resources to use Returns: @@ -184,13 +183,13 @@ def generate_available_resources(label, args, percent) -> tuple: f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'" ) task = f'Getting nodes with label {label}' - _, node_name = run_command_for_value(command, task, args) + _, node_name = run_command_for_value(command, task) command = ( f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'" ) task = 'Fetching available CPU on node' - _, available_cpu = run_command_for_value(command, task, args) + _, available_cpu = run_command_for_value(command, task) match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu) if not match: xpk_print( @@ -207,7 +206,7 @@ def generate_available_resources(label, args, percent) -> tuple: " jsonpath='{.status.allocatable.memory}'" ) task = 'Fetching available memory on node' - _, available_memory = run_command_for_value(command, task, args) + _, available_memory = run_command_for_value(command, task) match = re.match(r'(\d+)([a-zA-Z]+)', available_memory) if not match: xpk_print( diff --git a/src/xpk/core/resources.py b/src/xpk/core/resources.py index f215e1063..2e034d35e 100644 --- a/src/xpk/core/resources.py +++ b/src/xpk/core/resources.py @@ -50,11 +50,10 @@ class AutoprovisioningConfig: maximum_chips: int -def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: +def get_cluster_configmap(configmap_name) -> dict[str, str] | None: """Run the Get GKE Cluster ConfigMap request. Args: - args: user provided arguments for running the command. configmap_name: name of the configmap. Returns: @@ -68,7 +67,6 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: return_code, return_value = run_command_for_value( command, 'GKE Cluster Get ConfigMap', - args, dry_run_return_val='map[]', ) if return_code != 0: @@ -206,7 +204,7 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]: True if device_type/gke_accelerator exists in the cluster, False otherwise. """ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}' - resources_config_map = get_cluster_configmap(args, resources_configmap_name) + resources_config_map = get_cluster_configmap(resources_configmap_name) if resources_config_map is None: xpk_print( f'No ConfigMap exist for cluster with the name {resources_config_map}.' @@ -229,7 +227,7 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None: returns system characteristics """ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, resources_configmap_name) + cluster_config_map = get_cluster_configmap(resources_configmap_name) if cluster_config_map is None: return None @@ -251,7 +249,7 @@ def get_cluster_capacity_type(args) -> CapacityType | None: returns system characteristics """ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) + cluster_config_map = get_cluster_configmap(metadata_configmap_name) if cluster_config_map is None: return None diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index ef6b469ce..3032eb086 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -36,7 +36,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: returns true if workload can schedule, otherwise returns false. """ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, resources_configmap_name) + cluster_config_map = get_cluster_configmap(resources_configmap_name) # Prevents workload creation failure for existing clusters with no ConfigMap if cluster_config_map is None: diff --git a/src/xpk/core/vertex.py b/src/xpk/core/vertex.py index 6507e1856..8fe47257a 100644 --- a/src/xpk/core/vertex.py +++ b/src/xpk/core/vertex.py @@ -66,7 +66,7 @@ def create_vertex_experiment(args) -> dict | None: ) metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) + cluster_config_map = get_cluster_configmap(metadata_configmap_name) if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map: xpk_print( diff --git a/src/xpk/core/workload.py b/src/xpk/core/workload.py index 5ec937941..cebc1b455 100644 --- a/src/xpk/core/workload.py +++ b/src/xpk/core/workload.py @@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]: if hasattr(args, 'filter_by_job'): task += f' with filter-by-job={args.filter_by_job}' - return_code, return_value = run_command_for_value(command, task, args) + return_code, return_value = run_command_for_value(command, task) return return_code, return_value @@ -152,7 +152,7 @@ def check_if_workload_exists(args) -> bool: command = f"kubectl get workloads -o=custom-columns='{s}'" return_code, return_msg = run_command_for_value( - command, 'Check if Workload Already Exists', args + command, 'Check if Workload Already Exists' ) if return_code != 0: @@ -186,7 +186,7 @@ def wait_for_job_completion(args) -> int: # Get the full workload name get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}' return_code, return_value = run_command_for_value( - get_workload_name_cmd, 'Get full workload name', args + get_workload_name_cmd, 'Get full workload name' ) if return_code != 0: xpk_print(f'Get full workload name request returned ERROR {return_code}') @@ -205,7 +205,6 @@ def wait_for_job_completion(args) -> int: return_code, return_value = run_command_for_value( wait_cmd, f'Wait for workload to finish with timeout of {timeout_msg}', - args, print_timer=True, ) if return_code != 0: @@ -231,7 +230,7 @@ def wait_for_job_completion(args) -> int: " jsonpath='{.status.conditions[-1].type}'" ) return_code, return_value = run_command_for_value( - status_cmd, 'Get jobset status', args + status_cmd, 'Get jobset status' ) if return_code != 0: xpk_print(f'Get workload status request returned ERROR {return_code}') diff --git a/src/xpk/utils/validation.py b/src/xpk/utils/validation.py index 87c12befb..a572cd923 100644 --- a/src/xpk/utils/validation.py +++ b/src/xpk/utils/validation.py @@ -71,9 +71,7 @@ def validate_dependencies(): if deps_version is None or deps_version != xpk_version: for name, check in validation_commands.items(): cmd, message = check['command'], check['message'] - code, _ = run_command_for_value( - cmd, f'Validate {name} installation.', None - ) + code, _ = run_command_for_value(cmd, f'Validate {name} installation.') if code != 0: xpk_print(message) xpk_exit(code) From ebaec5d65f9026cdd39dd0f82e10c6e6d814d310 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:10:01 +0200 Subject: [PATCH 12/12] Golden buddy improvements (#645) * feat: add command to goldens files * feat: improve output of golden_buddy.sh script * feat: colorful git diff --- golden_buddy.sh | 59 ++++++++++++-------- goldens/Basic_cluster_create.txt | 1 + goldens/Batch.txt | 1 + goldens/Cluster_create_private.txt | 1 + goldens/Cluster_delete.txt | 1 + goldens/Cluster_delete_force.txt | 1 + goldens/Job_cancel.txt | 1 + goldens/Job_info.txt | 1 + goldens/Job_list.txt | 1 + goldens/NAP_cluster-create.txt | 1 + goldens/NAP_cluster-create_with_pathways.txt | 1 + goldens/Storage_list.txt | 1 + goldens/Workload_create.txt | 1 + goldens/Workload_create_pathways.txt | 1 + goldens/Workload_delete.txt | 1 + goldens/Workload_list.txt | 1 + 16 files changed, 52 insertions(+), 22 deletions(-) diff --git a/golden_buddy.sh b/golden_buddy.sh index bfae97dbb..8a77ac29f 100755 --- a/golden_buddy.sh +++ b/golden_buddy.sh @@ -90,31 +90,46 @@ fi mkdir -p "$GOLDENS_DIR" -yq -r '.goldens | keys[]' "$GOLDENS_FILE" | \ - while read -r key; do - command=$(yq -r '.goldens["'"$key"'"].command' "$GOLDENS_FILE") - if [[ "$MODE" = "update" ]]; then - printf "${YELLOW}Updating: %s${NC}\n" "$key" - fi - if [[ "$MODE" = "verify" ]]; then - printf "${YELLOW}Evaluating: %s${NC}\n" "$key" - fi - eval "$command" > "$GOLDENS_DIR/${key// /_}.txt" 2>&1 -done +has_diffs=false +while read -r key; do + command=$(yq -r '.goldens["'"$key"'"].command' "$GOLDENS_FILE") + if [[ "$MODE" = "update" ]]; then + printf "${YELLOW}Updating: %s...${NC} " "$key" + fi + if [[ "$MODE" = "verify" ]]; then + printf "${YELLOW}Evaluating: %s...${NC} " "$key" + fi -if [[ "$MODE" = "verify" ]]; then - git add "$GOLDENS_DIR" - DIFF_OUTPUT=$(git diff HEAD -- "$GOLDENS_DIR" | cat) + REFERENCE_FILE="$GOLDENS_DIR/${key// /_}.txt" + echo "\$ $command" > $REFERENCE_FILE + eval "$command" >> $REFERENCE_FILE 2>&1 + if [[ "$MODE" = "update" ]]; then + printf "${GREEN}DONE${NC}\n" + fi + + if [[ "$MODE" = "verify" ]]; then + git add $REFERENCE_FILE + + DIFF_OUTPUT=$(git diff --color=always HEAD -- $REFERENCE_FILE | cat) + + git reset HEAD -- $REFERENCE_FILE &> /dev/null + git restore $REFERENCE_FILE &> /dev/null + git clean -fd -- $REFERENCE_FILE &> /dev/null - git reset HEAD -- "$GOLDENS_DIR" &> /dev/null - git restore "$GOLDENS_DIR" &> /dev/null - git clean -fd -- "$GOLDENS_DIR" &> /dev/null + if [[ -n "$DIFF_OUTPUT" ]]; then + printf "${RED}FAIL${NC}\n" - if [[ -n "$DIFF_OUTPUT" ]]; then - printf "%s\n" "$DIFF_OUTPUT" >&2 + has_diffs=true + echo "\$ $command" $REFERENCE_FILE + printf "%s\n" "$DIFF_OUTPUT" >&2 + else + printf "${GREEN}OK${NC}\n" + fi + fi +done < <(yq -r '.goldens | keys[]' "$GOLDENS_FILE") - echo "" >&2 - +if [[ "$MODE" = "verify" ]]; then + if [[ "$has_diffs" == true ]]; then printf "${RED}Golden diffs found! Please use the following command to regenerate goldens:${NC}\n" >&2 printf "${YELLOW}\n\t%s${NC}\n\n" "${UPDATE_GOLDEN_COMMAND:-"golden_buddy.sh update $GOLDENS_FILE $GOLDENS_DIR"}" >&2 exit 1 @@ -127,4 +142,4 @@ fi if [[ "$MODE" = "update" ]]; then printf "${GREEN}Goldens updated!${NC}\n" exit 0 -fi \ No newline at end of file +fi diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 03fce8269..d23c248f8 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run [XPK] Starting xpk [XPK] Starting cluster create for cluster golden-cluster: [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Batch.txt b/goldens/Batch.txt index 483cd914f..c87e66ef6 100644 --- a/goldens/Batch.txt +++ b/goldens/Batch.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh [XPK] Starting xpk [XPK] Working on golden-project and us-central1-a [XPK] Try 1: get-credentials to cluster golden-cluster diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 4c399efab..c9a62c1b9 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run [XPK] Starting xpk [XPK] Starting cluster create for cluster golden-cluster-private: [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Cluster_delete.txt b/goldens/Cluster_delete.txt index 1ae8aa955..f50a2154c 100644 --- a/goldens/Cluster_delete.txt +++ b/goldens/Cluster_delete.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Starting cluster delete for cluster: golden-cluster [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Cluster_delete_force.txt b/goldens/Cluster_delete_force.txt index 060b5e5f0..3bb64477a 100644 --- a/goldens/Cluster_delete_force.txt +++ b/goldens/Cluster_delete_force.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --force --dry-run [XPK] Starting xpk [XPK] Starting cluster delete for cluster: golden-cluster [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt index 2e97a48aa..7e71ece4a 100644 --- a/goldens/Job_cancel.txt +++ b/goldens/Job_cancel.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py job cancel golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Starting job cancel for job: ['golden-job'] [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Job_info.txt b/goldens/Job_info.txt index 4e8765887..82076c7ac 100644 --- a/goldens/Job_info.txt +++ b/goldens/Job_info.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py job info golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Task: `Getting job data` is implemented by the following command not running since it is a dry run. kubectl-kjob describe slurm golden-job diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt index 76053723d..e659f2a6a 100644 --- a/goldens/Job_list.txt +++ b/goldens/Job_list.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Working on golden-project and us-central1-a [XPK] Try 1: get-credentials to cluster golden-cluster diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 863da3314..ed57b1de8 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run [XPK] Starting xpk [XPK] Starting cluster create for cluster golden-cluster: [XPK] Working on golden-project and us-central1-a diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 58bedcf6f..787bfe0ef 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run [XPK] Starting xpk [XPK] Starting cluster create for cluster golden-cluster: [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Storage_list.txt b/goldens/Storage_list.txt index 8da11f27e..81db59e91 100644 --- a/goldens/Storage_list.txt +++ b/goldens/Storage_list.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py storage list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk NAME TYPE AUTO MOUNT MOUNT POINT READONLY MANIFEST ------ ------ ------------ ------------- ---------- ---------- diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 849c85e56..294810256 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py workload create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run [XPK] Starting xpk [XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index e05b6840d..0071893f3 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py workload create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run [XPK] Starting xpk [XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt index c719630aa..683c8cca6 100644 --- a/goldens/Workload_delete.txt +++ b/goldens/Workload_delete.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py workload delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --dry-run [XPK] Starting xpk [XPK] Starting Workload delete [XPK] Working on golden-project and us-central1-a diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt index 4c608fdfb..e23d631fb 100644 --- a/goldens/Workload_list.txt +++ b/goldens/Workload_list.txt @@ -1,3 +1,4 @@ +$ python3 xpk.py workload list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Starting workload list [XPK] Working on golden-project and us-central1-a