From 03cd86a96402eed979dd3db6ca520058f268d15a Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 22 Sep 2025 08:26:55 +0000 Subject: [PATCH] feat: add more goldens --- golden_buddy.sh | 5 +- goldens.yaml | 26 +++++++ goldens/Basic_cluster_create.txt | 101 +++++++++++++++++++++++++ goldens/Batch.txt | 14 ++++ goldens/Cluster_create_private.txt | 108 +++++++++++++++++++++++++++ goldens/Cluster_delete.txt | 16 ++++ goldens/Cluster_delete_force.txt | 13 ++++ goldens/Job_cancel.txt | 9 +++ goldens/Job_info.txt | 20 +++++ goldens/Job_list.txt | 9 +++ goldens/Storage_list.txt | 4 + goldens/Workload_create.txt | 33 ++++++++ goldens/Workload_create_pathways.txt | 32 ++++++++ goldens/Workload_delete.txt | 12 +++ goldens/Workload_list.txt | 12 +++ src/xpk/commands/batch.py | 16 ++-- src/xpk/commands/common.py | 4 + src/xpk/commands/job.py | 32 +++++++- src/xpk/commands/storage.py | 7 +- src/xpk/commands/workload.py | 17 +++-- src/xpk/core/cluster_private.py | 4 +- src/xpk/core/docker_image.py | 17 ++++- src/xpk/core/docker_resources.py | 13 +++- src/xpk/core/pathways.py | 6 +- src/xpk/core/scheduling.py | 4 + src/xpk/utils/network.py | 4 + 26 files changed, 507 insertions(+), 31 deletions(-) create mode 100644 goldens/Basic_cluster_create.txt create mode 100644 goldens/Batch.txt create mode 100644 goldens/Cluster_create_private.txt create mode 100644 goldens/Cluster_delete.txt create mode 100644 goldens/Cluster_delete_force.txt create mode 100644 goldens/Job_cancel.txt create mode 100644 goldens/Job_info.txt create mode 100644 goldens/Job_list.txt create mode 100644 goldens/Storage_list.txt create mode 100644 goldens/Workload_create.txt create mode 100644 goldens/Workload_create_pathways.txt create mode 100644 goldens/Workload_delete.txt create mode 100644 goldens/Workload_list.txt diff --git a/golden_buddy.sh b/golden_buddy.sh index 4b3745f8c..bfae97dbb 100755 --- a/golden_buddy.sh +++ b/golden_buddy.sh @@ -90,8 +90,9 @@ fi mkdir -p "$GOLDENS_DIR" -cat "$GOLDENS_FILE" | yq -r '.goldens | to_entries[] | [.key, .value.command] | @tsv' | \ - while IFS=$'\t' read -r key command; do +yq -r '.goldens | keys[]' "$GOLDENS_FILE" | \ + while read -r key; do + command=$(yq -r '.goldens["'"$key"'"].command' "$GOLDENS_FILE") if [[ "$MODE" = "update" ]]; then printf "${YELLOW}Updating: %s${NC}\n" "$key" fi diff --git a/goldens.yaml b/goldens.yaml index f07091d78..c06f2883e 100644 --- a/goldens.yaml +++ b/goldens.yaml @@ -3,3 +3,29 @@ goldens: command: python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run "NAP cluster-create": command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run + "Basic cluster create": + command: python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run + "Cluster create private": + command: python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run + "Cluster delete": + command: python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Cluster delete force": + command: python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --force --dry-run + "Workload create": + command: python3 xpk.py workload create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run + "Workload create pathways": + command: python3 xpk.py workload create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run + "Workload delete": + command: python3 xpk.py workload delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --dry-run + "Workload list": + command: python3 xpk.py workload list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Storage list": + command: python3 xpk.py storage list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Job cancel": + command: python3 xpk.py job cancel golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Batch": + command: python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh + "Job list": + command: python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run + "Job info": + command: python3 xpk.py job info golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt new file mode 100644 index 000000000..df7685a99 --- /dev/null +++ b/goldens/Basic_cluster_create.txt @@ -0,0 +1,101 @@ +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 0f9af4fcbf6d012aed0c8e09d49827dc34495d3eed2893af6b68b2d121a519ac +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Batch.txt b/goldens/Batch.txt new file mode 100644 index 000000000..483cd914f --- /dev/null +++ b/goldens/Batch.txt @@ -0,0 +1,14 @@ +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `submit job` is implemented by the following command not running since it is a dry run. +kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --worker-container xpk-batch-container --first-node-ip --pod-template-annotation kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host -- batch-read.sh --partition multislice-queue +[XPK] XPK Done. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt new file mode 100644 index 000000000..400b59adb --- /dev/null +++ b/goldens/Cluster_create_private.txt @@ -0,0 +1,108 @@ +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster-private: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Task: `Fetching the list of authorized network from cluster describe.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" +[XPK] Current machine's IP adrress is already authorized. +[XPK] Try 1: get-credentials to cluster golden-cluster-private +[XPK] Task: `get-credentials to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster-private --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of v5p-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Creating 1 node pool or pools of v5p-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --region=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster-private +[XPK] Task: `Install Jobset on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster-private +[XPK] Task: `Install PathwaysJob on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f e528cf13756aaba6ef43789e09518fb9bcc6d43a945b51fb76d16e9869c73eec +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete.txt b/goldens/Cluster_delete.txt new file mode 100644 index 000000000..1ae8aa955 --- /dev/null +++ b/goldens/Cluster_delete.txt @@ -0,0 +1,16 @@ +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Get the name of the workloads in the cluster. +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete_force.txt b/goldens/Cluster_delete_force.txt new file mode 100644 index 000000000..060b5e5f0 --- /dev/null +++ b/goldens/Cluster_delete_force.txt @@ -0,0 +1,13 @@ +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt new file mode 100644 index 000000000..2e97a48aa --- /dev/null +++ b/goldens/Job_cancel.txt @@ -0,0 +1,9 @@ +[XPK] Starting xpk +[XPK] Starting job cancel for job: ['golden-job'] +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. +kubectl-kjob delete slurm golden-job +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_info.txt b/goldens/Job_info.txt new file mode 100644 index 000000000..4e8765887 --- /dev/null +++ b/goldens/Job_info.txt @@ -0,0 +1,20 @@ +[XPK] Starting xpk +[XPK] Task: `Getting job data` is implemented by the following command not running since it is a dry run. +kubectl-kjob describe slurm golden-job +[XPK] Task: `Getting job info` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm -o yaml --field-selector metadata.name==golden-job +[XPK] Task: `Getting pods list` is implemented by the following command not running since it is a dry run. +kubectl get pods -l=job-name=golden-job --no-headers +Job name: golden-job +Script name: echo hello +Profile: '' +Labels: + kjobctl.x-k8s.io/app-profile: default +Mounts: [] +Pods: +- Name: foo-pod + Status: Running +- Name: bar-pod + Status: Evicted +Entrypoint environment variables template: [] +[XPK] XPK Done. diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt new file mode 100644 index 000000000..76053723d --- /dev/null +++ b/goldens/Job_list.txt @@ -0,0 +1,9 @@ +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Listing jobs for project golden-project and zone us-central1-a: +[XPK] Task: `list jobs` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm --profile xpk-def-app-profile +[XPK] Exiting XPK cleanly diff --git a/goldens/Storage_list.txt b/goldens/Storage_list.txt new file mode 100644 index 000000000..8da11f27e --- /dev/null +++ b/goldens/Storage_list.txt @@ -0,0 +1,4 @@ +[XPK] Starting xpk +NAME TYPE AUTO MOUNT MOUNT POINT READONLY MANIFEST +------ ------ ------------ ------------- ---------- ---------- +[XPK] XPK Done. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt new file mode 100644 index 000000000..849c85e56 --- /dev/null +++ b/goldens/Workload_create.txt @@ -0,0 +1,33 @@ +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] No gcsfuse Storages to add detected +[XPK] No gcp filestore instances to add detected. +[XPK] No gcp parallelstore instances to add detected. +[XPK] No gce persistent disk instances to add detected. +[XPK] No managed lustre instances to add detected. +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144 +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your workload here: https://console.cloud.google.com/kubernetes/service/us-central1/golden-cluster/default/golden-workload/details?project=golden-project +[XPK] Follow your worker 0, slice 0 logs here: Adjust the pod name ([prefix]-slice-job-[slice_number]-[worker_number]) after clicking the url if you want other worker logs. https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22golden-project%22%0Aresource.labels.location%3D%22us-central1%22%0Aresource.labels.cluster_name%3D%22golden-cluster%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22golden-workload-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration=P1D?e=13802955&mods=allow_workbench_image_override&project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt new file mode 100644 index 000000000..e05b6840d --- /dev/null +++ b/goldens/Workload_create_pathways.txt @@ -0,0 +1,32 @@ +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f 871fa8b4813a0c43d7d5f0088986e20d11d4f093d6986a542d92a9420afa632b +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your Pathways workload and other resources here : https://console.cloud.google.com/logs/query;query=resource.type%3D"k8s_container"%0Aresource.labels.project_id%3D"golden-project"%0Aresource.labels.location%3D"us-central1"%0Aresource.labels.cluster_name%3D"golden-cluster"%0Aresource.labels.pod_name:"golden-workload-"%0Aseverity>%3DDEFAULT +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt new file mode 100644 index 000000000..c719630aa --- /dev/null +++ b/goldens/Workload_delete.txt @@ -0,0 +1,12 @@ +[XPK] Starting xpk +[XPK] Starting Workload delete +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Delete Workload` is implemented by the following command not running since it is a dry run. +kubectl delete pathwaysjob golden-workload -n default +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt new file mode 100644 index 000000000..4c608fdfb --- /dev/null +++ b/goldens/Workload_list.txt @@ -0,0 +1,12 @@ +[XPK] Starting xpk +[XPK] Starting workload list +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Workload List Output: +0 +[XPK] See your workloads in Cloud Console: https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/commands/batch.py b/src/xpk/commands/batch.py index 9759e23e3..e52edb9a8 100644 --- a/src/xpk/commands/batch.py +++ b/src/xpk/commands/batch.py @@ -31,6 +31,7 @@ ) from ..core.kueue import LOCAL_QUEUE_NAME from ..utils.console import xpk_exit, xpk_print +from ..utils.execution_context import is_dry_run from .kind import set_local_cluster_command from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command @@ -51,18 +52,16 @@ def batch(args: Namespace) -> None: if set_cluster_command_code != 0: xpk_exit(set_cluster_command_code) - err_code = prepare_kjob(args) - if err_code > 0: - xpk_exit(err_code) - setup_k8s_service_accounts() + if not is_dry_run(): + err_code = prepare_kjob(args) + if err_code > 0: + xpk_exit(err_code) + setup_k8s_service_accounts() submit_job(args) def submit_job(args: Namespace) -> None: - - setup_k8s_service_accounts() - cmd = ( 'kubectl kjob create slurm' f' --profile {AppProfileDefaults.NAME.value}' @@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None: cmd = add_gpu_networking_annotations_to_command(args, cmd) cmd = add_TAS_annotations_to_command(args, cmd) - for annotation in get_storage_annotations(args): + annotations = [] if is_dry_run() else get_storage_annotations(args) + for annotation in annotations: cmd += f' --pod-template-annotation {annotation}' if args.ignore_unknown_flags: diff --git a/src/xpk/commands/common.py b/src/xpk/commands/common.py index 7020a817f..53a377ac5 100644 --- a/src/xpk/commands/common.py +++ b/src/xpk/commands/common.py @@ -18,6 +18,7 @@ from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType from ..core.gcloud_context import zone_to_region from ..utils.console import xpk_print, xpk_exit +from ..utils.execution_context import is_dry_run from ..core.system_characteristics import ( SystemCharacteristics, ) @@ -63,6 +64,9 @@ def is_TAS_possible( True if possible and False otherwise. """ + if is_dry_run(): + return True + if system_characteristics is None: xpk_print('system_characteristics data was not found in configmaps.') xpk_exit(1) diff --git a/src/xpk/commands/job.py b/src/xpk/commands/job.py index 250fa9946..4d9d21457 100644 --- a/src/xpk/commands/job.py +++ b/src/xpk/commands/job.py @@ -28,6 +28,28 @@ from .kind import set_local_cluster_command +JOBS_DRY_RUN_YAML = """ +items: +- apiVersion: slurm.k8s.io/v1alpha1 + kind: SlurmJob + metadata: + annotations: + kjobctl.x-k8s.io/script: echo hello + creationTimestamp: '2024-04-29T12:00:00Z' + labels: + kjobctl.x-k8s.io/app-profile: default + name: golden-job + namespace: default + spec: + script: echo hello +""" + +PODS_DRY_RUN_RESULT = """ +foo-pod 2/2 Running 0 2d +bar-pod 1/1 Evicted 0 1d +""" + + def job_info(args): """Run commands obtaining information about a job given by name. @@ -52,7 +74,10 @@ def job_info(args): f' metadata.name=={job_name}' ) job_code, job_text = run_command_for_value( - job_command, 'Getting job info', args + job_command, + 'Getting job info', + args, + dry_run_return_val=JOBS_DRY_RUN_YAML, ) if job_code != 0: xpk_print(f'Job info request returned ERROR {job_code}') @@ -60,7 +85,10 @@ def job_info(args): pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers' pods_code, pods_text = run_command_for_value( - pods_command, 'Getting pods list', args + pods_command, + 'Getting pods list', + args, + dry_run_return_val=PODS_DRY_RUN_RESULT, ) if pods_code != 0: xpk_print(f'Pods list request returned ERROR {pods_code}') diff --git a/src/xpk/commands/storage.py b/src/xpk/commands/storage.py index fb3ba85ca..e27cdcfbc 100644 --- a/src/xpk/commands/storage.py +++ b/src/xpk/commands/storage.py @@ -58,6 +58,7 @@ ) from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.kubectl import apply_kubectl_manifest +from ..utils.execution_context import is_dry_run def storage_create(args: Namespace) -> None: @@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None: def storage_list(args: Namespace) -> None: - k8s_api_client = setup_k8s_env(args) - storages = list_storages(k8s_api_client) + storages = [] + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + storages = list_storages(k8s_api_client) print_storages_for_cluster(storages) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 548d5c47f..66e061de8 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -97,6 +97,7 @@ ) from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from . import cluster_gcluster from .common import is_TAS_possible @@ -306,8 +307,10 @@ def workload_create(args) -> None: Returns: 0 if successful and 1 otherwise. """ - k8s_api_client = setup_k8s_env(args) - setup_k8s_service_accounts() + k8s_api_client = None + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + setup_k8s_service_accounts() workload_exists = check_if_workload_exists(args) @@ -383,8 +386,10 @@ def workload_create(args) -> None: all_storages = [] # Currently storage customization is not supported for Pathways workloads. b/408468941 if not args.use_pathways: - storages: list[Storage] = get_storages_to_mount( - k8s_api_client, args.storage + storages: list[Storage] = ( + [] + if k8s_api_client is None + else get_storages_to_mount(k8s_api_client, args.storage) ) gcs_fuse_storages = list( filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages) @@ -576,7 +581,7 @@ def workload_create(args) -> None: xpk_print(f'Create Workload request returned ERROR {return_code}') xpk_exit(return_code) - if not args.use_pathways: + if not args.use_pathways and not is_dry_run(): add_bucket_iam_members(args, storages) # Get GKE outlier dashboard for TPU @@ -747,8 +752,6 @@ def workload_list(args) -> None: Returns: 0 if successful and 1 otherwise. """ - xpk_print(args) - xpk_print('Starting workload list', flush=True) add_zone_and_project(args) get_cluster_credentials(args) diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py index b5212b6da..3dd4b7f8e 100644 --- a/src/xpk/core/cluster_private.py +++ b/src/xpk/core/cluster_private.py @@ -19,6 +19,7 @@ add_current_machine_to_networks, is_current_machine_in_any_network, ) +from ..utils.execution_context import is_dry_run from ..utils.objects import is_text_true from .commands import run_command_for_value, run_command_with_updates from .gcloud_context import zone_to_region @@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int: if not args.private and args.authorized_networks is None: xpk_print('Cluster is public and no need to authorize networks.') return 0 - else: + elif not is_dry_run(): xpk_print( 'Cannot convert an existing public cluster to private. The arguments' ' --private and --authorized-networks are not acceptable for public' @@ -164,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]: command, 'Fetching the list of authorized network from cluster describe.', args, + dry_run_return_val='127.0.0.1/32', ) if return_code != 0: diff --git a/src/xpk/core/docker_image.py b/src/xpk/core/docker_image.py index c31b2c9fc..75050eb8d 100644 --- a/src/xpk/core/docker_image.py +++ b/src/xpk/core/docker_image.py @@ -21,6 +21,7 @@ from ..utils.console import xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from .commands import run_command_with_updates DEFAULT_DOCKER_IMAGE = 'python:3.10' @@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: """ # Pick a name for the docker image. - docker_image_prefix = os.getenv('USER', 'unknown') + docker_image_prefix = ( + 'dry-run' if is_dry_run() else os.getenv('USER', 'unknown') + ) docker_name = f'{docker_image_prefix}-runner' script_dir_dockerfile = """FROM {base_docker_image} @@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: # Pick a randomly generated `tag_length` character docker tag. tag_length = 4 - tag_random_prefix = ''.join( - random.choices(string.ascii_lowercase, k=tag_length) + tag_random_prefix = ( + 'prefix' + if is_dry_run() + else ''.join(random.choices(string.ascii_lowercase, k=tag_length)) + ) + tag_datetime = ( + 'current' + if is_dry_run() + else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') ) - tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') tag_name = f'{tag_random_prefix}-{tag_datetime}' cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}' xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}') diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py index a36651e47..0519845e9 100644 --- a/src/xpk/core/docker_resources.py +++ b/src/xpk/core/docker_resources.py @@ -20,6 +20,7 @@ from .cluster import setup_k8s_env from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount from .system_characteristics import AcceleratorType, SystemCharacteristics +from ..utils.execution_context import is_dry_run def get_main_container_resources( @@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str: - name: shared-data """ - storages: list[Storage] = get_storages_to_mount( - setup_k8s_env(args), args.storage + storages: list[Storage] = ( + [] + if is_dry_run() + else get_storages_to_mount(setup_k8s_env(args), args.storage) ) for storage in storages: if storage.type in { @@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str: elif system.accelerator_type == AcceleratorType['GPU']: volume_mount_yaml = '' - storages: list[Storage] = get_storages_to_mount( - setup_k8s_env(args), args.storage + storages: list[Storage] = ( + [] + if is_dry_run() + else get_storages_to_mount(setup_k8s_env(args), args.storage) ) for storage in storages: if storage.type in { diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 291afef68..017fb885e 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -19,6 +19,7 @@ from ..core.gcloud_context import zone_to_region from ..core.nodepool import get_all_nodepools_programmatic from ..utils.console import xpk_exit, xpk_print +from ..utils.execution_context import is_dry_run from .system_characteristics import AcceleratorType, SystemCharacteristics @@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool: # Ensure the cluster and CPU nodepools were created with create-pathways all_node_pools = get_all_nodepools_programmatic(args) desired_pw_cpu_node_pools = {'cpu-np'} - if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])): + if ( + not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])) + and not is_dry_run() + ): xpk_print( 'Cluster needs to be created with `xpk create-pathways` to run' ' Pathways workloads.' diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index d8957e133..ef6b469ce 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -15,6 +15,7 @@ """ from ..utils.console import xpk_print +from ..utils.execution_context import is_dry_run from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap from .system_characteristics import ( @@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: ) return True + if is_dry_run(): + return True + # Check for gke accelerator type: missing_gke_accelerator_type = False if not cluster_config_map.get(system.gke_accelerator): diff --git a/src/xpk/utils/network.py b/src/xpk/utils/network.py index cd506f760..509da276e 100644 --- a/src/xpk/utils/network.py +++ b/src/xpk/utils/network.py @@ -18,6 +18,7 @@ import socket import requests from .console import xpk_print +from .execution_context import is_dry_run # Retrives machine's external IP address ip_resolver_url = "http://api.ipify.org" @@ -36,6 +37,9 @@ def get_current_machine_ip(external_ip=True): The IP address as a string. """ + if is_dry_run(): + return 0, "127.0.0.1" + try: if external_ip: # Get external IP address