From 18cb1ecb4e5ff36ec64ea126148e9ce8acc614dd Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Fri, 19 Sep 2025 12:52:46 +0000 Subject: [PATCH 1/5] feat: add execution_context --- src/xpk/utils/execution_context.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/xpk/utils/execution_context.py diff --git a/src/xpk/utils/execution_context.py b/src/xpk/utils/execution_context.py new file mode 100644 index 000000000..d38088306 --- /dev/null +++ b/src/xpk/utils/execution_context.py @@ -0,0 +1,28 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +dry_run = False + + +def set_dry_run(value: bool) -> None: + """Sets the dry_run flag.""" + global dry_run + dry_run = value + + +def is_dry_run() -> bool: + """Returns the current value of the dry_run flag.""" + return dry_run From cebceb033ead147fa4a7dd462f99306b8644f32c Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Fri, 19 Sep 2025 12:56:43 +0000 Subject: [PATCH 2/5] fix: dry run --- src/xpk/commands/cluster.py | 11 +++++++---- src/xpk/commands/workload.py | 6 +++++- src/xpk/core/cluster.py | 6 +++++- src/xpk/core/kjob.py | 7 +++++-- src/xpk/core/nodepool.py | 11 ++++++++--- src/xpk/core/pathways.py | 2 +- src/xpk/core/resources.py | 22 ++++++++++++++++------ 7 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index a57d071bb..dc8634d6b 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -76,6 +76,7 @@ from ..core.workload import get_workload_list from ..utils.console import get_user_input, xpk_exit, xpk_print from ..utils.file import write_tmp_file +from ..utils.execution_context import is_dry_run from . import cluster_gcluster from .common import set_cluster_command import shutil @@ -128,9 +129,10 @@ def cluster_adapt(args) -> None: get_cluster_credentials(args) - k8s_client = setup_k8s_env(args) + if not is_dry_run(): + k8s_client = setup_k8s_env(args) + install_storage_crd(k8s_client) - install_storage_crd(k8s_client) install_storage_csis(args) # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set @@ -251,9 +253,10 @@ def cluster_create(args) -> None: if update_coredns_command_code != 0: xpk_exit(update_cluster_command_code) - k8s_client = setup_k8s_env(args) + if not is_dry_run(): + k8s_client = setup_k8s_env(args) + install_storage_crd(k8s_client) - install_storage_crd(k8s_client) install_storage_csis(args) # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 364d1b961..03f372c7d 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -725,7 +725,11 @@ def workload_delete(args) -> None: ) else: return_code = run_commands( - commands, 'Delete Workload', task_names, batch=100 + commands, + 'Delete Workload', + task_names, + batch=100, + dry_run=args.dry_run, ) if return_code != 0: diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 41c7ec452..b2c9d8e1c 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -442,7 +442,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient: if not getattr(args, 'kind_cluster', False): add_zone_and_project(args) get_cluster_credentials(args) - args.project_number = project_id_to_project_number(args.project) + args.project_number = ( + project_id_to_project_number(args.project) + if not args.dry_run + else abs(hash(args.project) % (10**12)) # 12 digit hash + ) config.load_kube_config() return k8s_client.ApiClient() diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index 59388b732..c2c43f27b 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -23,6 +23,7 @@ from kubernetes.client.rest import ApiException from ..utils import templates +from ..utils.execution_context import is_dry_run from ..utils.console import xpk_exit, xpk_print from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env @@ -368,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int: def prepare_kjob(args: Namespace) -> int: system = get_cluster_system_characteristics(args) - k8s_api_client = setup_k8s_env(args) - storages = get_auto_mount_storages(k8s_api_client) + storages = [] + if not is_dry_run(): + k8s_api_client = setup_k8s_env(args) + storages = get_auto_mount_storages(k8s_api_client) service_account = "" if len(storages) > 0: diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index cab159f15..85ab6aba9 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -265,7 +265,9 @@ def run_gke_node_pool_create_command( ) configmap_yml = {} configmap_yml[resources_configmap_name] = resources_yml - return_code = create_or_update_cluster_configmap(configmap_yml) + return_code = create_or_update_cluster_configmap( + configmap_yml, args.dry_run + ) if return_code != 0: return 1 @@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]: f' --region={zone_to_region(args.zone)} --format="value(locations)"' ) return_code, nodepool_zone = run_command_for_value( - command, 'Get Node Pool Zone', args + command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone ) if return_code != 0: xpk_print(f'Get Node Pool Zone returned ERROR {return_code}') @@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int: for i, command in enumerate(commands): xpk_print(f'To complete {task_names[i]} we are executing {command}') max_return_code = run_commands( - commands, 'Update GKE node pools to default RAPID GKE version', task_names + commands, + 'Update GKE node pools to default RAPID GKE version', + task_names, + dry_run=args.dry_run, ) if max_return_code != 0: xpk_print( diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 81770eb04..291afef68 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -322,7 +322,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool: return_code = run_command_with_updates(commands[0], 'Delete Workload', args) else: return_code = run_commands( - commands, 'Delete Workload', task_names, batch=100 + commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run ) if return_code != 0: diff --git a/src/xpk/core/resources.py b/src/xpk/core/resources.py index 85b266c70..cd3667583 100644 --- a/src/xpk/core/resources.py +++ b/src/xpk/core/resources.py @@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: ) return_code, return_value = run_command_for_value( - command, 'GKE Cluster Get ConfigMap', args + command, + 'GKE Cluster Get ConfigMap', + args, + dry_run_return_val='map[]', ) if return_code != 0: xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}') @@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None: configs = return_value[4:-1].split(' ') for config in configs: - key, value = config.strip().split(':') - config_map[key] = value + parts = config.strip().split(':') + if len(parts) != 2: + continue + config_map[parts[0]] = parts[1] return config_map @@ -150,10 +155,12 @@ def create_cluster_configmaps( args=args, name=metadata_configmap_name, data=metadata ) configmap_yml[metadata_configmap_name] = metadata_yml - return create_or_update_cluster_configmap(configmap_yml) + return create_or_update_cluster_configmap(configmap_yml, args.dry_run) -def create_or_update_cluster_configmap(configmap_yml: dict) -> int: +def create_or_update_cluster_configmap( + configmap_yml: dict, dry_run: bool +) -> int: """ Args: configmap_yml: dict containing ConfigMap name and yml string. @@ -171,7 +178,10 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int: task_names.append(task_name) return_code = run_commands( - commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names + commands, + 'GKE Cluster CreateOrUpdate ConfigMap(s)', + task_names, + dry_run=dry_run, ) if return_code != 0: xpk_print( From 9b34f1aa7e396923ee998c195852d6adb34f8e81 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Fri, 19 Sep 2025 13:04:18 +0000 Subject: [PATCH 3/5] feat: do not validate deps in dry_run mode --- src/xpk/main.py | 3 ++- src/xpk/utils/file.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/xpk/main.py b/src/xpk/main.py index 9662080ca..f59593748 100644 --- a/src/xpk/main.py +++ b/src/xpk/main.py @@ -63,9 +63,10 @@ def main() -> None: set_parser(parser=parser) xpk_print('Starting xpk', flush=True) - validate_dependencies() main_args = parser.parse_args() main_args.enable_ray_cluster = False + if not main_args.dry_run: + validate_dependencies() main_args.func(main_args) xpk_print('XPK Done.', flush=True) diff --git a/src/xpk/utils/file.py b/src/xpk/utils/file.py index 57321cf43..85f230986 100644 --- a/src/xpk/utils/file.py +++ b/src/xpk/utils/file.py @@ -16,7 +16,6 @@ import tempfile import os -from .console import xpk_print def make_tmp_files(per_command_name): @@ -79,4 +78,3 @@ def ensure_directory_exists(directory_path): """ if not os.path.exists(directory_path): os.makedirs(directory_path) - xpk_print(f"Directory '{directory_path}' created successfully.") From 2d786f8ef5d7a60a4343a3582628c2ef8556de53 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Fri, 19 Sep 2025 13:04:37 +0000 Subject: [PATCH 4/5] feat: deterministic file names in dry run --- goldens/NAP_cluster-create.txt | 97 +++++++++++++++---- goldens/NAP_cluster-create_with_pathways.txt | 98 ++++++++++++++++---- src/xpk/commands/cluster.py | 6 +- src/xpk/commands/inspector.py | 2 +- src/xpk/commands/workload.py | 2 +- src/xpk/core/commands.py | 17 ++-- src/xpk/core/docker_image.py | 2 +- src/xpk/core/jobset.py | 2 +- src/xpk/core/kueue.py | 4 +- src/xpk/core/nap.py | 2 +- src/xpk/core/network.py | 2 +- src/xpk/core/ray.py | 2 +- src/xpk/core/resources.py | 2 +- src/xpk/main.py | 2 + src/xpk/utils/file.py | 33 +++++-- 15 files changed, 212 insertions(+), 61 deletions(-) diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 93adad125..dd1df293b 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -29,19 +29,84 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] CoreDNS has successfully started and passed verification. [XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. [XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Couldn't translate project id: golden-project to project number. Error: 403 Permission 'resourcemanager.projects.get' denied on resource '//cloudresourcemanager.googleapis.com/projects/golden-project' (or it may not exist). [reason: "IAM_PERMISSION_DENIED" -domain: "cloudresourcemanager.googleapis.com" -metadata { - key: "resource" - value: "projects/golden-project" -} -metadata { - key: "permission" - value: "resourcemanager.projects.get" -} -] -[XPK] XPK failed, error code 1 +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f b3843453fb19ae7105126245bac5b63930f46861462cd3a557aea44801a99280 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index cfa84b36b..f0c8ba7f6 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -29,19 +29,85 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] CoreDNS has successfully started and passed verification. [XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. [XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Couldn't translate project id: golden-project to project number. Error: 403 Permission 'resourcemanager.projects.get' denied on resource '//cloudresourcemanager.googleapis.com/projects/golden-project' (or it may not exist). [reason: "IAM_PERMISSION_DENIED" -domain: "cloudresourcemanager.googleapis.com" -metadata { - key: "resource" - value: "projects/golden-project" -} -metadata { - key: "permission" - value: "resourcemanager.projects.get" -} -] -[XPK] XPK failed, error code 1 +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 898c7686cc5ef7f026f74e55b73b5843767e3f1abb9639169f02ebc44d06af73 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index dc8634d6b..d8bf77d45 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -412,10 +412,8 @@ def cluster_cacheimage(args) -> None: nodeSelectorKey=node_selector_key, ) tmp = write_tmp_file(yml_string) - command_apply = f'kubectl apply -f {str(tmp.file.name)}' - command_delete = ( - f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true' - ) + command_apply = f'kubectl apply -f {str(tmp)}' + command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true' return_code = run_command_with_updates( command_delete, 'Deleting Cached Image', args diff --git a/src/xpk/commands/inspector.py b/src/xpk/commands/inspector.py index 580aeff36..3e8d783f0 100644 --- a/src/xpk/commands/inspector.py +++ b/src/xpk/commands/inspector.py @@ -346,7 +346,7 @@ def inspector(args) -> None: ) # Summarize inspector: - xpk_print(f'Find xpk inspector output file: {inspector_file.name}') + xpk_print(f'Find xpk inspector output file: {inspector_file}') if final_return_code != 0: xpk_print( diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 03f372c7d..548d5c47f 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -569,7 +569,7 @@ def workload_create(args) -> None: pod_failure_policy=pod_failure_policy, ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' return_code = run_command_with_updates(command, 'Creating Workload', args) if return_code != 0: diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index ba8cb1191..cc3b266b7 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): The max return code and a list of all the return codes. """ + files = [open(f, 'w', encoding='utf-8') for f in output_logs] children = [] start_time = datetime.datetime.now() - for i, command in enumerate(commands): + for command, file in zip(commands, files): children.append( # subprocess managed by list pylint: disable=consider-using-with - subprocess.Popen( - command, stdout=output_logs[i], stderr=output_logs[i], shell=True - ) + subprocess.Popen(command, stdout=file, stderr=file, shell=True) ) while True: @@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): slow_worker_text = per_command_name[slow_worker_index] slow_str = ( f', task {slow_worker_text} still working, logfile' - f' {output_logs[slow_worker_index].name}' + f' {output_logs[slow_worker_index]}' ) else: slow_str = '' @@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): ) xpk_print( f'Failure is {per_command_name[failing_index]}' - f' and logfile {output_logs[failing_index].name}' + f' and logfile {output_logs[failing_index]}' ) for child in children: child.terminate() @@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs): break time.sleep(1) + + for file in files: + file.close() + return max_returncode, returncodes @@ -351,6 +354,6 @@ def run_command_with_full_controls( def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' err_code = run_command_with_updates(command, task, args) return err_code diff --git a/src/xpk/core/docker_image.py b/src/xpk/core/docker_image.py index 7425b0fd6..c31b2c9fc 100644 --- a/src/xpk/core/docker_image.py +++ b/src/xpk/core/docker_image.py @@ -94,7 +94,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]: ) tmp = write_tmp_file(docker_file) docker_build_command = ( - f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t' + f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t' f' {docker_name} {args.script_dir}' ) xpk_print(f'Building {args.script_dir} into docker image.') diff --git a/src/xpk/core/jobset.py b/src/xpk/core/jobset.py index 135cfda63..e47346796 100644 --- a/src/xpk/core/jobset.py +++ b/src/xpk/core/jobset.py @@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args): memory_limit_size=new_memory_limit, ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Updating jobset Controller Manager resources' return_code = run_command_with_updates_retry(command, task, args) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8f1d434c1..49f57a4fd 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -474,7 +474,7 @@ def install_kueue_crs( yml_string = topology_yaml + yml_string tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Applying Kueue Custom Resources' return_code = run_command_with_updates_retry(command, task, args) @@ -536,7 +536,7 @@ def update_kueue_resources_if_necessary(args): memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Updating Kueue Controller Manager resources' return_code = run_command_with_updates_retry(command, task, args) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 6a628eb83..0b21e8ee8 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -250,7 +250,7 @@ def create_autoprovisioning_config( zones=f'- {args.zone}', ) autoprovisioning_config = AutoprovisioningConfig( - config_filename=write_tmp_file(yml_string).name, + config_filename=write_tmp_file(yml_string), minimum_chips=minimum, maximum_chips=maximum, ) diff --git a/src/xpk/core/network.py b/src/xpk/core/network.py index e42ca76c6..18f844c59 100644 --- a/src/xpk/core/network.py +++ b/src/xpk/core/network.py @@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int: """ yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' return_code = run_command_with_updates( command, 'GKE Cluster Create Network Config', args diff --git a/src/xpk/core/ray.py b/src/xpk/core/ray.py index 2266d52ab..50e391025 100644 --- a/src/xpk/core/ray.py +++ b/src/xpk/core/ray.py @@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int: ) tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' task = 'Applying RayCluster' retry_attempts = 1 return_code = run_command_with_updates_retry( diff --git a/src/xpk/core/resources.py b/src/xpk/core/resources.py index cd3667583..f215e1063 100644 --- a/src/xpk/core/resources.py +++ b/src/xpk/core/resources.py @@ -172,7 +172,7 @@ def create_or_update_cluster_configmap( task_names = [] for configmap_name, yml_string in configmap_yml.items(): tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + command = f'kubectl apply -f {str(tmp)}' commands.append(command) task_name = f'ConfigMap CreateOrUpdate-{configmap_name}' task_names.append(task_name) diff --git a/src/xpk/main.py b/src/xpk/main.py index f59593748..d166b9833 100644 --- a/src/xpk/main.py +++ b/src/xpk/main.py @@ -37,6 +37,7 @@ from .parser.core import set_parser from .utils.console import xpk_print from .utils.validation import validate_dependencies +from .utils.execution_context import set_dry_run ################### Compatibility Check ################### # Check that the user runs the below version or greater. @@ -65,6 +66,7 @@ def main() -> None: xpk_print('Starting xpk', flush=True) main_args = parser.parse_args() main_args.enable_ray_cluster = False + set_dry_run('dry_run' in main_args and main_args.dry_run) if not main_args.dry_run: validate_dependencies() main_args.func(main_args) diff --git a/src/xpk/utils/file.py b/src/xpk/utils/file.py index 85f230986..f5242e2d3 100644 --- a/src/xpk/utils/file.py +++ b/src/xpk/utils/file.py @@ -16,9 +16,11 @@ import tempfile import os +import hashlib +from .execution_context import is_dry_run -def make_tmp_files(per_command_name): +def make_tmp_files(per_command_name: list[str]) -> list[str]: """Make temporary files for each command. Args: @@ -27,16 +29,19 @@ def make_tmp_files(per_command_name): Returns: A list of temporary files for each command. """ + if is_dry_run(): + return [_hash_filename(command) for command in per_command_name] + # Supports removal of spaces from command names before converting to file name. return [ tempfile.NamedTemporaryFile( delete=False, prefix=command.replace(' ', '-') + '-' - ) + ).file.name for command in per_command_name ] -def write_tmp_file(payload): +def write_tmp_file(payload: str) -> str: """Writes `payload` to a temporary file. Args: @@ -45,14 +50,17 @@ def write_tmp_file(payload): Returns: A file object that was written to. """ + if is_dry_run(): + return _hash_filename(payload) + with tempfile.NamedTemporaryFile(delete=False) as tmp: with open(file=tmp.name, mode='w', encoding='utf=8') as f: f.write(payload) f.flush() - return tmp + return tmp.file.name -def append_tmp_file(payload, file): +def append_tmp_file(payload: str, file: str) -> str: """Appends `payload` to an already created file. Use `write_temporary_file` to create a file. @@ -64,17 +72,26 @@ def append_tmp_file(payload, file): Returns: A file object that was written to. """ - with open(file=file.name, mode='a', encoding='utf=8') as f: + if is_dry_run(): + return file + + with open(file=file, mode='a', encoding='utf=8') as f: f.write(payload) f.flush() return file -def ensure_directory_exists(directory_path): +def ensure_directory_exists(directory_path: str) -> None: """Checks if a directory exists and creates it if it doesn't. Args: directory_path: The path to the directory. """ - if not os.path.exists(directory_path): + if not is_dry_run() and not os.path.exists(directory_path): os.makedirs(directory_path) + + +def _hash_filename(seed: str) -> str: + m = hashlib.sha256() + m.update(seed.encode('utf-8')) + return m.hexdigest() From fd90a08e1cf6c9250599b4b7501f84512eeb2650 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Thu, 18 Sep 2025 15:12:55 +0000 Subject: [PATCH 5/5] build: integrate golden testing with github actions --- .github/workflows/build_tests.yaml | 5 +++ .github/workflows/reusable_goldens.yaml | 48 +++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 .github/workflows/reusable_goldens.yaml diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index 4962cc7d1..61ded0a56 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -131,6 +131,11 @@ jobs: uses: ./.github/workflows/reusable_lint_and_format.yml with: run-id: '${{needs.set-variables.outputs.run-id}}' + verify-goldens: + needs: [install-dependencies, set-variables] + uses: ./.github/workflows/reusable_goldens.yaml + with: + run-id: '${{needs.set-variables.outputs.run-id}}' run-unit-tests: needs: [install-dependencies, set-variables] uses: ./.github/workflows/reusable_unit_tests.yaml diff --git a/.github/workflows/reusable_goldens.yaml b/.github/workflows/reusable_goldens.yaml new file mode 100644 index 000000000..df77e76be --- /dev/null +++ b/.github/workflows/reusable_goldens.yaml @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +on: + workflow_call: + inputs: + run-id: + required: true + type: string + +permissions: + contents: read + +jobs: + verify-goldens: + runs-on: [ubuntu-22.04] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Prepare directories + run: mkdir -p ~/.cache/pip + - name: Restore cached dependencies + uses: actions/cache@v4 + with: + path: | + /usr/local/bin/kubectl-kueue + /usr/local/bin/kubectl-kjob + ~/.cache/pip + ${{env.pythonLocation}} + key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}} + restore-keys: xpk-deps-3.10- + - name: Verify goldens + run: ./golden_buddy.sh verify goldens.yaml goldens + env: + UPDATE_GOLDEN_COMMAND: make goldens