From 807c3bc00963caf8744ddce6c5af6b9ca4740642 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 12 Aug 2025 06:45:05 +0000 Subject: [PATCH 01/15] feat: Add credential test with DNS retry logic --- src/xpk/core/cluster.py | 57 +++++++++++++++++++++++++++++++++++++--- src/xpk/core/commands.py | 26 ++++++++++++++++++ 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 0b36c8c11..58ef601d9 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -27,6 +27,7 @@ run_command_for_value, run_command_with_updates, run_command_with_updates_retry, + run_command_and_capture_output, ) from .gcloud_context import ( add_zone_and_project, @@ -64,6 +65,7 @@ def set_jobset_on_cluster(args) -> int: command = ( 'kubectl apply --server-side -f' f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml' + ' --force-conflicts' ) task = f'Install Jobset on {args.cluster}' return_code = run_command_with_updates_retry(command, task, args) @@ -877,6 +879,49 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int: return 0 +def test_and_retry_credentials_with_dns_logic(args) -> int: + """Tests kubectl credentials and retries with default settings if a DNS error is found. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if credentials are valid after retrying, 1 otherwise. + """ + + xpk_print('Testing credentials with kubectl...') + kubectl_command = 'kubectl get pods' + kubectl_return_code, kubectl_output = run_command_and_capture_output( + kubectl_command, 'kubectl get pods', args + ) + xpk_print(kubectl_output) + + if kubectl_return_code != 0: + dns_endpoint_error = 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic is disabled' + if dns_endpoint_error in kubectl_output: + xpk_print('Detected DNS endpoint-related error. Retrying without --dns-endpoint flag...') + + without_dns_command = ( + 'gcloud container clusters get-credentials' + f' {args.cluster} --region={zone_to_region(args.zone)}' + f' --project={args.project} &&' + ' kubectl config view && kubectl config set-context --current' + ' --namespace=default' + ) + return_code = run_command_with_updates_retry( + without_dns_command, 'get-credentials to cluster', args, verbose=False + ) + if return_code != 0: + xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.') + xpk_exit(return_code) + + return 0 + else: + xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}') + xpk_exit(kubectl_return_code) + + xpk_print('Credentials test succeeded.') + return 0 def get_cluster_credentials(args) -> None: """Run cluster configuration command to set the kubectl config. @@ -890,14 +935,18 @@ def get_cluster_credentials(args) -> None: command = ( 'gcloud container clusters get-credentials' f' {args.cluster} --region={zone_to_region(args.zone)}' + # ' --dns-endpoint' f' --project={args.project} &&' ' kubectl config view && kubectl config set-context --current' ' --namespace=default' ) - task = f'get-credentials to cluster {args.cluster}' + task = f'get-credentials-dns-endpoint to cluster {args.cluster}' return_code = run_command_with_updates_retry( command, task, args, verbose=False ) - if return_code != 0: - xpk_print(f'{task} returned ERROR {return_code}') - xpk_exit(return_code) + if return_code == 0: + return_code = test_and_retry_credentials_with_dns_logic(args) + xpk_print('Finished get-credentials and kubectl setup.') + + return return_code + diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index ad01e6c21..5221b4346 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -354,3 +354,29 @@ def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: command = f'kubectl apply -f {str(tmp.file.name)}' err_code = run_command_with_updates(command, task, args) return err_code + +def run_command_and_capture_output( + command: str, + task, + args +) -> tuple[int, str]: + """Executes a command and captures its output and return code. + + Args: + command (str): The command string to execute. + + Returns: + tuple[int, str]: A tuple containing the return code and the captured output string. + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + check=False + ) + output = result.stdout + result.stderr + return result.returncode, output + except Exception as e: + return 1, str(e) From 6a2e9f9bc45041fc26486a3523d4077077790b0f Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 12 Aug 2025 07:16:39 +0000 Subject: [PATCH 02/15] Fixed a minor bug --- src/xpk/core/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 58ef601d9..ef633b6a2 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -935,7 +935,7 @@ def get_cluster_credentials(args) -> None: command = ( 'gcloud container clusters get-credentials' f' {args.cluster} --region={zone_to_region(args.zone)}' - # ' --dns-endpoint' + ' --dns-endpoint' f' --project={args.project} &&' ' kubectl config view && kubectl config set-context --current' ' --namespace=default' From ceb7c3cc10e5555477269fd92eab566a600a5795 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 13 Aug 2025 08:16:41 +0000 Subject: [PATCH 03/15] Reduce nesting --- src/xpk/core/cluster.py | 47 +++++++++++++++++++--------------------- src/xpk/core/commands.py | 13 ++++++++--- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index ef633b6a2..a35e0724a 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -891,36 +891,33 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: xpk_print('Testing credentials with kubectl...') kubectl_command = 'kubectl get pods' - kubectl_return_code, kubectl_output = run_command_and_capture_output( + kubectl_output, kubectl_return_code = run_command_and_capture_output( kubectl_command, 'kubectl get pods', args ) xpk_print(kubectl_output) - - if kubectl_return_code != 0: - dns_endpoint_error = 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic is disabled' - if dns_endpoint_error in kubectl_output: - xpk_print('Detected DNS endpoint-related error. Retrying without --dns-endpoint flag...') - - without_dns_command = ( - 'gcloud container clusters get-credentials' - f' {args.cluster} --region={zone_to_region(args.zone)}' - f' --project={args.project} &&' - ' kubectl config view && kubectl config set-context --current' - ' --namespace=default' - ) - return_code = run_command_with_updates_retry( - without_dns_command, 'get-credentials to cluster', args, verbose=False - ) - if return_code != 0: - xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.') - xpk_exit(return_code) + if kubectl_return_code == 0: + xpk_print('Credentials test succeeded.') + return 0 - return 0 - else: - xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}') - xpk_exit(kubectl_return_code) + dns_endpoint_error = 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic is disabled' + if dns_endpoint_error not in kubectl_output: + xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}') + xpk_exit(kubectl_return_code) - xpk_print('Credentials test succeeded.') + xpk_print('Detected DNS endpoint-related error. Retrying without --dns-endpoint flag...') + without_dns_command = ( + 'gcloud container clusters get-credentials' + f' {args.cluster} --region={zone_to_region(args.zone)}' + f' --project={args.project} &&' + ' kubectl config view && kubectl config set-context --current' + ' --namespace=default' + ) + return_code = run_command_with_updates_retry( + without_dns_command, 'get-credentials to cluster', args, verbose=False + ) + if return_code != 0: + xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.') + xpk_exit(return_code) return 0 def get_cluster_credentials(args) -> None: diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 5221b4346..b134b16d4 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -358,7 +358,7 @@ def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: def run_command_and_capture_output( command: str, task, - args + global_args ) -> tuple[int, str]: """Executes a command and captures its output and return code. @@ -368,6 +368,13 @@ def run_command_and_capture_output( Returns: tuple[int, str]: A tuple containing the return code and the captured output string. """ + if global_args.dry_run: + xpk_print( + f'Task: `{task}` is implemented by the following command' + ' not running since it is a dry run.' + f' \n{command}' + ) + return 0 try: result = subprocess.run( command, @@ -377,6 +384,6 @@ def run_command_and_capture_output( check=False ) output = result.stdout + result.stderr - return result.returncode, output + return output, result.returncode except Exception as e: - return 1, str(e) + return str(e), 1 From 31ac562b9b91021c3533cae68895ed5eeb7b3ba3 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 28 Aug 2025 06:03:02 +0000 Subject: [PATCH 04/15] Fixed Pylint --- src/xpk/commands/cluster.py | 2 +- src/xpk/core/cluster.py | 30 ++++++++++++++++++------------ src/xpk/core/commands.py | 28 ++++++++++++++-------------- src/xpk/core/kueue.py | 2 +- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 2b93c0daf..831587252 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -786,7 +786,7 @@ def scale_down_deployment( if return_code != 0: xpk_print(f'Scale down {deployment_name} error {return_code}') xpk_exit(return_code) - xpk_print(f'\n{deployment_name} has been scaled down.') + xpk_print(f'{deployment_name} has been scaled down.') def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index a35e0724a..a61dbca85 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -879,12 +879,13 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int: return 0 + def test_and_retry_credentials_with_dns_logic(args) -> int: """Tests kubectl credentials and retries with default settings if a DNS error is found. - + Args: args: user provided arguments for running the command. - + Returns: 0 if credentials are valid after retrying, 1 otherwise. """ @@ -892,25 +893,30 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: xpk_print('Testing credentials with kubectl...') kubectl_command = 'kubectl get pods' kubectl_output, kubectl_return_code = run_command_and_capture_output( - kubectl_command, 'kubectl get pods', args + kubectl_command, 'kubectl get pods', args ) xpk_print(kubectl_output) if kubectl_return_code == 0: xpk_print('Credentials test succeeded.') return 0 - dns_endpoint_error = 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic is disabled' + dns_endpoint_error = ( + 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic' + ' is disabled' + ) if dns_endpoint_error not in kubectl_output: xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}') xpk_exit(kubectl_return_code) - - xpk_print('Detected DNS endpoint-related error. Retrying without --dns-endpoint flag...') + xpk_print( + 'Detected DNS endpoint-related error. Retrying without --dns-endpoint' + ' flag...' + ) without_dns_command = ( - 'gcloud container clusters get-credentials' - f' {args.cluster} --region={zone_to_region(args.zone)}' - f' --project={args.project} &&' - ' kubectl config view && kubectl config set-context --current' - ' --namespace=default' + 'gcloud container clusters get-credentials' + f' {args.cluster} --region={zone_to_region(args.zone)}' + f' --project={args.project} &&' + ' kubectl config view && kubectl config set-context --current' + ' --namespace=default' ) return_code = run_command_with_updates_retry( without_dns_command, 'get-credentials to cluster', args, verbose=False @@ -920,6 +926,7 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: xpk_exit(return_code) return 0 + def get_cluster_credentials(args) -> None: """Run cluster configuration command to set the kubectl config. @@ -946,4 +953,3 @@ def get_cluster_credentials(args) -> None: xpk_print('Finished get-credentials and kubectl setup.') return return_code - diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index b134b16d4..d97696fc1 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -355,18 +355,17 @@ def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: err_code = run_command_with_updates(command, task, args) return err_code + def run_command_and_capture_output( - command: str, - task, - global_args + command: str, task, global_args ) -> tuple[int, str]: """Executes a command and captures its output and return code. - Args: - command (str): The command string to execute. + Args: + command (str): The command string to execute. - Returns: - tuple[int, str]: A tuple containing the return code and the captured output string. + Returns: + tuple[int, str]: A tuple containing the return code and the captured output string. """ if global_args.dry_run: xpk_print( @@ -377,13 +376,14 @@ def run_command_and_capture_output( return 0 try: result = subprocess.run( - command, - shell=True, - capture_output=True, - text=True, - check=False + command, shell=True, capture_output=True, text=True, check=False ) output = result.stdout + result.stderr return output, result.returncode - except Exception as e: - return str(e), 1 + except subprocess.CalledProcessError as e: + error_output = e.stdout + e.stderr + xpk_print(f'Task {task} failed with return code {e.returncode}') + xpk_print('*' * 80) + xpk_print(error_output) + xpk_print('*' * 80) + return error_output, e.returncode \ No newline at end of file diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 257ed2bf1..ac8423bdd 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -378,7 +378,7 @@ def wait_for_kueue_available(args: Namespace) -> int: 0 if successful and 1 otherwise. """ command = ( - 'kubectl wait deploy/kueue-controller-manager -nkueue-system' + 'kubectl wait deploy/kueue-controller-manager -n kueue-system' f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}' ) task = 'Wait for Kueue to be available' From 9e1ca1b19d5f26ac94630a1323fcdef2f66df3d2 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Mon, 1 Sep 2025 04:02:08 +0000 Subject: [PATCH 05/15] Fixed the Final newline missing error --- src/xpk/core/commands.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index d97696fc1..33674d1ff 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -386,4 +386,5 @@ def run_command_and_capture_output( xpk_print('*' * 80) xpk_print(error_output) xpk_print('*' * 80) - return error_output, e.returncode \ No newline at end of file + return error_output, e.returncode + \ No newline at end of file From 634550fb9327108d02deaefd04b4c3af30079a8a Mon Sep 17 00:00:00 2001 From: DannyLi Date: Mon, 1 Sep 2025 06:12:45 +0000 Subject: [PATCH 06/15] Fixed linter --- src/xpk/core/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 33674d1ff..937d188de 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -387,4 +387,3 @@ def run_command_and_capture_output( xpk_print(error_output) xpk_print('*' * 80) return error_output, e.returncode - \ No newline at end of file From ff498261db1d30f452abe34e983f54680ac08bf7 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 4 Sep 2025 06:49:38 +0000 Subject: [PATCH 07/15] Fix Pytype --- src/xpk/core/cluster.py | 2 +- src/xpk/core/commands.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index a61dbca85..138e96998 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -927,7 +927,7 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: return 0 -def get_cluster_credentials(args) -> None: +def get_cluster_credentials(args) -> int: """Run cluster configuration command to set the kubectl config. Args: diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 937d188de..51faf70c9 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -358,7 +358,7 @@ def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: def run_command_and_capture_output( command: str, task, global_args -) -> tuple[int, str]: +) -> tuple[str, int]: """Executes a command and captures its output and return code. Args: From e2de13ae759eaa32c1904584c669ed95f536d8f6 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 1 Oct 2025 02:18:37 +0000 Subject: [PATCH 08/15] Fixed mypy --- src/xpk/core/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 18b1fdb5a..472afd148 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -373,7 +373,7 @@ def run_command_and_capture_output( ' not running since it is a dry run.' f' \n{command}' ) - return 0 + return "",0 try: result = subprocess.run( command, shell=True, capture_output=True, text=True, check=False From 302db1d6788293fa9ead67cab5eb3b3ab2f94290 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 1 Oct 2025 03:53:01 +0000 Subject: [PATCH 09/15] Fixed pylint --- goldens/Basic_cluster_create.txt | 16 +++++++++++----- goldens/Batch.txt | 12 +++++++++--- goldens/Cluster_create_private.txt | 16 +++++++++++----- goldens/Cluster_create_with_gb200-4.txt | 16 +++++++++++----- goldens/Job_cancel.txt | 12 +++++++++--- goldens/Job_list.txt | 12 +++++++++--- goldens/NAP_cluster-create.txt | 16 +++++++++++----- goldens/NAP_cluster-create_with_pathways.txt | 16 +++++++++++----- goldens/Workload_delete.txt | 12 +++++++++--- goldens/Workload_list.txt | 12 +++++++++--- src/xpk/core/cluster.py | 8 ++++---- src/xpk/core/commands.py | 2 +- 12 files changed, 105 insertions(+), 45 deletions(-) diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index e4dad6601..9a20e8909 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -14,9 +14,15 @@ gcloud beta container clusters create golden-cluster --project=golden-project -- gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" [XPK] Private Nodes is not enabled on the cluster. [XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system [XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. kubectl get deployment coredns -n kube-system @@ -53,7 +59,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available [XPK] Try 1: Install Jobset on golden-cluster [XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating jobset Controller Manager resources @@ -70,7 +76,7 @@ kubectl kueue version kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml [XPK] Wait for Kueue to be fully available [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Batch.txt b/goldens/Batch.txt index c87e66ef6..5d254bdb4 100644 --- a/goldens/Batch.txt +++ b/goldens/Batch.txt @@ -1,9 +1,15 @@ $ python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh [XPK] Starting xpk [XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 8c584fc07..d199327e8 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -16,9 +16,15 @@ gcloud container clusters describe golden-cluster-private --project=golden-proje [XPK] Task: `Fetching the list of authorized network from cluster describe.` is implemented by the following command not running since it is a dry run. gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" [XPK] Current machine's IP adrress is already authorized. -[XPK] Try 1: get-credentials to cluster golden-cluster-private -[XPK] Task: `get-credentials to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster-private +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system [XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. kubectl get deployment coredns -n kube-system @@ -60,7 +66,7 @@ gcloud beta compute reservations describe golden-reservation --project=golden-pr [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available [XPK] Try 1: Install Jobset on golden-cluster-private [XPK] Task: `Install Jobset on golden-cluster-private` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating jobset Controller Manager resources @@ -77,7 +83,7 @@ kubectl kueue version kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml [XPK] Wait for Kueue to be fully available [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index caaa33758..b2cdfcb7d 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -14,9 +14,15 @@ gcloud beta container clusters create golden-cluster --project=golden-project -- gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" [XPK] Private Nodes is not enabled on the cluster. [XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system [XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. kubectl get deployment coredns -n kube-system @@ -59,7 +65,7 @@ gcloud beta compute reservations describe golden-reservation --project=golden-pr [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available [XPK] Try 1: Install Jobset on golden-cluster [XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating jobset Controller Manager resources @@ -76,7 +82,7 @@ kubectl kueue version kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml [XPK] Wait for Kueue to be fully available [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt index 7e71ece4a..cdba7a64e 100644 --- a/goldens/Job_cancel.txt +++ b/goldens/Job_cancel.txt @@ -2,9 +2,15 @@ $ python3 xpk.py job cancel golden-job --project=golden-project --zone=us-centra [XPK] Starting xpk [XPK] Starting job cancel for job: ['golden-job'] [XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. kubectl-kjob delete slurm golden-job [XPK] Exiting XPK cleanly diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt index e659f2a6a..715b711be 100644 --- a/goldens/Job_list.txt +++ b/goldens/Job_list.txt @@ -1,9 +1,15 @@ $ python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run [XPK] Starting xpk [XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Listing jobs for project golden-project and zone us-central1-a: [XPK] Task: `list jobs` is implemented by the following command not running since it is a dry run. kubectl-kjob list slurm --profile xpk-def-app-profile diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index f765cd9bc..58a369314 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -14,9 +14,15 @@ gcloud beta container clusters create golden-cluster --project=golden-project -- gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" [XPK] Private Nodes is not enabled on the cluster. [XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system [XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. kubectl get deployment coredns -n kube-system @@ -64,7 +70,7 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available [XPK] Try 1: Install Jobset on golden-cluster [XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating jobset Controller Manager resources @@ -81,7 +87,7 @@ kubectl kueue version kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml [XPK] Wait for Kueue to be fully available [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 51982dbfb..94db15948 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -14,9 +14,15 @@ gcloud beta container clusters create golden-cluster --project=golden-project -- gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" [XPK] Private Nodes is not enabled on the cluster. [XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system [XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. kubectl get deployment coredns -n kube-system @@ -65,7 +71,7 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available [XPK] Try 1: Install Jobset on golden-cluster [XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l [XPK] Try 1: Updating jobset Controller Manager resources @@ -82,7 +88,7 @@ kubectl kueue version kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml [XPK] Wait for Kueue to be fully available [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -nkueue-system --for=condition=available --timeout=10m +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m [XPK] Install Kueue Custom Resources [XPK] Try 1: Applying Kueue Custom Resources [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt index 683c8cca6..1a306226a 100644 --- a/goldens/Workload_delete.txt +++ b/goldens/Workload_delete.txt @@ -2,9 +2,15 @@ $ python3 xpk.py workload delete --project=golden-project --zone=us-central1-a - [XPK] Starting xpk [XPK] Starting Workload delete [XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name [XPK] check_if_pathways_job_is_installed 0 0 diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt index e23d631fb..02b237805 100644 --- a/goldens/Workload_list.txt +++ b/goldens/Workload_list.txt @@ -2,9 +2,15 @@ $ python3 xpk.py workload list --project=golden-project --zone=us-central1-a --c [XPK] Starting xpk [XPK] Starting workload list [XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. [XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" [XPK] Workload List Output: diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 9e8960023..1293e5d8e 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -908,7 +908,7 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: ' --namespace=default' ) return_code = run_command_with_updates_retry( - without_dns_command, 'get-credentials to cluster', args, verbose=False + without_dns_command, 'get-credentials to cluster', verbose=False ) if return_code != 0: xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.') @@ -935,13 +935,13 @@ def get_cluster_credentials(args) -> int: ) task = f'get-credentials-dns-endpoint to cluster {args.cluster}' return_code = run_command_with_updates_retry( - command, task, args, verbose=False + command, task, verbose=False ) - + if return_code != 0: xpk_print(f'{task} returned ERROR {return_code}') xpk_exit(return_code) - + return_code = test_and_retry_credentials_with_dns_logic(args) xpk_print('Finished get-credentials and kubectl setup.') diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index fb9750649..1fffe8ff7 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -369,7 +369,7 @@ def run_command_and_capture_output( ' not running since it is a dry run.' f' \n{command}' ) - return "",0 + return '', 0 try: result = subprocess.run( command, shell=True, capture_output=True, text=True, check=False From 0a00530505d9e84efe92aa4c85bb1da16837656b Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 1 Oct 2025 04:06:21 +0000 Subject: [PATCH 10/15] pyink --- src/xpk/core/cluster.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 1293e5d8e..66f0a39a3 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -934,9 +934,7 @@ def get_cluster_credentials(args) -> int: ' --namespace=default' ) task = f'get-credentials-dns-endpoint to cluster {args.cluster}' - return_code = run_command_with_updates_retry( - command, task, verbose=False - ) + return_code = run_command_with_updates_retry(command, task, verbose=False) if return_code != 0: xpk_print(f'{task} returned ERROR {return_code}') From 38158b45a071b6b1bfe5eb772476f676336fd5e1 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 1 Oct 2025 09:12:43 +0000 Subject: [PATCH 11/15] Change global_args to is_dry_run() --- src/xpk/core/commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 1fffe8ff7..9420bace4 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -353,7 +353,7 @@ def run_kubectl_apply(yml_string: str, task: str) -> int: def run_command_and_capture_output( - command: str, task, global_args + command: str, task ) -> tuple[str, int]: """Executes a command and captures its output and return code. @@ -363,7 +363,7 @@ def run_command_and_capture_output( Returns: tuple[int, str]: A tuple containing the return code and the captured output string. """ - if global_args.dry_run: + if is_dry_run(): xpk_print( f'Task: `{task}` is implemented by the following command' ' not running since it is a dry run.' From 3e95b0b92830f70d58e8badc8a206203c6b70c81 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 2 Oct 2025 02:00:14 +0000 Subject: [PATCH 12/15] Deleted run_command_and_capture_output() --- src/xpk/core/cluster.py | 10 ++++------ src/xpk/core/commands.py | 33 --------------------------------- 2 files changed, 4 insertions(+), 39 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 66f0a39a3..72008fc1f 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -27,7 +27,6 @@ run_command_for_value, run_command_with_updates, run_command_with_updates_retry, - run_command_and_capture_output, ) from .gcloud_context import ( add_zone_and_project, @@ -63,9 +62,8 @@ def set_jobset_on_cluster(args) -> int: 0 if successful and 1 otherwise. """ command = ( - 'kubectl apply --server-side -f' - f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml' - ' --force-conflicts' + 'kubectl apply --server-side --force-conflicts' + f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml' ) task = f'Install Jobset on {args.cluster}' return_code = run_command_with_updates_retry(command, task) @@ -881,8 +879,8 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: xpk_print('Testing credentials with kubectl...') kubectl_command = 'kubectl get pods' - kubectl_output, kubectl_return_code = run_command_and_capture_output( - kubectl_command, 'kubectl get pods', args + kubectl_return_code, kubectl_output = run_command_for_value( + kubectl_command, 'kubectl get pods' ) xpk_print(kubectl_output) if kubectl_return_code == 0: diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index 9420bace4..af508d0e7 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -350,36 +350,3 @@ def run_kubectl_apply(yml_string: str, task: str) -> int: command = f'kubectl apply -f {str(tmp)}' err_code = run_command_with_updates(command, task) return err_code - - -def run_command_and_capture_output( - command: str, task -) -> tuple[str, int]: - """Executes a command and captures its output and return code. - - Args: - command (str): The command string to execute. - - Returns: - tuple[int, str]: A tuple containing the return code and the captured output string. - """ - if is_dry_run(): - xpk_print( - f'Task: `{task}` is implemented by the following command' - ' not running since it is a dry run.' - f' \n{command}' - ) - return '', 0 - try: - result = subprocess.run( - command, shell=True, capture_output=True, text=True, check=False - ) - output = result.stdout + result.stderr - return output, result.returncode - except subprocess.CalledProcessError as e: - error_output = e.stdout + e.stderr - xpk_print(f'Task {task} failed with return code {e.returncode}') - xpk_print('*' * 80) - xpk_print(error_output) - xpk_print('*' * 80) - return error_output, e.returncode From adf19bc8c40f6ad37b50aa872b1327392914d0d1 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 2 Oct 2025 02:11:25 +0000 Subject: [PATCH 13/15] Run golden_buddy.sh --- goldens/Basic_cluster_create.txt | 121 ++--------------- goldens/Batch.txt | 34 ++--- goldens/Cluster_create_private.txt | 128 ++---------------- goldens/Cluster_create_with_gb200-4.txt | 130 ++---------------- goldens/Cluster_delete.txt | 30 ++--- goldens/Cluster_delete_force.txt | 27 ++-- goldens/Job_cancel.txt | 29 ++-- goldens/Job_info.txt | 34 ++--- goldens/Job_list.txt | 29 ++-- goldens/NAP_cluster-create.txt | 132 ++---------------- goldens/NAP_cluster-create_with_pathways.txt | 133 ++----------------- goldens/Storage_list.txt | 18 ++- goldens/Workload_create.txt | 47 ++----- goldens/Workload_create_pathways.txt | 46 ++----- goldens/Workload_delete.txt | 32 ++--- goldens/Workload_list.txt | 32 ++--- 16 files changed, 224 insertions(+), 778 deletions(-) diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 9a20e8909..aa1ea933e 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -1,108 +1,15 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run -[XPK] Starting xpk -[XPK] Starting cluster create for cluster golden-cluster: -[XPK] Working on golden-project and us-central1-a -[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" -[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" -[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. -gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default -[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" -[XPK] Private Nodes is not enabled on the cluster. -[XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system -[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. -kubectl get deployment coredns -n kube-system -[XPK] Now verifying CoreDNS readiness... -[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. -kubectl get deployment kube-dns -n kube-system --ignore-not-found -[XPK] kube-dns deployment not found. -[XPK] Verifying if CoreDNS is available... -[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. -kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s -[XPK] CoreDNS has successfully started and passed verification. -[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. -[XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 -[XPK] Breaking up a total of 1 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Create or delete node pool request complete. -[XPK] Creating ConfigMap for cluster -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available -[XPK] Try 1: Install Jobset on golden-cluster -[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating jobset Controller Manager resources -[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 -[XPK] Try 1: Install PathwaysJob on golden-cluster -[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml -[XPK] Enabling Kueue on the cluster -[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. -kubectl kueue version -[XPK] Try 1: Set Kueue On Cluster -[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml -[XPK] Wait for Kueue to be fully available -[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Install Kueue Custom Resources -[XPK] Try 1: Applying Kueue Custom Resources -[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f c49da377b542c14a80a64a13236f8d3a1c8e022dc7c82cc6f6f0560d980ee9e7 -[XPK] Update Kueue Controller Manager resources -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating Kueue Controller Manager resources -[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc -[XPK] Verifying kjob installation -[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. -kubectl-kjob help -[XPK] kjob found -[XPK] Applying kjob CDRs -[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. -kubectl kjob printcrds | kubectl apply --server-side -f - -[XPK] Creating kjob CRDs succeeded -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 -[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 -[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 -[XPK] GKE commands done! Resources are created. -[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Batch.txt b/goldens/Batch.txt index 5d254bdb4..ca9a95161 100644 --- a/goldens/Batch.txt +++ b/goldens/Batch.txt @@ -1,21 +1,15 @@ $ python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh -[XPK] Starting xpk -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `submit job` is implemented by the following command not running since it is a dry run. -kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --worker-container xpk-batch-container --first-node-ip --pod-template-annotation kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host -- batch-read.sh --partition multislice-queue -[XPK] XPK Done. +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index d199327e8..c4dcaada9 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -1,115 +1,15 @@ $ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run -[XPK] Starting xpk -[XPK] Starting cluster create for cluster golden-cluster-private: -[XPK] Working on golden-project and us-central1-a -[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" -[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" -[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. -gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias -[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" -[XPK] Private Nodes is not enabled on the cluster. -[XPK] Task: `Fetching the list of authorized network from cluster describe.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" -[XPK] Current machine's IP adrress is already authorized. -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster-private -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system -[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. -kubectl get deployment coredns -n kube-system -[XPK] Now verifying CoreDNS readiness... -[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. -kubectl get deployment kube-dns -n kube-system --ignore-not-found -[XPK] kube-dns deployment not found. -[XPK] Verifying if CoreDNS is available... -[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. -kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s -[XPK] CoreDNS has successfully started and passed verification. -[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. -[XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters describe golden-cluster-private --region us-central1 --project golden-project --format="value(currentMasterVersion)" -[XPK] Creating 1 node pool or pools of v5p-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a -[XPK] Creating 1 node pool or pools of v5p-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') -[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="value(locations)" -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --region=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 -[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Create or delete node pool request complete. -[XPK] Creating ConfigMap for cluster -[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available -[XPK] Try 1: Install Jobset on golden-cluster-private -[XPK] Task: `Install Jobset on golden-cluster-private` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating jobset Controller Manager resources -[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 -[XPK] Try 1: Install PathwaysJob on golden-cluster-private -[XPK] Task: `Install PathwaysJob on golden-cluster-private` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml -[XPK] Enabling Kueue on the cluster -[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. -kubectl kueue version -[XPK] Try 1: Set Kueue On Cluster -[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml -[XPK] Wait for Kueue to be fully available -[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Install Kueue Custom Resources -[XPK] Try 1: Applying Kueue Custom Resources -[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f ec56970df5766f33e470374e087b3061d9960c171fce12fdb2d75170eb75fe55 -[XPK] Update Kueue Controller Manager resources -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating Kueue Controller Manager resources -[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc -[XPK] Verifying kjob installation -[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. -kubectl-kjob help -[XPK] kjob found -[XPK] Applying kjob CDRs -[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. -kubectl kjob printcrds | kubectl apply --server-side -f - -[XPK] Creating kjob CRDs succeeded -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 -[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 -[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 -[XPK] GKE commands done! Resources are created. -[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index b2cdfcb7d..93bed11bb 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -1,117 +1,15 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run -[XPK] Starting xpk -[XPK] Starting cluster create for cluster golden-cluster: -[XPK] Working on golden-project and us-central1-a -[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" -[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" -[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. -gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --enable-dataplane-v2 --enable-multi-networking --no-enable-autoupgrade --enable-ip-alias -[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" -[XPK] Private Nodes is not enabled on the cluster. -[XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system -[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. -kubectl get deployment coredns -n kube-system -[XPK] Now verifying CoreDNS readiness... -[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. -kubectl get deployment kube-dns -n kube-system --ignore-not-found -[XPK] kube-dns deployment not found. -[XPK] Verifying if CoreDNS is available... -[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. -kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s -[XPK] CoreDNS has successfully started and passed verification. -[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. -[XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" -[XPK] Creating 1 node pool or pools of gb200-4 -We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a -[XPK] Creating 1 node pool with 2 nodes of gb200-4 -Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') -[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Existing node pool names ['0'] -[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. -gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=a4x-highgpu-4g --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=golden-cluster-placement-policy --enable-gvnic --num-nodes=2 --accelerator type=nvidia-gb200,count=4,gpu-driver-version=latest --no-enable-autoupgrade --scopes="https://www.googleapis.com/auth/cloud-platform" -[XPK] Breaking up a total of 1 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Create or delete node pool request complete. -[XPK] Creating ConfigMap for cluster -[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available -[XPK] Try 1: Install Jobset on golden-cluster -[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating jobset Controller Manager resources -[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 -[XPK] Try 1: Install PathwaysJob on golden-cluster -[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml -[XPK] Enabling Kueue on the cluster -[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. -kubectl kueue version -[XPK] Try 1: Set Kueue On Cluster -[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml -[XPK] Wait for Kueue to be fully available -[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Install Kueue Custom Resources -[XPK] Try 1: Applying Kueue Custom Resources -[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 7aee1635a549cbab3308e64e5f973f49f1b09f0ea7c3633a60b69828be981fc5 -[XPK] Update Kueue Controller Manager resources -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating Kueue Controller Manager resources -[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc -[XPK] Verifying kjob installation -[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. -kubectl-kjob help -[XPK] kjob found -[XPK] Applying kjob CDRs -[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. -kubectl kjob printcrds | kubectl apply --server-side -f - -[XPK] Creating kjob CRDs succeeded -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 -[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 -[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 -[XPK] Installing NCCL Plugin for cluster -[XPK] Task: `Install NCCL Plugin On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml -[XPK] GKE commands done! Resources are created. -[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Cluster_delete.txt b/goldens/Cluster_delete.txt index f50a2154c..137c0d093 100644 --- a/goldens/Cluster_delete.txt +++ b/goldens/Cluster_delete.txt @@ -1,17 +1,15 @@ $ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -[XPK] Starting cluster delete for cluster: golden-cluster -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Get the name of the workloads in the cluster. -[XPK] Task: `List Jobs with filter-by-status=EVERYTHING` is implemented by the following command not running since it is a dry run. -kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" -[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet -[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. -gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project -[XPK] GKE commands done! Cluster golden-cluster deleted. - -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Cluster_delete_force.txt b/goldens/Cluster_delete_force.txt index 3bb64477a..6d57fc448 100644 --- a/goldens/Cluster_delete_force.txt +++ b/goldens/Cluster_delete_force.txt @@ -1,14 +1,15 @@ $ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --force --dry-run -[XPK] Starting xpk -[XPK] Starting cluster delete for cluster: golden-cluster -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials to cluster golden-cluster -[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet -[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. -gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project -[XPK] GKE commands done! Cluster golden-cluster deleted. - -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt index cdba7a64e..708874030 100644 --- a/goldens/Job_cancel.txt +++ b/goldens/Job_cancel.txt @@ -1,16 +1,15 @@ $ python3 xpk.py job cancel golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -[XPK] Starting job cancel for job: ['golden-job'] -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. -kubectl-kjob delete slurm golden-job -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Job_info.txt b/goldens/Job_info.txt index 82076c7ac..78a28f3d2 100644 --- a/goldens/Job_info.txt +++ b/goldens/Job_info.txt @@ -1,21 +1,15 @@ $ python3 xpk.py job info golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -[XPK] Task: `Getting job data` is implemented by the following command not running since it is a dry run. -kubectl-kjob describe slurm golden-job -[XPK] Task: `Getting job info` is implemented by the following command not running since it is a dry run. -kubectl-kjob list slurm -o yaml --field-selector metadata.name==golden-job -[XPK] Task: `Getting pods list` is implemented by the following command not running since it is a dry run. -kubectl get pods -l=job-name=golden-job --no-headers -Job name: golden-job -Script name: echo hello -Profile: '' -Labels: - kjobctl.x-k8s.io/app-profile: default -Mounts: [] -Pods: -- Name: foo-pod - Status: Running -- Name: bar-pod - Status: Evicted -Entrypoint environment variables template: [] -[XPK] XPK Done. +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt index 715b711be..9ccffe617 100644 --- a/goldens/Job_list.txt +++ b/goldens/Job_list.txt @@ -1,16 +1,15 @@ $ python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Listing jobs for project golden-project and zone us-central1-a: -[XPK] Task: `list jobs` is implemented by the following command not running since it is a dry run. -kubectl-kjob list slurm --profile xpk-def-app-profile -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 58a369314..6ab372f65 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -1,119 +1,15 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run -[XPK] Starting xpk -[XPK] Starting cluster create for cluster golden-cluster: -[XPK] Working on golden-project and us-central1-a -[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" -[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" -[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. -gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default -[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" -[XPK] Private Nodes is not enabled on the cluster. -[XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system -[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. -kubectl get deployment coredns -n kube-system -[XPK] Now verifying CoreDNS readiness... -[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. -kubectl get deployment kube-dns -n kube-system --ignore-not-found -[XPK] kube-dns deployment not found. -[XPK] Verifying if CoreDNS is available... -[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. -kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s -[XPK] CoreDNS has successfully started and passed verification. -[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. -[XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 -[XPK] Breaking up a total of 1 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Create or delete node pool request complete. -[XPK] Enabling Autoprovisioning -[XPK] Default Chips quota is minimum: 0, maximum: 4. -[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. -[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. -gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf -[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. -gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Breaking up a total of 0 commands into 0 batches -[XPK] Pretending all the jobs succeeded -[XPK] Creating ConfigMap for cluster -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available -[XPK] Try 1: Install Jobset on golden-cluster -[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating jobset Controller Manager resources -[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 -[XPK] Try 1: Install PathwaysJob on golden-cluster -[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml -[XPK] Enabling Kueue on the cluster -[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. -kubectl kueue version -[XPK] Try 1: Set Kueue On Cluster -[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml -[XPK] Wait for Kueue to be fully available -[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Install Kueue Custom Resources -[XPK] Try 1: Applying Kueue Custom Resources -[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f eaa77bda2c85901c627ae9bb4baacdb37df006d6bf267b319b6bc8b2cbf7ca7e -[XPK] Update Kueue Controller Manager resources -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating Kueue Controller Manager resources -[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc -[XPK] Verifying kjob installation -[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. -kubectl-kjob help -[XPK] kjob found -[XPK] Applying kjob CDRs -[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. -kubectl kjob printcrds | kubectl apply --server-side -f - -[XPK] Creating kjob CRDs succeeded -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 -[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 -[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 -[XPK] GKE commands done! Resources are created. -[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 94db15948..076047b72 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -1,120 +1,15 @@ $ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run -[XPK] Starting xpk -[XPK] Starting cluster create for cluster golden-cluster: -[XPK] Working on golden-project and us-central1-a -[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" -[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. -gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" -[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. -gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias -[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. -gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" -[XPK] Private Nodes is not enabled on the cluster. -[XPK] Cluster is public and no need to authorize networks. -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system -[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. -kubectl get deployment coredns -n kube-system -[XPK] Now verifying CoreDNS readiness... -[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. -kubectl get deployment kube-dns -n kube-system --ignore-not-found -[XPK] kube-dns deployment not found. -[XPK] Verifying if CoreDNS is available... -[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. -kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s -[XPK] CoreDNS has successfully started and passed verification. -[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. -[XPK] Skipping CoreDNS deployment since it already exists. -[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. -gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') -[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Existing node pool names ['0'] -[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 -[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Create or delete node pool request complete. -[XPK] Enabling Autoprovisioning -[XPK] Default Chips quota is minimum: 0, maximum: 4. -[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. -[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. -gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf -[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. -gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Breaking up a total of 0 commands into 0 batches -[XPK] Pretending all the jobs succeeded -[XPK] Creating ConfigMap for cluster -[XPK] Breaking up a total of 2 commands into 1 batches -[XPK] Pretending all the jobs succeeded -[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available -[XPK] Try 1: Install Jobset on golden-cluster -[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml --force-conflicts -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating jobset Controller Manager resources -[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 -[XPK] Try 1: Install PathwaysJob on golden-cluster -[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml -[XPK] Enabling Kueue on the cluster -[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. -kubectl kueue version -[XPK] Try 1: Set Kueue On Cluster -[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. -kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml -[XPK] Wait for Kueue to be fully available -[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. -kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Install Kueue Custom Resources -[XPK] Try 1: Applying Kueue Custom Resources -[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 7ffd24a656c1ec9c1d331862e352cefd5348637b0f776a8e3db888b04fa7fad6 -[XPK] Update Kueue Controller Manager resources -[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. -kubectl get node --no-headers | wc -l -[XPK] Try 1: Updating Kueue Controller Manager resources -[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. -kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc -[XPK] Verifying kjob installation -[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. -kubectl-kjob help -[XPK] kjob found -[XPK] Applying kjob CDRs -[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. -kubectl kjob printcrds | kubectl apply --server-side -f - -[XPK] Creating kjob CRDs succeeded -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 -[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. -kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 -[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. -kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 -[XPK] GKE commands done! Resources are created. -[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Storage_list.txt b/goldens/Storage_list.txt index 81db59e91..dd33960aa 100644 --- a/goldens/Storage_list.txt +++ b/goldens/Storage_list.txt @@ -1,5 +1,15 @@ $ python3 xpk.py storage list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -NAME TYPE AUTO MOUNT MOUNT POINT READONLY MANIFEST ------- ------ ------------ ------------- ---------- ---------- -[XPK] XPK Done. +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 294810256..639e7c319 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -1,34 +1,15 @@ $ python3 xpk.py workload create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run -[XPK] Starting xpk -[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. -kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' -[XPK] Starting workload create -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Starting workload create -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. -[XPK] No gcsfuse Storages to add detected -[XPK] No gcp filestore instances to add detected. -[XPK] No gcp parallelstore instances to add detected. -[XPK] No gce persistent disk instances to add detected. -[XPK] No managed lustre instances to add detected. -[XPK] Building /tmp into docker image. -[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. -docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp -[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project -[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. -docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. -docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144 -[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. -gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error -[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. -[XPK] Follow your workload here: https://console.cloud.google.com/kubernetes/service/us-central1/golden-cluster/default/golden-workload/details?project=golden-project -[XPK] Follow your worker 0, slice 0 logs here: Adjust the pod name ([prefix]-slice-job-[slice_number]-[worker_number]) after clicking the url if you want other worker logs. https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22golden-project%22%0Aresource.labels.location%3D%22us-central1%22%0Aresource.labels.cluster_name%3D%22golden-cluster%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22golden-workload-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration=P1D?e=13802955&mods=allow_workbench_image_override&project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index fdff927db..1175579e7 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -1,33 +1,15 @@ $ python3 xpk.py workload create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run -[XPK] Starting xpk -[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. -kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' -[XPK] Starting workload create -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Starting workload create -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. -kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true -[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. -[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name -[XPK] check_if_pathways_job_is_installed 0 0 -[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. -gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" -[XPK] Building /tmp into docker image. -[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. -docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp -[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project -[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. -docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. -docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f bfdb43fce214301b0be1d293cb623b61df6e14c376a0032cdc3273ed14f5a6f7 -[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. -gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error -[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. -[XPK] Follow your Pathways workload and other resources here : https://console.cloud.google.com/logs/query;query=resource.type%3D"k8s_container"%0Aresource.labels.project_id%3D"golden-project"%0Aresource.labels.location%3D"us-central1"%0Aresource.labels.cluster_name%3D"golden-cluster"%0Aresource.labels.pod_name:"golden-workload-"%0Aseverity>%3DDEFAULT -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt index 1a306226a..5f832a1c7 100644 --- a/goldens/Workload_delete.txt +++ b/goldens/Workload_delete.txt @@ -1,19 +1,15 @@ $ python3 xpk.py workload delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --dry-run -[XPK] Starting xpk -[XPK] Starting Workload delete -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. -kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name -[XPK] check_if_pathways_job_is_installed 0 0 -[XPK] Task: `Delete Workload` is implemented by the following command not running since it is a dry run. -kubectl delete pathwaysjob golden-workload -n default -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt index 02b237805..a3f0d389f 100644 --- a/goldens/Workload_list.txt +++ b/goldens/Workload_list.txt @@ -1,19 +1,15 @@ $ python3 xpk.py workload list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -[XPK] Starting xpk -[XPK] Starting workload list -[XPK] Working on golden-project and us-central1-a -[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster -[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. -gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default -[XPK] Testing credentials with kubectl... -[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. -kubectl get pods -[XPK] -[XPK] Credentials test succeeded. -[XPK] Finished get-credentials and kubectl setup. -[XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. -kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" -[XPK] Workload List Output: -0 -[XPK] See your workloads in Cloud Console: https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=golden-project -[XPK] Exiting XPK cleanly +Traceback (most recent call last): + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in + from src.xpk.main import main + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in + from .parser.core import set_parser + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in + from .config import set_config_parsers + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in + from ..commands.config import get_config, set_config + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in + from ..core.config import XpkConfig + File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in + import ruamel.yaml +ModuleNotFoundError: No module named 'ruamel' From c59d437387709cf86c5b2caa2bec718acbe95bf3 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 2 Oct 2025 07:53:58 +0000 Subject: [PATCH 14/15] Fixed ruamel --- goldens/Basic_cluster_create.txt | 121 +++++++++++++++-- goldens/Batch.txt | 34 +++-- goldens/Cluster_create_private.txt | 128 ++++++++++++++++-- goldens/Cluster_create_with_gb200-4.txt | 130 ++++++++++++++++-- goldens/Cluster_delete.txt | 30 +++-- goldens/Cluster_delete_force.txt | 27 ++-- goldens/Job_cancel.txt | 29 ++-- goldens/Job_info.txt | 34 +++-- goldens/Job_list.txt | 29 ++-- goldens/NAP_cluster-create.txt | 132 ++++++++++++++++-- goldens/NAP_cluster-create_with_pathways.txt | 133 +++++++++++++++++-- goldens/Storage_list.txt | 18 +-- goldens/Workload_create.txt | 47 +++++-- goldens/Workload_create_pathways.txt | 46 +++++-- goldens/Workload_delete.txt | 32 +++-- goldens/Workload_list.txt | 32 +++-- 16 files changed, 778 insertions(+), 224 deletions(-) diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index aa1ea933e..1062ae3aa 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -1,15 +1,108 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --spot --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f c49da377b542c14a80a64a13236f8d3a1c8e022dc7c82cc6f6f0560d980ee9e7 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Batch.txt b/goldens/Batch.txt index ca9a95161..4852edd0c 100644 --- a/goldens/Batch.txt +++ b/goldens/Batch.txt @@ -1,15 +1,21 @@ $ python3 xpk.py batch --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run batch-read.sh -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `submit job` is implemented by the following command not running since it is a dry run. +kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --worker-container xpk-batch-container --first-node-ip --pod-template-annotation kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host -- batch-read.sh --partition multislice-queue +[XPK] XPK Done. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index c4dcaada9..8adaa48e5 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -1,15 +1,115 @@ $ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster-private: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Task: `Fetching the list of authorized network from cluster describe.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster-private --project=golden-project --region=us-central1 --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" +[XPK] Current machine's IP adrress is already authorized. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster-private +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster-private` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster-private --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster-private --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of v5p-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Creating 1 node pool or pools of v5p-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=1, device_type='v5p-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --region=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster-private +[XPK] Task: `Install Jobset on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster-private +[XPK] Task: `Install PathwaysJob on golden-cluster-private` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f ec56970df5766f33e470374e087b3061d9960c171fce12fdb2d75170eb75fe55 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 93bed11bb..9df1f3ef7 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -1,15 +1,117 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --enable-dataplane-v2 --enable-multi-networking --no-enable-autoupgrade --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of gb200-4 +We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Creating 1 node pool with 2 nodes of gb200-4 +Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=2, device_type='gb200-4') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. +gcloud compute resource-policies describe golden-cluster-placement-policy --project=golden-project --region=us-central1 +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=a4x-highgpu-4g --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=golden-cluster-placement-policy --enable-gvnic --num-nodes=2 --accelerator type=nvidia-gb200,count=4,gpu-driver-version=latest --no-enable-autoupgrade --scopes="https://www.googleapis.com/auth/cloud-platform" +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Creating ConfigMap for cluster +[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 7aee1635a549cbab3308e64e5f973f49f1b09f0ea7c3633a60b69828be981fc5 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] Installing NCCL Plugin for cluster +[XPK] Task: `Install NCCL Plugin On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete.txt b/goldens/Cluster_delete.txt index 137c0d093..f50a2154c 100644 --- a/goldens/Cluster_delete.txt +++ b/goldens/Cluster_delete.txt @@ -1,15 +1,17 @@ $ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Get the name of the workloads in the cluster. +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Cluster_delete_force.txt b/goldens/Cluster_delete_force.txt index 6d57fc448..3bb64477a 100644 --- a/goldens/Cluster_delete_force.txt +++ b/goldens/Cluster_delete_force.txt @@ -1,15 +1,14 @@ $ python3 xpk.py cluster delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --force --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster delete for cluster: golden-cluster +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials to cluster golden-cluster +[XPK] Task: `get-credentials to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Task: `Cluster Delete` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters delete golden-cluster --project=golden-project --region=us-central1 --quiet +[XPK] Task: `Get All Subnets` is implemented by the following command not running since it is a dry run. +gcloud compute networks subnets list --filter=name~"golden-cluster-us-central1-sub-*" --project=golden-project +[XPK] GKE commands done! Cluster golden-cluster deleted. + +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt index 708874030..9153f379b 100644 --- a/goldens/Job_cancel.txt +++ b/goldens/Job_cancel.txt @@ -1,15 +1,16 @@ $ python3 xpk.py job cancel golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting job cancel for job: ['golden-job'] +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. +kubectl-kjob delete slurm golden-job +[XPK] Exiting XPK cleanly diff --git a/goldens/Job_info.txt b/goldens/Job_info.txt index 78a28f3d2..82076c7ac 100644 --- a/goldens/Job_info.txt +++ b/goldens/Job_info.txt @@ -1,15 +1,21 @@ $ python3 xpk.py job info golden-job --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Task: `Getting job data` is implemented by the following command not running since it is a dry run. +kubectl-kjob describe slurm golden-job +[XPK] Task: `Getting job info` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm -o yaml --field-selector metadata.name==golden-job +[XPK] Task: `Getting pods list` is implemented by the following command not running since it is a dry run. +kubectl get pods -l=job-name=golden-job --no-headers +Job name: golden-job +Script name: echo hello +Profile: '' +Labels: + kjobctl.x-k8s.io/app-profile: default +Mounts: [] +Pods: +- Name: foo-pod + Status: Running +- Name: bar-pod + Status: Evicted +Entrypoint environment variables template: [] +[XPK] XPK Done. diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt index 9ccffe617..e3a27e964 100644 --- a/goldens/Job_list.txt +++ b/goldens/Job_list.txt @@ -1,15 +1,16 @@ $ python3 xpk.py job ls --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Listing jobs for project golden-project and zone us-central1-a: +[XPK] Task: `list jobs` is implemented by the following command not running since it is a dry run. +kubectl-kjob list slurm --profile xpk-def-app-profile +[XPK] Exiting XPK cleanly diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 6ab372f65..7daa87828 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -1,15 +1,119 @@ $ python3 xpk.py cluster create --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] Breaking up a total of 1 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f eaa77bda2c85901c627ae9bb4baacdb37df006d6bf267b319b6bc8b2cbf7ca7e +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 076047b72..27536e3ad 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -1,15 +1,120 @@ $ python3 xpk.py cluster create-pathways --project=golden-project --zone=us-central1-a --enable-autoprovisioning --cluster=golden-cluster --tpu-type=tpu7x-8 --on-demand --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting cluster create for cluster golden-cluster: +[XPK] Working on golden-project and us-central1-a +[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)" +[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run. +gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)" +[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run. +gcloud container clusters list --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias +[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run. +gcloud container clusters describe golden-cluster --project=golden-project --region=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)" +[XPK] Private Nodes is not enabled on the cluster. +[XPK] Cluster is public and no need to authorize networks. +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system +[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run. +kubectl get deployment coredns -n kube-system +[XPK] Now verifying CoreDNS readiness... +[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run. +kubectl get deployment kube-dns -n kube-system --ignore-not-found +[XPK] kube-dns deployment not found. +[XPK] Verifying if CoreDNS is available... +[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run. +kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s +[XPK] CoreDNS has successfully started and passed verification. +[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'. +[XPK] Skipping CoreDNS deployment since it already exists. +[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. +gcloud beta container clusters describe golden-cluster --region us-central1 --project golden-project --format="value(currentMasterVersion)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Creating 1 node pool or pools of tpu7x-8 +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=1, device_type='tpu7x-8') +[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --region=us-central1 --format="value(locations)" +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Existing node pool names ['0'] +[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --region=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --max-pods-per-node 15 --tpu-topology=2x2x1 +[XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --region=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Create or delete node pool request complete. +[XPK] Enabling Autoprovisioning +[XPK] Default Chips quota is minimum: 0, maximum: 4. +[XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf +[XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. +gcloud container clusters update golden-cluster --project=golden-project --region=us-central1 --autoscaling-profile=optimize-utilization +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Breaking up a total of 0 commands into 0 batches +[XPK] Pretending all the jobs succeeded +[XPK] Creating ConfigMap for cluster +[XPK] Breaking up a total of 2 commands into 1 batches +[XPK] Pretending all the jobs succeeded +[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available +[XPK] Try 1: Install Jobset on golden-cluster +[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating jobset Controller Manager resources +[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 +[XPK] Try 1: Install PathwaysJob on golden-cluster +[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.2/install.yaml +[XPK] Enabling Kueue on the cluster +[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run. +kubectl kueue version +[XPK] Try 1: Set Kueue On Cluster +[XPK] Task: `Set Kueue On Cluster` is implemented by the following command not running since it is a dry run. +kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.12.2/manifests.yaml +[XPK] Wait for Kueue to be fully available +[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. +kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m +[XPK] Install Kueue Custom Resources +[XPK] Try 1: Applying Kueue Custom Resources +[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 7ffd24a656c1ec9c1d331862e352cefd5348637b0f776a8e3db888b04fa7fad6 +[XPK] Update Kueue Controller Manager resources +[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. +kubectl get node --no-headers | wc -l +[XPK] Try 1: Updating Kueue Controller Manager resources +[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run. +kubectl apply -f 012e1b15b6941e9d47cb2cdb35488d57c2f3ce0ef0b18093d2759f2e02ed81dc +[XPK] Verifying kjob installation +[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run. +kubectl-kjob help +[XPK] kjob found +[XPK] Applying kjob CDRs +[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run. +kubectl kjob printcrds | kubectl apply --server-side -f - +[XPK] Creating kjob CRDs succeeded +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. +kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. +kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 +[XPK] GKE commands done! Resources are created. +[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Storage_list.txt b/goldens/Storage_list.txt index dd33960aa..81db59e91 100644 --- a/goldens/Storage_list.txt +++ b/goldens/Storage_list.txt @@ -1,15 +1,5 @@ $ python3 xpk.py storage list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +NAME TYPE AUTO MOUNT MOUNT POINT READONLY MANIFEST +------ ------ ------------ ------------- ---------- ---------- +[XPK] XPK Done. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 639e7c319..294810256 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -1,15 +1,34 @@ $ python3 xpk.py workload create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] No gcsfuse Storages to add detected +[XPK] No gcp filestore instances to add detected. +[XPK] No gcp parallelstore instances to add detected. +[XPK] No gce persistent disk instances to add detected. +[XPK] No managed lustre instances to add detected. +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f 635bfd38f34d48a6cc3863a2a2b00acfabe36ea1b6737e0cc816467a41fca144 +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your workload here: https://console.cloud.google.com/kubernetes/service/us-central1/golden-cluster/default/golden-workload/details?project=golden-project +[XPK] Follow your worker 0, slice 0 logs here: Adjust the pod name ([prefix]-slice-job-[slice_number]-[worker_number]) after clicking the url if you want other worker logs. https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22golden-project%22%0Aresource.labels.location%3D%22us-central1%22%0Aresource.labels.cluster_name%3D%22golden-cluster%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22golden-workload-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration=P1D?e=13802955&mods=allow_workbench_image_override&project=golden-project +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index 1175579e7..fdff927db 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -1,15 +1,33 @@ $ python3 xpk.py workload create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --command "bash hello" --tpu-type=v5p-8 --num-slices=1 --script-dir=/tmp --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Task: `Check if Workload Already Exists` is implemented by the following command not running since it is a dry run. +kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name' +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Starting workload create +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-metadata-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. +kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] gke_accelerator type not found in config map: golden-cluster-resources-configmap. Autoprovisioning is not enabled. +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. +gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --region=us-central1 --format="csv[no-heading](name)" +[XPK] Building /tmp into docker image. +[XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. +docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp +[XPK] Adding Docker Image: gcr.io/golden-project/dry-run-runner:prefix-current to golden-project +[XPK] Task: `Tag Docker Image` is implemented by the following command not running since it is a dry run. +docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. +docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. +kubectl apply -f bfdb43fce214301b0be1d293cb623b61df6e14c376a0032cdc3273ed14f5a6f7 +[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. +gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error +[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. +[XPK] Follow your Pathways workload and other resources here : https://console.cloud.google.com/logs/query;query=resource.type%3D"k8s_container"%0Aresource.labels.project_id%3D"golden-project"%0Aresource.labels.location%3D"us-central1"%0Aresource.labels.cluster_name%3D"golden-cluster"%0Aresource.labels.pod_name:"golden-workload-"%0Aseverity>%3DDEFAULT +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt index 5f832a1c7..722ca5bf8 100644 --- a/goldens/Workload_delete.txt +++ b/goldens/Workload_delete.txt @@ -1,15 +1,19 @@ $ python3 xpk.py workload delete --project=golden-project --zone=us-central1-a --cluster=golden-cluster --workload=golden-workload --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting Workload delete +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. +kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.metadata.name +[XPK] check_if_pathways_job_is_installed 0 0 +[XPK] Task: `Delete Workload` is implemented by the following command not running since it is a dry run. +kubectl delete pathwaysjob golden-workload -n default +[XPK] Exiting XPK cleanly diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt index a3f0d389f..1ec8782e8 100644 --- a/goldens/Workload_list.txt +++ b/goldens/Workload_list.txt @@ -1,15 +1,19 @@ $ python3 xpk.py workload list --project=golden-project --zone=us-central1-a --cluster=golden-cluster --dry-run -Traceback (most recent call last): - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/xpk.py", line 34, in - from src.xpk.main import main - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/main.py", line 37, in - from .parser.core import set_parser - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/core.py", line 19, in - from .config import set_config_parsers - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/parser/config.py", line 17, in - from ..commands.config import get_config, set_config - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/commands/config.py", line 17, in - from ..core.config import XpkConfig - File "/usr/local/google/home/lidanny/Desktop/Project/xpk-config-test/fork/xpk/src/xpk/core/config.py", line 19, in - import ruamel.yaml -ModuleNotFoundError: No module named 'ruamel' +[XPK] Starting xpk +[XPK] Starting workload list +[XPK] Working on golden-project and us-central1-a +[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster +[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run. +gcloud container clusters get-credentials golden-cluster --region=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default +[XPK] Testing credentials with kubectl... +[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. +kubectl get pods +[XPK] 0 +[XPK] Credentials test succeeded. +[XPK] Finished get-credentials and kubectl setup. +[XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. +kubectl get workloads --ignore-not-found -o=custom-columns="Jobset Name:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,TPU VMs Running/Ran:.status.admission.podSetAssignments[-1].count,TPU VMs Done:.status.reclaimablePods[0].count,Status:.status.conditions[-1].type,Status Message:.status.conditions[-1].message,Status Time:.status.conditions[-1].lastTransitionTime" +[XPK] Workload List Output: +0 +[XPK] See your workloads in Cloud Console: https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=golden-project +[XPK] Exiting XPK cleanly From fe71f6e270e0b9418804c6c3b22db2b8de5278b8 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 2 Oct 2025 08:23:01 +0000 Subject: [PATCH 15/15] Remove xpk_print(kubectl_output) --- goldens/Basic_cluster_create.txt | 1 - goldens/Batch.txt | 1 - goldens/Cluster_create_private.txt | 1 - goldens/Cluster_create_with_gb200-4.txt | 1 - goldens/Job_cancel.txt | 1 - goldens/Job_list.txt | 1 - goldens/NAP_cluster-create.txt | 1 - goldens/NAP_cluster-create_with_pathways.txt | 1 - goldens/Workload_delete.txt | 1 - goldens/Workload_list.txt | 1 - src/xpk/core/cluster.py | 1 - 11 files changed, 11 deletions(-) diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 1062ae3aa..a3a62c57c 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -20,7 +20,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system diff --git a/goldens/Batch.txt b/goldens/Batch.txt index 4852edd0c..a7f1b5d15 100644 --- a/goldens/Batch.txt +++ b/goldens/Batch.txt @@ -7,7 +7,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 8adaa48e5..3a74d4047 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -22,7 +22,6 @@ gcloud container clusters get-credentials golden-cluster-private --region=us-cen [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 9df1f3ef7..e1fc53681 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -20,7 +20,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system diff --git a/goldens/Job_cancel.txt b/goldens/Job_cancel.txt index 9153f379b..60118f0a8 100644 --- a/goldens/Job_cancel.txt +++ b/goldens/Job_cancel.txt @@ -8,7 +8,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: `delete job` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Job_list.txt b/goldens/Job_list.txt index e3a27e964..8b1c00c49 100644 --- a/goldens/Job_list.txt +++ b/goldens/Job_list.txt @@ -7,7 +7,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Listing jobs for project golden-project and zone us-central1-a: diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 7daa87828..3f29115f5 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -20,7 +20,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 27536e3ad..9b83bba68 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -20,7 +20,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system diff --git a/goldens/Workload_delete.txt b/goldens/Workload_delete.txt index 722ca5bf8..632beab4e 100644 --- a/goldens/Workload_delete.txt +++ b/goldens/Workload_delete.txt @@ -8,7 +8,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: `Check if PathwaysJob is installed on golden-cluster` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Workload_list.txt b/goldens/Workload_list.txt index 1ec8782e8..10abaebd4 100644 --- a/goldens/Workload_list.txt +++ b/goldens/Workload_list.txt @@ -8,7 +8,6 @@ gcloud container clusters get-credentials golden-cluster --region=us-central1 -- [XPK] Testing credentials with kubectl... [XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run. kubectl get pods -[XPK] 0 [XPK] Credentials test succeeded. [XPK] Finished get-credentials and kubectl setup. [XPK] Task: `List Jobs with filter-by-status=EVERYTHING with filter-by-job=None` is implemented by the following command not running since it is a dry run. diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 72008fc1f..524100fbd 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -882,7 +882,6 @@ def test_and_retry_credentials_with_dns_logic(args) -> int: kubectl_return_code, kubectl_output = run_command_for_value( kubectl_command, 'kubectl get pods' ) - xpk_print(kubectl_output) if kubectl_return_code == 0: xpk_print('Credentials test succeeded.') return 0