From e72aa34c7c306a3f5d0a9282012f747760d121c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Mon, 3 Mar 2025 08:16:58 +0000 Subject: [PATCH 01/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kjob.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index 713c72828..de5d51017 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -225,7 +225,7 @@ def create_job_template_instance( args=args, ) - +# this may be moved to shell command def create_pod_template_instance(args: Namespace) -> int: """Create new PodTemplate instance on cluster with default settings. @@ -239,6 +239,7 @@ def create_pod_template_instance(args: Namespace) -> int: if pod_image is None or len(pod_image) == 0: pod_image = PodTemplateDefaults.IMAGE.value working_directory = config.get(KJOB_SHELL_WORKING_DIRECTORY) + xpk_print('working directory is: ', working_directory) if working_directory is None or len(working_directory) == 0: working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value @@ -263,7 +264,7 @@ def prepare_kjob(args) -> int: job_err_code = create_job_template_instance(args, system) if job_err_code > 0: return job_err_code - + xpk_print("Creating pod template instance") pod_err_code = create_pod_template_instance(args) if pod_err_code > 0: return pod_err_code From de9f7a15bc0bff091c0197a5d50ef9dc8d0ea9cf Mon Sep 17 00:00:00 2001 From: Danny LI Date: Mon, 16 Jun 2025 02:42:02 +0000 Subject: [PATCH 02/41] feat: Added an update to CoreDNS, and when python3 xpk/xpk.py cluster create-pathways is used, it will default to CoreDNS. --- src/xpk/commands/cluster.py | 130 +++++++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 83eb2b07b..c363b5758 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -75,7 +75,7 @@ from ..utils.file import write_tmp_file from . import cluster_gcluster from .common import set_cluster_command - +import os def cluster_adapt(args) -> None: """Function that performs cluster adaptation. @@ -696,6 +696,131 @@ def cluster_create_ray_cluster(args) -> None: args.enable_autoprovisioning = False cluster_create(args) +def update_coredns(args): + + home_dir = os.path.expanduser("~") + coredns_repo_dir_name = "deployment" + coredns_repo_full_path = os.path.join(home_dir, coredns_repo_dir_name) + coredns_k8s_path = os.path.join(coredns_repo_full_path, "kubernetes") + + command_jq_install = ( + f'sudo apt install jq -y' + ) + return_code = run_command_with_updates( + command_jq_install, 'Install jq', args + ) + if return_code != 0: + xpk_print(f'[XPK] Install jq error {return_code}') + xpk_exit(return_code) + + # Check if the target directory already exists to avoid errors caused by duplicate cloning + if os.path.exists(coredns_repo_full_path): + xpk_print(f"[XPK] Directory '{coredns_repo_full_path}' already exists, skip git clone.") + else: + command_git_clone = ( + f'git clone https://github.com/coredns/deployment.git {coredns_repo_full_path}' + ) + xpk_print(f"[XPK] Task: 'Clone deployment ' in progress, Target directory:{coredns_repo_full_path}.") + return_code = run_command_with_updates( + command_git_clone, 'Clone deployment', args + ) + if return_code != 0: + xpk_print(f'[XPK] Clone deployment error {return_code}') + xpk_exit(return_code) + + + cluster_name = os.environ.get('CLUSTER_NAME') + region = os.environ.get('REGION') + project = os.environ.get('PROJECT') + + if not all([cluster_name, region, project]): + missing_vars = [] + if not cluster_name: missing_vars.append('CLUSTER_NAME') + if not region: missing_vars.append('REGION') + if not project: missing_vars.append('PROJECT') + xpk_print(f"[XPK] Error: Missing required environment variable:{', '.join(missing_vars)}. Please ensure these variables are set.") + xpk_exit(1) + + command_get_credentials = ( + f'gcloud container clusters get-credentials {cluster_name} --dns-endpoint ' + f'--region={region} --project={project} && kubectl config view ' + f'&& kubectl config set-context --current --namespace=default' + ) + xpk_print(f"[XPK] Task: 'Get cluster credentials' in progress.") + return_code = run_command_with_updates( + command_get_credentials, 'Get cluster credentials', args + ) + if return_code != 0: + xpk_print(f'[XPK] Failed to get cluster credentials {return_code}') + xpk_exit(return_code) + + if not os.path.isdir(coredns_k8s_path): + xpk_print(f"[XPK] Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") + xpk_exit(1) + + # Remember the current directory so that you can restore it later + original_cwd = os.getcwd() + try: + # Change the current working directory to the path of the CoreDNS deployment + os.chdir(coredns_k8s_path) + + command_deploy_coredns = ( + f'./deploy.sh | kubectl apply -f -' + ) + xpk_print(f"[XPK] Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") + + return_code = run_command_with_updates( + command_deploy_coredns, 'Deploy CoreDNS', args + ) + if return_code != 0: + xpk_print(f'[XPK] Deploy CoreDNS error {return_code}') + xpk_exit(return_code) + + finally: + # Whether it succeeds or fails, always restore to the original directory + os.chdir(original_cwd) + + # Scale down kube-dns-autoscaler + command_autoscaler_scale_down = ( + 'kubectl scale deployment kube-dns-autoscaler --replicas=0 --namespace=kube-system' + ) + xpk_print(f"[XPK] Task: 'Scaling down kube-dns-autoscaler' in progress") + return_code = run_command_with_updates( + command_autoscaler_scale_down, 'Scale down kube-dns-autoscaler', args + ) + if return_code != 0: + xpk_print(f'[XPK] Scale down kube-dns-autoscaler error {return_code}') + xpk_exit(return_code) + xpk_print("\n[XPK] kube-dns-autoscaler has been scaled down.") + + # Scale down kube-dns (or CoreDNS) + command_dns_scale_down = ( + 'kubectl scale deployment kube-dns --replicas=0 --namespace=kube-system' + ) + xpk_print(f"[XPK] Task: 'Scaling down kube-dns' in progress") + return_code = run_command_with_updates( + command_dns_scale_down, 'Scale down kube-dns', args + ) + if return_code != 0: + xpk_print(f'[XPK] Scale down kube-dns error {return_code}') + xpk_exit(return_code) + xpk_print("\n[XPK] kube-dns has been scaled down.") + + command_coredns_scale = ( + 'kubectl scale deployment coredns --replicas=15 -n kube-system' + ) + xpk_print(f"[XPK] Task: 'Scale CoreDNS' in progress") + return_code = run_command_with_updates( + command_coredns_scale, 'Scale CoreDNS', args + ) + if return_code != 0: + xpk_print(f'[XPK] Scale CoreDNS error {return_code}') + xpk_exit(return_code) + + xpk_print("\n[XPK] The CoreDNS setup process has been completed.") + + return 0 + def create_cluster_if_necessary( args, gke_control_plane_version: str, system: SystemCharacteristics @@ -890,6 +1015,9 @@ def run_gke_cluster_create_command( if return_code != 0: xpk_print(f'GKE Cluster Create request returned ERROR {return_code}') return 1 + else: + if args.enable_pathways == True: + update_coredns(args) return 0 From 0d5729aa7a922efa2ea3ef5c8412a92ddde6f2dc Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 17 Jun 2025 10:21:49 +0800 Subject: [PATCH 03/41] Update cluster.py --- src/xpk/commands/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index c363b5758..ce170661f 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -75,6 +75,7 @@ from ..utils.file import write_tmp_file from . import cluster_gcluster from .common import set_cluster_command + import os def cluster_adapt(args) -> None: From 7392e316f1c1165c42b656df00d8ca61056b59f7 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 17 Jun 2025 14:58:52 +0800 Subject: [PATCH 04/41] Update cluster.py --- src/xpk/commands/cluster.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index ce170661f..4b764bcb5 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -698,7 +698,22 @@ def cluster_create_ray_cluster(args) -> None: cluster_create(args) def update_coredns(args): + """Updates and deploys CoreDNS within a cluster. + This function performs the following steps: + 1. Installs 'jq'. + 2. Clones the CoreDNS deployment repository from GitHub if it doesn't already exist. + 3. Retrieves Google Kubernetes Engine (GKE) cluster credentials. + 4. Deploys CoreDNS to the cluster. + 5. Scales down the 'kube-dns-autoscaler' and 'kube-dns' deployments. + 6. Scales up the 'coredns' deployment to 15 replicas. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if successful and 1 otherwise. + """ home_dir = os.path.expanduser("~") coredns_repo_dir_name = "deployment" coredns_repo_full_path = os.path.join(home_dir, coredns_repo_dir_name) From 1c57afe6f69884ae7f7a3887ac387a09e8384bb6 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 18 Jun 2025 09:00:07 +0000 Subject: [PATCH 05/41] feat: Remaining: Verify CoreDNS startup and add 'update_coredns_if_necessary' function. --- src/xpk/commands/cluster.py | 104 +++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 26 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 4b764bcb5..4422735ea 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -75,8 +75,9 @@ from ..utils.file import write_tmp_file from . import cluster_gcluster from .common import set_cluster_command - +import shutil import os +import subprocess def cluster_adapt(args) -> None: """Function that performs cluster adaptation. @@ -714,20 +715,20 @@ def update_coredns(args): Returns: 0 if successful and 1 otherwise. """ - home_dir = os.path.expanduser("~") + coredns_repo_dir = os.path.expanduser("/tmp/") coredns_repo_dir_name = "deployment" - coredns_repo_full_path = os.path.join(home_dir, coredns_repo_dir_name) + coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name) coredns_k8s_path = os.path.join(coredns_repo_full_path, "kubernetes") command_jq_install = ( f'sudo apt install jq -y' ) return_code = run_command_with_updates( - command_jq_install, 'Install jq', args - ) + command_jq_install, 'Install jq', args + ) if return_code != 0: - xpk_print(f'[XPK] Install jq error {return_code}') - xpk_exit(return_code) + xpk_print(f'[XPK] Install jq error {return_code}') + xpk_exit(return_code) # Check if the target directory already exists to avoid errors caused by duplicate cloning if os.path.exists(coredns_repo_full_path): @@ -744,25 +745,13 @@ def update_coredns(args): xpk_print(f'[XPK] Clone deployment error {return_code}') xpk_exit(return_code) - - cluster_name = os.environ.get('CLUSTER_NAME') - region = os.environ.get('REGION') - project = os.environ.get('PROJECT') - - if not all([cluster_name, region, project]): - missing_vars = [] - if not cluster_name: missing_vars.append('CLUSTER_NAME') - if not region: missing_vars.append('REGION') - if not project: missing_vars.append('PROJECT') - xpk_print(f"[XPK] Error: Missing required environment variable:{', '.join(missing_vars)}. Please ensure these variables are set.") - xpk_exit(1) - command_get_credentials = ( - f'gcloud container clusters get-credentials {cluster_name} --dns-endpoint ' - f'--region={region} --project={project} && kubectl config view ' + f'gcloud container clusters get-credentials {args.cluster} --dns-endpoint ' + f'--region={zone_to_region(args.zone)} --project={args.project} && kubectl config view ' f'&& kubectl config set-context --current --namespace=default' ) xpk_print(f"[XPK] Task: 'Get cluster credentials' in progress.") + return_code = get_cluster_credentials(args) return_code = run_command_with_updates( command_get_credentials, 'Get cluster credentials', args ) @@ -790,16 +779,19 @@ def update_coredns(args): ) if return_code != 0: xpk_print(f'[XPK] Deploy CoreDNS error {return_code}') - xpk_exit(return_code) + pass finally: # Whether it succeeds or fails, always restore to the original directory os.chdir(original_cwd) - + if return_code != 0: + xpk_exit(return_code) + # Scale down kube-dns-autoscaler command_autoscaler_scale_down = ( 'kubectl scale deployment kube-dns-autoscaler --replicas=0 --namespace=kube-system' ) + # Note: The scaling down command has been issued, but the actual scaling process may take some time. xpk_print(f"[XPK] Task: 'Scaling down kube-dns-autoscaler' in progress") return_code = run_command_with_updates( command_autoscaler_scale_down, 'Scale down kube-dns-autoscaler', args @@ -809,10 +801,11 @@ def update_coredns(args): xpk_exit(return_code) xpk_print("\n[XPK] kube-dns-autoscaler has been scaled down.") - # Scale down kube-dns (or CoreDNS) + # Scale down kube-dns command_dns_scale_down = ( 'kubectl scale deployment kube-dns --replicas=0 --namespace=kube-system' ) + # Note: The scaling down command has been issued, but the actual scaling process may take some time. xpk_print(f"[XPK] Task: 'Scaling down kube-dns' in progress") return_code = run_command_with_updates( command_dns_scale_down, 'Scale down kube-dns', args @@ -822,8 +815,9 @@ def update_coredns(args): xpk_exit(return_code) xpk_print("\n[XPK] kube-dns has been scaled down.") + # Scale up core-dns command_coredns_scale = ( - 'kubectl scale deployment coredns --replicas=15 -n kube-system' + 'kubectl scale deployment coredns --replicas=1 -n kube-system' ) xpk_print(f"[XPK] Task: 'Scale CoreDNS' in progress") return_code = run_command_with_updates( @@ -833,10 +827,68 @@ def update_coredns(args): xpk_print(f'[XPK] Scale CoreDNS error {return_code}') xpk_exit(return_code) + # Waiting for CoreDNS to scale up and reach a Ready state. + command_wait_coredns_ready = ( + f'kubectl wait --for=condition=ready deployment/coredns ' + f'--namespace=kube-system --timeout=180s' + ) + xpk_print(f"[XPK] Task: 'Waiting for CoreDNS to become ready'...") + return_code = run_command_with_updates( + command_wait_coredns_ready, 'Wait for CoreDNS Ready', args + ) + if return_code != 0: + xpk_print(f'[XPK] Error: CoreDNS did not become ready within the timeout.') + xpk_exit(1) + xpk_print("\n[XPK] The CoreDNS setup process has been completed.") + xpk_print(f"[XPK] Task: 'Deleting CoreDNS deployment directory' in progress: {coredns_repo_full_path}") + try: + shutil.rmtree(coredns_repo_full_path) + xpk_print(f"[XPK] Successfully deleted directory: {coredns_repo_full_path}") + except OSError as e: + xpk_print(f"[XPK] Error deleting directory {coredns_repo_full_path}: {e}") + return 0 +def coredns_deployment_exists(namespace: str = 'kube-system') -> bool: + """Checks if the CoreDNS deployment exists in the given namespace. + + Args: + namespace: The Kubernetes namespace to check for the CoreDNS deployment. + + Returns: + True if the 'coredns' deployment exists, False otherwise. + """ + command = f"kubectl get deployment coredns -n {namespace}" + try: + # Use subprocess.run with check=False to capture exit code without raising an exception + result = subprocess.run(command, shell=True, check=False, capture_output=True) + return result.returncode == 0 + except Exception as e: + xpk_print(f"Error checking CoreDNS deployment existence: {e}") + return False + +def update_coredns_if_necessary(args) -> int: + """Updates and deploys CoreDNS within the cluster if it's not already present. + + This function checks for the existence of the CoreDNS deployment. + If it's not found, it proceeds to deploy and configure CoreDNS. + + Args: + args: User-provided arguments for running the command. + + Returns: + 0 if successful (CoreDNS was already present or successfully deployed), + and 1 otherwise. + """ + if coredns_deployment_exists(namespace='kube-system'): + xpk_print('Skipping CoreDNS deployment since it already exists.') + return 0 + else: + xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.') + return update_coredns(args) + def create_cluster_if_necessary( args, gke_control_plane_version: str, system: SystemCharacteristics From 807e7c91acfe5cdc4aef55f227e6a6f5ef4b6a02 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 19 Jun 2025 08:49:17 +0000 Subject: [PATCH 06/41] feat: Added CoreDNS status check and update_coredns_if_necessary functions, along with some helper functions. --- src/xpk/commands/cluster.py | 159 ++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 35 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 4422735ea..059b6f216 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -79,6 +79,11 @@ import os import subprocess + +from kubernetes import client, config +from kubernetes.stream import stream +import time + def cluster_adapt(args) -> None: """Function that performs cluster adaptation. @@ -698,6 +703,93 @@ def cluster_create_ray_cluster(args) -> None: args.enable_autoprovisioning = False cluster_create(args) + +# -- CoreDNS Check Function -- +def check_coredns_status(namespace="kube-system", deployment_name="coredns", timeout=120): + """ + Checks the operational status of CoreDNS, including its Deployment and Pods capabilities. + """ + xpk_print(f"Checking CoreDNS status (Namespace: {namespace}, Deployment: {deployment_name})...") + + try: + config.load_kube_config() + except config.ConfigException: + xpk_print("Failed to load kubeconfig, attempting to use in-cluster configuration...") + try: + config.load_incluster_config() + except config.ConfigException: + xpk_print("Error: Could not find Kubernetes configuration. Please ensure you are running inside a cluster or have configured kubeconfig.") + return False + + v1 = client.CoreV1Api() + app_v1 = client.AppsV1Api() + + start_time = time.time() + + while time.time() - start_time < timeout: + # Check the Deployment's Ready status + try: + deployment = app_v1.read_namespaced_deployment_status(name=deployment_name, namespace=namespace) + if deployment.status.ready_replicas is not None and \ + deployment.status.replicas is not None and \ + deployment.status.ready_replicas == deployment.status.replicas: + xpk_print(f"Deployment '{deployment_name}' is ready ({deployment.status.ready_replicas}/{deployment.status.replicas}).") + else: + xpk_print(f"Deployment '{deployment_name}' is not fully ready yet ({deployment.status.ready_replicas or 0}/{deployment.status.replicas or 0}). Waiting...") + time.sleep(5) + continue + except client.ApiException as e: + if e.status == 404: + xpk_print(f"Error: Deployment '{deployment_name}' not found in namespace '{namespace}'.") + else: + xpk_print(f"API error when reading deployment status: {e}") + return False + except Exception as e: + xpk_print(f"Unknown error occurred while checking deployment: {e}") + return False + + # Checking the status of all CoreDNS Pods + try: + pods = v1.list_namespaced_pod(namespace=namespace, label_selector="k8s-app=kube-dns") + all_pods_ready = True + if not pods.items: + xpk_print("No CoreDNS Pods found. Waiting...") + all_pods_ready = False + + for pod in pods.items: + if pod.status.phase != "Running": + xpk_print(f"Pod '{pod.metadata.name}' status is '{pod.status.phase}'. Waiting...") + all_pods_ready = False + break + + ready_condition = False + if pod.status.conditions: + for condition in pod.status.conditions: + if condition.type == "Ready" and condition.status == "True": + ready_condition = True + break + + if not ready_condition: + xpk_print(f"Pod '{pod.metadata.name}' is not fully ready yet. Waiting...") + all_pods_ready = False + break + + if not all_pods_ready: + time.sleep(5) + continue + + return True + + except client.ApiException as e: + xpk_print(f"API error when listing Pods: {e}") + return False + except Exception as e: + xpk_print(f"Unknown error occurred while checking Pods: {e}") + return False + + return False + + def update_coredns(args): """Updates and deploys CoreDNS within a cluster. @@ -727,22 +819,22 @@ def update_coredns(args): command_jq_install, 'Install jq', args ) if return_code != 0: - xpk_print(f'[XPK] Install jq error {return_code}') + xpk_print(f'Install jq error {return_code}') xpk_exit(return_code) # Check if the target directory already exists to avoid errors caused by duplicate cloning if os.path.exists(coredns_repo_full_path): - xpk_print(f"[XPK] Directory '{coredns_repo_full_path}' already exists, skip git clone.") + xpk_print(f"Directory '{coredns_repo_full_path}' already exists, skip git clone.") else: command_git_clone = ( f'git clone https://github.com/coredns/deployment.git {coredns_repo_full_path}' ) - xpk_print(f"[XPK] Task: 'Clone deployment ' in progress, Target directory:{coredns_repo_full_path}.") + xpk_print(f"Task: 'Clone deployment ' in progress, Target directory:{coredns_repo_full_path}.") return_code = run_command_with_updates( command_git_clone, 'Clone deployment', args ) if return_code != 0: - xpk_print(f'[XPK] Clone deployment error {return_code}') + xpk_print(f'Clone deployment error {return_code}') xpk_exit(return_code) command_get_credentials = ( @@ -750,17 +842,17 @@ def update_coredns(args): f'--region={zone_to_region(args.zone)} --project={args.project} && kubectl config view ' f'&& kubectl config set-context --current --namespace=default' ) - xpk_print(f"[XPK] Task: 'Get cluster credentials' in progress.") + xpk_print(f"Task: 'Get cluster credentials' in progress.") return_code = get_cluster_credentials(args) return_code = run_command_with_updates( command_get_credentials, 'Get cluster credentials', args ) if return_code != 0: - xpk_print(f'[XPK] Failed to get cluster credentials {return_code}') + xpk_print(f'Failed to get cluster credentials {return_code}') xpk_exit(return_code) if not os.path.isdir(coredns_k8s_path): - xpk_print(f"[XPK] Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") + xpk_print(f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") xpk_exit(1) # Remember the current directory so that you can restore it later @@ -772,13 +864,13 @@ def update_coredns(args): command_deploy_coredns = ( f'./deploy.sh | kubectl apply -f -' ) - xpk_print(f"[XPK] Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") + xpk_print(f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") return_code = run_command_with_updates( command_deploy_coredns, 'Deploy CoreDNS', args ) if return_code != 0: - xpk_print(f'[XPK] Deploy CoreDNS error {return_code}') + xpk_print(f'Deploy CoreDNS error {return_code}') pass finally: @@ -792,62 +884,58 @@ def update_coredns(args): 'kubectl scale deployment kube-dns-autoscaler --replicas=0 --namespace=kube-system' ) # Note: The scaling down command has been issued, but the actual scaling process may take some time. - xpk_print(f"[XPK] Task: 'Scaling down kube-dns-autoscaler' in progress") + xpk_print(f"Task: 'Scaling down kube-dns-autoscaler' in progress") return_code = run_command_with_updates( command_autoscaler_scale_down, 'Scale down kube-dns-autoscaler', args ) if return_code != 0: - xpk_print(f'[XPK] Scale down kube-dns-autoscaler error {return_code}') + xpk_print(f'Scale down kube-dns-autoscaler error {return_code}') xpk_exit(return_code) - xpk_print("\n[XPK] kube-dns-autoscaler has been scaled down.") + xpk_print("\n kube-dns-autoscaler has been scaled down.") # Scale down kube-dns command_dns_scale_down = ( 'kubectl scale deployment kube-dns --replicas=0 --namespace=kube-system' ) # Note: The scaling down command has been issued, but the actual scaling process may take some time. - xpk_print(f"[XPK] Task: 'Scaling down kube-dns' in progress") + xpk_print(f"Task: 'Scaling down kube-dns' in progress") return_code = run_command_with_updates( command_dns_scale_down, 'Scale down kube-dns', args ) if return_code != 0: - xpk_print(f'[XPK] Scale down kube-dns error {return_code}') + xpk_print(f'Scale down kube-dns error {return_code}') xpk_exit(return_code) - xpk_print("\n[XPK] kube-dns has been scaled down.") + xpk_print("\nkube-dns has been scaled down.") # Scale up core-dns command_coredns_scale = ( - 'kubectl scale deployment coredns --replicas=1 -n kube-system' + 'kubectl scale deployment coredns --replicas=15 -n kube-system' ) - xpk_print(f"[XPK] Task: 'Scale CoreDNS' in progress") + xpk_print(f"Task: 'Scale CoreDNS' in progress") return_code = run_command_with_updates( command_coredns_scale, 'Scale CoreDNS', args ) if return_code != 0: - xpk_print(f'[XPK] Scale CoreDNS error {return_code}') + xpk_print(f'Scale CoreDNS error {return_code}') xpk_exit(return_code) - # Waiting for CoreDNS to scale up and reach a Ready state. - command_wait_coredns_ready = ( - f'kubectl wait --for=condition=ready deployment/coredns ' - f'--namespace=kube-system --timeout=180s' - ) - xpk_print(f"[XPK] Task: 'Waiting for CoreDNS to become ready'...") - return_code = run_command_with_updates( - command_wait_coredns_ready, 'Wait for CoreDNS Ready', args - ) - if return_code != 0: - xpk_print(f'[XPK] Error: CoreDNS did not become ready within the timeout.') - xpk_exit(1) + xpk_print("CoreDNS scale up command sent. Now verifying CoreDNS readiness...") + + # Call the check function here + if not check_coredns_status(timeout=300): + xpk_print("CoreDNS verification failed, it might not have fully started.") + xpk_exit(1) + + xpk_print("CoreDNS has successfully started and passed verification.") - xpk_print("\n[XPK] The CoreDNS setup process has been completed.") + xpk_print("\nThe CoreDNS setup process has been completed.") - xpk_print(f"[XPK] Task: 'Deleting CoreDNS deployment directory' in progress: {coredns_repo_full_path}") + xpk_print(f"Task: 'Deleting CoreDNS deployment directory' in progress: {coredns_repo_full_path}") try: shutil.rmtree(coredns_repo_full_path) - xpk_print(f"[XPK] Successfully deleted directory: {coredns_repo_full_path}") + xpk_print(f"Successfully deleted directory: {coredns_repo_full_path}") except OSError as e: - xpk_print(f"[XPK] Error deleting directory {coredns_repo_full_path}: {e}") + xpk_print(f"Error deleting directory {coredns_repo_full_path}: {e}") return 0 @@ -909,6 +997,7 @@ def create_cluster_if_necessary( return 1 if args.cluster in all_clusters: xpk_print('Skipping cluster creation since it already exists.') + update_coredns_if_necessary(args) return 0 else: return run_gke_cluster_create_command( @@ -1085,7 +1174,7 @@ def run_gke_cluster_create_command( return 1 else: if args.enable_pathways == True: - update_coredns(args) + update_coredns_if_necessary(args) return 0 From 18ed2c05f073f6956f343bf7732fc5c8f75a381c Mon Sep 17 00:00:00 2001 From: DannyLi Date: Fri, 20 Jun 2025 02:50:45 +0000 Subject: [PATCH 07/41] refactor: Organize code --- src/xpk/commands/cluster.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 059b6f216..df8b70e4b 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -862,20 +862,20 @@ def update_coredns(args): os.chdir(coredns_k8s_path) command_deploy_coredns = ( - f'./deploy.sh | kubectl apply -f -' + f'./deploy.sh | kubectl apply -f -' ) xpk_print(f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") return_code = run_command_with_updates( - command_deploy_coredns, 'Deploy CoreDNS', args + command_deploy_coredns, 'Deploy CoreDNS', args ) if return_code != 0: xpk_print(f'Deploy CoreDNS error {return_code}') pass finally: - # Whether it succeeds or fails, always restore to the original directory - os.chdir(original_cwd) + # Whether it succeeds or fails, always restore to the original directory + os.chdir(original_cwd) if return_code != 0: xpk_exit(return_code) @@ -923,8 +923,8 @@ def update_coredns(args): # Call the check function here if not check_coredns_status(timeout=300): - xpk_print("CoreDNS verification failed, it might not have fully started.") - xpk_exit(1) + xpk_print("CoreDNS verification failed, it might not have fully started.") + xpk_exit(1) xpk_print("CoreDNS has successfully started and passed verification.") @@ -1001,7 +1001,7 @@ def create_cluster_if_necessary( return 0 else: return run_gke_cluster_create_command( - args, gke_control_plane_version, system + args, gke_control_plane_version, system ) From 3b3c98f157679e4206103d8c206477eda0d47f18 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Mon, 23 Jun 2025 08:54:08 +0000 Subject: [PATCH 08/41] Refactor check_coredns_status() into multiple smaller functions. --- src/xpk/commands/cluster.py | 227 ++++++++++++++++++++++-------------- 1 file changed, 140 insertions(+), 87 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index df8b70e4b..b3dd9fb40 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -81,7 +81,7 @@ from kubernetes import client, config -from kubernetes.stream import stream +from kubernetes.client.rest import ApiException import time def cluster_adapt(args) -> None: @@ -235,6 +235,11 @@ def cluster_create(args) -> None: if create_cluster_command_code != 0: xpk_exit(create_cluster_command_code) + if args.enable_pathways == True: + update_coredns_command_code = update_coredns_if_necessary(args) + if update_coredns_command_code != 0: + xpk_exit(update_cluster_command_code) + authorize_private_cluster_access_command_code = ( authorize_private_cluster_access_if_necessary(args) ) @@ -704,92 +709,137 @@ def cluster_create_ray_cluster(args) -> None: cluster_create(args) -# -- CoreDNS Check Function -- -def check_coredns_status(namespace="kube-system", deployment_name="coredns", timeout=120): +def _load_kubernetes_config(): """ - Checks the operational status of CoreDNS, including its Deployment and Pods capabilities. + Loads Kubernetes configuration, trying kubeconfig first, then in-cluster. + Returns True on success, False on failure. """ - xpk_print(f"Checking CoreDNS status (Namespace: {namespace}, Deployment: {deployment_name})...") - try: config.load_kube_config() + xpk_print("Kubeconfig loaded successfully.") + return True except config.ConfigException: xpk_print("Failed to load kubeconfig, attempting to use in-cluster configuration...") try: config.load_incluster_config() + xpk_print("In-cluster configuration loaded successfully.") + return True except config.ConfigException: xpk_print("Error: Could not find Kubernetes configuration. Please ensure you are running inside a cluster or have configured kubeconfig.") return False + except Exception as e: + xpk_print(f"Unknown error during Kubernetes config loading: {e}") + return False + +def _check_deployment_readiness(args, app_v1_api, namespace, deployment_name): + """ + Checks if the specified deployment is fully ready. + Returns True if ready, False otherwise (including not found or API errors). + """ + try: + deployment = app_v1_api.read_namespaced_deployment_status(name=deployment_name, namespace=namespace) + ready_replicas = deployment.status.ready_replicas or 0 + total_replicas = deployment.status.replicas or 0 - v1 = client.CoreV1Api() - app_v1 = client.AppsV1Api() + if ready_replicas == total_replicas and total_replicas > 0: + xpk_print(f"Deployment '{deployment_name}' is ready ({ready_replicas}/{total_replicas}).") + return True + else: + xpk_print(f"Deployment '{deployment_name}' is not fully ready yet ({ready_replicas}/{total_replicas}).") + return False + except ApiException as e: + if e.status == 404: + xpk_print(f"Error: Deployment '{deployment_name}' not found in namespace '{namespace}'.") + update_coredns(args) + else: + xpk_print(f"API error when reading deployment status for '{deployment_name}': {e}") + xpk_exit(1) + except Exception as e: + xpk_print(f"Unknown error occurred while checking deployment '{deployment_name}': {e}") + xpk_exit(1) - start_time = time.time() - while time.time() - start_time < timeout: - # Check the Deployment's Ready status - try: - deployment = app_v1.read_namespaced_deployment_status(name=deployment_name, namespace=namespace) - if deployment.status.ready_replicas is not None and \ - deployment.status.replicas is not None and \ - deployment.status.ready_replicas == deployment.status.replicas: - xpk_print(f"Deployment '{deployment_name}' is ready ({deployment.status.ready_replicas}/{deployment.status.replicas}).") - else: - xpk_print(f"Deployment '{deployment_name}' is not fully ready yet ({deployment.status.ready_replicas or 0}/{deployment.status.replicas or 0}). Waiting...") - time.sleep(5) - continue - except client.ApiException as e: - if e.status == 404: - xpk_print(f"Error: Deployment '{deployment_name}' not found in namespace '{namespace}'.") - else: - xpk_print(f"API error when reading deployment status: {e}") - return False - except Exception as e: - xpk_print(f"Unknown error occurred while checking deployment: {e}") +def _check_coredns_pods_status(core_v1_api, namespace): + """ + Checks if all CoreDNS Pods are in a Running phase and Ready condition. + Returns True if all pods are ready, False otherwise. + """ + try: + # Assuming "k8s-app=kube-dns" is the correct label selector for CoreDNS pods + pods = core_v1_api.list_namespaced_pod(namespace=namespace, label_selector="k8s-app=kube-dns") + + if not pods.items: + xpk_print("No CoreDNS Pods found with label 'k8s-app=kube-dns'. Waiting...") return False - # Checking the status of all CoreDNS Pods - try: - pods = v1.list_namespaced_pod(namespace=namespace, label_selector="k8s-app=kube-dns") - all_pods_ready = True - if not pods.items: - xpk_print("No CoreDNS Pods found. Waiting...") + all_pods_ready = True + for pod in pods.items: + pod_name = pod.metadata.name + # Check Pod Phase + if pod.status.phase != "Running": + xpk_print(f"Pod '{pod_name}' status is '{pod.status.phase}'. Waiting...") + all_pods_ready = False + break # Exit early if one pod isn't running + + # Check Ready condition + ready_condition_found = False + if pod.status.conditions: + for condition in pod.status.conditions: + if condition.type == "Ready" and condition.status == "True": + ready_condition_found = True + break + + if not ready_condition_found: + xpk_print(f"Pod '{pod_name}' is not fully ready yet (Ready condition not True). Waiting...") all_pods_ready = False + break # Exit early if one pod isn't ready - for pod in pods.items: - if pod.status.phase != "Running": - xpk_print(f"Pod '{pod.metadata.name}' status is '{pod.status.phase}'. Waiting...") - all_pods_ready = False - break - - ready_condition = False - if pod.status.conditions: - for condition in pod.status.conditions: - if condition.type == "Ready" and condition.status == "True": - ready_condition = True - break - - if not ready_condition: - xpk_print(f"Pod '{pod.metadata.name}' is not fully ready yet. Waiting...") - all_pods_ready = False - break - - if not all_pods_ready: - time.sleep(5) - continue + return all_pods_ready + + except ApiException as e: + xpk_print(f"API error when listing CoreDNS Pods: {e}") + return False + except Exception as e: + xpk_print(f"Unknown error occurred while checking CoreDNS Pods: {e}") + return False - return True - - except client.ApiException as e: - xpk_print(f"API error when listing Pods: {e}") - return False - except Exception as e: - xpk_print(f"Unknown error occurred while checking Pods: {e}") - return False +# -- CoreDNS Check Function -- +def check_coredns_status(args, namespace="kube-system", deployment_name="coredns", timeout=120): + """ + Checks the operational status of CoreDNS, including its Deployment and Pods capabilities. + """ + xpk_print(f"Checking CoreDNS status (Namespace: {namespace}, Deployment: {deployment_name})...") + + # 1. Load Kubernetes configuration + if not _load_kubernetes_config(): + return False + + v1 = client.CoreV1Api() + app_v1 = client.AppsV1Api() + + start_time = time.time() + while time.time() - start_time < timeout: + # 2. Check Deployment readiness + deployment_ok = _check_deployment_readiness(args, app_v1, namespace, deployment_name) + if not deployment_ok: + time.sleep(5) # Wait before retrying deployment check + continue + + # 3. Check all CoreDNS Pods status + pods_ok = _check_coredns_pods_status(v1, namespace) + if not pods_ok: + time.sleep(5) # Wait before retrying pod check + continue + + # If both deployment and pods are OK, return True + return True + + xpk_print(f"Timeout reached. CoreDNS did not become fully ready within {timeout} seconds.") return False + def update_coredns(args): """Updates and deploys CoreDNS within a cluster. @@ -837,19 +887,19 @@ def update_coredns(args): xpk_print(f'Clone deployment error {return_code}') xpk_exit(return_code) - command_get_credentials = ( - f'gcloud container clusters get-credentials {args.cluster} --dns-endpoint ' - f'--region={zone_to_region(args.zone)} --project={args.project} && kubectl config view ' - f'&& kubectl config set-context --current --namespace=default' - ) - xpk_print(f"Task: 'Get cluster credentials' in progress.") - return_code = get_cluster_credentials(args) - return_code = run_command_with_updates( - command_get_credentials, 'Get cluster credentials', args - ) - if return_code != 0: - xpk_print(f'Failed to get cluster credentials {return_code}') - xpk_exit(return_code) + # command_get_credentials = ( + # f'gcloud container clusters get-credentials {args.cluster} --dns-endpoint ' + # f'--region={zone_to_region(args.zone)} --project={args.project} && kubectl config view ' + # f'&& kubectl config set-context --current --namespace=default' + # ) + # xpk_print(f"Task: 'Get cluster credentials' in progress.") + # return_code = get_cluster_credentials(args) + # return_code = run_command_with_updates( + # command_get_credentials, 'Get cluster credentials', args + # ) + # if return_code != 0: + # xpk_print(f'Failed to get cluster credentials {return_code}') + # xpk_exit(return_code) if not os.path.isdir(coredns_k8s_path): xpk_print(f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") @@ -922,7 +972,7 @@ def update_coredns(args): xpk_print("CoreDNS scale up command sent. Now verifying CoreDNS readiness...") # Call the check function here - if not check_coredns_status(timeout=300): + if not check_coredns_status(args, timeout=120): xpk_print("CoreDNS verification failed, it might not have fully started.") xpk_exit(1) @@ -939,7 +989,7 @@ def update_coredns(args): return 0 -def coredns_deployment_exists(namespace: str = 'kube-system') -> bool: +def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: """Checks if the CoreDNS deployment exists in the given namespace. Args: @@ -948,11 +998,17 @@ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool: Returns: True if the 'coredns' deployment exists, False otherwise. """ + command = f"kubectl get deployment coredns -n {namespace}" try: - # Use subprocess.run with check=False to capture exit code without raising an exception - result = subprocess.run(command, shell=True, check=False, capture_output=True) - return result.returncode == 0 + xpk_print(f"Task: 'Checking CoreDNS deployment existence' in progress for namespace: {namespace}") + return_code = run_command_with_updates( + command, f"Check CoreDNS deployment in {namespace}", args + ) + if not check_coredns_status(args, timeout=120): + xpk_print("CoreDNS verification failed, it might not have fully started.") + xpk_exit(1) + return return_code == 0 except Exception as e: xpk_print(f"Error checking CoreDNS deployment existence: {e}") return False @@ -970,7 +1026,8 @@ def update_coredns_if_necessary(args) -> int: 0 if successful (CoreDNS was already present or successfully deployed), and 1 otherwise. """ - if coredns_deployment_exists(namespace='kube-system'): + get_cluster_credentials(args) + if coredns_deployment_exists(args, namespace='kube-system'): xpk_print('Skipping CoreDNS deployment since it already exists.') return 0 else: @@ -997,7 +1054,6 @@ def create_cluster_if_necessary( return 1 if args.cluster in all_clusters: xpk_print('Skipping cluster creation since it already exists.') - update_coredns_if_necessary(args) return 0 else: return run_gke_cluster_create_command( @@ -1172,9 +1228,6 @@ def run_gke_cluster_create_command( if return_code != 0: xpk_print(f'GKE Cluster Create request returned ERROR {return_code}') return 1 - else: - if args.enable_pathways == True: - update_coredns_if_necessary(args) return 0 From 9670da13446776beeb8c68bbd44c3b50b1d4b81d Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 24 Jun 2025 07:14:28 +0000 Subject: [PATCH 09/41] Refactor update_coredns(args) and add _verify_coredns_readiness(). --- src/xpk/commands/cluster.py | 373 ++++++++++++------------------------ 1 file changed, 126 insertions(+), 247 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index b3dd9fb40..42c475b12 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -77,7 +77,6 @@ from .common import set_cluster_command import shutil import os -import subprocess from kubernetes import client, config @@ -235,10 +234,7 @@ def cluster_create(args) -> None: if create_cluster_command_code != 0: xpk_exit(create_cluster_command_code) - if args.enable_pathways == True: - update_coredns_command_code = update_coredns_if_necessary(args) - if update_coredns_command_code != 0: - xpk_exit(update_cluster_command_code) + authorize_private_cluster_access_command_code = ( authorize_private_cluster_access_if_necessary(args) @@ -257,6 +253,11 @@ def cluster_create(args) -> None: get_cluster_credentials(args) + if args.enable_pathways == True: + update_coredns_command_code = update_coredns_if_necessary(args) + if update_coredns_command_code != 0: + xpk_exit(update_cluster_command_code) + k8s_client = setup_k8s_env(args) install_storage_crd(k8s_client) @@ -694,7 +695,6 @@ def cluster_create_pathways(args) -> None: args.enable_ray_cluster = False cluster_create(args) - def cluster_create_ray_cluster(args) -> None: """Function around cluster creation for RayCluster. @@ -708,278 +708,108 @@ def cluster_create_ray_cluster(args) -> None: args.enable_autoprovisioning = False cluster_create(args) - -def _load_kubernetes_config(): - """ - Loads Kubernetes configuration, trying kubeconfig first, then in-cluster. - Returns True on success, False on failure. - """ - try: - config.load_kube_config() - xpk_print("Kubeconfig loaded successfully.") - return True - except config.ConfigException: - xpk_print("Failed to load kubeconfig, attempting to use in-cluster configuration...") - try: - config.load_incluster_config() - xpk_print("In-cluster configuration loaded successfully.") - return True - except config.ConfigException: - xpk_print("Error: Could not find Kubernetes configuration. Please ensure you are running inside a cluster or have configured kubeconfig.") - return False - except Exception as e: - xpk_print(f"Unknown error during Kubernetes config loading: {e}") - return False - -def _check_deployment_readiness(args, app_v1_api, namespace, deployment_name): - """ - Checks if the specified deployment is fully ready. - Returns True if ready, False otherwise (including not found or API errors). - """ - try: - deployment = app_v1_api.read_namespaced_deployment_status(name=deployment_name, namespace=namespace) - ready_replicas = deployment.status.ready_replicas or 0 - total_replicas = deployment.status.replicas or 0 - - if ready_replicas == total_replicas and total_replicas > 0: - xpk_print(f"Deployment '{deployment_name}' is ready ({ready_replicas}/{total_replicas}).") - return True - else: - xpk_print(f"Deployment '{deployment_name}' is not fully ready yet ({ready_replicas}/{total_replicas}).") - return False - except ApiException as e: - if e.status == 404: - xpk_print(f"Error: Deployment '{deployment_name}' not found in namespace '{namespace}'.") - update_coredns(args) - else: - xpk_print(f"API error when reading deployment status for '{deployment_name}': {e}") - xpk_exit(1) - except Exception as e: - xpk_print(f"Unknown error occurred while checking deployment '{deployment_name}': {e}") - xpk_exit(1) - - -def _check_coredns_pods_status(core_v1_api, namespace): - """ - Checks if all CoreDNS Pods are in a Running phase and Ready condition. - Returns True if all pods are ready, False otherwise. - """ - try: - # Assuming "k8s-app=kube-dns" is the correct label selector for CoreDNS pods - pods = core_v1_api.list_namespaced_pod(namespace=namespace, label_selector="k8s-app=kube-dns") - - if not pods.items: - xpk_print("No CoreDNS Pods found with label 'k8s-app=kube-dns'. Waiting...") - return False - - all_pods_ready = True - for pod in pods.items: - pod_name = pod.metadata.name - # Check Pod Phase - if pod.status.phase != "Running": - xpk_print(f"Pod '{pod_name}' status is '{pod.status.phase}'. Waiting...") - all_pods_ready = False - break # Exit early if one pod isn't running - - # Check Ready condition - ready_condition_found = False - if pod.status.conditions: - for condition in pod.status.conditions: - if condition.type == "Ready" and condition.status == "True": - ready_condition_found = True - break - - if not ready_condition_found: - xpk_print(f"Pod '{pod_name}' is not fully ready yet (Ready condition not True). Waiting...") - all_pods_ready = False - break # Exit early if one pod isn't ready - - return all_pods_ready - - except ApiException as e: - xpk_print(f"API error when listing CoreDNS Pods: {e}") - return False - except Exception as e: - xpk_print(f"Unknown error occurred while checking CoreDNS Pods: {e}") - return False - -# -- CoreDNS Check Function -- -def check_coredns_status(args, namespace="kube-system", deployment_name="coredns", timeout=120): - """ - Checks the operational status of CoreDNS, including its Deployment and Pods capabilities. - """ - xpk_print(f"Checking CoreDNS status (Namespace: {namespace}, Deployment: {deployment_name})...") - - # 1. Load Kubernetes configuration - if not _load_kubernetes_config(): - return False - - v1 = client.CoreV1Api() - app_v1 = client.AppsV1Api() - - start_time = time.time() - - while time.time() - start_time < timeout: - # 2. Check Deployment readiness - deployment_ok = _check_deployment_readiness(args, app_v1, namespace, deployment_name) - if not deployment_ok: - time.sleep(5) # Wait before retrying deployment check - continue - - # 3. Check all CoreDNS Pods status - pods_ok = _check_coredns_pods_status(v1, namespace) - if not pods_ok: - time.sleep(5) # Wait before retrying pod check - continue - - # If both deployment and pods are OK, return True - return True - - xpk_print(f"Timeout reached. CoreDNS did not become fully ready within {timeout} seconds.") - return False - - - -def update_coredns(args): - """Updates and deploys CoreDNS within a cluster. - - This function performs the following steps: - 1. Installs 'jq'. - 2. Clones the CoreDNS deployment repository from GitHub if it doesn't already exist. - 3. Retrieves Google Kubernetes Engine (GKE) cluster credentials. - 4. Deploys CoreDNS to the cluster. - 5. Scales down the 'kube-dns-autoscaler' and 'kube-dns' deployments. - 6. Scales up the 'coredns' deployment to 15 replicas. - - Args: - args: user provided arguments for running the command. - - Returns: - 0 if successful and 1 otherwise. - """ - coredns_repo_dir = os.path.expanduser("/tmp/") - coredns_repo_dir_name = "deployment" - coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name) - coredns_k8s_path = os.path.join(coredns_repo_full_path, "kubernetes") - - command_jq_install = ( - f'sudo apt install jq -y' - ) - return_code = run_command_with_updates( - command_jq_install, 'Install jq', args - ) +def _install_jq(args): + """Installs 'jq' utility.""" + command_jq_install = 'sudo apt install jq -y' + xpk_print("Task: 'Install jq' in progress.") + return_code = run_command_with_updates(command_jq_install, 'Install jq', args) if return_code != 0: xpk_print(f'Install jq error {return_code}') xpk_exit(return_code) - # Check if the target directory already exists to avoid errors caused by duplicate cloning +def _clone_coredns_deployment_repo(args, coredns_repo_full_path: str): + """Clones the CoreDNS deployment repository if it doesn't exist.""" if os.path.exists(coredns_repo_full_path): xpk_print(f"Directory '{coredns_repo_full_path}' already exists, skip git clone.") - else: - command_git_clone = ( - f'git clone https://github.com/coredns/deployment.git {coredns_repo_full_path}' - ) - xpk_print(f"Task: 'Clone deployment ' in progress, Target directory:{coredns_repo_full_path}.") - return_code = run_command_with_updates( - command_git_clone, 'Clone deployment', args - ) - if return_code != 0: - xpk_print(f'Clone deployment error {return_code}') - xpk_exit(return_code) - - # command_get_credentials = ( - # f'gcloud container clusters get-credentials {args.cluster} --dns-endpoint ' - # f'--region={zone_to_region(args.zone)} --project={args.project} && kubectl config view ' - # f'&& kubectl config set-context --current --namespace=default' - # ) - # xpk_print(f"Task: 'Get cluster credentials' in progress.") - # return_code = get_cluster_credentials(args) - # return_code = run_command_with_updates( - # command_get_credentials, 'Get cluster credentials', args - # ) - # if return_code != 0: - # xpk_print(f'Failed to get cluster credentials {return_code}') - # xpk_exit(return_code) + return + + command_git_clone = f'git clone https://github.com/coredns/deployment.git {coredns_repo_full_path}' + xpk_print(f"Task: 'Clone deployment' in progress, Target directory:{coredns_repo_full_path}.") + return_code = run_command_with_updates(command_git_clone, 'Clone deployment', args) + if return_code != 0: + xpk_print(f'Clone deployment error {return_code}') + xpk_exit(return_code) +def _deploy_coredns_manifests(args, coredns_k8s_path: str): + """Deploys CoreDNS manifests to the cluster.""" if not os.path.isdir(coredns_k8s_path): xpk_print(f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") xpk_exit(1) - - # Remember the current directory so that you can restore it later + original_cwd = os.getcwd() try: - # Change the current working directory to the path of the CoreDNS deployment os.chdir(coredns_k8s_path) + xpk_print(f"Current working directory changed to: {os.getcwd()}") - command_deploy_coredns = ( - f'./deploy.sh | kubectl apply -f -' - ) + command_deploy_coredns = './deploy.sh | kubectl apply -f -' xpk_print(f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") - return_code = run_command_with_updates( - command_deploy_coredns, 'Deploy CoreDNS', args - ) + return_code = run_command_with_updates(command_deploy_coredns, 'Deploy CoreDNS', args) if return_code != 0: xpk_print(f'Deploy CoreDNS error {return_code}') pass finally: - # Whether it succeeds or fails, always restore to the original directory + xpk_print(f"Restoring working directory to: {original_cwd}") os.chdir(original_cwd) - if return_code != 0: - xpk_exit(return_code) - # Scale down kube-dns-autoscaler - command_autoscaler_scale_down = ( - 'kubectl scale deployment kube-dns-autoscaler --replicas=0 --namespace=kube-system' - ) - # Note: The scaling down command has been issued, but the actual scaling process may take some time. - xpk_print(f"Task: 'Scaling down kube-dns-autoscaler' in progress") - return_code = run_command_with_updates( - command_autoscaler_scale_down, 'Scale down kube-dns-autoscaler', args - ) if return_code != 0: - xpk_print(f'Scale down kube-dns-autoscaler error {return_code}') xpk_exit(return_code) - xpk_print("\n kube-dns-autoscaler has been scaled down.") - # Scale down kube-dns - command_dns_scale_down = ( - 'kubectl scale deployment kube-dns --replicas=0 --namespace=kube-system' - ) - # Note: The scaling down command has been issued, but the actual scaling process may take some time. - xpk_print(f"Task: 'Scaling down kube-dns' in progress") - return_code = run_command_with_updates( - command_dns_scale_down, 'Scale down kube-dns', args - ) +def _scale_down_deployment(args, deployment_name: str, namespace: str = 'kube-system'): + """Scales down a specified Kubernetes deployment to 0 replicas.""" + command = f'kubectl scale deployment {deployment_name} --replicas=0 --namespace={namespace}' + xpk_print(f"Task: 'Scaling down {deployment_name}' in progress") + return_code = run_command_with_updates(command, f'Scale down {deployment_name}', args) if return_code != 0: - xpk_print(f'Scale down kube-dns error {return_code}') + xpk_print(f'Scale down {deployment_name} error {return_code}') xpk_exit(return_code) - xpk_print("\nkube-dns has been scaled down.") + xpk_print(f"\n{deployment_name} has been scaled down.") - # Scale up core-dns - command_coredns_scale = ( - 'kubectl scale deployment coredns --replicas=15 -n kube-system' - ) - xpk_print(f"Task: 'Scale CoreDNS' in progress") - return_code = run_command_with_updates( - command_coredns_scale, 'Scale CoreDNS', args - ) +def _scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): + """Scales up the CoreDNS deployment to a specified number of replicas.""" + command_coredns_scale = f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}' + xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)") + return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS', args) if return_code != 0: xpk_print(f'Scale CoreDNS error {return_code}') xpk_exit(return_code) - xpk_print("CoreDNS scale up command sent. Now verifying CoreDNS readiness...") +def _verify_coredns_readiness(args, timeout: int = 120, namespace: str = 'kube-system'): + """Verifies CoreDNS readiness using kubectl wait commands.""" + xpk_print("Now verifying CoreDNS readiness...") + + # Wait for kube-dns to be fully scaled down + command_kube_dns_wait_scaled_down = ( + f"kubectl wait deployment/kube-dns --for=jsonpath='{{.status.replicas}}'=0 " + f"--namespace={namespace} --timeout={timeout}s" + ) + xpk_print(f"Verifying if kube-dns has scaled down...") + return_code_kube_dns = run_command_with_updates( + command_kube_dns_wait_scaled_down, "Wait for kube-dns scale down", args + ) + if return_code_kube_dns == 0: + xpk_print("kube-dns did not scale down successfully within the timeout.") + xpk_exit(1) # Exit if kube-dns cannot scale down + + # Wait for CoreDNS to be fully scaled up and available + command_coredns_wait_available = ( + f"kubectl wait deployment/coredns --for=condition=Available=true " + f"--namespace={namespace} --timeout={timeout}s" + ) + xpk_print(f"Verifying if CoreDNS is available...") + return_code_coredns = run_command_with_updates( + command_coredns_wait_available, "Wait for coredns available", args + ) + if return_code_coredns != 0: + xpk_print("CoreDNS verification failed, it might not have fully started within the timeout.") + xpk_exit(1) # Exit if coredns cannot become available - # Call the check function here - if not check_coredns_status(args, timeout=120): - xpk_print("CoreDNS verification failed, it might not have fully started.") - xpk_exit(1) - xpk_print("CoreDNS has successfully started and passed verification.") - xpk_print("\nThe CoreDNS setup process has been completed.") - + +def _cleanup_coredns_repo(coredns_repo_full_path: str): + """Deletes the cloned CoreDNS deployment directory.""" xpk_print(f"Task: 'Deleting CoreDNS deployment directory' in progress: {coredns_repo_full_path}") try: shutil.rmtree(coredns_repo_full_path) @@ -987,6 +817,53 @@ def update_coredns(args): except OSError as e: xpk_print(f"Error deleting directory {coredns_repo_full_path}: {e}") +def update_coredns(args): + """Updates and deploys CoreDNS within a cluster. + + This function performs the following steps: + 1. Installs 'jq'. + 2. Clones the CoreDNS deployment repository from GitHub if it doesn't already exist. + 3. Deploys CoreDNS to the cluster. + 4. Scales down the 'kube-dns-autoscaler' and 'kube-dns' deployments. + 5. Scales up the 'coredns' deployment to 15 replicas. + 6. Waits for kube-dns to scale down and coredns to be ready using kubectl wait. + 7. Cleans up the cloned repository. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if successful and 1 otherwise. + """ + coredns_repo_dir = os.path.expanduser("/tmp/") + coredns_repo_dir_name = "deployment" + coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name) + coredns_k8s_path = os.path.join(coredns_repo_full_path, "kubernetes") + + # 1. Install jq + _install_jq(args) + + # 2. Clone CoreDNS deployment repository + _clone_coredns_deployment_repo(args, coredns_repo_full_path) + + # 3. Deploy CoreDNS to the cluster + _deploy_coredns_manifests(args, coredns_k8s_path) + + # 4. Scale down kube-dns-autoscaler + _scale_down_deployment(args, "kube-dns-autoscaler") + + # 5. Scale down kube-dns + _scale_down_deployment(args, "kube-dns") + + # 6. Scale up coredns and verify readiness + _scale_up_coredns(args, replicas=15) + _verify_coredns_readiness(args, timeout=120) + + xpk_print("The CoreDNS setup process has been completed.") + + # 7. Cleanup + _cleanup_coredns_repo(coredns_repo_full_path) + return 0 def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: @@ -1005,10 +882,13 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: return_code = run_command_with_updates( command, f"Check CoreDNS deployment in {namespace}", args ) - if not check_coredns_status(args, timeout=120): - xpk_print("CoreDNS verification failed, it might not have fully started.") - xpk_exit(1) - return return_code == 0 + if return_code == 0: + _verify_coredns_readiness(args) + xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.") + return True + else: + xpk_print(f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or an error occurred.") + return False except Exception as e: xpk_print(f"Error checking CoreDNS deployment existence: {e}") return False @@ -1026,7 +906,6 @@ def update_coredns_if_necessary(args) -> int: 0 if successful (CoreDNS was already present or successfully deployed), and 1 otherwise. """ - get_cluster_credentials(args) if coredns_deployment_exists(args, namespace='kube-system'): xpk_print('Skipping CoreDNS deployment since it already exists.') return 0 From b58c54d8f638f0524e8989d1d5cace74f44d3605 Mon Sep 17 00:00:00 2001 From: DannyLi Date: Tue, 24 Jun 2025 07:22:06 +0000 Subject: [PATCH 10/41] Organize code --- src/xpk/commands/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 42c475b12..de52a1b3e 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -791,7 +791,8 @@ def _verify_coredns_readiness(args, timeout: int = 120, namespace: str = 'kube-s if return_code_kube_dns == 0: xpk_print("kube-dns did not scale down successfully within the timeout.") xpk_exit(1) # Exit if kube-dns cannot scale down - + else: + xpk_print("kube-dns did not scale down successfully within the timeout.") # Wait for CoreDNS to be fully scaled up and available command_coredns_wait_available = ( f"kubectl wait deployment/coredns --for=condition=Available=true " From edb54a7cc697c68f1d6f2cb4a1f7cebc682b0feb Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 26 Jun 2025 06:49:08 +0000 Subject: [PATCH 11/41] Organize code --- src/xpk/commands/cluster.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index de52a1b3e..b0960522e 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -79,10 +79,6 @@ import os -from kubernetes import client, config -from kubernetes.client.rest import ApiException -import time - def cluster_adapt(args) -> None: """Function that performs cluster adaptation. @@ -234,8 +230,6 @@ def cluster_create(args) -> None: if create_cluster_command_code != 0: xpk_exit(create_cluster_command_code) - - authorize_private_cluster_access_command_code = ( authorize_private_cluster_access_if_necessary(args) ) @@ -747,7 +741,6 @@ def _deploy_coredns_manifests(args, coredns_k8s_path: str): return_code = run_command_with_updates(command_deploy_coredns, 'Deploy CoreDNS', args) if return_code != 0: xpk_print(f'Deploy CoreDNS error {return_code}') - pass finally: xpk_print(f"Restoring working directory to: {original_cwd}") From f6dde4fd5c286f9270e15d7e6b6fbed3e72b38fe Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 3 Jul 2025 06:03:34 +0000 Subject: [PATCH 12/41] Remove the arg.enable_pathways condition. --- src/xpk/commands/cluster.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index b0960522e..2ae07830f 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -247,10 +247,9 @@ def cluster_create(args) -> None: get_cluster_credentials(args) - if args.enable_pathways == True: - update_coredns_command_code = update_coredns_if_necessary(args) - if update_coredns_command_code != 0: - xpk_exit(update_cluster_command_code) + update_coredns_command_code = update_coredns_if_necessary(args) + if update_coredns_command_code != 0: + xpk_exit(update_cluster_command_code) k8s_client = setup_k8s_env(args) From 04e5a1de1959737a0fef4870651043485cbb15ee Mon Sep 17 00:00:00 2001 From: DannyLi Date: Wed, 9 Jul 2025 13:31:35 +0000 Subject: [PATCH 13/41] Resolve lint issue and added a function to fix a bug when validating kubeDNS. --- src/xpk/commands/cluster.py | 212 +++++++++++++++++++++++------------- 1 file changed, 135 insertions(+), 77 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 2ae07830f..5fbf3ad00 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -688,6 +688,7 @@ def cluster_create_pathways(args) -> None: args.enable_ray_cluster = False cluster_create(args) + def cluster_create_ray_cluster(args) -> None: """Function around cluster creation for RayCluster. @@ -701,7 +702,8 @@ def cluster_create_ray_cluster(args) -> None: args.enable_autoprovisioning = False cluster_create(args) -def _install_jq(args): + +def install_jq(args): """Installs 'jq' utility.""" command_jq_install = 'sudo apt install jq -y' xpk_print("Task: 'Install jq' in progress.") @@ -710,105 +712,159 @@ def _install_jq(args): xpk_print(f'Install jq error {return_code}') xpk_exit(return_code) -def _clone_coredns_deployment_repo(args, coredns_repo_full_path: str): + +def clone_coredns_deployment_repo(args, coredns_repo_full_path: str): """Clones the CoreDNS deployment repository if it doesn't exist.""" if os.path.exists(coredns_repo_full_path): - xpk_print(f"Directory '{coredns_repo_full_path}' already exists, skip git clone.") + xpk_print( + f"Directory '{coredns_repo_full_path}' already exists, skip git clone." + ) return - - command_git_clone = f'git clone https://github.com/coredns/deployment.git {coredns_repo_full_path}' - xpk_print(f"Task: 'Clone deployment' in progress, Target directory:{coredns_repo_full_path}.") - return_code = run_command_with_updates(command_git_clone, 'Clone deployment', args) + command_git_clone = ( + 'git clone https://github.com/coredns/deployment.git' + f' {coredns_repo_full_path}' + ) + xpk_print( + "Task: 'Clone deployment' in progress, Target" + f' directory:{coredns_repo_full_path}.' + ) + return_code = run_command_with_updates( + command_git_clone, 'Clone deployment', args + ) if return_code != 0: xpk_print(f'Clone deployment error {return_code}') xpk_exit(return_code) -def _deploy_coredns_manifests(args, coredns_k8s_path: str): + +def deploy_coredns_manifests(args, coredns_k8s_path: str): """Deploys CoreDNS manifests to the cluster.""" if not os.path.isdir(coredns_k8s_path): - xpk_print(f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist. Has git clone been successful?") + xpk_print( + f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist." + ' Has git clone been successful?' + ) xpk_exit(1) - original_cwd = os.getcwd() try: os.chdir(coredns_k8s_path) - xpk_print(f"Current working directory changed to: {os.getcwd()}") + xpk_print(f'Current working directory changed to: {os.getcwd()}') command_deploy_coredns = './deploy.sh | kubectl apply -f -' - xpk_print(f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'") - - return_code = run_command_with_updates(command_deploy_coredns, 'Deploy CoreDNS', args) + xpk_print( + f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'" + ) + return_code = run_command_with_updates( + command_deploy_coredns, 'Deploy CoreDNS', args + ) if return_code != 0: xpk_print(f'Deploy CoreDNS error {return_code}') finally: - xpk_print(f"Restoring working directory to: {original_cwd}") + xpk_print(f'Restoring working directory to: {original_cwd}') os.chdir(original_cwd) - if return_code != 0: xpk_exit(return_code) -def _scale_down_deployment(args, deployment_name: str, namespace: str = 'kube-system'): + +def scale_down_deployment( + args, deployment_name: str, namespace: str = 'kube-system' +): """Scales down a specified Kubernetes deployment to 0 replicas.""" - command = f'kubectl scale deployment {deployment_name} --replicas=0 --namespace={namespace}' + command = ( + f'kubectl scale deployment {deployment_name} --replicas=0' + f' --namespace={namespace}' + ) xpk_print(f"Task: 'Scaling down {deployment_name}' in progress") - return_code = run_command_with_updates(command, f'Scale down {deployment_name}', args) + return_code = run_command_with_updates( + command, f'Scale down {deployment_name}', args + ) if return_code != 0: xpk_print(f'Scale down {deployment_name} error {return_code}') xpk_exit(return_code) - xpk_print(f"\n{deployment_name} has been scaled down.") + xpk_print(f'\n{deployment_name} has been scaled down.') + -def _scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): +def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): """Scales up the CoreDNS deployment to a specified number of replicas.""" - command_coredns_scale = f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}' + command_coredns_scale = ( + f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}' + ) xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)") - return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS', args) + return_code = run_command_with_updates( + command_coredns_scale, 'Scale CoreDNS', args + ) if return_code != 0: xpk_print(f'Scale CoreDNS error {return_code}') xpk_exit(return_code) -def _verify_coredns_readiness(args, timeout: int = 120, namespace: str = 'kube-system'): - """Verifies CoreDNS readiness using kubectl wait commands.""" - xpk_print("Now verifying CoreDNS readiness...") - # Wait for kube-dns to be fully scaled down - command_kube_dns_wait_scaled_down = ( - f"kubectl wait deployment/kube-dns --for=jsonpath='{{.status.replicas}}'=0 " - f"--namespace={namespace} --timeout={timeout}s" +def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool: + """Check for the existence of a specific Deployment in a given namespace.""" + command = ( + f'kubectl get deployment {deployment_name} -n' + f' {namespace} --ignore-not-found' ) - xpk_print(f"Verifying if kube-dns has scaled down...") - return_code_kube_dns = run_command_with_updates( - command_kube_dns_wait_scaled_down, "Wait for kube-dns scale down", args + result = run_command_with_updates( + command, 'Waiting for kubeDNS to be checked.', args ) - if return_code_kube_dns == 0: - xpk_print("kube-dns did not scale down successfully within the timeout.") - xpk_exit(1) # Exit if kube-dns cannot scale down + return result + + +def verify_coredns_readiness( + args, timeout: int = 120, namespace: str = 'kube-system' +): + """Verifies CoreDNS readiness using kubectl wait commands.""" + xpk_print('Now verifying CoreDNS readiness...') + kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace) + if kube_dns_exists: + # Wait for kube-dns to be fully scaled down + command_kube_dns_wait_scaled_down = ( + 'kubectl wait deployment/kube-dns' + " --for=jsonpath='{.status.replicas}'=0" + f' --namespace={namespace} --timeout={timeout}s' + ) + xpk_print('Verifying if kube-dns has scaled down...') + return_code_kube_dns = run_command_with_updates( + command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args + ) + if return_code_kube_dns != 0: + xpk_print('kube-dns did not scale down successfully within the timeout.') + xpk_exit(1) # Exit if kube-dns cannot scale down + else: + xpk_print('kube-dns has successfully scaled down.') else: - xpk_print("kube-dns did not scale down successfully within the timeout.") + xpk_print('kube-dns deployment not found.') # Wait for CoreDNS to be fully scaled up and available command_coredns_wait_available = ( - f"kubectl wait deployment/coredns --for=condition=Available=true " - f"--namespace={namespace} --timeout={timeout}s" + 'kubectl wait deployment/coredns --for=condition=Available=true' + f' --namespace={namespace} --timeout={timeout}s' ) - xpk_print(f"Verifying if CoreDNS is available...") + xpk_print('Verifying if CoreDNS is available...') return_code_coredns = run_command_with_updates( - command_coredns_wait_available, "Wait for coredns available", args + command_coredns_wait_available, 'Wait for coredns available', args ) if return_code_coredns != 0: - xpk_print("CoreDNS verification failed, it might not have fully started within the timeout.") - xpk_exit(1) # Exit if coredns cannot become available + xpk_print( + 'CoreDNS verification failed, it might not have fully started within' + ' the timeout.' + ) + xpk_exit(1) # Exit if coredns cannot become available - xpk_print("CoreDNS has successfully started and passed verification.") + xpk_print('CoreDNS has successfully started and passed verification.') -def _cleanup_coredns_repo(coredns_repo_full_path: str): +def cleanup_coredns_repo(coredns_repo_full_path: str): """Deletes the cloned CoreDNS deployment directory.""" - xpk_print(f"Task: 'Deleting CoreDNS deployment directory' in progress: {coredns_repo_full_path}") + xpk_print( + "Task: 'Deleting CoreDNS deployment directory' in progress:" + f' {coredns_repo_full_path}' + ) try: shutil.rmtree(coredns_repo_full_path) - xpk_print(f"Successfully deleted directory: {coredns_repo_full_path}") + xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}') except OSError as e: - xpk_print(f"Error deleting directory {coredns_repo_full_path}: {e}") + xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}') + def update_coredns(args): """Updates and deploys CoreDNS within a cluster. @@ -828,37 +884,37 @@ def update_coredns(args): Returns: 0 if successful and 1 otherwise. """ - coredns_repo_dir = os.path.expanduser("/tmp/") - coredns_repo_dir_name = "deployment" + coredns_repo_dir = os.path.expanduser('/tmp/') + coredns_repo_dir_name = 'deployment' coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name) - coredns_k8s_path = os.path.join(coredns_repo_full_path, "kubernetes") - + coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes') # 1. Install jq - _install_jq(args) + install_jq(args) # 2. Clone CoreDNS deployment repository - _clone_coredns_deployment_repo(args, coredns_repo_full_path) + clone_coredns_deployment_repo(args, coredns_repo_full_path) # 3. Deploy CoreDNS to the cluster - _deploy_coredns_manifests(args, coredns_k8s_path) + deploy_coredns_manifests(args, coredns_k8s_path) # 4. Scale down kube-dns-autoscaler - _scale_down_deployment(args, "kube-dns-autoscaler") + scale_down_deployment(args, 'kube-dns-autoscaler') # 5. Scale down kube-dns - _scale_down_deployment(args, "kube-dns") + scale_down_deployment(args, 'kube-dns') # 6. Scale up coredns and verify readiness - _scale_up_coredns(args, replicas=15) - _verify_coredns_readiness(args, timeout=120) + scale_up_coredns(args, replicas=15) + verify_coredns_readiness(args, timeout=120) - xpk_print("The CoreDNS setup process has been completed.") + xpk_print('The CoreDNS setup process has been completed.') # 7. Cleanup - _cleanup_coredns_repo(coredns_repo_full_path) + cleanup_coredns_repo(coredns_repo_full_path) return 0 + def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: """Checks if the CoreDNS deployment exists in the given namespace. @@ -868,24 +924,26 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool: Returns: True if the 'coredns' deployment exists, False otherwise. """ - - command = f"kubectl get deployment coredns -n {namespace}" - try: - xpk_print(f"Task: 'Checking CoreDNS deployment existence' in progress for namespace: {namespace}") - return_code = run_command_with_updates( - command, f"Check CoreDNS deployment in {namespace}", args + command = f'kubectl get deployment coredns -n {namespace}' + xpk_print( + "Task: 'Checking CoreDNS deployment existence' in progress for" + f' namespace: {namespace}' + ) + return_code = run_command_with_updates( + command, f'Check CoreDNS deployment in {namespace}', args + ) + if return_code == 0: + verify_coredns_readiness(args) + xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.") + return True + else: + xpk_print( + f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or" + ' an error occurred.' ) - if return_code == 0: - _verify_coredns_readiness(args) - xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.") - return True - else: - xpk_print(f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or an error occurred.") - return False - except Exception as e: - xpk_print(f"Error checking CoreDNS deployment existence: {e}") return False + def update_coredns_if_necessary(args) -> int: """Updates and deploys CoreDNS within the cluster if it's not already present. @@ -929,7 +987,7 @@ def create_cluster_if_necessary( return 0 else: return run_gke_cluster_create_command( - args, gke_control_plane_version, system + args, gke_control_plane_version, system ) From 1b296f7b9fd2af032a5674245fb7a76594b17d9e Mon Sep 17 00:00:00 2001 From: DannyLi Date: Thu, 10 Jul 2025 08:26:49 +0000 Subject: [PATCH 14/41] Delete this steps listing. --- src/xpk/commands/cluster.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 5fbf3ad00..12138cd54 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -869,15 +869,6 @@ def cleanup_coredns_repo(coredns_repo_full_path: str): def update_coredns(args): """Updates and deploys CoreDNS within a cluster. - This function performs the following steps: - 1. Installs 'jq'. - 2. Clones the CoreDNS deployment repository from GitHub if it doesn't already exist. - 3. Deploys CoreDNS to the cluster. - 4. Scales down the 'kube-dns-autoscaler' and 'kube-dns' deployments. - 5. Scales up the 'coredns' deployment to 15 replicas. - 6. Waits for kube-dns to scale down and coredns to be ready using kubectl wait. - 7. Cleans up the cloned repository. - Args: args: user provided arguments for running the command. From 133f2a4a10ad912ed99d7603f41b9dbfaa97f860 Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Fri, 18 Jul 2025 14:41:17 +0200 Subject: [PATCH 15/41] Fix max-nodes when creating flex queued nodepool of tpus (#541) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski --- src/xpk/core/capacity.py | 4 ++-- src/xpk/core/kjob.py | 1 - src/xpk/core/nodepool.py | 6 +++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 3e3567412..08d17c09b 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -173,7 +173,7 @@ def verify_reservation_exists(args) -> int: def get_capacity_arguments_from_capacity_type( - args, capacity_type: CapacityType + args, capacity_type: CapacityType, max_nodes: int ) -> tuple[str, int]: """Determine the Nodepool creation capacity arguments needed. @@ -197,7 +197,7 @@ def get_capacity_arguments_from_capacity_type( capacity_args = ( ' --flex-start --enable-queued-provisioning --enable-autoscaling' ' --location-policy=ANY --reservation-affinity=none' - ' --no-enable-autorepair --max-nodes=1' + f' --no-enable-autorepair --max-nodes={max_nodes}' ) case CapacityType.RESERVATION: capacity_args = ( diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index 5fbe06f12..318fe19d2 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -347,7 +347,6 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int: if pod_image is None or len(pod_image) == 0: pod_image = PodTemplateDefaults.IMAGE.value working_directory = config.get(KJOB_SHELL_WORKING_DIRECTORY) - xpk_print("working directory is: ", working_directory) if working_directory is None or len(working_directory) == 0: working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 681f62a8d..fccd7c886 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -77,8 +77,12 @@ def run_gke_node_pool_create_command( if return_code > 0: xpk_print('Listing all reservations failed!') return_code = 1 + if system.accelerator_type == AcceleratorType['TPU']: + max_nodes = system.vms_per_slice + else: + max_nodes = 1000 capacity_args, return_code = get_capacity_arguments_from_capacity_type( - args, capacity_type + args, capacity_type, max_nodes ) if return_code > 0: xpk_print('Parsing capacity arguments failed!') From 8b8f767e9f218f2b5c388420b5d43bff092110de Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Tue, 22 Jul 2025 17:24:52 +0200 Subject: [PATCH 16/41] Fix kueue version in yaml string and loosen dependecy on cloud-storage (#546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix kueue version Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski --- pyproject.toml | 2 +- src/xpk/core/kueue.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 19a36365e..a658d4634 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "google-api-core==2.24.1", "packaging==24.2", "google-cloud-filestore==1.12.0", - "google-cloud-storage==2.19.0" + "google-cloud-storage" ] [project.urls] diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 9812a7508..a6c6872b4 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -220,7 +220,7 @@ - --zap-log-level=2 command: - /manager - image: registry.k8s.io/kueue/kueue:v0.10.0 + image: registry.k8s.io/kueue/kueue:v0.12.2 imagePullPolicy: Always livenessProbe: httpGet: From e39a7a7bd85d159b6abcb031f2a313898bb4d70d Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Wed, 23 Jul 2025 14:45:51 +0200 Subject: [PATCH 17/41] Remove RBAC container (#547) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove rbac contaier Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski --- src/xpk/core/kueue.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index a6c6872b4..31430838d 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -220,7 +220,7 @@ - --zap-log-level=2 command: - /manager - image: registry.k8s.io/kueue/kueue:v0.12.2 + image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION} imagePullPolicy: Always livenessProbe: httpGet: @@ -258,17 +258,6 @@ - mountPath: /controller_manager_config.yaml name: manager-config subPath: controller_manager_config.yaml - - args: - - --secure-listen-address=0.0.0.0:8443 - - --upstream=http://127.0.0.1:8080/ - - --logtostderr=true - - --v=10 - image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0 - name: kube-rbac-proxy - ports: - - containerPort: 8443 - name: https - protocol: TCP securityContext: runAsNonRoot: true serviceAccountName: kueue-controller-manager @@ -536,7 +525,7 @@ def update_kueue_resources_if_necessary(args): f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' ) yml_string = kueue_controller_manager_yml.format( - memory_limit_size=new_memory_limit, + memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION ) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp.file.name)}' From ab5bc712fb688c342e3f15c5b09df0e71dde8e19 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 23 Jul 2025 16:44:19 +0200 Subject: [PATCH 18/41] Merge main to develop (#542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * version updated to 0.10.0 * Fix max-nodes when creating flex queued nodepool of tpus (#541) * fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski Co-authored-by: pawloch00 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index c15c401a8..428f523de 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.9.0' +__version__ = 'v0.10.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From 4c12e2ad3a0a3cac30d3cb95b6fb0f09c4c375d4 Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Wed, 23 Jul 2025 18:02:51 +0200 Subject: [PATCH 19/41] fix kjob.py pyink (#552) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kjob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index ab082e1e4..318fe19d2 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -333,6 +333,7 @@ def create_job_template_instance( args=args, ) + def create_pod_template_instance(args: Namespace, service_account: str) -> int: """Create new PodTemplate instance on cluster with default settings. From 2b7c5f528655656ae4b60b8a7f5c0678c072271c Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Fri, 25 Jul 2025 00:41:14 -0700 Subject: [PATCH 20/41] Update Kueue to create visibility folder (#556) --- src/xpk/core/kueue.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 31430838d..8f69c40c8 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -252,6 +252,8 @@ securityContext: allowPrivilegeEscalation: false volumeMounts: + - mountPath: /visibility + name: visibility - mountPath: /tmp/k8s-webhook-server/serving-certs name: cert readOnly: true @@ -263,6 +265,8 @@ serviceAccountName: kueue-controller-manager terminationGracePeriodSeconds: 10 volumes: + - name: visibility + emptyDir: {{}} - name: cert secret: defaultMode: 420 From c0fb3f68c739df0e049bb36e1eec9d03d13633ec Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Sun, 27 Jul 2025 23:54:46 -0700 Subject: [PATCH 21/41] Update CPU limits to 750m (#558) --- src/xpk/core/kueue.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8f69c40c8..d7bbdfbf8 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -244,10 +244,10 @@ periodSeconds: 10 resources: limits: - cpu: 500m + cpu: 750m memory: {memory_limit_size} requests: - cpu: 500m + cpu: 750m memory: 512Mi securityContext: allowPrivilegeEscalation: false From 03b2b3b40739fa9b63e9b9c3469f2496eccd0a86 Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Mon, 28 Jul 2025 10:32:50 +0200 Subject: [PATCH 22/41] Merge main release 0.10.1 (#555) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * version updated to 0.10.0 * Fix max-nodes when creating flex queued nodepool of tpus (#541) * fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski * Release 0.10.1 (#553) * fix Signed-off-by: Piotr Pawłowski * Merge develop to release-0.10 (#551) * Fix max-nodes when creating flex queued nodepool of tpus (#541) * fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski * Fix kueue version in yaml string and loosen dependecy on cloud-storage (#546) * fix kueue version Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski * Remove RBAC container (#547) * remove rbac contaier Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski * Merge main to develop (#542) * version updated to 0.10.0 * Fix max-nodes when creating flex queued nodepool of tpus (#541) * fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski Co-authored-by: pawloch00 * fix kjob.py pyink (#552) Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski Co-authored-by: Farhad Sharabiani * bump xpk version Signed-off-by: Piotr Pawłowski --------- Signed-off-by: Piotr Pawłowski Co-authored-by: Farhad Sharabiani --------- Signed-off-by: Piotr Pawłowski Co-authored-by: Farhad Sharabiani --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 428f523de..67e71fa56 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.10.0' +__version__ = 'v0.10.1' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From 44b6d7be5e1ddc7868aa167e984d4ccf6b7e2f57 Mon Sep 17 00:00:00 2001 From: pawloch00 Date: Mon, 28 Jul 2025 10:50:09 +0200 Subject: [PATCH 23/41] Revert "Merge main release 0.10.1 (#555)" (#559) This reverts commit 03b2b3b40739fa9b63e9b9c3469f2496eccd0a86. --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 67e71fa56..428f523de 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.10.1' +__version__ = 'v0.10.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml') From 14b33f23d0a5acdb2fd4b46129fe16af0533c745 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 29 Jul 2025 11:24:01 +0200 Subject: [PATCH 24/41] AutoscalingProfile was set to optimize_utilization (#565) --- src/xpk/commands/cluster.py | 1 + src/xpk/core/nap.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 99126db65..143cee9c8 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -842,6 +842,7 @@ def run_gke_cluster_create_command( f' {args.custom_cluster_arguments}' f' {rapid_release_cmd}' ' --enable-dns-access' + ' --autoscaling-profile=optimize-utilization' ) enable_ip_alias = False diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 9c788d6b4..11c314e97 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -99,6 +99,7 @@ def enable_autoprovisioning_on_cluster( f' --region={zone_to_region(args.zone)} --enable-autoprovisioning' ' --autoprovisioning-config-file' f' {autoprovisioning_config.config_filename}' + ' --autoscaling-profile=optimize-utilization' ) task = 'Update cluster with autoprovisioning enabled' return_code = run_command_with_updates(command, task, args) From 25fe39948d380529234ddc6e0890f0c71f7621ee Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 30 Jul 2025 09:03:29 +0200 Subject: [PATCH 25/41] "Select TPU by topology (#525)" + Fix errors (#563) * fix yaml formatting for workloads with TPU and NAP * refactor tpu system characteristics internal representation of TPU machines has changed, so now grep that used the old format fails * fix device_type issue --------- Co-authored-by: gcie --- .github/workflows/build_tests.yaml | 10 +- .../workflows/reusable_workload_tests.yaml | 5 +- src/xpk/core/capacity.py | 4 +- src/xpk/core/scheduling.py | 2 +- src/xpk/core/system_characteristics.py | 1237 +++-------------- 5 files changed, 172 insertions(+), 1086 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index b3359852f..58b795ca7 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -40,6 +40,7 @@ jobs: group-name: ${{ steps.set-group-name.outputs.group-name }} zone: ${{ steps.set-zone.outputs.zone }} tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }} + tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }} location: ${{steps.set-location.outputs.location}} run-id: ${{steps.set-run-id.outputs.run-id}} steps: @@ -76,6 +77,10 @@ jobs: id: set-tpu-type run: | echo tpu-type=v4-8 >> $GITHUB_OUTPUT + - name: set tpu-type-topology + id: set-tpu-type-topology + run: | + echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT - name: set location id: set-location run: | @@ -152,7 +157,7 @@ jobs: with: run-id: '${{needs.set-variables.outputs.run-id}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' secrets: inherit @@ -165,7 +170,7 @@ jobs: with: cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' run-id: '${{needs.set-variables.outputs.run-id}}' @@ -180,6 +185,7 @@ jobs: cluster-name: ${{needs.set-variables.outputs.cluster-name}} cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' tpu-type: ${{needs.set-variables.outputs.tpu-type}} + tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}} zone: ${{needs.set-variables.outputs.zone}} run-id: '${{needs.set-variables.outputs.run-id}}' secrets: inherit diff --git a/.github/workflows/reusable_workload_tests.yaml b/.github/workflows/reusable_workload_tests.yaml index 6bf5f14d6..fad5034cd 100644 --- a/.github/workflows/reusable_workload_tests.yaml +++ b/.github/workflows/reusable_workload_tests.yaml @@ -24,6 +24,9 @@ on: tpu-type: required: true type: string + tpu-type-topology: + required: true + type: string tpu-type-dws: required: false type: string @@ -108,7 +111,7 @@ jobs: --docker-password='${{secrets.GCP_SA_KEY}}' \ --docker-email='${{secrets.GCP_SA_EMAIL}}' - name: Run workload with private image - run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key + run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type-topology}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key - name: Wait for private image workload completion and confirm it succeeded run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $PRIVATE_IMAGE_WORKLOAD_NAME --timeout 300 - name: Delete kubectl secret diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 08d17c09b..93f2d672c 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type( case CapacityType.ON_DEMAND.name: node_selector = '' case CapacityType.FLEX_START.name: - node_selector = 'cloud.google.com/gke-queued="true"' + node_selector = 'cloud.google.com/gke-queued: "true"' case CapacityType.SPOT.name: - node_selector = 'cloud.google.com/gke-spot="true"' + node_selector = 'cloud.google.com/gke-spot: "true"' case CapacityType.RESERVATION.name: node_selector = f'cloud.google.com/reservation-name: {args.reservation}' case _: diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index 8bc18c66d..d8957e133 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: missing_gke_accelerator_type = False if not cluster_config_map.get(system.gke_accelerator): xpk_print( - f'Gke Accelerator Type Check: {args.workload} is requesting' + f'GKE Accelerator Type Check: {args.workload} is requesting' f' {system.gke_accelerator} but cluster only contains' f' {cluster_config_map.keys()}. ' ) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 48fd2c6f3..68a5b89c2 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -15,6 +15,8 @@ """ from dataclasses import dataclass +from functools import reduce +from operator import mul AcceleratorType = {'TPU': 1, 'GPU': 2, 'CPU': 3} @@ -91,6 +93,34 @@ def get_system_characteristics_by_device_type( return None, 1 +def get_tpu_system_characteristics_map( + prefix: str, + tensorcores_per_chip: int, + gke_accelerator: str, + machine_type: str, + supported_topologies: list[str], +) -> dict[str, SystemCharacteristics]: + system_characteristics_map = {} + for topology in supported_topologies: + total_chips = reduce(mul, (int(x) for x in topology.split('x')), 1) + num_tensorcores = total_chips * tensorcores_per_chip + chips_per_vm = 1 if total_chips == 1 else 4 + vms_per_slice = total_chips // chips_per_vm + system = SystemCharacteristics( + topology, + vms_per_slice, + gke_accelerator, + machine_type, + chips_per_vm, + AcceleratorType['TPU'], + f'{prefix}-{num_tensorcores}', + ) + system_characteristics_map[f'{prefix}-{topology}'] = system + system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system + + return system_characteristics_map + + ################### Subcommand Helper Functions ############################# """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD @@ -212,1098 +242,145 @@ def get_system_characteristics_by_device_type( 'h100-mega-80gb-8', ), # TPU system characteristics - # v6e - 'v6e-1': SystemCharacteristics( - '1x1', - 1, - 'tpu-v6e-slice', - 'ct6e-standard-1t', - 1, - AcceleratorType['TPU'], - 'v6e-1', + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] ), - 'v6e-4': SystemCharacteristics( - '2x2', + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-4', + ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v6e-8': SystemCharacteristics( - '2x4', + **get_tpu_system_characteristics_map( + 'v5p', 2, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-8', - ), - 'v6e-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-16', - ), - 'v6e-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-32', - ), - 'v6e-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-64', - ), - 'v6e-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-128', - ), - 'v6e-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-256', - ), - # v5p - 'v5p-8': SystemCharacteristics( - '2x2x1', - 1, 'tpu-v5p-slice', 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x4x12', + '4x8x8', + '4x4x20', + '4x8x12', + '4x4x28', + '8x8x8', + '4x12x12', + '4x8x20', + '4x4x44', + '8x8x12', + '4x4x52', + '4x8x28', + '4x12x20', + '8x8x16', + '4x4x68', + '8x12x12', + '4x4x76', + '8x8x20', + '4x12x28', + '4x8x44', + '4x4x92', + '8x12x16', + '4x20x20', + '4x8x52', + '12x12x12', + '8x8x28', + '4x4x116', + '8x12x20', + '4x4x124', + '8x16x16', + '4x12x44', + '4x8x68', + '4x20x28', + '12x12x16', + '4x4x148', + '4x8x76', + '4x12x52', + '8x16x20', + '4x4x164', + '8x12x28', + '4x4x172', + '8x8x44', + '12x12x20', + '4x8x92', + '4x4x188', + '12x16x16', + '4x28x28', + '8x20x20', + '4x12x68', + '8x8x52', + '4x4x212', + '12x12x24', + '4x20x44', + '8x16x28', + '4x12x76', + '4x8x116', + '4x4x236', + '12x16x20', + '4x4x244', + '4x8x124', + '12x12x28', + '16x16x16', + '4x20x52', + '8x12x44', + '8x8x68', + '4x12x92', + '8x20x28', + '12x16x24', + '4x8x148', + '12x20x20', + '8x8x76', + '4x28x44', + '8x12x52', + '16x16x20', + '12x12x36', + '4x8x164', + '12x16x28', + '4x20x68', + '4x8x172', + '4x12x116', + '8x16x44', + '12x20x24', + '4x28x52', + '8x8x92', + '4x12x124', + '4x8x188', + '4x20x76', + '16x16x24', + '12x24x24', + '16x20x28', + ], + ), + **get_tpu_system_characteristics_map( + 'v5litepod', + 1, + 'tpu-v5-lite-podslice', + 'ct5lp-hightpu-4t', + ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v5p-16': SystemCharacteristics( - '2x2x2', + **get_tpu_system_characteristics_map( + 'v4', 2, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-16', - ), - 'v5p-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-32', - ), - 'v5p-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-64', - ), - 'v5p-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-128', - ), - 'v5p-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-256', - ), - 'v5p-384': SystemCharacteristics( - '4x4x12', - 48, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-384', - ), - 'v5p-512': SystemCharacteristics( - '4x8x8', - 64, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-512', - ), - 'v5p-640': SystemCharacteristics( - '4x4x20', - 80, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-640', - ), - 'v5p-768': SystemCharacteristics( - '4x8x12', - 96, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-768', - ), - 'v5p-896': SystemCharacteristics( - '4x4x28', - 112, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-896', - ), - 'v5p-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1024', - ), - 'v5p-1152': SystemCharacteristics( - '4x12x12', - 144, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1152', - ), - 'v5p-1280': SystemCharacteristics( - '4x8x20', - 160, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1280', - ), - 'v5p-1408': SystemCharacteristics( - '4x4x44', - 176, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1408', - ), - 'v5p-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1536', - ), - 'v5p-1664': SystemCharacteristics( - '4x4x52', - 208, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1664', - ), - 'v5p-1792': SystemCharacteristics( - '4x8x28', - 224, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1792', - ), - 'v5p-1920': SystemCharacteristics( - '4x12x20', - 240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1920', - ), - 'v5p-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2048', - ), - 'v5p-2176': SystemCharacteristics( - '4x4x68', - 272, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2176', - ), - 'v5p-2304': SystemCharacteristics( - '8x12x12', - 288, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2304', - ), - 'v5p-2432': SystemCharacteristics( - '4x4x76', - 304, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2432', - ), - 'v5p-2560': SystemCharacteristics( - '8x8x20', - 320, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2560', - ), - 'v5p-2688': SystemCharacteristics( - '4x12x28', - 336, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2688', - ), - 'v5p-2816': SystemCharacteristics( - '4x8x44', - 352, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2816', - ), - 'v5p-2944': SystemCharacteristics( - '4x4x92', - 368, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2944', - ), - 'v5p-3072': SystemCharacteristics( - '8x12x16', - 384, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3072', - ), - 'v5p-3200': SystemCharacteristics( - '4x20x20', - 400, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3200', - ), - 'v5p-3328': SystemCharacteristics( - '4x8x52', - 416, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3328', - ), - 'v5p-3456': SystemCharacteristics( - '12x12x12', - 432, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3456', - ), - 'v5p-3584': SystemCharacteristics( - '8x8x28', - 448, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3584', - ), - 'v5p-3712': SystemCharacteristics( - '4x4x116', - 464, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3712', - ), - 'v5p-3840': SystemCharacteristics( - '8x12x20', - 480, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3840', - ), - 'v5p-3968': SystemCharacteristics( - '4x4x124', - 496, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3968', - ), - 'v5p-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4096', - ), - 'v5p-4224': SystemCharacteristics( - '4x12x44', - 528, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4224', - ), - 'v5p-4352': SystemCharacteristics( - '4x8x68', - 544, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4352', - ), - 'v5p-4480': SystemCharacteristics( - '4x20x28', - 560, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4480', - ), - 'v5p-4608': SystemCharacteristics( - '12x12x16', - 576, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4608', - ), - 'v5p-4736': SystemCharacteristics( - '4x4x148', - 592, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4736', - ), - 'v5p-4864': SystemCharacteristics( - '4x8x76', - 608, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4864', - ), - 'v5p-4992': SystemCharacteristics( - '4x12x52', - 624, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4992', - ), - 'v5p-5120': SystemCharacteristics( - '8x16x20', - 640, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5120', - ), - 'v5p-5248': SystemCharacteristics( - '4x4x164', - 656, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5248', - ), - 'v5p-5376': SystemCharacteristics( - '8x12x28', - 672, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5376', - ), - 'v5p-5504': SystemCharacteristics( - '4x4x172', - 688, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5504', - ), - 'v5p-5632': SystemCharacteristics( - '8x8x44', - 704, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5632', - ), - 'v5p-5760': SystemCharacteristics( - '12x12x20', - 720, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5760', - ), - 'v5p-5888': SystemCharacteristics( - '4x8x92', - 736, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5888', - ), - 'v5p-6016': SystemCharacteristics( - '4x4x188', - 752, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6016', - ), - 'v5p-6144': SystemCharacteristics( - '12x16x16', - 768, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6144', - ), - 'v5p-6272': SystemCharacteristics( - '4x28x28', - 784, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6272', - ), - 'v5p-6400': SystemCharacteristics( - '8x20x20', - 800, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6400', - ), - 'v5p-6528': SystemCharacteristics( - '4x12x68', - 816, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6528', - ), - 'v5p-6656': SystemCharacteristics( - '8x8x52', - 832, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6656', - ), - 'v5p-6784': SystemCharacteristics( - '4x4x212', - 848, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6784', - ), - 'v5p-6912': SystemCharacteristics( - '12x12x24', - 864, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6912', - ), - 'v5p-7040': SystemCharacteristics( - '4x20x44', - 880, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7040', - ), - 'v5p-7168': SystemCharacteristics( - '8x16x28', - 896, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7168', - ), - 'v5p-7296': SystemCharacteristics( - '4x12x76', - 912, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7296', - ), - 'v5p-7424': SystemCharacteristics( - '4x8x116', - 928, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7424', - ), - 'v5p-7552': SystemCharacteristics( - '4x4x236', - 944, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7552', - ), - 'v5p-7680': SystemCharacteristics( - '12x16x20', - 960, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7680', - ), - 'v5p-7808': SystemCharacteristics( - '4x4x244', - 976, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7808', - ), - 'v5p-7936': SystemCharacteristics( - '4x8x124', - 992, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7936', - ), - 'v5p-8064': SystemCharacteristics( - '12x12x28', - 1008, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8064', - ), - 'v5p-8192': SystemCharacteristics( - '16x16x16', - 1024, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8192', - ), - 'v5p-8320': SystemCharacteristics( - '4x20x52', - 1040, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8320', - ), - 'v5p-8448': SystemCharacteristics( - '8x12x44', - 1056, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8448', - ), - 'v5p-8704': SystemCharacteristics( - '8x8x68', - 1088, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8704', - ), - 'v5p-8832': SystemCharacteristics( - '4x12x92', - 1104, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8832', - ), - 'v5p-8960': SystemCharacteristics( - '8x20x28', - 1120, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8960', - ), - 'v5p-9216': SystemCharacteristics( - '12x16x24', - 1152, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9216', - ), - 'v5p-9472': SystemCharacteristics( - '4x8x148', - 1184, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9472', - ), - 'v5p-9600': SystemCharacteristics( - '12x20x20', - 1200, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9600', - ), - 'v5p-9728': SystemCharacteristics( - '8x8x76', - 1216, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9728', - ), - 'v5p-9856': SystemCharacteristics( - '4x28x44', - 1232, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9856', - ), - 'v5p-9984': SystemCharacteristics( - '8x12x52', - 1248, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9984', - ), - 'v5p-10240': SystemCharacteristics( - '16x16x20', - 1280, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10240', - ), - 'v5p-10368': SystemCharacteristics( - '12x12x36', - 1296, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10368', - ), - 'v5p-10496': SystemCharacteristics( - '4x8x164', - 1312, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10496', - ), - 'v5p-10752': SystemCharacteristics( - '12x16x28', - 1344, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10752', - ), - 'v5p-10880': SystemCharacteristics( - '4x20x68', - 1360, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10880', - ), - 'v5p-11008': SystemCharacteristics( - '4x8x172', - 1376, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11008', - ), - 'v5p-11136': SystemCharacteristics( - '4x12x116', - 1392, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11136', - ), - 'v5p-11264': SystemCharacteristics( - '8x16x44', - 1408, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11264', - ), - 'v5p-11520': SystemCharacteristics( - '12x20x24', - 1440, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11520', - ), - 'v5p-11648': SystemCharacteristics( - '4x28x52', - 1456, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11648', - ), - 'v5p-11776': SystemCharacteristics( - '8x8x92', - 1472, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11776', - ), - 'v5p-11904': SystemCharacteristics( - '4x12x124', - 1488, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11904', - ), - 'v5p-12032': SystemCharacteristics( - '4x8x188', - 1504, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12032', - ), - 'v5p-12160': SystemCharacteristics( - '4x20x76', - 1520, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12160', - ), - 'v5p-12288': SystemCharacteristics( - '16x16x24', - 1536, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12288', - ), - 'v5p-13824': SystemCharacteristics( - '12x24x24', - 1728, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-13824', - ), - 'v5p-17920': SystemCharacteristics( - '16x20x28', - 2240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-17920', - ), - # v5litepod - 'v5litepod-8': SystemCharacteristics( - '2x4', - 2, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-8', - ), - 'v5litepod-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-16', - ), - 'v5litepod-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-32', - ), - 'v5litepod-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-64', - ), - 'v5litepod-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-128', - ), - 'v5litepod-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-256', - ), - # v4 - 'v4-8': SystemCharacteristics( - '2x2x1', - 1, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-8', - ), - 'v4-16': SystemCharacteristics( - '2x2x2', - 2, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-16', - ), - 'v4-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-32', - ), - 'v4-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-64', - ), - 'v4-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-128', - ), - 'v4-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-256', - ), - 'v4-512': SystemCharacteristics( - '4x8x8', - 64, 'tpu-v4-podslice', 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-512', - ), - 'v4-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1024', - ), - 'v4-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1536', - ), - 'v4-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-2048', - ), - 'v4-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-4096', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x8x8', + '8x8x8', + '8x8x12', + '8x8x16', + '8x16x16', + ], ), # CPU system characteristics. # Note that chips_per_vm is actually the number of vCPUs in that CPU. From f2340d036e3ea7a5e7b32c7d5c0c2b72232248c0 Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Wed, 30 Jul 2025 16:45:40 -0700 Subject: [PATCH 26/41] Update CPU limit for large scale clusters (#571) --- src/xpk/core/jobset.py | 2 +- src/xpk/core/kueue.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xpk/core/jobset.py b/src/xpk/core/jobset.py index 3b53c6a58..135cfda63 100644 --- a/src/xpk/core/jobset.py +++ b/src/xpk/core/jobset.py @@ -81,7 +81,7 @@ limits: memory: {memory_limit_size} requests: - cpu: 500m + cpu: 1000m memory: 128Mi securityContext: allowPrivilegeEscalation: false diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index d7bbdfbf8..257ed2bf1 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -244,10 +244,10 @@ periodSeconds: 10 resources: limits: - cpu: 750m + cpu: 1000m memory: {memory_limit_size} requests: - cpu: 750m + cpu: 1000m memory: 512Mi securityContext: allowPrivilegeEscalation: false From 5eaffd6d7c9dfaa37881939a61a4f787012c8ebb Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Thu, 7 Aug 2025 09:53:42 +0000 Subject: [PATCH 27/41] Update CODEOWNERS --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 47feda594..f6af5039d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1,2 @@ -* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN +* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN @scaliby @jamOne- @SikaGrr @FIoannides @fatoshoti slice/ @mwysokin @mimowo @gabesaba @PBundyra @mwielgus @pajakd \ No newline at end of file From 2faa737e8b9df9ddde2f93057b7a9a7249d48644 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Tue, 12 Aug 2025 09:39:13 +0000 Subject: [PATCH 28/41] fix: autoprovisioning cluster create --- src/xpk/core/nap.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 11c314e97..1eb4e76b0 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -99,7 +99,6 @@ def enable_autoprovisioning_on_cluster( f' --region={zone_to_region(args.zone)} --enable-autoprovisioning' ' --autoprovisioning-config-file' f' {autoprovisioning_config.config_filename}' - ' --autoscaling-profile=optimize-utilization' ) task = 'Update cluster with autoprovisioning enabled' return_code = run_command_with_updates(command, task, args) @@ -107,6 +106,18 @@ def enable_autoprovisioning_on_cluster( xpk_print(f'{task} request returned ERROR {return_code}') return autoprovisioning_config, return_code + command = ( + 'gcloud container clusters update' + f' {args.cluster} --project={args.project}' + f' --region={zone_to_region(args.zone)}' + ' --autoscaling-profile=optimize-utilization' + ) + task = 'Update cluster with autoscaling-profile' + return_code = run_command_with_updates(command, task, args) + if return_code != 0: + xpk_print(f'{task} request returned ERROR {return_code}') + return autoprovisioning_config, return_code + # Update created accelerator node pools to support autoprovisioning. existing_node_pool_names, return_code = get_all_nodepools_programmatic(args) if return_code != 0: From 42cb07ba263636cc2f1b2e4c45d46fbe9f81d9bb Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 18 Aug 2025 15:57:23 +0000 Subject: [PATCH 29/41] fix: provisioning 1t tpu topologies --- src/xpk/core/nodepool.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index fccd7c886..b043f70df 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -32,6 +32,8 @@ create_or_update_cluster_configmap, ) from .system_characteristics import AcceleratorType +from functools import reduce +from operator import mul CLOUD_PLATFORM_AUTH_SCOPE_URL = ( '"https://www.googleapis.com/auth/cloud-platform"' @@ -279,16 +281,19 @@ def run_gke_node_pool_create_command( ) if system.accelerator_type == AcceleratorType['TPU']: command += f' --node-version={gke_node_pool_version}' + topology_product = reduce(mul, (int(x) for x in system.topology.split('x')), 1) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' - else: + elif topology_product > 1: command += f' --num-nodes={system.vms_per_slice}' - command += ' --placement-type=COMPACT --max-pods-per-node 15' command += ( f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}' ) - command += f' --tpu-topology={system.topology}' - command += f' {args.custom_tpu_nodepool_arguments}' + + if topology_product > 1: + command += ' --placement-type=COMPACT --max-pods-per-node 15' + command += f' --tpu-topology={system.topology}' + command += f' {args.custom_tpu_nodepool_arguments}' elif system.accelerator_type == AcceleratorType['GPU']: subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}' if capacity_type == CapacityType.FLEX_START: From 91c9127579d200297d0c49d09ef63cf6e8c84139 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 18 Aug 2025 16:18:22 +0000 Subject: [PATCH 30/41] style: reformat --- src/xpk/core/nodepool.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index b043f70df..e5d8eda7a 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -281,7 +281,9 @@ def run_gke_node_pool_create_command( ) if system.accelerator_type == AcceleratorType['TPU']: command += f' --node-version={gke_node_pool_version}' - topology_product = reduce(mul, (int(x) for x in system.topology.split('x')), 1) + topology_product = reduce( + mul, (int(x) for x in system.topology.split('x')), 1 + ) if capacity_type == CapacityType.FLEX_START: command += ' --num-nodes=0' elif topology_product > 1: From 5c3b87b082bd84cd9a9d74f81bd529485f52fc47 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 18 Aug 2025 16:55:49 +0000 Subject: [PATCH 31/41] fix: reorder custom_nodepool_arguments for node-pool create command --- src/xpk/core/nodepool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index fccd7c886..ea0f6fb59 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -275,7 +275,6 @@ def run_gke_node_pool_create_command( f' --host-maintenance-interval={args.host_maintenance_interval}' f' {capacity_args}' ' --enable-gvnic' - f' {args.custom_nodepool_arguments}' ) if system.accelerator_type == AcceleratorType['TPU']: command += f' --node-version={gke_node_pool_version}' @@ -319,6 +318,8 @@ def run_gke_node_pool_create_command( if args.enable_workload_identity or args.enable_gcsfuse_csi_driver: command += ' --workload-metadata=GKE_METADATA' + command += args.custom_nodepool_arguments + task = f'NodepoolCreate-{node_pool_name}' create_commands.append(command) create_task_names.append(task) From 880d83f7bf3466c1842cf5d1c35f8cc230480a46 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 18 Aug 2025 22:38:23 +0000 Subject: [PATCH 32/41] NAP memory limit increased --- src/xpk/core/nap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 1eb4e76b0..1f3700438 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -187,7 +187,7 @@ def create_autoprovisioning_config( """ memory_limits = """ minimum: 1 - maximum: 10000 + maximum: 10000000 """ # By default, the maximum chips is set to be the current number of resources used From 81e1babb3ffde751ccbbbece611779e1a6b653b5 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 18 Aug 2025 22:52:28 +0000 Subject: [PATCH 33/41] Revert "NAP memory limit increased" This reverts commit 880d83f7bf3466c1842cf5d1c35f8cc230480a46. --- src/xpk/core/nap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 1f3700438..1eb4e76b0 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -187,7 +187,7 @@ def create_autoprovisioning_config( """ memory_limits = """ minimum: 1 - maximum: 10000000 + maximum: 10000 """ # By default, the maximum chips is set to be the current number of resources used From 1f6d137a21f54cf98be69a9219a63cf613080d31 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 18 Aug 2025 22:55:57 +0000 Subject: [PATCH 34/41] NAP memory limit increased --- src/xpk/core/nap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 1eb4e76b0..1f3700438 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -187,7 +187,7 @@ def create_autoprovisioning_config( """ memory_limits = """ minimum: 1 - maximum: 10000 + maximum: 10000000 """ # By default, the maximum chips is set to be the current number of resources used From 9a41e4eb148179ecc3ecb2cfed97d2260a7d9381 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 18 Aug 2025 23:02:27 +0000 Subject: [PATCH 35/41] NAP cpu limit increased --- src/xpk/core/nap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 1f3700438..f53f9dbb8 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -183,7 +183,7 @@ def create_autoprovisioning_config( # is not controlled by NAP. cpu_limits = """ minimum: 1 - maximum: 10000 + maximum: 1000000 """ memory_limits = """ minimum: 1 From e80a686e6cd9db3d4b15108cd75352cd92c6c2fb Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 18 Aug 2025 21:57:30 -0700 Subject: [PATCH 36/41] fix #598 only install JQ when not installed --- src/xpk/commands/cluster.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 2b93c0daf..e4cb29119 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -710,6 +710,9 @@ def cluster_create_ray_cluster(args) -> None: def install_jq(args): """Installs 'jq' utility.""" + if shutil.which('jq'): + xpk_print("Task: 'Install jq' skipped, jq already installed.") + return command_jq_install = 'sudo apt install jq -y' xpk_print("Task: 'Install jq' in progress.") return_code = run_command_with_updates(command_jq_install, 'Install jq', args) From b503f54a8c34cd66ff1a1d0bf963dd58f4f8d97f Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Thu, 21 Aug 2025 08:19:17 +0000 Subject: [PATCH 37/41] fix: custom nodepool arguments append --- src/xpk/core/nodepool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 931a04c8c..58ab038c4 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -325,7 +325,7 @@ def run_gke_node_pool_create_command( if args.enable_workload_identity or args.enable_gcsfuse_csi_driver: command += ' --workload-metadata=GKE_METADATA' - command += args.custom_nodepool_arguments + command += f' {args.custom_nodepool_arguments}' task = f'NodepoolCreate-{node_pool_name}' create_commands.append(command) From 86573a37789e848d84c8840a0408a058cd3993a3 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 11 Aug 2025 10:47:22 +0000 Subject: [PATCH 38/41] feat: add tpu7x support --- src/xpk/core/system_characteristics.py | 109 +++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 68a5b89c2..5ae4d9746 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -242,6 +242,115 @@ def get_tpu_system_characteristics_map( 'h100-mega-80gb-8', ), # TPU system characteristics + **get_tpu_system_characteristics_map( + 'tpu7x', 2, 'tpu7x', 'tpu7x-standard-1t', ['1x1x1'] + ), + **get_tpu_system_characteristics_map( + 'tpu7x', + 2, + 'tpu7x', + 'tpu7x-standard-4t', + [ + '12x12x12', + '12x12x16', + '12x12x20', + '12x12x24', + '12x12x28', + '12x12x36', + '12x16x16', + '12x16x20', + '12x16x24', + '12x16x28', + '12x20x20', + '12x20x24', + '12x24x24', + '16x16x16', + '16x16x20', + '16x16x24', + '16x16x32', + '16x20x28', + '16x24x24', + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x12x116', + '4x12x12', + '4x12x124', + '4x12x20', + '4x12x28', + '4x12x44', + '4x12x52', + '4x12x68', + '4x12x76', + '4x12x92', + '4x20x20', + '4x20x28', + '4x20x44', + '4x20x52', + '4x20x68', + '4x20x76', + '4x28x28', + '4x28x44', + '4x28x52', + '4x4x116', + '4x4x12', + '4x4x124', + '4x4x148', + '4x4x164', + '4x4x172', + '4x4x188', + '4x4x20', + '4x4x212', + '4x4x236', + '4x4x244', + '4x4x28', + '4x4x4', + '4x4x44', + '4x4x52', + '4x4x68', + '4x4x76', + '4x4x8', + '4x4x92', + '4x8x116', + '4x8x12', + '4x8x124', + '4x8x148', + '4x8x164', + '4x8x172', + '4x8x188', + '4x8x20', + '4x8x28', + '4x8x44', + '4x8x52', + '4x8x68', + '4x8x76', + '4x8x8', + '4x8x92', + '8x12x12', + '8x12x16', + '8x12x20', + '8x12x28', + '8x12x44', + '8x12x52', + '8x16x16', + '8x16x20', + '8x16x28', + '8x16x44', + '8x20x20', + '8x20x28', + '8x8x12', + '8x8x16', + '8x8x20', + '8x8x28', + '8x8x44', + '8x8x52', + '8x8x68', + '8x8x76', + '8x8x8', + '8x8x92', + ], + ), **get_tpu_system_characteristics_map( 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] ), From c8af3c431a767f6458d44ce9df99b439718aa5d0 Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Mon, 25 Aug 2025 08:06:11 +0000 Subject: [PATCH 39/41] fix: provisioning scopes for nap --- src/xpk/core/nap.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index f53f9dbb8..8ba0d24ce 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -42,6 +42,8 @@ management: autoRepair: true autoUpgrade: true +scopes: + - "https://www.googleapis.com/auth/devstorage.read_write" autoprovisioningLocations: {zones} {resource_limits} From 0d4c8602324e1d2fdab35c2ae5a815e289c899e6 Mon Sep 17 00:00:00 2001 From: Konrad Kaim <31181410+scaliby@users.noreply.github.com> Date: Thu, 28 Aug 2025 10:41:23 +0200 Subject: [PATCH 40/41] Merge pull request #606 from AI-Hypercomputer/scaliby/b/434405026 Fix nodepool creation --- src/xpk/core/nodepool.py | 53 +++++++++++---- src/xpk/core/tests/unit/test_nodepool.py | 82 ++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 14 deletions(-) create mode 100644 src/xpk/core/tests/unit/test_nodepool.py diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 58ab038c4..cab159f15 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -14,6 +14,7 @@ limitations under the License. """ +from typing import List from ..utils.console import get_user_input, xpk_print from .capacity import ( AUTOPROVISIONING_CONFIG_VALUE, @@ -90,20 +91,26 @@ def run_gke_node_pool_create_command( xpk_print('Parsing capacity arguments failed!') return return_code - if system.accelerator_type == AcceleratorType['GPU']: - xpk_print( - f'Creating 1 node pool with {args.num_nodes} nodes of' - f' {system.device_type}\nUnderlyingly, we assume that means: {system}' - ) - desired_node_pool_names = [f'{args.cluster}-np-0'] - else: - xpk_print( - f'Creating {args.num_slices} node pool or pools of' - f' {system.device_type}\nUnderlyingly, we assume that means: {system}' - ) - desired_node_pool_names = [ - f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices) - ] + desired_node_pool_count = ( + 1 + if system.accelerator_type == AcceleratorType['GPU'] + else args.num_slices + ) + message = ( + ( + f'Creating 1 node pool with {args.num_nodes} nodes of' + f' {system.device_type}\nUnderlyingly, we assume that means: {system}' + ) + if system.accelerator_type == AcceleratorType['GPU'] + else ( + f'Creating {args.num_slices} node pool or pools of' + f' {system.device_type}\nUnderlyingly, we assume that means: {system}' + ) + ) + xpk_print(message) + desired_node_pool_names = get_desired_node_pool_names( + existing_node_pool_names, args.cluster, desired_node_pool_count + ) node_pools_to_remain = [] delete_commands = [] @@ -602,3 +609,21 @@ def get_nodepool_workload_metadata_mode( return 1, None return 0, nodepool_WI_mode.strip() + + +def get_desired_node_pool_names( + existing_node_pool_names: List[str], + cluster_name: str, + desired_node_pool_count: int, +) -> List[str]: + cluster_node_pools = [ + np + for np in existing_node_pool_names + if np.startswith(f'{cluster_name}-np-') + ] + result = set(cluster_node_pools[:desired_node_pool_count]) + i = 0 + while len(result) < desired_node_pool_count: + result.add(f'{cluster_name}-np-{i}') + i += 1 + return list(result) diff --git a/src/xpk/core/tests/unit/test_nodepool.py b/src/xpk/core/tests/unit/test_nodepool.py new file mode 100644 index 000000000..71cc540c3 --- /dev/null +++ b/src/xpk/core/tests/unit/test_nodepool.py @@ -0,0 +1,82 @@ +""" +Copyright 2025 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from xpk.core.nodepool import get_desired_node_pool_names + +CLUSTER_NAME = "running-cucumber" + + +def node_pool_name(number: int) -> str: + return f"{CLUSTER_NAME}-np-{number}" + + +def test_compute_desired_node_pool_names_with_desired_larger_than_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(1)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_desired_smaller_than_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(1)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=1, + ) + + expected_result = [node_pool_name(0)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_consecutive_numbers_missing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(3)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=3, + ) + + expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing(): + result = get_desired_node_pool_names( + existing_node_pool_names=[node_pool_name(0), node_pool_name(3)], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(3)] + assert set(result) == set(expected_result) + + +def test_compute_desired_node_pool_names_with_unknown_node_pools(): + result = get_desired_node_pool_names( + existing_node_pool_names=[ + "unknown-node-pool", + node_pool_name(0), + node_pool_name(3), + ], + cluster_name=CLUSTER_NAME, + desired_node_pool_count=2, + ) + + expected_result = [node_pool_name(0), node_pool_name(3)] + assert set(result) == set(expected_result) From 21c1c13f0829f42ed2bf99e84345d7bc85051c8c Mon Sep 17 00:00:00 2001 From: Konrad Kaim Date: Fri, 29 Aug 2025 09:08:20 +0000 Subject: [PATCH 41/41] Release v0.11.0 --- src/xpk/core/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py index 67e71fa56..279f6b8d1 100644 --- a/src/xpk/core/config.py +++ b/src/xpk/core/config.py @@ -22,7 +22,7 @@ from ..utils.console import xpk_print # This is the version for XPK PyPI package -__version__ = 'v0.10.1' +__version__ = 'v0.11.0' XPK_CURRENT_VERSION = __version__ XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')