Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/xpk/commands/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
get_storage_annotations,
prepare_kjob,
)
from ..core.kueue import LOCAL_QUEUE_NAME
from ..core.kueue_manager import LOCAL_QUEUE_NAME
from ..utils.console import xpk_exit, xpk_print
from ..utils.execution_context import is_dry_run
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
Expand Down
14 changes: 9 additions & 5 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@
)
from ..core.jobset import update_jobset_resources_if_necessary
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
from ..core.kueue import (
cluster_preheat_yml,
)
from ..core.kueue_manager import (KueueConfig, KueueManager)
from ..core.nap import enable_autoprovisioning_on_cluster
from ..core.network import (
Expand Down Expand Up @@ -79,9 +76,13 @@
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
from . import cluster_gcluster
from .common import set_cluster_command
from jinja2 import Environment, FileSystemLoader
from ..utils.templates import TEMPLATE_PATH
import shutil
import os

CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'


def cluster_adapt(args) -> None:
"""Function that performs cluster adaptation.
Expand Down Expand Up @@ -424,12 +425,15 @@ def cluster_cacheimage(args) -> None:
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
system.accelerator_type
].accelerator_label
yml_string = cluster_preheat_yml.format(

template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
rendered_yaml = cluster_preheat_yaml.render(
cachekey=args.cache_key,
image_name=args.docker_image,
nodeSelectorKey=node_selector_key,
)
tmp = write_tmp_file(yml_string)
tmp = write_tmp_file(rendered_yaml)
command_apply = f'kubectl apply -f {str(tmp)}'
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'

Expand Down
2 changes: 0 additions & 2 deletions src/xpk/commands/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from ..core.commands import run_command_for_value
from ..core.cluster import get_cluster_credentials
from ..core.gcloud_context import add_zone_and_project
from ..core.kueue import verify_kueuectl
from ..utils.console import xpk_exit, xpk_print
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies

Expand All @@ -46,7 +45,6 @@ def info(args: Namespace) -> None:
add_zone_and_project(args)
get_cluster_credentials(args)

verify_kueuectl()
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
if not lq and not cq:
lq, cq = True, True
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/commands/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..core.cluster import get_cluster_credentials
from ..core.commands import run_command_for_value
from ..core.gcloud_context import add_zone_and_project, get_cluster_location
from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
from ..utils.console import xpk_exit, xpk_print
from ..utils.file import append_tmp_file, write_tmp_file
Expand Down
34 changes: 15 additions & 19 deletions src/xpk/commands/kind.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
limitations under the License.
"""

from ..core.kueue_manager import (KueueConfig, KueueManager)
from ..core.commands import (
run_command_for_value,
run_command_with_updates,
Expand All @@ -24,11 +25,7 @@
prepare_kjob,
apply_kjob_crds,
)
from ..core.kueue import (
install_kueue_on_cluster,
install_kueue_crs,
wait_for_kueue_available,
)
from ..core.scheduling import get_total_chips_requested_from_args
from ..core.storage import install_storage_crd
from ..core.system_characteristics import (
SystemCharacteristics,
Expand Down Expand Up @@ -71,11 +68,6 @@ def cluster_create(args) -> None:
if set_jobset_on_cluster_code != 0:
xpk_exit(set_jobset_on_cluster_code)

xpk_print('Enabling Kueue on the cluster')
install_kueue_on_cluster_code = install_kueue_on_cluster()
if install_kueue_on_cluster_code != 0:
xpk_exit(install_kueue_on_cluster_code)

xpk_print('Verifying kjob installation')
err_code = verify_kjob_installed()
if err_code > 0:
Expand All @@ -94,11 +86,6 @@ def cluster_create(args) -> None:
k8s_client = setup_k8s_env(args)
install_storage_crd(k8s_client)

xpk_print('Wait for Kueue to be fully available')
wait_for_kueue_available_code = wait_for_kueue_available()
if wait_for_kueue_available_code != 0:
xpk_exit(wait_for_kueue_available_code)

args.num_slices = 1
args.enable_pathways = False
system = SystemCharacteristics(
Expand All @@ -112,10 +99,19 @@ def cluster_create(args) -> None:
supports_sub_slicing=False,
)

xpk_print('Install Kueue Custom Resources')
enable_kueue_credentials_code = install_kueue_crs(args, system, None)
if enable_kueue_credentials_code != 0:
xpk_exit(enable_kueue_credentials_code)
kueue_manager = KueueManager()
kueue_manager.install_or_upgrade(
KueueConfig(
system,
total_chips=get_total_chips_requested_from_args(args, system),
autoprovisioning_enabled=False,
num_slices=args.num_slices,
memory_limit='',
cpu_limit=0,
is_pathways_cluster=False,
flex=False,
),
)

xpk_print('Kind commands done! Resources are created.')
xpk_exit(0)
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
get_storage_annotations,
prepare_kjob,
)
from ..core.kueue import LOCAL_QUEUE_NAME
from ..core.kueue_manager import LOCAL_QUEUE_NAME
from ..utils.console import xpk_exit, xpk_print
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
from .kind import set_local_cluster_command
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)
from ..core.docker_resources import get_volumes, parse_env_config
from ..core.gcloud_context import add_zone_and_project
from ..core.kueue import LOCAL_QUEUE_NAME
from ..core.kueue_manager import LOCAL_QUEUE_NAME
from ..core.monitoring import get_gke_outlier_dashboard
from ..core.nap import (
get_autoprovisioning_node_selector_args,
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/core/jobset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from ..utils.console import xpk_exit, xpk_print
from ..utils.file import write_tmp_file
from ..core.kueue import (
from ..core.kueue_manager import (
MEMORY_SIZE_PER_VM,
MIN_MEMORY_LIMIT_SIZE,
)
Expand Down
Loading
Loading