diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 49f57a4fd..8f8e63c00 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -436,6 +436,8 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + cpu_limit=args.cpu_limit, + memory_limit=args.memory_limit, ) topology_label = '' if system.device_type in [ @@ -473,6 +475,7 @@ def install_kueue_crs( ]: yml_string = topology_yaml + yml_string + print(yml_string) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp)}' @@ -484,7 +487,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit ) -> str: """Gets Kueue covered resources configuration. @@ -497,17 +500,31 @@ def get_kueue_covered_resources_config( A string of Kueue covered resources configuration. """ config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: {resource_types} flavors: - name: {cluster_hardware_name} resources: - name: "{resource_type}" - nominalQuota: {total_chips} - """ + nominalQuota: {total_chips}""" + resource_types = [resource_type] + if cpu_limit: + config_format = config_format + """ + - name: "cpu" + nominalQuota: {cpu_limit}""" + resource_types.append('cpu') + if memory_limit: + config_format = config_format + """ + - name: "memory" + nominalQuota: {memory_limit}""" + resource_types.append('memory') + config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, + resource_types=resource_types, resource_type=resource_type, total_chips=total_chips, + cpu_limit=cpu_limit, + memory_limit=memory_limit, ) return config_string diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 663a6bd3b..ce2fd8260 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -176,6 +176,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) cluster_create_parser.set_defaults(func=cluster_create) + cluster_create_resource_limits = cluster_create_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -887,3 +893,23 @@ def add_shared_cluster_create_mtc_arguments( ' checkpointing. By default, it is set to "google.com/tpu".' ), ) + + +def add_resource_limits(parser_or_group: ParserOrArgumentGroup): + """Add resource limits arguments in cluster create. + + Args: + List of cluster create resource limits arguments parsers or group + """ + parser_or_group.add_argument( + '--memory-limit', + type=str, + default=None, + help='The memory limit for the Kueue controller manager.', + ) + parser_or_group.add_argument( + '--cpu-limit', + type=int, + default=None, + help='The CPU limit for the Kueue controller manager.', + )