From 9749763eb3c3323a3f467856687dc920ca3af212 Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Tue, 23 Sep 2025 12:24:46 +0000 Subject: [PATCH 1/3] Add cpu and memory limit flags and use them in Kueue configuration --- src/xpk/core/kueue.py | 25 +++++++++++++++++++++---- src/xpk/parser/cluster.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 49f57a4fd..8f8e63c00 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -436,6 +436,8 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + cpu_limit=args.cpu_limit, + memory_limit=args.memory_limit, ) topology_label = '' if system.device_type in [ @@ -473,6 +475,7 @@ def install_kueue_crs( ]: yml_string = topology_yaml + yml_string + print(yml_string) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp)}' @@ -484,7 +487,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit ) -> str: """Gets Kueue covered resources configuration. @@ -497,17 +500,31 @@ def get_kueue_covered_resources_config( A string of Kueue covered resources configuration. """ config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: {resource_types} flavors: - name: {cluster_hardware_name} resources: - name: "{resource_type}" - nominalQuota: {total_chips} - """ + nominalQuota: {total_chips}""" + resource_types = [resource_type] + if cpu_limit: + config_format = config_format + """ + - name: "cpu" + nominalQuota: {cpu_limit}""" + resource_types.append('cpu') + if memory_limit: + config_format = config_format + """ + - name: "memory" + nominalQuota: {memory_limit}""" + resource_types.append('memory') + config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, + resource_types=resource_types, resource_type=resource_type, total_chips=total_chips, + cpu_limit=cpu_limit, + memory_limit=memory_limit, ) return config_string diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 663a6bd3b..ce2fd8260 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -176,6 +176,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) cluster_create_parser.set_defaults(func=cluster_create) + cluster_create_resource_limits = cluster_create_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -887,3 +893,23 @@ def add_shared_cluster_create_mtc_arguments( ' checkpointing. By default, it is set to "google.com/tpu".' ), ) + + +def add_resource_limits(parser_or_group: ParserOrArgumentGroup): + """Add resource limits arguments in cluster create. + + Args: + List of cluster create resource limits arguments parsers or group + """ + parser_or_group.add_argument( + '--memory-limit', + type=str, + default=None, + help='The memory limit for the Kueue controller manager.', + ) + parser_or_group.add_argument( + '--cpu-limit', + type=int, + default=None, + help='The CPU limit for the Kueue controller manager.', + ) From 545bb3112b12ae6df157ed3468a116e97f3a9b40 Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Tue, 23 Sep 2025 12:24:46 +0000 Subject: [PATCH 2/3] Remove print line --- src/xpk/core/kueue.py | 24 ++++++++++++++++++++---- src/xpk/parser/cluster.py | 26 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 49f57a4fd..fd4b8437d 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -436,6 +436,8 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + cpu_limit=args.cpu_limit, + memory_limit=args.memory_limit, ) topology_label = '' if system.device_type in [ @@ -484,7 +486,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit ) -> str: """Gets Kueue covered resources configuration. @@ -497,17 +499,31 @@ def get_kueue_covered_resources_config( A string of Kueue covered resources configuration. """ config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: {resource_types} flavors: - name: {cluster_hardware_name} resources: - name: "{resource_type}" - nominalQuota: {total_chips} - """ + nominalQuota: {total_chips}""" + resource_types = [resource_type] + if cpu_limit: + config_format = config_format + """ + - name: "cpu" + nominalQuota: {cpu_limit}""" + resource_types.append('cpu') + if memory_limit: + config_format = config_format + """ + - name: "memory" + nominalQuota: {memory_limit}""" + resource_types.append('memory') + config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, + resource_types=resource_types, resource_type=resource_type, total_chips=total_chips, + cpu_limit=cpu_limit, + memory_limit=memory_limit, ) return config_string diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 663a6bd3b..ce2fd8260 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -176,6 +176,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) cluster_create_parser.set_defaults(func=cluster_create) + cluster_create_resource_limits = cluster_create_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -887,3 +893,23 @@ def add_shared_cluster_create_mtc_arguments( ' checkpointing. By default, it is set to "google.com/tpu".' ), ) + + +def add_resource_limits(parser_or_group: ParserOrArgumentGroup): + """Add resource limits arguments in cluster create. + + Args: + List of cluster create resource limits arguments parsers or group + """ + parser_or_group.add_argument( + '--memory-limit', + type=str, + default=None, + help='The memory limit for the Kueue controller manager.', + ) + parser_or_group.add_argument( + '--cpu-limit', + type=int, + default=None, + help='The CPU limit for the Kueue controller manager.', + ) From 48c5ebdb2776ed70f24e2fb476967bc54e4c3499 Mon Sep 17 00:00:00 2001 From: Feidias Ioannidis Date: Tue, 23 Sep 2025 13:42:16 +0000 Subject: [PATCH 3/3] Add cpu-limit and memory-limit for other types of cluster creation as well --- src/xpk/parser/cluster.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index ce2fd8260..18dd2d04f 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -174,7 +174,6 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): 'Arguments for configuring MTC in cluster create.', ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) - cluster_create_parser.set_defaults(func=cluster_create) cluster_create_resource_limits = cluster_create_parser.add_argument_group( 'Optional Resource Limits Arguments', @@ -182,6 +181,8 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser): ) add_resource_limits(cluster_create_resource_limits) + cluster_create_parser.set_defaults(func=cluster_create) + def set_cluster_create_pathways_parser( cluster_create_pathways_parser: ArgumentParser, @@ -251,6 +252,15 @@ def set_cluster_create_pathways_parser( ) ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) + + cluster_create_resource_limits = ( + cluster_create_pathways_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + ) + add_resource_limits(cluster_create_resource_limits) + cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways) @@ -326,6 +336,13 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser): 'Arguments for configuring MTC in cluster create.', ) add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments) + + cluster_create_resource_limits = cluster_create_ray_parser.add_argument_group( + 'Optional Resource Limits Arguments', + 'Arguments for configuring resource limits in cluster create.', + ) + add_resource_limits(cluster_create_resource_limits) + cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)