diff --git a/config/github_actions.py b/config/github_actions.py index 9371e376..b2196b6b 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -28,7 +28,13 @@ 'options': ['--mem={size}'], } ], - 'max_jobs': 1 + 'max_jobs': 1, + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + # This is a fictional amount, GH actions probably has less, but only does --dry-run + 'mem_per_node': 30 # in GiB + }, } ] } diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index cdd6957e..2bdfa035 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -59,6 +59,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 219.345 # in GiB + }, 'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/' }, # We don't have GPU budget on Karolina at this time diff --git a/config/izum_vega.py b/config/izum_vega.py index 4c67792b..f7193aed 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -59,47 +59,57 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 238.418 # in GiB + }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, - { - 'name': 'gpu', - 'scheduler': 'slurm', - 'prepare_cmds': [ - 'source %s' % common_eessi_init(), - # Pass job environment variables like $PATH, etc., into job steps - 'export SLURM_EXPORT_ENV=ALL', - # Needed when using srun launcher - # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega - # Avoid https://github.com/EESSI/software-layer/issues/136 - # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) - 'export OMPI_MCA_pml=ucx', - ], - 'launcher': 'mpirun', - # Use --export=None to avoid that login environment is passed down to submitted jobs - 'access': ['-p gpu', '--export=None'], - 'environs': ['default'], - 'max_jobs': 60, - 'devices': [ - { - 'type': DEVICE_TYPES[GPU], - 'num_devices': 4, - } - ], - 'resources': [ - { - 'name': '_rfm_gpu', - 'options': ['--gpus-per-node={num_gpus_per_node}'], - }, - { - 'name': 'memory', - 'options': ['--mem={size}'], - } - ], - 'features': [ - FEATURES[GPU], - ] + list(SCALES.keys()), - 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' - }, + # { + # 'name': 'gpu', + # 'scheduler': 'slurm', + # 'prepare_cmds': [ + # 'source %s' % common_eessi_init(), + # # Pass job environment variables like $PATH, etc., into job steps + # 'export SLURM_EXPORT_ENV=ALL', + # # Needed when using srun launcher + # # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega + # # Avoid https://github.com/EESSI/software-layer/issues/136 + # # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) + # 'export OMPI_MCA_pml=ucx', + # ], + # 'launcher': 'mpirun', + # # Use --export=None to avoid that login environment is passed down to submitted jobs + # 'access': ['-p gpu', '--export=None'], + # 'environs': ['default'], + # 'max_jobs': 60, + # 'devices': [ + # { + # 'type': DEVICE_TYPES[GPU], + # 'num_devices': 4, + # } + # ], + # 'resources': [ + # { + # 'name': '_rfm_gpu', + # 'options': ['--gpus-per-node={num_gpus_per_node}'], + # }, + # { + # 'name': 'memory', + # 'options': ['--mem={size}'], + # } + # ], + # 'features': [ + # FEATURES[GPU], + # ] + list(SCALES.keys()), + # 'extras': { + # # Make sure to round down, otherwise a job might ask for more mem than is available + # # per node + # 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf) + # }, + # 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' + # }, ] }, ], diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 9e4ee269..d8bcc36c 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -53,6 +53,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 213.623 # in GiB + }, 'descr': 'AMD Rome CPU partition with native EESSI stack' }, { @@ -72,6 +77,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 320.434 # in GiB + }, 'descr': 'AMD Genoa CPU partition with native EESSI stack' }, @@ -105,6 +115,9 @@ ] + valid_scales_snellius_gpu, 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 457.763 # in GiB }, 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index fbfa9e4c..f349bf60 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -6,7 +6,9 @@ from reframe.core.backends import register_launcher from reframe.core.launchers import JobLauncher -from eessi.testsuite.common_config import common_logging_config, common_general_config, common_eessi_init +from eessi.testsuite.common_config import (common_eessi_init, + common_general_config, + common_logging_config) from eessi.testsuite.constants import * # noqa: F403 account = "my-slurm-account" @@ -54,6 +56,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + }, }, { 'name': 'cpu_rome_512gb', @@ -81,6 +88,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 511.983 # in GiB + }, }, { 'name': 'cpu_milan', @@ -108,6 +120,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + }, }, { 'name': 'gpu_rome_a100_40gb', @@ -131,6 +148,9 @@ def command(self, job): ] + list(SCALES.keys()), 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB }, 'resources': [ { @@ -172,6 +192,9 @@ def command(self, job): ] + list(SCALES.keys()), 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 511.983 # in GiB }, 'resources': [ { diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index ad1741b8..5dd98a7f 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -10,7 +10,7 @@ from eessi.testsuite.constants import * from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log, - check_proc_attribute_defined) + check_proc_attribute_defined, check_extras_key_defined) def _assign_default_num_cpus_per_node(test: rfm.RegressionTest): @@ -383,6 +383,92 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic log(f'valid_systems set to {test.valid_systems}') +def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): + """ + This hook will request a specific amount of memory per node to the batch scheduler. + First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional) + amount of memory would be. + Then, the hook compares this to how much memory the application claims to need per node (app_mem_req). + It then passes the maximum of these two numbers to the batch scheduler as a memory request. + + Note: using this hook requires that the ReFrame configuration defines system.partition.extras['mem_per_node'] + That field should be defined in GiB + + Arguments: + - test: the ReFrame test to which this hook should apply + - app_mem_req: the amount of memory this application needs (per node) in GiB + + Example 1: + - A system with 128 cores and 64 GiB per node. + - The test is launched on 64 cores + - The app_mem_req is 40 (GiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GiB. + The app_mem_req is higher. Thus, 40GiB (per node) is requested from the batch scheduler. + + Example 2: + - A system with 128 cores per node, 128 GiB mem per node is used. + - The test is launched on 64 cores + - the app_mem_req is 40 (GiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GiB. + This is higher than the app_mem_req. Thus, 64 GiB (per node) is requested from the batch scheduler. + """ + # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node + check_extras_key_defined(test, 'mem_per_node') + # Skip if the current partition doesn't have sufficient memory to run the application + msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB" + msg += " memory available (per node) accodring to the current ReFrame configuration," + msg += f" but {app_mem_req} GiB is needed" + test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) + + # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs + # Fraction of CPU cores requested + check_proc_attribute_defined(test, 'num_cpus') + cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus + proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] + + scheduler_name = test.current_partition.scheduler.registered_name + if scheduler_name == 'slurm' or scheduler_name == 'squeue': + # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html + # SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024 + # Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824) + app_mem_req = math.ceil(1073.741824 * app_mem_req) + log(f"Memory requested by application: {app_mem_req} MB") + proportional_mem = math.floor(1073.741824 * proportional_mem) + log(f"Memory proportional to the core count: {proportional_mem} MB") + + # Request the maximum of the proportional_mem, and app_mem_req to the scheduler + req_mem_per_node = max(proportional_mem, app_mem_req) + + test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}} + log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler") + + elif scheduler_name == 'torque': + # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) + # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm + # Units are MiB according to the documentation, thus, we simply multiply with 1024 + # We immediately divide by num_tasks_per_node (before rounding), since -pmem specifies memroy _per process_ + app_mem_req_task = math.ceil(1024 * app_mem_req / test.num_tasks_per_node) + proportional_mem_task = math.floor(1024 * proportional_mem / test.num_tasks_per_node) + + # Request the maximum of the proportional_mem, and app_mem_req to the scheduler + req_mem_per_task = max(proportional_mem_task, app_mem_req_task) + + # We assume here the reframe config defines the extra resource memory as asking for pmem + # i.e. 'options': ['--pmem={size}'] + test.extra_resources = {'memory': {'size': f'{req_mem_per_task}mb'}} + log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler") + + else: + logger = rflog.getlogger() + msg = "hooks.req_memory_per_node does not support the scheduler you configured" + msg += f" ({test.current_partition.scheduler.registered_name})." + msg += " The test will run, but since it doesn't request the required amount of memory explicitely," + msg += " it may result in an out-of-memory error." + msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler." + # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command + logger.warning(msg) + + def set_modules(test: rfm.RegressionTest): """ Skip current test if module_name is not among a list of modules, diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py new file mode 100644 index 00000000..050e43d3 --- /dev/null +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -0,0 +1,113 @@ +""" +This module tests the binary 'pw.x' in available modules containing substring 'QuantumESPRESSO'. +Test input files are defined in the ReFrame test library, +see https://github.com/reframe-hpc/reframe/blob/develop/hpctestlib/sciapps/qespresso/benchmarks.py + +ReFrame terminology: + +"pipeline stages": +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#pipeline-hooks + +"test parameter": a list of values, which will generate different test variants. +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.builtins.parameter + +"test variant": a version of a test with a specific value for each test parameter +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#test-variants + +"concrete test cases": all test combinations that will actually run: +- test variants +- valid system:partition+programming environment combinations +https://reframe-hpc.readthedocs.io/en/stable/tutorial_deps.html#listing-dependencies + +Tests can be filtered by name, tag, programming environment, system, partition, or maintainer, +see https://reframe-hpc.readthedocs.io/en/stable/manpage.html#test-filtering + +Hooks acting on all possible test combinations (before filtering) are called after the 'init' stage. +Hooks acting on concrete test cases (after filtering) are called after the 'setup' stage. + +See also https://reframe-hpc.readthedocs.io/en/stable/pipeline.html +""" + +import reframe as rfm +from hpctestlib.sciapps.qespresso.benchmarks import QEspressoPWCheck +from reframe.core.builtins import ( # added only to make the linter happy + parameter, run_after) + +from eessi.testsuite import hooks +from eessi.testsuite.constants import (COMPUTE_UNIT, CPU, DEVICE_TYPES, GPU, + SCALES, TAGS) +from eessi.testsuite.utils import find_modules, log + + +@rfm.simple_test +class EESSI_QuantumESPRESSO_PW(QEspressoPWCheck): + scale = parameter(SCALES.keys()) + valid_prog_environs = ['default'] + valid_systems = ['*'] + time_limit = '30m' + module_name = parameter(find_modules('QuantumESPRESSO')) + # For now, QE is being build for CPU targets only + # compute_device = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]]) + compute_device = parameter([DEVICE_TYPES[CPU], ]) + + @run_after('init') + def run_after_init(self): + """Hooks to run after the init phase""" + + # Filter on which scales are supported by the partitions defined in the ReFrame configuration + hooks.filter_supported_scales(self) + + # Make sure that GPU tests run in partitions that support running on a GPU, + # and that CPU-only tests run in partitions that support running CPU-only. + # Also support setting valid_systems on the cmd line. + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device) + + # Support selecting modules on the cmd line. + hooks.set_modules(self) + + # Support selecting scales on the cmd line via tags. + hooks.set_tag_scale(self) + + @run_after('init') + def set_tag_ci(self): + """Set tag CI on smallest benchmark, so it can be selected on the cmd line via --tag CI""" + min_ecut = min(QEspressoPWCheck.ecut.values) + min_nbnd = min(QEspressoPWCheck.nbnd.values) + if self.ecut == min_ecut and self.nbnd == min_nbnd: + self.tags.add(TAGS['CI']) + log(f'tags set to {self.tags}') + + @run_after('init') + def set_increased_walltime(self): + """Increase the amount of time for the largest benchmark, so it can complete successfully.""" + max_ecut = max(QEspressoPWCheck.ecut.values) + max_nbnd = max(QEspressoPWCheck.nbnd.values) + if self.ecut == max_ecut and self.nbnd == max_nbnd: + self.time_limit = '60m' + + @run_after('setup') + def run_after_setup(self): + """Hooks to run after the setup phase""" + + # Calculate default requested resources based on the scale: + # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests. + # Also support setting the resources on the cmd line. + if self.compute_device == DEVICE_TYPES[GPU]: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU]) + else: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU]) + + @run_after('setup') + def request_mem(self): + memory_required = self.num_tasks_per_node * 0.9 + 4 + hooks.req_memory_per_node(test=self, app_mem_req=memory_required) + + @run_after('setup') + def set_omp_num_threads(self): + """ + Set number of OpenMP threads via OMP_NUM_THREADS. + Set default number of OpenMP threads equal to number of CPUs per task. + """ + + self.env_vars['OMP_NUM_THREADS'] = self.num_cpus_per_task + log(f'env_vars set to {self.env_vars}') diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index be9dec4d..ee679295 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -145,7 +145,41 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: else: msg = ( "This test's current_partition is not set yet. " - "The function utils.proc_attribute_defined should only be called after the setup() phase of ReFrame." + "The function utils.check_proc_attribute_defined should only be called after the setup() phase of ReFrame." + "This is a programming error, please report this issue." + ) + raise AttributeError(msg) + + +def check_extras_key_defined(test: rfm.RegressionTest, extra_key) -> bool: + """ + Checks if a specific key is defined in the 'extras' dictionary for the current partition + (i.e. if test.current_partition.extras[extra_key] is defined) + If not, throws an informative error message. + Note that partition extras are defined by free text keys, so any string is (potentially) valid. + + Arguments: + - test: the reframe regression test instance for which should be checked if the key is defined in 'extras' + - extra_key: key for which to check in the 'extras' dictionary + + Return: + - True (bool) if the key is defined + - Function does not return (but raises an error) if the attribute is undefined + """ + + if test.current_partition: + if extra_key in test.current_partition.extras: + return True + else: + msg = ( + f"Key '{extra_key}' missing in the 'extras' dictionary for partition '{test.current_partition.name}'." + "Please define this key for the relevant partition in the ReFrame configuration file (see " + "https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.extras)." + ) + else: + msg = ( + "This test's current_partition is not set yet. " + "The function utils.check_extras_key_defined should only be called after the setup() phase of ReFrame." "This is a programming error, please report this issue." ) raise AttributeError(msg)