From 33393f54f1898c0a25e03fe64be3f2a1c7a1a103 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 16:08:22 +0200 Subject: [PATCH 1/6] These settings were introduced in EB 5.1.0, so the check should be >= 5.1 --- EESSI-extend-easybuild.eb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-extend-easybuild.eb b/EESSI-extend-easybuild.eb index 0abb6ce9..547710ac 100644 --- a/EESSI-extend-easybuild.eb +++ b/EESSI-extend-easybuild.eb @@ -212,7 +212,7 @@ easybuild_version = os.getenv("EBVERSIONEASYBUILD") or easybuild_version eessi_version = os.getenv("EESSI_VERSION") or "2023.06" -- Set environment variables that are EasyBuild version specific -if convertToCanonical(easybuild_version) > convertToCanonical("4") then +if convertToCanonical(easybuild_version) >= convertToCanonical("5.1") then setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1") setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1") setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1") From d668ada856e24971ab5169298c3047a455694d95 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 16:16:19 +0200 Subject: [PATCH 2/6] Move installation of CUDA SDK back down, reverting the move from #54. This will make sure the rebuild of EESSI-extend is done before building the CUDA in host-injections. That's essentially, as the fix in EESSI-extend is needed to make the CUDA in host-injections step pass --- EESSI-install-software.sh | 47 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 48e03f94..d462cad2 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -247,29 +247,6 @@ if [ ! -f ${_lmod_sitepackage_file} ]; then python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path} fi -# Install full CUDA SDK and cu* libraries in host_injections -# (This is done *before* configuring EasyBuild as it may rely on an older EB version) -# Hardcode this for now, see if it works -# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install -# Allow skipping CUDA SDK install in e.g. CI environments -echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary" -temp_install_storage=${TMPDIR}/temp_install_storage -mkdir -p ${temp_install_storage} -if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ - -t ${temp_install_storage} \ - --accept-cuda-eula \ - --accept-cudnn-eula -else - echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed" -fi - -# Install NVIDIA drivers in host_injections (if they exist) -if nvidia_gpu_available; then - echo "Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh -fi - echo ">> Configuring EasyBuild..." # Make sure EESSI-extend is not loaded, and configure location variables for a @@ -316,6 +293,30 @@ echo "DEBUG: before loading EESSI-extend // EASYBUILD_INSTALLPATH='${EASYBUILD_I source $TOPDIR/load_eessi_extend_module.sh ${EESSI_VERSION} echo "DEBUG: after loading EESSI-extend // EASYBUILD_INSTALLPATH='${EASYBUILD_INSTALLPATH}'" +# Install full CUDA SDK and cu* libraries in host_injections +# (This is done *before* configuring EasyBuild as it may rely on an older EB version) +# Hardcode this for now, see if it works +# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install +# Allow skipping CUDA SDK install in e.g. CI environments +echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary" +temp_install_storage=${TMPDIR}/temp_install_storage +mkdir -p ${temp_install_storage} +if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ + -t ${temp_install_storage} \ + --accept-cuda-eula \ + --accept-cudnn-eula +else + echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed" +fi + +# Install NVIDIA drivers in host_injections (if they exist) +if nvidia_gpu_available; then + echo "Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +fi + + if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path" From 1e0a49a4ac4957e712dda60d48499705a347795b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 16:41:13 +0200 Subject: [PATCH 3/6] Add debugging output --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 5123a7c1..04d49b8a 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -115,6 +115,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do unset EESSI_PROJECT_INSTALL unset EESSI_USER_INSTALL export EESSI_SITE_INSTALL=1 + echo "BEFORE UNLOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" module unload EESSI-extend ml_av_eessi_extend_out=${tmpdir}/ml_av_eessi_extend.out # need to use --ignore_cache to avoid the case that the module was removed (to be @@ -127,7 +128,9 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do error="\nNo module for EESSI-extend/${EESSI_EXTEND_VERSION} found\nwhile EESSI has been initialised to use software under ${EESSI_SOFTWARE_PATH}\n" fatal_error "${error}" fi + echo "BEFORE RELOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" module --ignore_cache load EESSI-extend/${EESSI_EXTEND_VERSION} + echo "AFTER RELOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" unset EESSI_EXTEND_VERSION # If there is a GPU on the node, the installation path will by default have an From e080044083850bdb50d85d20077cc0827cf23319 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:02:43 +0200 Subject: [PATCH 4/6] Fix that we unset EB config vars unconditionally upon unload --- EESSI-extend-easybuild.eb | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/EESSI-extend-easybuild.eb b/EESSI-extend-easybuild.eb index 547710ac..bc3695e6 100644 --- a/EESSI-extend-easybuild.eb +++ b/EESSI-extend-easybuild.eb @@ -211,8 +211,23 @@ end easybuild_version = os.getenv("EBVERSIONEASYBUILD") or easybuild_version eessi_version = os.getenv("EESSI_VERSION") or "2023.06" +if (mode() == "unload") then + -- unload unconditionally, so that even if EB versions were switched in the meantime, this gets unset + -- This avoids issues where EESSI-extend is first loaded with EB => 5.1 (which set these vars) + -- but then EB is swapped for a version < 5.1 and then EESSI-extend is unloaded (which would not unset + -- these vars if we did it conditional on the EB version) + setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1") + setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1") + setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1") + setenv ("EASYBUILD_LOCAL_VAR_NAMING_CHECK", "error") + -- This can still be conditional, eessi_version is always set + if convertToCanonical(eessi_version) > convertToCanonical("2023.06") then + setenv ("EASYBUILD_PREFER_PYTHON_SEARCH_PATH", "EBPYTHONPREFIXES") + setenv ("EASYBUILD_MODULE_SEARCH_PATH_HEADERS", "include_paths") + setenv ("EASYBUILD_SEARCH_PATH_CPP_HEADERS", "include_paths") + end -- Set environment variables that are EasyBuild version specific -if convertToCanonical(easybuild_version) >= convertToCanonical("5.1") then +elseif convertToCanonical(easybuild_version) >= convertToCanonical("5.1") then setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1") setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1") setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1") From 454f7576514bfdd64750daf488d6e9a3f34da535 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:14:36 +0200 Subject: [PATCH 5/6] Do the same thing with less duplication --- EESSI-extend-easybuild.eb | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/EESSI-extend-easybuild.eb b/EESSI-extend-easybuild.eb index bc3695e6..eaf59540 100644 --- a/EESSI-extend-easybuild.eb +++ b/EESSI-extend-easybuild.eb @@ -211,23 +211,12 @@ end easybuild_version = os.getenv("EBVERSIONEASYBUILD") or easybuild_version eessi_version = os.getenv("EESSI_VERSION") or "2023.06" -if (mode() == "unload") then - -- unload unconditionally, so that even if EB versions were switched in the meantime, this gets unset - -- This avoids issues where EESSI-extend is first loaded with EB => 5.1 (which set these vars) - -- but then EB is swapped for a version < 5.1 and then EESSI-extend is unloaded (which would not unset - -- these vars if we did it conditional on the EB version) - setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1") - setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1") - setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1") - setenv ("EASYBUILD_LOCAL_VAR_NAMING_CHECK", "error") - -- This can still be conditional, eessi_version is always set - if convertToCanonical(eessi_version) > convertToCanonical("2023.06") then - setenv ("EASYBUILD_PREFER_PYTHON_SEARCH_PATH", "EBPYTHONPREFIXES") - setenv ("EASYBUILD_MODULE_SEARCH_PATH_HEADERS", "include_paths") - setenv ("EASYBUILD_SEARCH_PATH_CPP_HEADERS", "include_paths") - end -- Set environment variables that are EasyBuild version specific -elseif convertToCanonical(easybuild_version) >= convertToCanonical("5.1") then +-- Do unload unconditionally, so that even if EB versions were switched in the meantime, this gets unset +-- This avoids issues where EESSI-extend is first loaded with EB => 5.1 (which set these vars) +-- but then EB is swapped for a version < 5.1 and then EESSI-extend is unloaded (which would not unset +-- these vars if we did it conditional on the EB version) +if convertToCanonical(easybuild_version) >= convertToCanonical("5.1") or mode() == "unload" then setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1") setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1") setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1") From 728c638403efe8c316b2546a3ec402c6e106301f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:20:32 +0200 Subject: [PATCH 6/6] Remove debugging output --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 04d49b8a..5123a7c1 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -115,7 +115,6 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do unset EESSI_PROJECT_INSTALL unset EESSI_USER_INSTALL export EESSI_SITE_INSTALL=1 - echo "BEFORE UNLOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" module unload EESSI-extend ml_av_eessi_extend_out=${tmpdir}/ml_av_eessi_extend.out # need to use --ignore_cache to avoid the case that the module was removed (to be @@ -128,9 +127,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do error="\nNo module for EESSI-extend/${EESSI_EXTEND_VERSION} found\nwhile EESSI has been initialised to use software under ${EESSI_SOFTWARE_PATH}\n" fatal_error "${error}" fi - echo "BEFORE RELOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" module --ignore_cache load EESSI-extend/${EESSI_EXTEND_VERSION} - echo "AFTER RELOADING EESSI-EXTEND, EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS = $EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS" unset EESSI_EXTEND_VERSION # If there is a GPU on the node, the installation path will by default have an