From e010c2dbf3652da0d228b321a9bf5bd6852be762 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 14 Apr 2026 23:49:55 +0200 Subject: [PATCH 1/8] Initial attempt at dynamically determining CUDA version --- create_lmodsitepackage.py | 25 +++++++++++++------ .../nvidia/get_cuda_driver_version.sh | 4 +++ 2 files changed, 22 insertions(+), 7 deletions(-) create mode 100644 scripts/gpu_support/nvidia/get_cuda_driver_version.sh diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index b1be54ee..e14fc5d1 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -172,7 +172,6 @@ else cudaDriverDir = eessi_eprefix .. "/lib/nvidia" end - local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt" local cudaDriverFile = cudaDriverDir .. "/libcuda.so" local cudaDriverExists = isFile(cudaDriverFile) local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") @@ -189,19 +188,31 @@ else -- CUDA driver exists, now we check its version to see if an update is needed if cudaDriverExists then - local cudaVersion = read_file(cudaVersionFile) - if not cudaVersion then - LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs) + LmodMessage("EESSI_CUDA_DRIVER_VERSION initial: " .. os.getenv("EESSI_CUDA_DRIVER_VERSION")) + local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") + if not cudaVersion or cudaVersion == "" then + -- Hardcode for local testing + -- local eessi_prefix = os.getenv("EESSI_PREFIX") + local eessi_prefix = pathJoin('/home', 'casparl', 'EESSI', 'software-layer-scripts') + local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') + LmodMessage("Getting version") + source_sh("bash", script) + end + cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") + LmodMessage("CUDA VERSION" .. cudaVersion) + if not cudaVersion or cudaVersion == "" then + -- Change to warning? + LmodError("Environment variable EESSI_CUDA_DRIVER_VERSION not found. " .. refer_to_docs) end local cudaVersion_req = os.getenv("EESSICUDAVERSION") -- driver CUDA versions don't give a patch version for CUDA local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") local driver_libs_need_update = false - if major < major_req then + if tonumber(major) < tonumber(major_req) then driver_libs_need_update = true - elseif major == major_req then - if minor < minor_req then + elseif tonumber(major) == tonumber(major_req) then + if tonumber(minor) < tonumber(minor_req) then driver_libs_need_update = true end end diff --git a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh new file mode 100644 index 00000000..b4a8ebbd --- /dev/null +++ b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh @@ -0,0 +1,4 @@ +# This can be leveraged by the source_sh() feature of Lmod +set -o pipefail +EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi | grep -oP 'CUDA Version:\s*\K[0-9]+\.[0-9]+') || return $? +export EESSI_CUDA_DRIVER_VERSION From 8611fcbce5d06f10e8c8eaf0cc6bb72456e88d97 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 15 Apr 2026 00:03:14 +0200 Subject: [PATCH 2/8] --query is typically faster --- create_lmodsitepackage.py | 1 - scripts/gpu_support/nvidia/get_cuda_driver_version.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index e14fc5d1..e78cedbe 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -188,7 +188,6 @@ else -- CUDA driver exists, now we check its version to see if an update is needed if cudaDriverExists then - LmodMessage("EESSI_CUDA_DRIVER_VERSION initial: " .. os.getenv("EESSI_CUDA_DRIVER_VERSION")) local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") if not cudaVersion or cudaVersion == "" then -- Hardcode for local testing diff --git a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh index b4a8ebbd..65ff2a1a 100644 --- a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh +++ b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh @@ -1,4 +1,4 @@ # This can be leveraged by the source_sh() feature of Lmod set -o pipefail -EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi | grep -oP 'CUDA Version:\s*\K[0-9]+\.[0-9]+') || return $? +EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return $? export EESSI_CUDA_DRIVER_VERSION From daac4f5b0208ac5157ef23fbe4324cecbe68e531 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 15 Apr 2026 00:24:17 +0200 Subject: [PATCH 3/8] Make sure we conditionally print warning, and make it suppressable --- create_lmodsitepackage.py | 55 +++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index e78cedbe..b01a3fe4 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -194,33 +194,44 @@ -- local eessi_prefix = os.getenv("EESSI_PREFIX") local eessi_prefix = pathJoin('/home', 'casparl', 'EESSI', 'software-layer-scripts') local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') - LmodMessage("Getting version") - source_sh("bash", script) + -- Check return code first. We don't want source_sh to raise an LmodError, we just print + -- an LmodWarning stating we couldn't do a proper version compatibility check + local rc = os.execute("bash -c 'source " .. script .. "'") + if rc == 0 then + source_sh("bash", script) + end end cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") - LmodMessage("CUDA VERSION" .. cudaVersion) - if not cudaVersion or cudaVersion == "" then - -- Change to warning? - LmodError("Environment variable EESSI_CUDA_DRIVER_VERSION not found. " .. refer_to_docs) - end local cudaVersion_req = os.getenv("EESSICUDAVERSION") - -- driver CUDA versions don't give a patch version for CUDA - local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") - local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") - local driver_libs_need_update = false - if tonumber(major) < tonumber(major_req) then - driver_libs_need_update = true - elseif tonumber(major) == tonumber(major_req) then - if tonumber(minor) < tonumber(minor_req) then + if not cudaVersion or cudaVersion == "" then + local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" + local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " + warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" + warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " + warn = warn .. "as expected. Export " .. suppress_var .. "=1" + local suppress_warn = os.getenv(suppress_var) + if not suppress_warn or suppress_warn == 1 then + LmodWarning(warn) + end + else + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if tonumber(major) < tonumber(major_req) then driver_libs_need_update = true + elseif tonumber(major) == tonumber(major_req) then + if tonumber(minor) < tonumber(minor_req) then + driver_libs_need_update = true + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then " + advice = advice .. "let EESSI know about the update.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end - end - if driver_libs_need_update == true then - local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "Please update your CUDA driver libraries and then " - advice = advice .. "let EESSI know about the update.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end end end From 64e17c4bf5d85933e8284d48674d15bf0b84a5bf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 15 Apr 2026 00:27:41 +0200 Subject: [PATCH 4/8] Remove logic for local testing --- create_lmodsitepackage.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index b01a3fe4..9fb1b547 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -190,9 +190,7 @@ if cudaDriverExists then local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") if not cudaVersion or cudaVersion == "" then - -- Hardcode for local testing - -- local eessi_prefix = os.getenv("EESSI_PREFIX") - local eessi_prefix = pathJoin('/home', 'casparl', 'EESSI', 'software-layer-scripts') + local eessi_prefix = os.getenv("EESSI_PREFIX") local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') -- Check return code first. We don't want source_sh to raise an LmodError, we just print -- an LmodWarning stating we couldn't do a proper version compatibility check From ee037b1e7f1db53dabc930cfcace5d1a9e099330 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 Apr 2026 16:52:49 +0200 Subject: [PATCH 5/8] Make compatible with Lua 5.2 and onwards as well --- create_lmodsitepackage.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 9fb1b547..1e49970d 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -194,8 +194,16 @@ local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') -- Check return code first. We don't want source_sh to raise an LmodError, we just print -- an LmodWarning stating we couldn't do a proper version compatibility check - local rc = os.execute("bash -c 'source " .. script .. "'") - if rc == 0 then + local r1, r2, r3 = os.execute("bash -c 'source " .. script .. "'") + local exit_code = 0 + if type(r1) == "number" then + -- Lua 5.1 or earlier, this is our exit code + exit_code = r1 + else + -- Lua 5.2 or later, r3 is our exit code + exit_code = r3 + end + if exit_code == 0 then source_sh("bash", script) end end From 05a69e4801172cbab742cd9eaf0a04acd7464560 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 Apr 2026 17:06:45 +0200 Subject: [PATCH 6/8] Make sure get_cuda_driver_version.sh gets installed --- install_scripts.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/install_scripts.sh b/install_scripts.sh index 022c0f9f..004ac075 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -211,6 +211,7 @@ nvidia_files=( install_cuda_and_libraries.sh install_cuda_host_injections.sh link_nvidia_host_libraries.sh + get_cuda_driver_version.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" From 08e0cbc6ee972c1c5015217bae2f902606230aec Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 Apr 2026 17:25:52 +0200 Subject: [PATCH 7/8] Improve message --- create_lmodsitepackage.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 1e49970d..f6e508de 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -192,8 +192,13 @@ if not cudaVersion or cudaVersion == "" then local eessi_prefix = os.getenv("EESSI_PREFIX") local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') - -- Check return code first. We don't want source_sh to raise an LmodError, we just print - -- an LmodWarning stating we couldn't do a proper version compatibility check + -- We cannot immedately use source_sh, since lmod has no way of catching a potential error + -- and we don't want this to raise an LmodError just because nvidia-smi doesn't exist or + -- doesn't print the right output (happens on a node with nvidia-smi but no driver installed). + -- The only way to catch this is to source the script first with os.execute and make sure it + -- returns with a zero exit code. Unfortunately, this means we have to run nvidia-smi twice, which + -- is a bit slow. Since the result is then cached in the EESSI_CUDA_DRIVER_VERSION environment + -- variable, this is probably acceptable local r1, r2, r3 = os.execute("bash -c 'source " .. script .. "'") local exit_code = 0 if type(r1) == "number" then From 22291703886247f4770afb3c5b9970b49efa9e8e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 Apr 2026 17:52:41 +0200 Subject: [PATCH 8/8] Change strategy in order to avoid an LmodError: we don't like executing the get_cuda_driver_script twice, as it's costly. We simply adapt the script to always return a 0 exit, and then do any handling of the case where EESSI_CUDA_DRIVER_VERSION is NOT set by the end in the calling Lmod hook --- create_lmodsitepackage.py | 23 ++++--------------- .../nvidia/get_cuda_driver_version.sh | 8 ++++--- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index f6e508de..2fd4f78d 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -192,28 +192,13 @@ if not cudaVersion or cudaVersion == "" then local eessi_prefix = os.getenv("EESSI_PREFIX") local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') - -- We cannot immedately use source_sh, since lmod has no way of catching a potential error - -- and we don't want this to raise an LmodError just because nvidia-smi doesn't exist or - -- doesn't print the right output (happens on a node with nvidia-smi but no driver installed). - -- The only way to catch this is to source the script first with os.execute and make sure it - -- returns with a zero exit code. Unfortunately, this means we have to run nvidia-smi twice, which - -- is a bit slow. Since the result is then cached in the EESSI_CUDA_DRIVER_VERSION environment - -- variable, this is probably acceptable - local r1, r2, r3 = os.execute("bash -c 'source " .. script .. "'") - local exit_code = 0 - if type(r1) == "number" then - -- Lua 5.1 or earlier, this is our exit code - exit_code = r1 - else - -- Lua 5.2 or later, r3 is our exit code - exit_code = r3 - end - if exit_code == 0 then - source_sh("bash", script) - end + source_sh("bash", script) end cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") local cudaVersion_req = os.getenv("EESSICUDAVERSION") + -- Account for the fact that the script sourced above was designed to never return a non-zero exit + -- even if it failes to set EESSI_CUDA_DRIVER_VERSION + -- Essentially, we handle that case here by raising an error, which can be suppressed if not cudaVersion or cudaVersion == "" then local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " diff --git a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh index 65ff2a1a..d92493f6 100644 --- a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh +++ b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh @@ -1,4 +1,6 @@ # This can be leveraged by the source_sh() feature of Lmod -set -o pipefail -EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return $? -export EESSI_CUDA_DRIVER_VERSION +# Because we want to source this without immediately raising an LmodError upon failure, this script +# is designed to ALWAYS return a 0 exit code +EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return 0 +# The || return 0 shouldn't be needed, but just to be overly sure that this script always returns 0 +export EESSI_CUDA_DRIVER_VERSION || return 0