diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index b1be54ee..f6e508de 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -172,7 +172,6 @@ else cudaDriverDir = eessi_eprefix .. "/lib/nvidia" end - local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt" local cudaDriverFile = cudaDriverDir .. "/libcuda.so" local cudaDriverExists = isFile(cudaDriverFile) local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") @@ -189,28 +188,61 @@ else -- CUDA driver exists, now we check its version to see if an update is needed if cudaDriverExists then - local cudaVersion = read_file(cudaVersionFile) - if not cudaVersion then - LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs) + local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") + if not cudaVersion or cudaVersion == "" then + local eessi_prefix = os.getenv("EESSI_PREFIX") + local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') + -- We cannot immedately use source_sh, since lmod has no way of catching a potential error + -- and we don't want this to raise an LmodError just because nvidia-smi doesn't exist or + -- doesn't print the right output (happens on a node with nvidia-smi but no driver installed). + -- The only way to catch this is to source the script first with os.execute and make sure it + -- returns with a zero exit code. Unfortunately, this means we have to run nvidia-smi twice, which + -- is a bit slow. Since the result is then cached in the EESSI_CUDA_DRIVER_VERSION environment + -- variable, this is probably acceptable + local r1, r2, r3 = os.execute("bash -c 'source " .. script .. "'") + local exit_code = 0 + if type(r1) == "number" then + -- Lua 5.1 or earlier, this is our exit code + exit_code = r1 + else + -- Lua 5.2 or later, r3 is our exit code + exit_code = r3 + end + if exit_code == 0 then + source_sh("bash", script) + end end + cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") local cudaVersion_req = os.getenv("EESSICUDAVERSION") - -- driver CUDA versions don't give a patch version for CUDA - local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") - local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") - local driver_libs_need_update = false - if major < major_req then - driver_libs_need_update = true - elseif major == major_req then - if minor < minor_req then + if not cudaVersion or cudaVersion == "" then + local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" + local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " + warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" + warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " + warn = warn .. "as expected. Export " .. suppress_var .. "=1" + local suppress_warn = os.getenv(suppress_var) + if not suppress_warn or suppress_warn == 1 then + LmodWarning(warn) + end + else + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if tonumber(major) < tonumber(major_req) then driver_libs_need_update = true + elseif tonumber(major) == tonumber(major_req) then + if tonumber(minor) < tonumber(minor_req) then + driver_libs_need_update = true + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then " + advice = advice .. "let EESSI know about the update.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end - end - if driver_libs_need_update == true then - local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "Please update your CUDA driver libraries and then " - advice = advice .. "let EESSI know about the update.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end end end diff --git a/install_scripts.sh b/install_scripts.sh index 022c0f9f..004ac075 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -211,6 +211,7 @@ nvidia_files=( install_cuda_and_libraries.sh install_cuda_host_injections.sh link_nvidia_host_libraries.sh + get_cuda_driver_version.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" diff --git a/scripts/gpu_support/nvidia/get_cuda_driver_version.sh b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh new file mode 100644 index 00000000..65ff2a1a --- /dev/null +++ b/scripts/gpu_support/nvidia/get_cuda_driver_version.sh @@ -0,0 +1,4 @@ +# This can be leveraged by the source_sh() feature of Lmod +set -o pipefail +EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return $? +export EESSI_CUDA_DRIVER_VERSION