Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 38 additions & 19 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@
else
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
end
local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt"
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
Expand All @@ -189,28 +188,48 @@
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
local cudaVersion = read_file(cudaVersionFile)
if not cudaVersion then
LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs)
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
if not cudaVersion or cudaVersion == "" then
local eessi_prefix = os.getenv("EESSI_PREFIX")
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
-- Check return code first. We don't want source_sh to raise an LmodError, we just print
-- an LmodWarning stating we couldn't do a proper version compatibility check
local rc = os.execute("bash -c 'source " .. script .. "'")
if rc == 0 then
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me this does not seem to work. Even though the script exists, the condition always evaluates to false. By adding some debugging statements, I found that rc is a boolean (or nil?), and if rc then works for me.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://www.lua.org/pil/22.2.html is not really clear, but my AI friend told me:

In Lua, os.execute() returns different values depending on the Lua version.

✅ Lua 5.2 and newer (5.2, 5.3, 5.4)

os.execute(command) returns three values:
success, exit_type, code = os.execute("your_command")

Meaning:
success → true if the command terminated normally, nil otherwise
exit_type → string:
"exit" → program exited normally
"signal" → program was killed by a signal (Unix)
code → numeric:
exit status (if "exit")
signal number (if "signal")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lua 5.1 apparently does only return a number...

💡 Portable pattern

If you want code that works across versions:

local r1, r2, r3 = os.execute(cmd)

if type(r1) == "number" then
    -- Lua 5.1
    local exit_code = r1 / 256
else
    -- Lua 5.2+
    local success = r1
    local exit_code = r3
end

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused, so I can catch 3 returns, even if the command only returns a single one (i.e. r1)? And this does not lead to errors?

Also: does this change than solve your original issue of this thing always evaluating to false? I guess I then just check exit_code == 0 and that would work accross both, right?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, apparently that works in Lua:

function test()
    return 42
end

> test()
42
> r1, r2, r3 = test()
> r1
42
> r2
nil
> r3
nil

Checking the exit code should work in that case, indeed.

source_sh("bash", script)
end
end
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if major < major_req then
driver_libs_need_update = true
elseif major == major_req then
if minor < minor_req then
if not cudaVersion or cudaVersion == "" then
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
local suppress_warn = os.getenv(suppress_var)
if not suppress_warn or suppress_warn == 1 then
LmodWarning(warn)
end
else
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if tonumber(major) < tonumber(major_req) then
driver_libs_need_update = true
elseif tonumber(major) == tonumber(major_req) then
if tonumber(minor) < tonumber(minor_req) then
driver_libs_need_update = true
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
end
Expand Down
4 changes: 4 additions & 0 deletions scripts/gpu_support/nvidia/get_cuda_driver_version.sh
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file will have to be added to the following list to ensure that it gets deployed to cvmfs:
https://github.com/EESSI/software-layer-scripts/blob/main/install_scripts.sh#L210

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This can be leveraged by the source_sh() feature of Lmod
set -o pipefail
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return $?
export EESSI_CUDA_DRIVER_VERSION
Loading