From 350d520e219efdf2789bc1fb2b3658bfe5b11e58 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 20 Apr 2026 19:18:46 -0700 Subject: [PATCH 1/2] meta-nvidia: add 580.95.05 recipes for nvidia, fabricmanager, nscq Required for RTX PRO 6000 Blackwell Server Edition (10de:2bb5) Confidential Compute support: the 570.172.08 NVIDIA Open Kernel Module predates Pro 6000 SE and refuses to attach with "GPU confidential compute capability is not enabled" inside a TDX guest. 580.95.05 recognizes GB202 and brings the full TDX guest CC stack online (verified end-to-end: nvidia-smi conf-compute reports CC State=ON, GPU CC Capabilities=CC Capable, Protected memory=99461312 KiB; PyTorch CUDA workloads run inside the TD). The nscq archive at 580 no longer ships bin/nscq-cli, so the new recipe drops it from do_install and FILES. --- .../nvidia/libnvidia-nscq_580.95.05.bb | 32 +++++++ .../nvidia/nvidia-fabricmanager_580.95.05.bb | 86 +++++++++++++++++++ .../nvidia/nvidia_580.95.05.bb | 24 ++++++ 3 files changed, 142 insertions(+) create mode 100644 meta-nvidia/recipes-graphics/nvidia/libnvidia-nscq_580.95.05.bb create mode 100644 meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb create mode 100644 meta-nvidia/recipes-graphics/nvidia/nvidia_580.95.05.bb diff --git a/meta-nvidia/recipes-graphics/nvidia/libnvidia-nscq_580.95.05.bb b/meta-nvidia/recipes-graphics/nvidia/libnvidia-nscq_580.95.05.bb new file mode 100644 index 0000000..7ba3538 --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/libnvidia-nscq_580.95.05.bb @@ -0,0 +1,32 @@ +SUMMARY = "NVIDIA NSCQ library" +DESCRIPTION = "NVIDIA NSCQ (NVIDIA System Communication Queue) library for NVIDIA GPU systems" +HOMEPAGE = "https://developer.nvidia.com/" +LICENSE = "NVIDIA-Proprietary" +LIC_FILES_CHKSUM = "file://LICENSE;md5=2cc00be68c1227a7c42ff3620ef75d05" + +SRC_URI = "https://developer.download.nvidia.cn/compute/nvidia-driver/redist/libnvidia_nscq/linux-x86_64/libnvidia_nscq-linux-x86_64-${PV}-archive.tar.xz" +SRC_URI[md5sum] = "6bc20061ebdae98fadd7a76110b44430" +SRC_URI[sha256sum] = "c2285c12f10ec2afc0ad2949f7fcc282b6fd37f32165c1df241451ccabb1067a" + +S = "${WORKDIR}/libnvidia_nscq-linux-x86_64-${PV}-archive" + +INSANE_SKIP:${PN} = "already-stripped ldflags" + +do_configure[noexec] = "1" +do_compile[noexec] = "1" + +do_install() { + install -d ${D}${libdir} + + install -m 0755 ${S}/lib/libnvidia-nscq.so.${PV} ${D}${libdir} + ln -sf libnvidia-nscq.so.${PV} ${D}${libdir}/libnvidia-nscq.so.2.0 + ln -sf libnvidia-nscq.so.2.0 ${D}${libdir}/libnvidia-nscq.so.2 + ln -sf libnvidia-nscq.so.2 ${D}${libdir}/libnvidia-nscq.so +} + +FILES:${PN} = "\ + ${libdir}/libnvidia-nscq.so.${PV} \ + ${libdir}/libnvidia-nscq.so.2.0 \ + ${libdir}/libnvidia-nscq.so.2 \ + ${libdir}/libnvidia-nscq.so \ +" diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb new file mode 100644 index 0000000..6225e7d --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb @@ -0,0 +1,86 @@ +SUMMARY = "NVIDIA Fabric Manager for NVSwitch systems" +DESCRIPTION = "NVIDIA Fabric Manager provides NVSwitch management for NVIDIA HGX and DGX systems" +HOMEPAGE = "https://developer.nvidia.com/" +LICENSE = "NVIDIA-Proprietary" +LIC_FILES_CHKSUM = "file://LICENSE;md5=2cc00be68c1227a7c42ff3620ef75d05" + +SRC_URI = "https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${PV}-archive.tar.xz" +SRC_URI[md5sum] = "a6568aa288cb4784b85ba6826463f918" +SRC_URI[sha256sum] = "f0220bfb67d04b4107acf00cc95abe5a9268fd8f8b5bae26971f4df232e4369c" + +S = "${WORKDIR}/fabricmanager-linux-x86_64-${PV}-archive" + +DEPENDS = "" +RDEPENDS:${PN} = "bash zlib" + +INSANE_SKIP:${PN} = "already-stripped ldflags" + +do_configure[noexec] = "1" +do_compile[noexec] = "1" + +inherit systemd + +SYSTEMD_AUTO_ENABLE = "enable" +SYSTEMD_SERVICE:${PN} = "nvidia-fabricmanager.service" + +do_install() { + # Create directories + install -d ${D}${bindir} + install -d ${D}${libdir} + install -d ${D}${datadir}/nvidia/nvswitch + install -d ${D}${systemd_system_unitdir} + + # Install binaries + install -m 0755 ${S}/bin/nv-fabricmanager ${D}${bindir} + install -m 0755 ${S}/bin/nvidia-fabricmanager-start.sh ${D}${bindir} + install -m 0755 ${S}/bin/nvswitch-audit ${D}${bindir} + + # Install libraries + install -m 0644 ${S}/lib/libnvfm.so.1 ${D}${libdir} + ln -sf libnvfm.so.1 ${D}${libdir}/libnvfm.so + + # Install config files + install -m 0644 ${S}/etc/fabricmanager.cfg ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/etc/fabricmanager_multinode.cfg ${D}${datadir}/nvidia/nvswitch/ + + # Install topology files + install -m 0644 ${S}/share/nvidia/nvswitch/dgx2_hgx2_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxa100_hgxa100_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxh100_hgxh100_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxh800_hgxh800_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/mgxh20_nvl16_topology ${D}${datadir}/nvidia/nvswitch/ + + # Install multi-node topology files + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_8gpus_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_trunk_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_osfp_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_osfp_cable_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_trunk_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_osfp_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_osfp_cable_connections.csv ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gh200_nvlink_32gpus_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl36r1_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl36r1_c2g2_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r1_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r2_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r2_c2g2_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl576r16_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl8r1_c2g4_etf_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl8r1_c2g4_etf_nso_topology ${D}${datadir}/nvidia/nvswitch/ + install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl4r1_c2g2_etf_topology ${D}${datadir}/nvidia/nvswitch/ + + # Install systemd service + install -m 0644 ${S}/systemd/nvidia-fabricmanager.service ${D}${systemd_system_unitdir} +} + +FILES:${PN} = "\ + ${bindir}/nv-fabricmanager \ + ${bindir}/nvidia-fabricmanager-start.sh \ + ${bindir}/nvswitch-audit \ + ${libdir}/libnvfm.so.1 \ + ${libdir}/libnvfm.so \ + ${datadir}/nvidia/nvswitch/* \ + ${systemd_system_unitdir}/nvidia-fabricmanager.service \ +" diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia_580.95.05.bb b/meta-nvidia/recipes-graphics/nvidia/nvidia_580.95.05.bb new file mode 100644 index 0000000..3f0677a --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia_580.95.05.bb @@ -0,0 +1,24 @@ +SUMMARY = "NVidia Graphics Driver" +LICENSE = "NVIDIA-Proprietary" +LIC_FILES_CHKSUM = "file://../LICENSE;md5=92aa2e2af6aa0bcba1c3fe49da021937" + +NVIDIA_ARCHIVE_NAME = "NVIDIA-Linux-${TARGET_ARCH}-${PV}" +NVIDIA_SRC = "${WORKDIR}/${NVIDIA_ARCHIVE_NAME}" +SRC_URI = " \ + https://us.download.nvidia.com/tesla/${PV}/${NVIDIA_ARCHIVE_NAME}.run \ +" +SRC_URI[md5sum] = "3d23653c4898d08b1f3f031ea8cdaa93" +SRC_URI[sha256sum] = "849ef0ef8e842b9806b2cde9f11c1303d54f1a9a769467e4e5d961b2fe1182a7" + +RDEPENDS:${PN} = "nvidia-modprobe-config" + +do_unpack() { + chmod +x ${DL_DIR}/${NVIDIA_ARCHIVE_NAME}.run + rm -rf ${NVIDIA_SRC} + ${DL_DIR}/${NVIDIA_ARCHIVE_NAME}.run -x --target ${NVIDIA_SRC} +} + +do_make_scripts[noexec] = "1" + +include nvidia-kernel-module.inc +include nvidia-libs.inc From 85e7e171657725e7ca2d3701b1e77ca6241c1939 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 20 Apr 2026 19:30:37 -0700 Subject: [PATCH 2/2] meta-nvidia: glob topology files + pin nvidia stack via NVIDIA_VERSION - nvidia-fabricmanager: install all files from share/nvidia/nvswitch/ via glob loop instead of an explicit list. The 580.95.05 archive ships new GB300 topologies (gb300_nvl72r{1,2}_c2g4_topology) that the old hard-coded list missed; future archives stay covered automatically. (Addresses Copilot review feedback on PR #56.) - dstack.conf: introduce NVIDIA_VERSION = "580.95.05" and pin PREFERRED_VERSION_{nvidia,nvidia-fabricmanager,libnvidia-nscq} from it so the kernel module ABI and userspace libs always move together. --- meta-dstack/conf/distro/dstack.conf | 7 ++++ .../nvidia/nvidia-fabricmanager_580.95.05.bb | 33 +++---------------- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/meta-dstack/conf/distro/dstack.conf b/meta-dstack/conf/distro/dstack.conf index 7cd225e..edaece7 100644 --- a/meta-dstack/conf/distro/dstack.conf +++ b/meta-dstack/conf/distro/dstack.conf @@ -24,4 +24,11 @@ SERIAL_CONSOLES = "115200;ttyS0" PREFERRED_VERSION_rust-bin-cross-x86_64 = "1.92.0" PREFERRED_VERSION_cargo-bin-cross-x86_64 = "1.92.0" +# NVIDIA driver stack (only consulted when nvidia flavor is built). +# Bump all three together — kernel module ABI is paired with userspace libs. +NVIDIA_VERSION = "580.95.05" +PREFERRED_VERSION_nvidia = "${NVIDIA_VERSION}" +PREFERRED_VERSION_nvidia-fabricmanager = "${NVIDIA_VERSION}" +PREFERRED_VERSION_libnvidia-nscq = "${NVIDIA_VERSION}" + BAD_RECOMMENDATIONS = "busybox-syslog" diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb index 6225e7d..7187848 100644 --- a/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_580.95.05.bb @@ -39,37 +39,12 @@ do_install() { install -m 0644 ${S}/lib/libnvfm.so.1 ${D}${libdir} ln -sf libnvfm.so.1 ${D}${libdir}/libnvfm.so - # Install config files + # Install config + topology files (glob picks up new SKUs in future archives) install -m 0644 ${S}/etc/fabricmanager.cfg ${D}${datadir}/nvidia/nvswitch/ install -m 0644 ${S}/etc/fabricmanager_multinode.cfg ${D}${datadir}/nvidia/nvswitch/ - - # Install topology files - install -m 0644 ${S}/share/nvidia/nvswitch/dgx2_hgx2_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxa100_hgxa100_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxh100_hgxh100_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxh800_hgxh800_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/mgxh20_nvl16_topology ${D}${datadir}/nvidia/nvswitch/ - - # Install multi-node topology files - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_8gpus_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_trunk_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_osfp_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_16gpus_osfp_cable_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_trunk_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_osfp_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/dgxgh200_hgxgh200_32gpus_osfp_cable_connections.csv ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gh200_nvlink_32gpus_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl36r1_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl36r1_c2g2_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r1_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r2_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl72r2_c2g2_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl576r16_c2g4_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl8r1_c2g4_etf_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl8r1_c2g4_etf_nso_topology ${D}${datadir}/nvidia/nvswitch/ - install -m 0644 ${S}/share/nvidia/nvswitch/gb200_nvl4r1_c2g2_etf_topology ${D}${datadir}/nvidia/nvswitch/ + for f in ${S}/share/nvidia/nvswitch/*; do + [ -f "$f" ] && install -m 0644 "$f" ${D}${datadir}/nvidia/nvswitch/ + done # Install systemd service install -m 0644 ${S}/systemd/nvidia-fabricmanager.service ${D}${systemd_system_unitdir}