Azure · ganeshkumarashok · May 31, 2026
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -23,11 +23,30 @@ if [[ "${1}" == "copy" ]]; then
     exit 0
 fi
 
-if [[ "${1}" == "install" ]]; then
-    echo "copying gpu cache files"
-    cp -a /opt/gpu/. /mnt/gpu/
-    echo "copied successfully!"
-fi
+# Map the requested action to the install mode passed to install.sh.
+#   install            -> full compile + device init (legacy behaviour)
+#   build-only         -> compile/cache the kernel module only (VHD build, no GPU)
+#   install-skip-build -> device init only, reusing the module prebuilt into the VHD
+GPU_INSTALL_MODE_ENV=""
+case "${1}" in
+    install)
+        echo "copying gpu cache files"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        ;;
+    build-only)
+        echo "copying gpu cache files (build-only)"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        GPU_INSTALL_MODE_ENV="AKSGPU_BUILD_ONLY=1"
+        ;;
+    install-skip-build)
+        echo "copying gpu cache files (install-skip-build)"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        GPU_INSTALL_MODE_ENV="AKSGPU_SKIP_KERNEL_BUILD=1"
+        ;;
+esac
 
 ACTION_FILE="/opt/actions/install.sh"
 
@@ -46,7 +65,11 @@ cp -R /opt/actions/. /mnt/actions
 
 echo "Executing nsenter"
 
-nsenter -t 1 -m bash "${ACTION_FILE}"
+if [[ -n "${GPU_INSTALL_MODE_ENV}" ]]; then
+    nsenter -t 1 -m env "${GPU_INSTALL_MODE_ENV}" bash "${ACTION_FILE}"
+else
+    nsenter -t 1 -m bash "${ACTION_FILE}"
+fi
 RESULT="${PIPESTATUS[0]}"
 
 if [ $RESULT -eq 0 ]; then

diff --git a/install.sh b/install.sh
@@ -7,18 +7,54 @@ source /opt/gpu/package_manager_helpers.sh
 trap 'PS4="+ "' exit
 PS4='+ $(date -u -I"seconds" | cut -c1-19) '
 
+# Install mode flags (set by entrypoint.sh based on the requested action):
+#   AKSGPU_BUILD_ONLY=1        -> compile/cache the kernel module + userspace libs only.
+#                                 Runs on a GPU-less host (e.g. the Packer VHD builder).
+#                                 Skips every device-dependent step (modprobe, nvidia-smi,
+#                                 fabric manager, persistence) and writes a marker.
+#   AKSGPU_SKIP_KERNEL_BUILD=1 -> the kernel module + libs were prebuilt into the VHD for
+#                                 this exact kernel+driver; skip recompilation and only run
+#                                 the device-dependent steps at node boot.
+#   (neither set)              -> legacy behaviour: full compile + device init in one shot.
+AKSGPU_BUILD_ONLY="${AKSGPU_BUILD_ONLY:-0}"
+AKSGPU_SKIP_KERNEL_BUILD="${AKSGPU_SKIP_KERNEL_BUILD:-0}"
+
+# Host-side marker describing what was baked into the VHD at build time. AgentBaker reads
+# this (plus its own image-digest record) to decide whether the boot-time fast path is safe.
+DKMS_MARKER_FILE="/opt/azure/aks-gpu/dkms-marker"
+
 KERNEL_NAME=$(uname -r)
 LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log"
 ARCH=$(uname -m)
 
-set +euo pipefail
-open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
-echo "Open devices: $open_devices"
-
-open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
-echo "Open gridd: $open_gridd"
-
-set -euo pipefail
+# Track overlay/tmpfs state so a build-time exit can never leave dangling mounts in the VHD.
+OVERLAY_MOUNTED=0
+cleanup_overlay() {
+    set +e
+    if [ "${OVERLAY_MOUNTED}" = "1" ]; then
+        umount -l "/usr/lib/${ARCH}-linux-gnu" || true
+        umount /tmp/overlay || true
+        rm -r /tmp/overlay || true
+        OVERLAY_MOUNTED=0
+    fi
+    set -e
+}
+trap cleanup_overlay EXIT
+
+resolve_runfile() {
+    if [[ "${DRIVER_KIND}" == "cuda" ]]; then
+        RUNFILE="NVIDIA-Linux-${ARCH}-${DRIVER_VERSION}"
+    elif [[ "${DRIVER_KIND}" == "grid" ]]; then
+        if [[ "${ARCH}" != "x86_64" ]]; then
+            echo "GRID driver is only supported on x86_64 architecture"
+            exit 1
+        fi
+        RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
+    else
+        echo "Invalid driver kind: ${DRIVER_KIND}"
+        exit 1
+    fi
+}
 
 # install cached nvidia debian packages for container runtime compatibility
 install_cached_nvidia_packages() {
@@ -27,87 +63,126 @@ for apt_package in $NVIDIA_PACKAGES; do
 done
 }
 
-use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
-
-# blacklist nouveau driver, nvidia driver dependency
-cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
-update-initramfs -u
-
-# clean up lingering files from previous install
-set +e
-umount -l /usr/lib/$(uname -m)-linux-gnu || true
-umount -l /tmp/overlay || true
-rm -r /tmp/overlay || true
-set -e
-
-# set up overlayfs to change install location of nvidia libs from /usr/lib/$ARCH-linux-gnu to /usr/local/nvidia
-# add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container)
-mkdir /tmp/overlay
-mount -t tmpfs tmpfs /tmp/overlay
-mkdir /tmp/overlay/{workdir,lib64}
-mkdir -p ${GPU_DEST}/lib64
-mount -t overlay overlay -o lowerdir=/usr/lib/$(uname -m)-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/$(uname -m)-linux-gnu
-
-if [[ "${DRIVER_KIND}" == "cuda" ]]; then
-    RUNFILE="NVIDIA-Linux-$(uname -m)-${DRIVER_VERSION}"
-elif [[ "${DRIVER_KIND}" == "grid" ]]; then
-    if [[ $(uname -m) != "x86_64" ]]; then
-        echo "GRID driver is only supported on x86_64 architecture"
-        exit 1
-    fi
-    RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
-else
-    echo "Invalid driver kind: ${DRIVER_KIND}"
-    exit 1
-fi
+install_nvidia_container_toolkit() {
+    use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
+}
 
-# install nvidia drivers
-pushd /opt/gpu
-/opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms
-nvidia-smi
-popd
-
-# move nvidia libs to correct location from temporary overlayfs
-cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64
-
-# configure system to know about nvidia lib paths
-echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
-ldconfig 
-
-# unmount, cleanup
-set +e
-umount -l /usr/lib/$(uname -m)-linux-gnu
-umount /tmp/overlay
-rm -r /tmp/overlay
-set -e
-
-# validate that nvidia driver is working
-dkms status
-nvidia-modprobe -u -c0
-
-# configure persistence daemon
-# decreases latency for later driver loads
-# reduces nvidia-smi invocation time 10x from 30 to 2 sec 
-# notable on large VM sizes with multiple GPUs
-# especially when nvidia-smi process is in CPU cgroup
-cp -r /usr/bin/lib64/lib64/* /usr/lib/$(uname -m)-linux-gnu/
-nvidia-smi
-
-# install fabricmanager for nvlink based systems
-if [[ "${DRIVER_KIND}" == "cuda" ]]; then
-    NVIDIA_FM_ARCH=$(uname -m)
-    if [ $NVIDIA_FM_ARCH = "arm64" ]; then
-        # NVIDIA uses the name "SBSA" for ARM64 platforms for the fabric manager. See https://en.wikipedia.org/wiki/Server_Base_System_Architecture
-        NVIDIA_FM_ARCH="sbsa"
+# build_kernel_module compiles the NVIDIA kernel module (the expensive step) and stages the
+# userspace libraries. It performs NO device access, so it is safe to run at VHD build time on
+# a host without a GPU.
+build_kernel_module() {
+    # blacklist nouveau driver, nvidia driver dependency
+    cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
+    update-initramfs -u
+
+    # clean up lingering files from previous install
+    set +e
+    umount -l "/usr/lib/${ARCH}-linux-gnu" || true
+    umount -l /tmp/overlay || true
+    rm -r /tmp/overlay || true
+    set -e
+
+    # set up overlayfs to change install location of nvidia libs from /usr/lib/$ARCH-linux-gnu to /usr/local/nvidia
+    # add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container)
+    mkdir /tmp/overlay
+    mount -t tmpfs tmpfs /tmp/overlay
+    mkdir /tmp/overlay/{workdir,lib64}
+    mkdir -p ${GPU_DEST}/lib64
+    mount -t overlay overlay -o lowerdir="/usr/lib/${ARCH}-linux-gnu",upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir "/usr/lib/${ARCH}-linux-gnu"
+    OVERLAY_MOUNTED=1
+
+    resolve_runfile
+
+    # install nvidia drivers (DKMS build is the dominant cost we are hoisting to VHD build time)
+    pushd /opt/gpu
+    /opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms
+    popd
+
+    # move nvidia libs to correct location from temporary overlayfs
+    cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64
+
+    # configure system to know about nvidia lib paths
+    echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
+    ldconfig
+
+    cleanup_overlay
+
+    # validate that the kernel module was built and registered (no device access required)
+    dkms status
+    modinfo -k "$KERNEL_NAME" nvidia
+}
+
+# device_init runs the steps that require the physical GPU and therefore must execute at node
+# boot, regardless of whether the kernel module was prebuilt into the VHD.
+device_init() {
+    nvidia-modprobe -u -c0
+
+    # configure persistence daemon
+    # decreases latency for later driver loads
+    # reduces nvidia-smi invocation time 10x from 30 to 2 sec
+    # notable on large VM sizes with multiple GPUs
+    # especially when nvidia-smi process is in CPU cgroup
+    cp -r /usr/bin/lib64/lib64/* "/usr/lib/${ARCH}-linux-gnu/"
+    nvidia-smi
+
+    # install fabricmanager for nvlink based systems
+    if [[ "${DRIVER_KIND}" == "cuda" ]]; then
+        NVIDIA_FM_ARCH=$ARCH
+        if [ "$NVIDIA_FM_ARCH" = "arm64" ]; then
+            # NVIDIA uses the name "SBSA" for ARM64 platforms for the fabric manager. See https://en.wikipedia.org/wiki/Server_Base_System_Architecture
+            NVIDIA_FM_ARCH="sbsa"
+        fi
+        bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
     fi
-    bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
+
+    mkdir -p /etc/containerd/config.d
+    cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
+
+    mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)"
+    cp /opt/gpu/71-nvidia-char-dev.rules /lib/udev/rules.d/71-nvidia-dev-char.rules
+    /usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all
+}
+
+write_dkms_marker() {
+    mkdir -p "$(dirname "${DKMS_MARKER_FILE}")"
+    cat > "${DKMS_MARKER_FILE}" <<EOF
+kernel=${KERNEL_NAME}
+driver_version=${DRIVER_VERSION}
+driver_kind=${DRIVER_KIND}
+arch=${ARCH}
+EOF
+}
+
+set +euo pipefail
+open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
+echo "Open devices: $open_devices"
+
+open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
+echo "Open gridd: $open_gridd"
+set -euo pipefail
+
+if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then
+    # VHD build time: compile + cache only, no device access.
+    echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME})"
+    build_kernel_module
+    write_dkms_marker
+    rm -r /opt/gpu
+    exit 0
 fi
 
-mkdir -p /etc/containerd/config.d
-cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
+install_nvidia_container_toolkit
+
+if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then
+    # Node boot, prebuilt module valid for this kernel+driver: skip recompilation, ensure the
+    # baked module is loadable, then run the device-dependent steps only.
+    echo "aks-gpu: skip-kernel-build mode (using module prebuilt in VHD for kernel ${KERNEL_NAME})"
+    ldconfig
+    dkms status
+    modinfo -k "$KERNEL_NAME" nvidia
+else
+    build_kernel_module
+fi
 
-mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)"
-cp /opt/gpu/71-nvidia-char-dev.rules /lib/udev/rules.d/71-nvidia-dev-char.rules
-/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all
+device_init
 
 rm -r /opt/gpu