Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

feat: only download nvidia drivers if needed #4797

Merged
merged 5 commits into from
Jan 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions examples/no_outbound.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
"availabilityProfile": "VirtualMachineScaleSets",
"distro": "aks-ubuntu-18.04"
},
{
"name": "poolgpu",
"count": 1,
"vmSize": "Standard_NC6",
"availabilityProfile": "VirtualMachineScaleSets",
"distro": "aks-ubuntu-18.04"
},
{
"name": "poolwinvhd",
"count": 1,
Expand Down
9 changes: 5 additions & 4 deletions parts/k8s/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,10 @@ configGPUDrivers() {
echo blacklist nouveau >>/etc/modprobe.d/blacklist.conf
retrycmd_no_stats 120 5 25 update-initramfs -u || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
wait_for_apt_locks
dpkg -i $(ls ${APT_CACHE_DIR}libnvidia-container1*) || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
dpkg -i $(ls ${APT_CACHE_DIR}libnvidia-container-tools*) || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
dpkg -i $(ls ${APT_CACHE_DIR}nvidia-container-toolkit*) || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
dpkg -i $(ls ${APT_CACHE_DIR}nvidia-container-runtime*) || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
for apt_package in $NVIDIA_PACKAGES; do
dpkg -i ${PERMANENT_CACHE_DIR}${apt_package}* || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
done
dpkg -i ${PERMANENT_CACHE_DIR}nvidia-container-runtime* || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
mkdir -p $GPU_DEST/lib64 $GPU_DEST/overlay-workdir
retrycmd 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
export -f installNvidiaDrivers
Expand All @@ -637,6 +637,7 @@ configGPUDrivers() {
retrycmd 120 5 25 nvidia-modprobe -u -c0 || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd 120 5 25 nvidia-smi || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd 120 5 25 ldconfig || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
rm -Rf ${PERMANENT_CACHE_DIR}
}
ensureGPUDrivers() {
configGPUDrivers
Expand Down
1 change: 1 addition & 0 deletions parts/k8s/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ if ! [[ $(echo -n "$PRIVATE_IP" | grep -c '^') == 1 ]]; then
fi
export PRIVATE_IP
APT_CACHE_DIR=/var/cache/apt/archives/
PERMANENT_CACHE_DIR=/root/aptcache/

configure_prerequisites() {
ip_forward_path=/proc/sys/net/ipv4/ip_forward
Expand Down
18 changes: 16 additions & 2 deletions parts/k8s/cloud-init/artifacts/cse_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ CONTAINERD_DOWNLOADS_DIR="/opt/containerd/downloads"
APMZ_DOWNLOADS_DIR="/opt/apmz/downloads"
UBUNTU_RELEASE=$(lsb_release -r -s)
UBUNTU_CODENAME=$(lsb_release -c -s)
NVIDIA_PACKAGES="libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit"
NVIDIA_CONTAINER_TOOLKIT_VER=1.6.0
NVIDIA_RUNTIME_VER=3.6.0

disableTimeSyncd() {
systemctl_stop 20 5 10 systemd-timesyncd || exit 3
Expand Down Expand Up @@ -67,6 +70,12 @@ installDeps() {
fi
fi
}
gpuDriversDownloaded() {
for apt_package in $NVIDIA_PACKAGES; do
ls ${PERMANENT_CACHE_DIR}${apt_package}* || return 1
done
ls ${PERMANENT_CACHE_DIR}nvidia-container-runtime* || return 1
}
downloadGPUDrivers() {
mkdir -p $GPU_DEST/tmp
retrycmd_no_stats 120 5 25 curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey >$GPU_DEST/tmp/aptnvidia.gpg || exit 85
Expand All @@ -78,8 +87,13 @@ downloadGPUDrivers() {
retrycmd_no_stats 120 5 25 cat $GPU_DEST/tmp/nvidia-docker.list >/etc/apt/sources.list.d/nvidia-docker.list || exit 85
apt_get_update
retrycmd 30 5 60 curl -fLS https://us.download.nvidia.com/tesla/$GPU_DV/NVIDIA-Linux-x86_64-${GPU_DV}.run -o ${GPU_DEST}/nvidia-drivers-${GPU_DV} || exit 85
tmpDir=$GPU_DEST/tmp
apt_get_download 20 30 libnvidia-container1=1.6.0* libnvidia-container-tools=1.6.0* nvidia-container-toolkit=1.6.0* nvidia-container-runtime=3.6.0* || exit 85
mkdir -p $PERMANENT_CACHE_DIR
for apt_package in $NVIDIA_PACKAGES; do
apt_get_download 20 30 "${apt_package}=${NVIDIA_CONTAINER_TOOLKIT_VER}*" || exit 85
cp -al ${APT_CACHE_DIR}${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* $PERMANENT_CACHE_DIR || exit 85
done
apt_get_download 20 30 nvidia-container-runtime=${NVIDIA_RUNTIME_VER}* || exit 85
cp -al ${APT_CACHE_DIR}nvidia-container-runtime_${NVIDIA_RUNTIME_VER}* $PERMANENT_CACHE_DIR || exit 85
}
removeMoby() {
apt_get_purge moby-engine moby-cli || exit 27
Expand Down
2 changes: 1 addition & 1 deletion parts/k8s/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ time_metric "InstallNetworkPlugin" installNetworkPlugin

{{- if and HasNSeriesSKU IsNvidiaDevicePluginAddonEnabled}}
if [[ ${GPU_NODE} == true ]]; then
time_metric "DownloadGPUDrivers" downloadGPUDrivers
gpuDriversDownloaded || time_metric "DownloadGPUDrivers" downloadGPUDrivers
jackfrancis marked this conversation as resolved.
Show resolved Hide resolved
time_metric "EnsureGPUDrivers" ensureGPUDrivers
fi
{{end}}
Expand Down
30 changes: 23 additions & 7 deletions pkg/engine/templates_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.