Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

Commit

Permalink
refactor: store CSE exit codes in code (#3012)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackfrancis committed Apr 2, 2020
1 parent 7f301d2 commit f6ed0e5
Show file tree
Hide file tree
Showing 14 changed files with 502 additions and 422 deletions.
4 changes: 2 additions & 2 deletions docs/howto/troubleshooting.md
Expand Up @@ -31,7 +31,7 @@ From a vm node that did not provision successfully:

How to determine the above?

1. Look at the deployment error message. The error should include which VM extension failed the deployment. For example, `cse-master-0` means that the CSE extension of VM master 0 failed. It should also include the date that the extension started running at and the name of the VM it was running on.
1. Look at the deployment error message. The error should include which VM extension failed the deployment. For example, `cse-master-0` means that the CSE extension of VM master 0 failed. It should also include the date that the extension started running at and the name of the VM it was running on.

2. From a master node: `kubectl get nodes`

Expand All @@ -48,7 +48,7 @@ How to determine the above?
execute command: command terminated with exit status=20\n[stdout]\n\n[stderr]\n"."
```

Look for the exit code. In the above example, the exit code is `20`. The list of exit codes and their meaning can be found [here](../../parts/k8s/cloud-init/artifacts/cse_helpers.sh).
Look for the exit code. In the above example, the exit code is `20`. The list of exit codes and their meaning can be found [here](../../pkg/engine/cse.go).

If after following the above you are still unable to troubleshoot your deployment error, please open a Github issue with title "CSE error: exit code <INSERT_YOUR_EXIT_CODE>" and include the following in the description:

Expand Down
42 changes: 21 additions & 21 deletions parts/k8s/cloud-init/artifacts/cis.sh
Expand Up @@ -7,7 +7,7 @@ assignRootPW() {
CMD="import crypt, getpass, pwd; print crypt.crypt('$SECRET', '\$6\$$SALT\$')"
HASH=$(python -c "$CMD")

echo 'root:'$HASH | /usr/sbin/chpasswd -e || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
echo 'root:'$HASH | /usr/sbin/chpasswd -e || exit 112
fi
}

Expand Down Expand Up @@ -37,33 +37,33 @@ assignFilePermissions() {
for FILE in ${FILES}; do
FILEPATH="/var/log/${FILE}"
DIR=$(dirname "${FILEPATH}")
mkdir -p ${DIR} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
touch ${FILEPATH} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 640 ${FILEPATH} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
mkdir -p ${DIR} || exit 112
touch ${FILEPATH} || exit 112
chmod 640 ${FILEPATH} || exit 112
done
find /var/log -type f -perm '/o+r' -exec chmod 'g-wx,o-rwx' {} \;
chmod 600 /etc/passwd- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/shadow- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/group- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 644 /etc/default/grub || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/passwd- || exit 112
chmod 600 /etc/shadow- || exit 112
chmod 600 /etc/group- || exit 112
chmod 644 /etc/default/grub || exit 112
for filepath in /etc/crontab /etc/cron.hourly /etc/cron.daily /etc/cron.weekly /etc/cron.monthly /etc/cron.d; do
chmod 0600 $filepath || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 0600 $filepath || exit 112
done
}

setPWExpiration() {
sed -i "s|PASS_MAX_DAYS||g" /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MAX_DAYS' /etc/login.defs && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|PASS_MIN_DAYS||g" /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MIN_DAYS' /etc/login.defs && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|INACTIVE=||g" /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'INACTIVE=' /etc/default/useradd && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'PASS_MAX_DAYS 90' >> /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MAX_DAYS 90' /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'PASS_MIN_DAYS 7' >> /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MIN_DAYS 7' /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'INACTIVE=30' >> /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'INACTIVE=30' /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|PASS_MAX_DAYS||g" /etc/login.defs || exit 115
grep 'PASS_MAX_DAYS' /etc/login.defs && exit 115
sed -i "s|PASS_MIN_DAYS||g" /etc/login.defs || exit 115
grep 'PASS_MIN_DAYS' /etc/login.defs && exit 115
sed -i "s|INACTIVE=||g" /etc/default/useradd || exit 115
grep 'INACTIVE=' /etc/default/useradd && exit 115
echo 'PASS_MAX_DAYS 90' >> /etc/login.defs || exit 115
grep 'PASS_MAX_DAYS 90' /etc/login.defs || exit 115
echo 'PASS_MIN_DAYS 7' >> /etc/login.defs || exit 115
grep 'PASS_MIN_DAYS 7' /etc/login.defs || exit 115
echo 'INACTIVE=30' >> /etc/default/useradd || exit 115
grep 'INACTIVE=30' /etc/default/useradd || exit 115
}

applyCIS() {
Expand Down
106 changes: 53 additions & 53 deletions parts/k8s/cloud-init/artifacts/cse_config.sh
Expand Up @@ -72,7 +72,7 @@ configureEtcd() {
set -x

ETCD_SETUP_FILE=/opt/azure/containers/setup-etcd.sh
wait_for_file 1200 1 $ETCD_SETUP_FILE || exit $ERR_ETCD_CONFIG_FAIL
wait_for_file 1200 1 $ETCD_SETUP_FILE || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
$ETCD_SETUP_FILE > /opt/azure/containers/setup-etcd.log 2>&1
RET=$?
if [ $RET -ne 0 ]; then
Expand All @@ -88,9 +88,9 @@ configureEtcd() {
fi

MOUNT_ETCD_FILE=/opt/azure/containers/mountetcd.sh
wait_for_file 1200 1 $MOUNT_ETCD_FILE || exit $ERR_ETCD_CONFIG_FAIL
$MOUNT_ETCD_FILE || exit $ERR_ETCD_VOL_MOUNT_FAIL
systemctlEnableAndStart etcd || exit $ERR_ETCD_START_TIMEOUT
wait_for_file 1200 1 $MOUNT_ETCD_FILE || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
$MOUNT_ETCD_FILE || exit {{GetCSEErrorCode "ERR_ETCD_VOL_MOUNT_FAIL"}}
systemctlEnableAndStart etcd || exit {{GetCSEErrorCode "ERR_ETCD_START_TIMEOUT"}}
for i in $(seq 1 600); do
MEMBER="$(sudo -E etcdctl member list | grep -E ${NODE_NAME} | cut -d':' -f 1)"
if [ "$MEMBER" != "" ]; then
Expand All @@ -99,29 +99,29 @@ configureEtcd() {
sleep 1
fi
done
retrycmd_if_failure 120 5 25 sudo -E etcdctl member update $MEMBER ${ETCD_PEER_URL} || exit $ERR_ETCD_CONFIG_FAIL
retrycmd_if_failure 120 5 25 sudo -E etcdctl member update $MEMBER ${ETCD_PEER_URL} || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
}

ensureNTP() {
systemctlEnableAndStart ntp || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart ntp || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureRPC() {
systemctlEnableAndStart rpcbind || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart rpc-statd || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart rpcbind || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
systemctlEnableAndStart rpc-statd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureAuditD() {
if [[ "${AUDITD_ENABLED}" == true ]]; then
systemctlEnableAndStart auditd || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart auditd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
else
apt_get_purge auditd mlocate &
fi
}

generateAggregatedAPICerts() {
AGGREGATED_API_CERTS_SETUP_FILE=/etc/kubernetes/generate-proxy-certs.sh
wait_for_file 1200 1 $AGGREGATED_API_CERTS_SETUP_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $AGGREGATED_API_CERTS_SETUP_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
$AGGREGATED_API_CERTS_SETUP_FILE
}

Expand Down Expand Up @@ -208,7 +208,7 @@ EOF

configureCNI() {
{{/* needed for the iptables rules to work on bridges */}}
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit $ERR_MODPROBE_FAIL
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit {{GetCSEErrorCode "ERR_MODPROBE_FAIL"}}
echo -n "br_netfilter" > /etc/modules-load.d/br_netfilter.conf
configureCNIIPTables
{{if HasCiliumNetworkPlugin}}
Expand Down Expand Up @@ -240,16 +240,16 @@ configureCNIIPTables() {

{{- if NeedsContainerd}}
ensureContainerd() {
wait_for_file 1200 1 /etc/systemd/system/containerd.service.d/exec_start.conf || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart containerd || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 /etc/systemd/system/containerd.service.d/exec_start.conf || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
wait_for_file 1200 1 /etc/containerd/config.toml || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart containerd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if IsDockerContainerRuntime}}
ensureDocker() {
DOCKER_SERVICE_EXEC_START_FILE=/etc/systemd/system/docker.service.d/exec_start.conf
wait_for_file 1200 1 $DOCKER_SERVICE_EXEC_START_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $DOCKER_SERVICE_EXEC_START_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
usermod -aG docker ${ADMINUSER}
DOCKER_MOUNT_FLAGS_SYSTEMD_FILE=/etc/systemd/system/docker.service.d/clear_mount_propagation_flags.conf
DOCKER_JSON_FILE=/etc/docker/daemon.json
Expand All @@ -258,46 +258,46 @@ ensureDocker() {
jq '.' < $DOCKER_JSON_FILE && break
fi
if [ $i -eq 1200 ]; then
exit $ERR_FILE_WATCH_TIMEOUT
exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
else
sleep 1
fi
done
systemctlEnableAndStart docker || exit $ERR_DOCKER_START_FAIL
systemctlEnableAndStart docker || exit {{GetCSEErrorCode "ERR_DOCKER_START_FAIL"}}
{{/* Delay start of docker-monitor for 30 mins after booting */}}
DOCKER_MONITOR_SYSTEMD_TIMER_FILE=/etc/systemd/system/docker-monitor.timer
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_TIMER_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_TIMER_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
DOCKER_MONITOR_SYSTEMD_FILE=/etc/systemd/system/docker-monitor.service
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart docker-monitor.timer || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart docker-monitor.timer || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if EnableEncryptionWithExternalKms}}
ensureKMS() {
systemctlEnableAndStart kms || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart kms || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if IsIPv6Enabled}}
ensureDHCPv6() {
wait_for_file 3600 1 {{GetDHCPv6ServiceCSEScriptFilepath}} || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 3600 1 {{GetDHCPv6ConfigCSEScriptFilepath}} || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart dhcpv6 || exit $ERR_SYSTEMCTL_START_FAIL
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit $ERR_MODPROBE_FAIL
wait_for_file 3600 1 {{GetDHCPv6ServiceCSEScriptFilepath}} || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
wait_for_file 3600 1 {{GetDHCPv6ConfigCSEScriptFilepath}} || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart dhcpv6 || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit {{GetCSEErrorCode "ERR_MODPROBE_FAIL"}}
}
{{end}}

ensureKubelet() {
wait_for_file 1200 1 /etc/sysctl.d/11-aks-engine.conf || exit $ERR_FILE_WATCH_TIMEOUT
sysctl_reload 10 5 120 || exit $ERR_SYSCTL_RELOAD
wait_for_file 1200 1 /etc/sysctl.d/11-aks-engine.conf || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sysctl_reload 10 5 120 || exit {{GetCSEErrorCode "ERR_SYSCTL_RELOAD"}}
KUBELET_DEFAULT_FILE=/etc/default/kubelet
wait_for_file 1200 1 $KUBELET_DEFAULT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $KUBELET_DEFAULT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
KUBECONFIG_FILE=/var/lib/kubelet/kubeconfig
wait_for_file 1200 1 $KUBECONFIG_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $KUBECONFIG_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
KUBELET_RUNTIME_CONFIG_SCRIPT_FILE=/opt/azure/containers/kubelet.sh
wait_for_file 1200 1 $KUBELET_RUNTIME_CONFIG_SCRIPT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart kubelet || exit $ERR_KUBELET_START_FAIL
wait_for_file 1200 1 $KUBELET_RUNTIME_CONFIG_SCRIPT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart kubelet || exit {{GetCSEErrorCode "ERR_KUBELET_START_FAIL"}}
{{if HasCiliumNetworkPolicy}}
while [ ! -f /etc/cni/net.d/05-cilium.conf ]; do
sleep 3
Expand All @@ -317,10 +317,10 @@ ensureKubelet() {

ensureLabelNodes() {
LABEL_NODES_SCRIPT_FILE=/opt/azure/containers/label-nodes.sh
wait_for_file 1200 1 $LABEL_NODES_SCRIPT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $LABEL_NODES_SCRIPT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
LABEL_NODES_SYSTEMD_FILE=/etc/systemd/system/label-nodes.service
wait_for_file 1200 1 $LABEL_NODES_SYSTEMD_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart label-nodes || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 $LABEL_NODES_SYSTEMD_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart label-nodes || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureJournal() {
Expand All @@ -330,27 +330,27 @@ ensureJournal() {
echo "RuntimeMaxUse=1G"
echo "ForwardToSyslog=yes"
} >> /etc/systemd/journald.conf
systemctlEnableAndStart systemd-journald || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart systemd-journald || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureK8sControlPlane() {
if $REBOOTREQUIRED || [ "$NO_OUTBOUND" = "true" ]; then
return
fi
retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}
}

{{- if IsAzurePolicyAddonEnabled}}
ensureLabelExclusionForAzurePolicyAddon() {
GATEKEEPER_NAMESPACE="gatekeeper-system"
retrycmd_if_failure 120 5 25 $KUBECTL create ns --save-config $GATEKEEPER_NAMESPACE 2>/dev/null || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL create ns --save-config $GATEKEEPER_NAMESPACE 2>/dev/null || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}

retrycmd_if_failure 120 5 25 $KUBECTL label ns kube-system control-plane=controller-manager 2>/dev/null || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL label ns kube-system control-plane=controller-manager 2>/dev/null || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}
}
{{end}}

ensureEtcd() {
retrycmd_if_failure 120 5 25 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key ${ETCD_CLIENT_URL}/v2/machines || exit $ERR_ETCD_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key ${ETCD_CLIENT_URL}/v2/machines || exit {{GetCSEErrorCode "ERR_ETCD_RUNNING_TIMEOUT"}}
}

createKubeManifestDir() {
Expand Down Expand Up @@ -395,7 +395,7 @@ users:
{{- if IsClusterAutoscalerAddonEnabled}}
configClusterAutoscalerAddon() {
CLUSTER_AUTOSCALER_ADDON_FILE=/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml
wait_for_file 1200 1 $CLUSTER_AUTOSCALER_ADDON_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $CLUSTER_AUTOSCALER_ADDON_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sed -i "s|<clientID>|$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<clientSec>|$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<subID>|$(echo $SUBSCRIPTION_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
Expand All @@ -413,7 +413,7 @@ configACIConnectorAddon() {
ACI_CONNECTOR_CERT=$(base64 /etc/kubernetes/certs/aci-connector-cert.pem -w0)

ACI_CONNECTOR_ADDON_FILE=/etc/kubernetes/addons/aci-connector-deployment.yaml
wait_for_file 1200 1 $ACI_CONNECTOR_ADDON_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $ACI_CONNECTOR_ADDON_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sed -i "s|<creds>|$ACI_CONNECTOR_CREDENTIALS|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<rgName>|$RESOURCE_GROUP|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<cert>|$ACI_CONNECTOR_CERT|g" $ACI_CONNECTOR_ADDON_FILE
Expand Down Expand Up @@ -452,32 +452,32 @@ configGPUDrivers() {
{{/* we will manually install nvidia-docker2 */}}
rmmod nouveau
echo blacklist nouveau >> /etc/modprobe.d/blacklist.conf
retrycmd_if_failure_no_stats 120 5 25 update-initramfs -u || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure_no_stats 120 5 25 update-initramfs -u || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
wait_for_apt_locks
retrycmd_if_failure 30 5 3600 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-container-runtime="${NVIDIA_CONTAINER_RUNTIME_VERSION}+docker18.09.2-1" || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 30 5 3600 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-container-runtime="${NVIDIA_CONTAINER_RUNTIME_VERSION}+docker18.09.2-1" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
tmpDir=$GPU_DEST/tmp
(
set -e -o pipefail
cd "${tmpDir}"
wait_for_apt_locks
dpkg-deb -R ./nvidia-docker2*.deb "${tmpDir}/pkg" || exit $ERR_GPU_DRIVERS_CONFIG
cp -r ${tmpDir}/pkg/usr/* /usr/ || exit $ERR_GPU_DRIVERS_CONFIG
dpkg-deb -R ./nvidia-docker2*.deb "${tmpDir}/pkg" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
cp -r ${tmpDir}/pkg/usr/* /usr/ || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
)
rm -rf $GPU_DEST/tmp
retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
mkdir -p $GPU_DEST/lib64 $GPU_DEST/overlay-workdir
retrycmd_if_failure 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 3 1 600 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
retrycmd_if_failure 3 1 600 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
umount -l /usr/lib/x86_64-linux-gnu
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd_if_failure 120 5 25 ldconfig || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
}
ensureGPUDrivers() {
configGPUDrivers
systemctlEnableAndStart nvidia-modprobe || exit $ERR_GPU_DRIVERS_START_FAIL
systemctlEnableAndStart nvidia-modprobe || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
}
{{end}}
#EOF

0 comments on commit f6ed0e5

Please sign in to comment.