Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

refactor: store CSE exit codes in code #3012

Merged
merged 5 commits into from Apr 2, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
43 changes: 22 additions & 21 deletions parts/k8s/cloud-init/artifacts/cis.sh
@@ -1,4 +1,5 @@
#!/bin/bash
# exit codes defined in /pkg/engine/cse.go

assignRootPW() {
if grep '^root:[!*]:' /etc/shadow; then
Expand All @@ -7,7 +8,7 @@ assignRootPW() {
CMD="import crypt, getpass, pwd; print crypt.crypt('$SECRET', '\$6\$$SALT\$')"
HASH=$(python -c "$CMD")

echo 'root:'$HASH | /usr/sbin/chpasswd -e || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
echo 'root:'$HASH | /usr/sbin/chpasswd -e || exit 112
fi
}

Expand Down Expand Up @@ -37,33 +38,33 @@ assignFilePermissions() {
for FILE in ${FILES}; do
FILEPATH="/var/log/${FILE}"
DIR=$(dirname "${FILEPATH}")
mkdir -p ${DIR} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
touch ${FILEPATH} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 640 ${FILEPATH} || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
mkdir -p ${DIR} || exit 112
touch ${FILEPATH} || exit 112
chmod 640 ${FILEPATH} || exit 112
done
find /var/log -type f -perm '/o+r' -exec chmod 'g-wx,o-rwx' {} \;
chmod 600 /etc/passwd- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/shadow- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/group- || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 644 /etc/default/grub || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 600 /etc/passwd- || exit 112
chmod 600 /etc/shadow- || exit 112
chmod 600 /etc/group- || exit 112
chmod 644 /etc/default/grub || exit 112
for filepath in /etc/crontab /etc/cron.hourly /etc/cron.daily /etc/cron.weekly /etc/cron.monthly /etc/cron.d; do
chmod 0600 $filepath || exit $ERR_CIS_ASSIGN_FILE_PERMISSION
chmod 0600 $filepath || exit 112
done
}

setPWExpiration() {
sed -i "s|PASS_MAX_DAYS||g" /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MAX_DAYS' /etc/login.defs && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|PASS_MIN_DAYS||g" /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MIN_DAYS' /etc/login.defs && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|INACTIVE=||g" /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'INACTIVE=' /etc/default/useradd && exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'PASS_MAX_DAYS 90' >> /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MAX_DAYS 90' /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'PASS_MIN_DAYS 7' >> /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'PASS_MIN_DAYS 7' /etc/login.defs || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
echo 'INACTIVE=30' >> /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
grep 'INACTIVE=30' /etc/default/useradd || exit $ERR_CIS_APPLY_PASSWORD_CONFIG
sed -i "s|PASS_MAX_DAYS||g" /etc/login.defs || exit 115
grep 'PASS_MAX_DAYS' /etc/login.defs && exit 115
sed -i "s|PASS_MIN_DAYS||g" /etc/login.defs || exit 115
grep 'PASS_MIN_DAYS' /etc/login.defs && exit 115
sed -i "s|INACTIVE=||g" /etc/default/useradd || exit 115
grep 'INACTIVE=' /etc/default/useradd && exit 115
echo 'PASS_MAX_DAYS 90' >> /etc/login.defs || exit 115
grep 'PASS_MAX_DAYS 90' /etc/login.defs || exit 115
echo 'PASS_MIN_DAYS 7' >> /etc/login.defs || exit 115
grep 'PASS_MIN_DAYS 7' /etc/login.defs || exit 115
echo 'INACTIVE=30' >> /etc/default/useradd || exit 115
grep 'INACTIVE=30' /etc/default/useradd || exit 115
}

applyCIS() {
Expand Down
106 changes: 53 additions & 53 deletions parts/k8s/cloud-init/artifacts/cse_config.sh
Expand Up @@ -72,7 +72,7 @@ configureEtcd() {
set -x

ETCD_SETUP_FILE=/opt/azure/containers/setup-etcd.sh
wait_for_file 1200 1 $ETCD_SETUP_FILE || exit $ERR_ETCD_CONFIG_FAIL
wait_for_file 1200 1 $ETCD_SETUP_FILE || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
$ETCD_SETUP_FILE > /opt/azure/containers/setup-etcd.log 2>&1
RET=$?
if [ $RET -ne 0 ]; then
Expand All @@ -88,9 +88,9 @@ configureEtcd() {
fi

MOUNT_ETCD_FILE=/opt/azure/containers/mountetcd.sh
wait_for_file 1200 1 $MOUNT_ETCD_FILE || exit $ERR_ETCD_CONFIG_FAIL
$MOUNT_ETCD_FILE || exit $ERR_ETCD_VOL_MOUNT_FAIL
systemctlEnableAndStart etcd || exit $ERR_ETCD_START_TIMEOUT
wait_for_file 1200 1 $MOUNT_ETCD_FILE || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
$MOUNT_ETCD_FILE || exit {{GetCSEErrorCode "ERR_ETCD_VOL_MOUNT_FAIL"}}
systemctlEnableAndStart etcd || exit {{GetCSEErrorCode "ERR_ETCD_START_TIMEOUT"}}
for i in $(seq 1 600); do
MEMBER="$(sudo -E etcdctl member list | grep -E ${NODE_NAME} | cut -d':' -f 1)"
if [ "$MEMBER" != "" ]; then
Expand All @@ -99,29 +99,29 @@ configureEtcd() {
sleep 1
fi
done
retrycmd_if_failure 120 5 25 sudo -E etcdctl member update $MEMBER ${ETCD_PEER_URL} || exit $ERR_ETCD_CONFIG_FAIL
retrycmd_if_failure 120 5 25 sudo -E etcdctl member update $MEMBER ${ETCD_PEER_URL} || exit {{GetCSEErrorCode "ERR_ETCD_CONFIG_FAIL"}}
}

ensureNTP() {
systemctlEnableAndStart ntp || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart ntp || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureRPC() {
systemctlEnableAndStart rpcbind || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart rpc-statd || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart rpcbind || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
systemctlEnableAndStart rpc-statd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureAuditD() {
if [[ "${AUDITD_ENABLED}" == true ]]; then
systemctlEnableAndStart auditd || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart auditd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
else
apt_get_purge auditd mlocate &
fi
}

generateAggregatedAPICerts() {
AGGREGATED_API_CERTS_SETUP_FILE=/etc/kubernetes/generate-proxy-certs.sh
wait_for_file 1200 1 $AGGREGATED_API_CERTS_SETUP_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $AGGREGATED_API_CERTS_SETUP_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
$AGGREGATED_API_CERTS_SETUP_FILE
}

Expand Down Expand Up @@ -208,7 +208,7 @@ EOF

configureCNI() {
{{/* needed for the iptables rules to work on bridges */}}
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit $ERR_MODPROBE_FAIL
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit {{GetCSEErrorCode "ERR_MODPROBE_FAIL"}}
echo -n "br_netfilter" > /etc/modules-load.d/br_netfilter.conf
configureCNIIPTables
{{if HasCiliumNetworkPlugin}}
Expand Down Expand Up @@ -240,16 +240,16 @@ configureCNIIPTables() {

{{- if NeedsContainerd}}
ensureContainerd() {
wait_for_file 1200 1 /etc/systemd/system/containerd.service.d/exec_start.conf || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart containerd || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 /etc/systemd/system/containerd.service.d/exec_start.conf || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
wait_for_file 1200 1 /etc/containerd/config.toml || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart containerd || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if IsDockerContainerRuntime}}
ensureDocker() {
DOCKER_SERVICE_EXEC_START_FILE=/etc/systemd/system/docker.service.d/exec_start.conf
wait_for_file 1200 1 $DOCKER_SERVICE_EXEC_START_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $DOCKER_SERVICE_EXEC_START_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
usermod -aG docker ${ADMINUSER}
DOCKER_MOUNT_FLAGS_SYSTEMD_FILE=/etc/systemd/system/docker.service.d/clear_mount_propagation_flags.conf
DOCKER_JSON_FILE=/etc/docker/daemon.json
Expand All @@ -258,46 +258,46 @@ ensureDocker() {
jq '.' < $DOCKER_JSON_FILE && break
fi
if [ $i -eq 1200 ]; then
exit $ERR_FILE_WATCH_TIMEOUT
exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
else
sleep 1
fi
done
systemctlEnableAndStart docker || exit $ERR_DOCKER_START_FAIL
systemctlEnableAndStart docker || exit {{GetCSEErrorCode "ERR_DOCKER_START_FAIL"}}
{{/* Delay start of docker-monitor for 30 mins after booting */}}
DOCKER_MONITOR_SYSTEMD_TIMER_FILE=/etc/systemd/system/docker-monitor.timer
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_TIMER_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_TIMER_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
DOCKER_MONITOR_SYSTEMD_FILE=/etc/systemd/system/docker-monitor.service
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart docker-monitor.timer || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 $DOCKER_MONITOR_SYSTEMD_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart docker-monitor.timer || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if EnableEncryptionWithExternalKms}}
ensureKMS() {
systemctlEnableAndStart kms || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart kms || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}
{{end}}

{{- if IsIPv6Enabled}}
ensureDHCPv6() {
wait_for_file 3600 1 {{GetDHCPv6ServiceCSEScriptFilepath}} || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 3600 1 {{GetDHCPv6ConfigCSEScriptFilepath}} || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart dhcpv6 || exit $ERR_SYSTEMCTL_START_FAIL
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit $ERR_MODPROBE_FAIL
wait_for_file 3600 1 {{GetDHCPv6ServiceCSEScriptFilepath}} || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
wait_for_file 3600 1 {{GetDHCPv6ConfigCSEScriptFilepath}} || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart dhcpv6 || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit {{GetCSEErrorCode "ERR_MODPROBE_FAIL"}}
}
{{end}}

ensureKubelet() {
wait_for_file 1200 1 /etc/sysctl.d/11-aks-engine.conf || exit $ERR_FILE_WATCH_TIMEOUT
sysctl_reload 10 5 120 || exit $ERR_SYSCTL_RELOAD
wait_for_file 1200 1 /etc/sysctl.d/11-aks-engine.conf || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sysctl_reload 10 5 120 || exit {{GetCSEErrorCode "ERR_SYSCTL_RELOAD"}}
KUBELET_DEFAULT_FILE=/etc/default/kubelet
wait_for_file 1200 1 $KUBELET_DEFAULT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $KUBELET_DEFAULT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
KUBECONFIG_FILE=/var/lib/kubelet/kubeconfig
wait_for_file 1200 1 $KUBECONFIG_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $KUBECONFIG_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
KUBELET_RUNTIME_CONFIG_SCRIPT_FILE=/opt/azure/containers/kubelet.sh
wait_for_file 1200 1 $KUBELET_RUNTIME_CONFIG_SCRIPT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart kubelet || exit $ERR_KUBELET_START_FAIL
wait_for_file 1200 1 $KUBELET_RUNTIME_CONFIG_SCRIPT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart kubelet || exit {{GetCSEErrorCode "ERR_KUBELET_START_FAIL"}}
{{if HasCiliumNetworkPolicy}}
while [ ! -f /etc/cni/net.d/05-cilium.conf ]; do
sleep 3
Expand All @@ -317,10 +317,10 @@ ensureKubelet() {

ensureLabelNodes() {
LABEL_NODES_SCRIPT_FILE=/opt/azure/containers/label-nodes.sh
wait_for_file 1200 1 $LABEL_NODES_SCRIPT_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $LABEL_NODES_SCRIPT_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
LABEL_NODES_SYSTEMD_FILE=/etc/systemd/system/label-nodes.service
wait_for_file 1200 1 $LABEL_NODES_SYSTEMD_FILE || exit $ERR_FILE_WATCH_TIMEOUT
systemctlEnableAndStart label-nodes || exit $ERR_SYSTEMCTL_START_FAIL
wait_for_file 1200 1 $LABEL_NODES_SYSTEMD_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
systemctlEnableAndStart label-nodes || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureJournal() {
Expand All @@ -330,27 +330,27 @@ ensureJournal() {
echo "RuntimeMaxUse=1G"
echo "ForwardToSyslog=yes"
} >> /etc/systemd/journald.conf
systemctlEnableAndStart systemd-journald || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart systemd-journald || exit {{GetCSEErrorCode "ERR_SYSTEMCTL_START_FAIL"}}
}

ensureK8sControlPlane() {
if $REBOOTREQUIRED || [ "$NO_OUTBOUND" = "true" ]; then
return
fi
retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}
}

{{- if IsAzurePolicyAddonEnabled}}
ensureLabelExclusionForAzurePolicyAddon() {
GATEKEEPER_NAMESPACE="gatekeeper-system"
retrycmd_if_failure 120 5 25 $KUBECTL create ns --save-config $GATEKEEPER_NAMESPACE 2>/dev/null || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL create ns --save-config $GATEKEEPER_NAMESPACE 2>/dev/null || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}

retrycmd_if_failure 120 5 25 $KUBECTL label ns kube-system control-plane=controller-manager 2>/dev/null || exit $ERR_K8S_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 $KUBECTL label ns kube-system control-plane=controller-manager 2>/dev/null || exit {{GetCSEErrorCode "ERR_K8S_RUNNING_TIMEOUT"}}
}
{{end}}

ensureEtcd() {
retrycmd_if_failure 120 5 25 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key ${ETCD_CLIENT_URL}/v2/machines || exit $ERR_ETCD_RUNNING_TIMEOUT
retrycmd_if_failure 120 5 25 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key ${ETCD_CLIENT_URL}/v2/machines || exit {{GetCSEErrorCode "ERR_ETCD_RUNNING_TIMEOUT"}}
}

createKubeManifestDir() {
Expand Down Expand Up @@ -395,7 +395,7 @@ users:
{{- if IsClusterAutoscalerAddonEnabled}}
configClusterAutoscalerAddon() {
CLUSTER_AUTOSCALER_ADDON_FILE=/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml
wait_for_file 1200 1 $CLUSTER_AUTOSCALER_ADDON_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $CLUSTER_AUTOSCALER_ADDON_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sed -i "s|<clientID>|$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<clientSec>|$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<subID>|$(echo $SUBSCRIPTION_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
Expand All @@ -413,7 +413,7 @@ configACIConnectorAddon() {
ACI_CONNECTOR_CERT=$(base64 /etc/kubernetes/certs/aci-connector-cert.pem -w0)

ACI_CONNECTOR_ADDON_FILE=/etc/kubernetes/addons/aci-connector-deployment.yaml
wait_for_file 1200 1 $ACI_CONNECTOR_ADDON_FILE || exit $ERR_FILE_WATCH_TIMEOUT
wait_for_file 1200 1 $ACI_CONNECTOR_ADDON_FILE || exit {{GetCSEErrorCode "ERR_FILE_WATCH_TIMEOUT"}}
sed -i "s|<creds>|$ACI_CONNECTOR_CREDENTIALS|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<rgName>|$RESOURCE_GROUP|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<cert>|$ACI_CONNECTOR_CERT|g" $ACI_CONNECTOR_ADDON_FILE
Expand Down Expand Up @@ -452,32 +452,32 @@ configGPUDrivers() {
{{/* we will manually install nvidia-docker2 */}}
rmmod nouveau
echo blacklist nouveau >> /etc/modprobe.d/blacklist.conf
retrycmd_if_failure_no_stats 120 5 25 update-initramfs -u || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure_no_stats 120 5 25 update-initramfs -u || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
wait_for_apt_locks
retrycmd_if_failure 30 5 3600 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-container-runtime="${NVIDIA_CONTAINER_RUNTIME_VERSION}+docker18.09.2-1" || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 30 5 3600 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-container-runtime="${NVIDIA_CONTAINER_RUNTIME_VERSION}+docker18.09.2-1" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
tmpDir=$GPU_DEST/tmp
(
set -e -o pipefail
cd "${tmpDir}"
wait_for_apt_locks
dpkg-deb -R ./nvidia-docker2*.deb "${tmpDir}/pkg" || exit $ERR_GPU_DRIVERS_CONFIG
cp -r ${tmpDir}/pkg/usr/* /usr/ || exit $ERR_GPU_DRIVERS_CONFIG
dpkg-deb -R ./nvidia-docker2*.deb "${tmpDir}/pkg" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
cp -r ${tmpDir}/pkg/usr/* /usr/ || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
)
rm -rf $GPU_DEST/tmp
retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
mkdir -p $GPU_DEST/lib64 $GPU_DEST/overlay-workdir
retrycmd_if_failure 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit $ERR_GPU_DRIVERS_CONFIG
retrycmd_if_failure 3 1 600 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_CONFIG"}}
retrycmd_if_failure 3 1 600 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
umount -l /usr/lib/x86_64-linux-gnu
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
retrycmd_if_failure 120 5 25 ldconfig || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
}
ensureGPUDrivers() {
configGPUDrivers
systemctlEnableAndStart nvidia-modprobe || exit $ERR_GPU_DRIVERS_START_FAIL
systemctlEnableAndStart nvidia-modprobe || exit {{GetCSEErrorCode "ERR_GPU_DRIVERS_START_FAIL"}}
}
{{end}}
#EOF
7 changes: 4 additions & 3 deletions parts/k8s/cloud-init/artifacts/cse_customcloud.sh
@@ -1,4 +1,5 @@
#!/bin/bash
# exit codes defined in /pkg/engine/cse.go

ensureCertificates() {
AZURESTACK_ENVIRONMENT_JSON_PATH="/etc/kubernetes/azurestackcloud.json"
Expand Down Expand Up @@ -119,7 +120,7 @@ configureAzureStackInterfaces() {

if [[ -z "$TOKEN" ]]; then
echo "Error generating token for Azure Resource Manager"
exit ${ERR_AZURE_STACK_GET_ARM_TOKEN}
exit 120
fi

echo "Fetching network interface configuration for node"
Expand All @@ -139,7 +140,7 @@ configureAzureStackInterfaces() {

if [[ ! -s ${NETWORK_INTERFACES_FILE} ]]; then
echo "Error fetching network interface configuration for node"
exit ${ERR_AZURE_STACK_GET_NETWORK_CONFIGURATION}
exit 121
fi

echo "Generating Azure CNI interface file"
Expand All @@ -161,7 +162,7 @@ configureAzureStackInterfaces() {

if [[ -z "$SUBNET_PREFIX" ]]; then
echo "Error fetching the subnet address prefix for a subnet ID"
exit ${ERR_AZURE_STACK_GET_SUBNET_PREFIX}
exit 122
fi

# shellcheck disable=SC2001
Expand Down