diff --git a/cmd/agent/internal/goalstates/constants.go b/cmd/agent/internal/goalstates/constants.go index eae69fc3..c903e0ca 100644 --- a/cmd/agent/internal/goalstates/constants.go +++ b/cmd/agent/internal/goalstates/constants.go @@ -17,7 +17,9 @@ const ( SystemdSystemDir = "/etc/systemd/system" // DaemonUnit is the systemd unit name for the unbounded-agent daemon. - DaemonUnit = "unbounded-agent-daemon.service" + DaemonUnit = "unbounded-agent-daemon.service" + DaemonRecoveryUnit = "unbounded-agent-daemon-recovery.service" + DaemonRecoveryPath = "/usr/local/bin/unbounded-agent-daemon-recovery.sh" ) // NSpawn machine names used for alternating in-place upgrades. diff --git a/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.service b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.service new file mode 100644 index 00000000..15ea55b5 --- /dev/null +++ b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.service @@ -0,0 +1,9 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +[Unit] +Description=Recover Unbounded Agent Daemon to last known good binary + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/unbounded-agent-daemon-recovery.sh diff --git a/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.sh b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.sh new file mode 100644 index 00000000..04e28c39 --- /dev/null +++ b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon-recovery.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +set -euo pipefail + +current="/usr/local/bin/unbounded-agent-current" +last_good="$(readlink -f /usr/local/bin/unbounded-agent-last-good || true)" + +if [ -z "${last_good}" ] || [ ! -x "${last_good}" ]; then + echo "no valid last-known-good agent binary found" >&2 + exit 1 +fi + +ln -sfn "${last_good}" "${current}" +systemctl reset-failed unbounded-agent-daemon.service +systemctl restart unbounded-agent-daemon.service diff --git a/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon.service b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon.service index 0be64c1d..3cd66246 100644 --- a/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon.service +++ b/cmd/agent/internal/phases/host/assets/unbounded-agent-daemon.service @@ -8,10 +8,11 @@ Wants=network-online.target machines.target [Service] Type=simple -ExecStart=/usr/local/bin/unbounded-agent daemon +ExecStart=/usr/local/bin/unbounded-agent-current daemon Restart=always RestartSec=10 StartLimitIntervalSec=0 +OnFailure=unbounded-agent-daemon-recovery.service [Install] WantedBy=multi-user.target diff --git a/cmd/agent/internal/phases/host/enable_daemon.go b/cmd/agent/internal/phases/host/enable_daemon.go index 8f71f647..b206b3b3 100644 --- a/cmd/agent/internal/phases/host/enable_daemon.go +++ b/cmd/agent/internal/phases/host/enable_daemon.go @@ -19,6 +19,12 @@ import ( //go:embed assets/unbounded-agent-daemon.service var daemonServiceContent []byte +//go:embed assets/unbounded-agent-daemon-recovery.service +var daemonRecoveryServiceContent []byte + +//go:embed assets/unbounded-agent-daemon-recovery.sh +var daemonRecoveryScriptContent []byte + type enableDaemon struct { log *slog.Logger } @@ -35,11 +41,20 @@ func (d *enableDaemon) Name() string { return "enable-daemon" } func (d *enableDaemon) Do(ctx context.Context) error { unitPath := filepath.Join(goalstates.SystemdSystemDir, goalstates.DaemonUnit) + recoveryUnitPath := filepath.Join(goalstates.SystemdSystemDir, goalstates.DaemonRecoveryUnit) if err := utilio.WriteFile(unitPath, daemonServiceContent, 0o644); err != nil { return fmt.Errorf("writing %s: %w", unitPath, err) } + if err := utilio.WriteFile(recoveryUnitPath, daemonRecoveryServiceContent, 0o644); err != nil { + return fmt.Errorf("writing %s: %w", recoveryUnitPath, err) + } + + if err := utilio.WriteFile(goalstates.DaemonRecoveryPath, daemonRecoveryScriptContent, 0o755); err != nil { + return fmt.Errorf("writing %s: %w", goalstates.DaemonRecoveryPath, err) + } + systemctl := utilexec.Systemctl() if err := utilexec.RunCmd(ctx, d.log, systemctl, "daemon-reload"); err != nil { diff --git a/cmd/agent/internal/phases/reset/cleanup.go b/cmd/agent/internal/phases/reset/cleanup.go index 6f2a42dc..1e8813c4 100644 --- a/cmd/agent/internal/phases/reset/cleanup.go +++ b/cmd/agent/internal/phases/reset/cleanup.go @@ -8,6 +8,7 @@ import ( "log/slog" "path/filepath" + "github.com/Azure/unbounded-kube/cmd/agent/internal/goalstates" "github.com/Azure/unbounded-kube/cmd/agent/internal/phases" ) @@ -29,6 +30,11 @@ func (t *removeAgentArtifacts) Do(_ context.Context) error { // Remove known file paths. for _, path := range []string{ "/usr/local/bin/unbounded-agent", + "/usr/local/bin/unbounded-agent-blue", + "/usr/local/bin/unbounded-agent-green", + "/usr/local/bin/unbounded-agent-current", + "/usr/local/bin/unbounded-agent-last-good", + goalstates.DaemonRecoveryPath, "/usr/local/bin/unbounded-agent-install.sh", "/usr/local/bin/unbounded-agent-uninstall.sh", } { diff --git a/cmd/agent/internal/phases/reset/daemon.go b/cmd/agent/internal/phases/reset/daemon.go index 29b0a75d..e3cbaeb5 100644 --- a/cmd/agent/internal/phases/reset/daemon.go +++ b/cmd/agent/internal/phases/reset/daemon.go @@ -18,8 +18,8 @@ type stopDaemon struct { } // StopDaemon returns a task that stops, disables, and removes the -// unbounded-agent-daemon systemd unit. Errors from stop and disable are -// logged but do not fail the task since the unit may not be present. +// unbounded-agent-daemon and recovery systemd units. Errors from stop and +// disable are logged but do not fail the task since the units may not be present. func StopDaemon(log *slog.Logger) phases.Task { return &stopDaemon{log: log} } @@ -37,8 +37,13 @@ func (t *stopDaemon) Do(ctx context.Context) error { t.log.Warn("failed to disable daemon (may not be enabled)", "error", err) } - unitPath := filepath.Join(goalstates.SystemdSystemDir, goalstates.DaemonUnit) - removeFileIfExists(t.log, unitPath) + if err := utilexec.RunCmd(ctx, t.log, systemctl, "disable", goalstates.DaemonRecoveryUnit); err != nil { + t.log.Warn("failed to disable daemon recovery unit (may not be enabled)", "error", err) + } + + for _, unit := range []string{goalstates.DaemonUnit, goalstates.DaemonRecoveryUnit} { + removeFileIfExists(t.log, filepath.Join(goalstates.SystemdSystemDir, unit)) + } return nil } diff --git a/internal/provision/assets/unbounded-agent-install.sh b/internal/provision/assets/unbounded-agent-install.sh index 62db1e18..effa385f 100644 --- a/internal/provision/assets/unbounded-agent-install.sh +++ b/internal/provision/assets/unbounded-agent-install.sh @@ -41,11 +41,42 @@ esac if [ -z "${AGENT_URL}" ]; then AGENT_URL="https://github.com/Azure/unbounded-kube/releases/download/${AGENT_VERSION}/unbounded-agent-linux-${arch}.tar.gz" fi -AGENT_BIN="/usr/local/bin/unbounded-agent" +AGENT_BIN_BLUE="/usr/local/bin/unbounded-agent-blue" +AGENT_BIN_GREEN="/usr/local/bin/unbounded-agent-green" +AGENT_BIN_CURRENT="/usr/local/bin/unbounded-agent-current" +AGENT_BIN_LAST_GOOD="/usr/local/bin/unbounded-agent-last-good" echo "Downloading unbounded-agent ${AGENT_VERSION} for ${arch}..." -curl -fsSL "${AGENT_URL}" | tar -xz -C /usr/local/bin unbounded-agent -chmod +x "${AGENT_BIN}" +ACTIVE_BIN="$(readlink -f "${AGENT_BIN_CURRENT}" || true)" +if [ "${ACTIVE_BIN}" = "${AGENT_BIN_BLUE}" ]; then + NEXT_BIN="${AGENT_BIN_GREEN}" +else + NEXT_BIN="${AGENT_BIN_BLUE}" +fi + +tmp_dir="$(mktemp -d)" +trap 'rm -rf "${tmp_dir}"' EXIT + +archive_path="${tmp_dir}/unbounded-agent.tar.gz" +if ! curl -fsSL "${AGENT_URL}" -o "${archive_path}"; then + echo "failed to download unbounded-agent archive: ${AGENT_URL}" >&2 + exit 1 +fi + +if ! tar -xzf "${archive_path}" -C "${tmp_dir}" unbounded-agent; then + echo "failed to extract unbounded-agent from archive: ${archive_path}" >&2 + exit 1 +fi + +install -m 0755 "${tmp_dir}/unbounded-agent" "${NEXT_BIN}" + +if [ -x "${ACTIVE_BIN}" ]; then + ln -sfn "${ACTIVE_BIN}" "${AGENT_BIN_LAST_GOOD}" +elif [ -x "${NEXT_BIN}" ]; then + ln -sfn "${NEXT_BIN}" "${AGENT_BIN_LAST_GOOD}" +fi + +ln -sfn "${NEXT_BIN}" "${AGENT_BIN_CURRENT}" _START_ARGS="" case "${AGENT_DEBUG}" in @@ -53,4 +84,4 @@ case "${AGENT_DEBUG}" in esac echo "Running unbounded-agent start..." -"${AGENT_BIN}" start ${_START_ARGS} +"${AGENT_BIN_CURRENT}" start ${_START_ARGS} diff --git a/internal/provision/assets/unbounded-agent-uninstall.sh b/internal/provision/assets/unbounded-agent-uninstall.sh index 7f8d2cdc..e5834793 100644 --- a/internal/provision/assets/unbounded-agent-uninstall.sh +++ b/internal/provision/assets/unbounded-agent-uninstall.sh @@ -97,7 +97,11 @@ rm -rf "/var/lib/machines/${MACHINE_NAME}" # ----------------------------------------------------------------- echo "Removing nftables flush service..." systemctl disable --now nftables-flush.service 2>/dev/null || true +systemctl disable --now unbounded-agent-daemon.service 2>/dev/null || true +systemctl disable --now unbounded-agent-daemon-recovery.service 2>/dev/null || true rm -f /etc/systemd/system/nftables-flush.service +rm -f /etc/systemd/system/unbounded-agent-daemon.service +rm -f /etc/systemd/system/unbounded-agent-daemon-recovery.service rm -rf /etc/unbounded/kube # ----------------------------------------------------------------- @@ -152,6 +156,11 @@ done # ----------------------------------------------------------------- echo "Removing agent binaries and configuration..." rm -f /usr/local/bin/unbounded-agent +rm -f /usr/local/bin/unbounded-agent-blue +rm -f /usr/local/bin/unbounded-agent-green +rm -f /usr/local/bin/unbounded-agent-current +rm -f /usr/local/bin/unbounded-agent-last-good +rm -f /usr/local/bin/unbounded-agent-daemon-recovery.sh rm -f /usr/local/bin/unbounded-agent-install.sh rm -f /usr/local/bin/unbounded-agent-uninstall.sh rm -rf /etc/unbounded/agent diff --git a/internal/provision/script_test.go b/internal/provision/script_test.go index 717935f0..0c3a3faf 100644 --- a/internal/provision/script_test.go +++ b/internal/provision/script_test.go @@ -17,6 +17,11 @@ func TestUnboundedAgentInstallScript(t *testing.T) { require.NotEmpty(t, script) require.Contains(t, script, "#!/bin/bash") require.Contains(t, script, "unbounded-agent") + require.Contains(t, script, "unbounded-agent-blue") + require.Contains(t, script, "unbounded-agent-green") + require.Contains(t, script, "unbounded-agent-current") + require.Contains(t, script, "unbounded-agent-last-good") + require.Contains(t, script, "ln -sfn") } func TestUnboundedAgentUninstallScript(t *testing.T) { @@ -40,6 +45,8 @@ func TestUnboundedAgentUninstallScript(t *testing.T) { require.Contains(t, script, "/etc/systemd/nspawn/${MACHINE_NAME}.nspawn") require.Contains(t, script, "/var/lib/machines/${MACHINE_NAME}") require.Contains(t, script, "nftables-flush.service") + require.Contains(t, script, "unbounded-agent-daemon-recovery.service") + require.Contains(t, script, "unbounded-agent-daemon-recovery.sh") require.Contains(t, script, "99-kubernetes.conf") require.Contains(t, script, "sysctl --system") require.Contains(t, script, "docker.service")