Skip to content

Implement termination grace period support for the runner #3830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Runner.Common/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ public static class Agent
public static readonly string ForcedActionsNodeVersion = "ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION";
public static readonly string PrintLogToStdout = "ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT";
public static readonly string ActionArchiveCacheDirectory = "ACTIONS_RUNNER_ACTION_ARCHIVE_CACHE";
public static readonly string ActionsTerminationGracePeriodSeconds = "ACTIONS_RUNNER_TERMINATION_GRACE_PERIOD_SECONDS";
}

public static class System
Expand Down
30 changes: 24 additions & 6 deletions src/Runner.Common/HostContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public interface IHostContext : IDisposable
T GetService<T>() where T : class, IRunnerService;
void SetDefaultCulture(string name);
event EventHandler Unloading;
void ShutdownRunner(ShutdownReason reason);
void ShutdownRunner(ShutdownReason reason, TimeSpan delay = default);
void WritePerfCounter(string counter);
void LoadDefaultUserAgents();

Expand Down Expand Up @@ -74,6 +74,8 @@ public sealed class HostContext : EventListener, IObserver<DiagnosticListener>,
private string _perfFile;
private RunnerWebProxy _webProxy = new();
private string _hostType = string.Empty;
private ShutdownReason _shutdownReason = ShutdownReason.UserCancelled;
private int _shutdownReasonSet = 0;

// disable auth migration by default
private readonly ManualResetEventSlim _allowAuthMigration = new ManualResetEventSlim(false);
Expand All @@ -85,7 +87,7 @@ public sealed class HostContext : EventListener, IObserver<DiagnosticListener>,
public event EventHandler Unloading;
public event EventHandler<AuthMigrationEventArgs> AuthMigrationChanged;
public CancellationToken RunnerShutdownToken => _runnerShutdownTokenSource.Token;
public ShutdownReason RunnerShutdownReason { get; private set; }
public ShutdownReason RunnerShutdownReason => _shutdownReason;
public ISecretMasker SecretMasker => _secretMasker;
public List<ProductInfoHeaderValue> UserAgents => _userAgents;
public RunnerWebProxy WebProxy => _webProxy;
Expand Down Expand Up @@ -573,12 +575,28 @@ public void SetDefaultCulture(string name)
}


public void ShutdownRunner(ShutdownReason reason)
public void ShutdownRunner(ShutdownReason reason, TimeSpan delay = default)
{
ArgUtil.NotNull(reason, nameof(reason));
_trace.Info($"Runner will be shutdown for {reason.ToString()}");
RunnerShutdownReason = reason;
_runnerShutdownTokenSource.Cancel();
_trace.Info($"Runner will be shutdown for {reason.ToString()} after {delay.TotalSeconds} seconds.");
if (Interlocked.CompareExchange(ref _shutdownReasonSet, 1, 0) == 0)
{
// Set the shutdown reason only if it hasn't been set before.
_shutdownReason = reason;
}
else
{
_trace.Verbose($"Runner shutdown reason already set to {_shutdownReason.ToString()}.");
}

if (delay.TotalSeconds == 0)
{
_runnerShutdownTokenSource.Cancel();
}
else
{
_runnerShutdownTokenSource.CancelAfter(delay);
}
}

public override void Dispose()
Expand Down
66 changes: 62 additions & 4 deletions src/Runner.Listener/Runner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public sealed class Runner : RunnerService, IRunner
private readonly object _authMigrationTelemetryLock = new();
private IRunnerServer _runnerServer;
private CancellationTokenSource _authMigrationTelemetryTokenSource = new();
private bool _runnerExiting = false;
private bool _hasTerminationGracePeriod = false;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i use _hasTerminationGracePeriod as the feature flag, so unless the new ENV is set, no new code will get executed.


// <summary>
// Helps avoid excessive calls to Run Service when encountering non-retriable errors from /acquirejob.
Expand Down Expand Up @@ -309,6 +311,12 @@ public async Task<int> ExecuteCommand(CommandSettings command)
_term.WriteLine("https://docs.github.com/en/actions/hosting-your-own-runners/autoscaling-with-self-hosted-runners#using-ephemeral-runners-for-autoscaling", ConsoleColor.Yellow);
}

if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(Constants.Variables.Agent.ActionsTerminationGracePeriodSeconds)))
{
_hasTerminationGracePeriod = true;
Trace.Verbose($"Runner has termination grace period set");
}

var cred = store.GetCredentials();
if (cred != null &&
cred.Scheme == Constants.Configuration.OAuth &&
Expand Down Expand Up @@ -339,16 +347,18 @@ public async Task<int> ExecuteCommand(CommandSettings command)

private void Runner_Unloading(object sender, EventArgs e)
{
_runnerExiting = true;
if ((!_inConfigStage) && (!HostContext.RunnerShutdownToken.IsCancellationRequested))
{
HostContext.ShutdownRunner(ShutdownReason.UserCancelled);
HostContext.ShutdownRunner(ShutdownReason.UserCancelled, GetShutdownDelay());
_completedCommand.WaitOne(Constants.Runner.ExitOnUnloadTimeout);
}
}

private void CtrlCHandler(object sender, EventArgs e)
{
_term.WriteLine("Exiting...");
_runnerExiting = true;
if (_inConfigStage)
{
HostContext.Dispose();
Expand All @@ -371,15 +381,27 @@ private void CtrlCHandler(object sender, EventArgs e)
reason = ShutdownReason.UserCancelled;
}

HostContext.ShutdownRunner(reason);
HostContext.ShutdownRunner(reason, GetShutdownDelay());
}
else
{
HostContext.ShutdownRunner(ShutdownReason.UserCancelled);
HostContext.ShutdownRunner(ShutdownReason.UserCancelled, GetShutdownDelay());
}
}
}

private void HandleJobStatusEvent(object sender, JobStatusEventArgs e)
{
if (_hasTerminationGracePeriod &&
e != null &&
e.Status != TaskAgentStatus.Busy &&
_runnerExiting)
{
Trace.Info("Runner is no longer busy, shutting down.");
HostContext.ShutdownRunner(ShutdownReason.UserCancelled);
}
}

private IMessageListener GetMessageListener(RunnerSettings settings)
{
if (settings.UseV2Flow)
Expand Down Expand Up @@ -430,9 +452,13 @@ private async Task<int> RunAsync(RunnerSettings settings, bool runOnce = false)
bool autoUpdateInProgress = false;
Task<bool> selfUpdateTask = null;
bool runOnceJobReceived = false;
jobDispatcher = HostContext.CreateService<IJobDispatcher>();
jobDispatcher = HostContext.GetService<IJobDispatcher>();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change to GetService, so i can fetch the same instance in the event handler.
We never need more than 1 jobDispatcher in the listener, so switching to Get should be safe.


jobDispatcher.JobStatus += _listener.OnJobStatus;
if (_hasTerminationGracePeriod)
{
jobDispatcher.JobStatus += HandleJobStatusEvent;
}

while (!HostContext.RunnerShutdownToken.IsCancellationRequested)
{
Expand Down Expand Up @@ -703,6 +729,10 @@ await configUpdater.UpdateRunnerConfigAsync(
{
if (jobDispatcher != null)
{
if (_hasTerminationGracePeriod)
{
jobDispatcher.JobStatus -= HandleJobStatusEvent;
}
jobDispatcher.JobStatus -= _listener.OnJobStatus;
await jobDispatcher.ShutdownAsync();
}
Expand Down Expand Up @@ -810,6 +840,34 @@ private async Task ReportAuthMigrationTelemetryAsync(CancellationToken token)
}
}

private TimeSpan GetShutdownDelay()
{
TimeSpan delay = TimeSpan.Zero;
if (_hasTerminationGracePeriod)
{
var jobDispatcher = HostContext.GetService<IJobDispatcher>();
if (jobDispatcher.Busy)
{
Trace.Info("Runner is busy, checking for grace period.");
var delayEnv = Environment.GetEnvironmentVariable(Constants.Variables.Agent.ActionsTerminationGracePeriodSeconds);
if (!string.IsNullOrEmpty(delayEnv) &&
int.TryParse(delayEnv, out int delaySeconds) &&
delaySeconds > 0 &&
delaySeconds < 60 * 60) // 1 hour
{
Trace.Info($"Waiting for {delaySeconds} seconds before shutting down.");
delay = TimeSpan.FromSeconds(delaySeconds);
}
}
else
{
Trace.Verbose("Runner is not busy, no grace period.");
}
}

return delay;
}

private void PrintUsage(CommandSettings command)
{
string separator;
Expand Down
16 changes: 8 additions & 8 deletions src/Test/L0/Listener/RunnerL0.cs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ public async Task TestRunAsync()

});

hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

_configStore.Setup(x => x.IsServiceConfigured()).Returns(false);
//Act
Expand Down Expand Up @@ -309,7 +309,7 @@ public async Task TestRunOnce()

});

hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

_configStore.Setup(x => x.IsServiceConfigured()).Returns(false);
//Act
Expand Down Expand Up @@ -413,7 +413,7 @@ public async Task TestRunOnceOnlyTakeOneJobMessage()

});

hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

_configStore.Setup(x => x.IsServiceConfigured()).Returns(false);
//Act
Expand Down Expand Up @@ -503,7 +503,7 @@ public async Task TestRunOnceHandleUpdateMessage()

});

hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

_configStore.Setup(x => x.IsServiceConfigured()).Returns(false);
//Act
Expand Down Expand Up @@ -578,7 +578,7 @@ public async Task TestReportAuthMigrationTelemetry()
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.SetSingleton<ICredentialManager>(_credentialManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

runner.Initialize(hc);
var settings = new RunnerSettings
Expand Down Expand Up @@ -679,7 +679,7 @@ public async Task TestRunnerJobRequestMessageFromPipeline()
hc.SetSingleton<ICredentialManager>(_credentialManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
hc.EnqueueInstance<IActionsRunServer>(_actionsRunServer.Object);
hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

runner.Initialize(hc);
var settings = new RunnerSettings
Expand Down Expand Up @@ -780,7 +780,7 @@ public async Task TestRunnerJobRequestMessageFromRunService()
hc.SetSingleton<ICredentialManager>(_credentialManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
hc.EnqueueInstance<IRunServer>(_runServer.Object);
hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);

runner.Initialize(hc);
var settings = new RunnerSettings
Expand Down Expand Up @@ -880,7 +880,7 @@ public async Task TestRunnerJobRequestMessageFromRunService_AuthMigrationFallbac
hc.SetSingleton<ISelfUpdater>(_updater.Object);
hc.SetSingleton<ICredentialManager>(_credentialManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
hc.EnqueueInstance<IJobDispatcher>(_jobDispatcher.Object);
hc.SetSingleton<IJobDispatcher>(_jobDispatcher.Object);
hc.EnqueueInstance<IRunServer>(_runServer.Object);
hc.EnqueueInstance<IRunServer>(_runServer.Object);

Expand Down
2 changes: 1 addition & 1 deletion src/Test/L0/TestHostContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ public Tracing GetTrace(string name)
return _traceManager[name];
}

public void ShutdownRunner(ShutdownReason reason)
public void ShutdownRunner(ShutdownReason reason, TimeSpan delay = default)
{
ArgUtil.NotNull(reason, nameof(reason));
RunnerShutdownReason = reason;
Expand Down