Commit
Delete idle runners, even if they started a long time after the step function did. Runners can be delayed in various ways, depending on the provider. One example is a provider using spot pricing when there is no availability. The provider will keep retrying for an hour (configurable) until there is spot availability. That's why we can't assume when the runner started and we have to continually check until the runner actually starts, figure out when it started, and make sure the idle timeout hasn't passed yet. We now expect runner providers to add a label telling the reaper when it was started. This helps us figure out how long a runner has been idle. The label format is `cdkghr:started:<seconds since epoch>`. This makes `IRunnerProvider` harder to implement, but I don't think a lot of people were going to do that anyway. Nothing major breaks by not sending the `cdkghr:started:xxx` label. The reaper will just not be able to stop idle runners and keep retrying until the runner is done. See #139 for test plan and more details on why runner deletion on error still exists. We can't leave dead runners behind as there is a limited amount of runners that can be registered to a repo and we can run out. Fixes #190 by taking this logic outside of the step function and into separate Lambda based on SQS.
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import * as AWSLambda from 'aws-lambda'; | ||
import * as AWS from 'aws-sdk'; | ||
import { getOctokit, getRunner } from './lambda-github'; | ||
|
||
interface IdleReaperLambdaInput { | ||
readonly executionArn: string; | ||
readonly runnerName: string; | ||
readonly owner: string; | ||
readonly repo: string; | ||
readonly installationId: string; | ||
readonly maxIdleSeconds: number; | ||
} | ||
|
||
const sfn = new AWS.StepFunctions(); | ||
|
||
exports.handler = async function (event: AWSLambda.SQSEvent): Promise<AWSLambda.SQSBatchResponse> { | ||
let result: AWSLambda.SQSBatchResponse = { batchItemFailures: [] }; | ||
|
||
for (const record of event.Records) { | ||
const input = JSON.parse(record.body) as IdleReaperLambdaInput; | ||
console.log(`Checking runner for ${input.owner}/${input.repo} [execution-id=${input.runnerName}]`); | ||
|
||
const retryLater = () => result.batchItemFailures.push({ itemIdentifier: record.messageId }); | ||
|
||
// check if step function is still running | ||
const execution = await sfn.describeExecution({ executionArn: input.executionArn }).promise(); | ||
if (execution.status != 'RUNNING') { | ||
// no need to test again as runner already finished | ||
console.log('Runner already finished'); | ||
continue; | ||
} | ||
|
||
// get github access | ||
const { octokit } = await getOctokit(input.installationId); | ||
|
||
// find runner | ||
const runner = await getRunner(octokit, input.owner, input.repo, input.runnerName); | ||
if (!runner) { | ||
console.error(`Runner not running yet for ${input.owner}/${input.repo}:${input.runnerName}`); | ||
retryLater(); | ||
continue; | ||
} | ||
|
||
// if not idle, we're done | ||
if (runner.busy) { | ||
console.log('Runner is not idle'); | ||
continue; | ||
} | ||
|
||
// check if max idle timeout has reached | ||
let found = false; | ||
for (const label of runner.labels) { | ||
if (label.name.toLowerCase().startsWith('cdkghr:started:')) { | ||
const started = parseFloat(label.name.split(':')[2]); | ||
const startedDate = new Date(started * 1000); | ||
const now = new Date(); | ||
const diffMs = now.getTime() - startedDate.getTime(); | ||
|
||
console.log(`Runner ${input.runnerName} started ${diffMs/1000} seconds ago`); | ||
|
||
if (diffMs > 1000 * input.maxIdleSeconds) { | ||
// max idle time reached, delete runner | ||
console.log(`Runner ${input.runnerName} is idle for too long, deleting...`); | ||
|
||
await octokit.rest.actions.deleteSelfHostedRunnerFromRepo({ | ||
owner: input.owner, | ||
repo: input.repo, | ||
runner_id: runner.id, | ||
}); | ||
} else { | ||
// still idle, timeout not reached -- retry later | ||
retryLater(); | ||
} | ||
|
||
found = true; | ||
break; | ||
} | ||
} | ||
|
||
if (!found) { | ||
// no started label? retry later (it won't retry forever as eventually the runner will stop and the step function will finish) | ||
console.error('No `cdkghr:started:xxx` label found???'); | ||
retryLater(); | ||
} | ||
} | ||
|
||
return result; | ||
}; |