Skip to content

Commit

Permalink
fix: Better idle runner deleter (#314)
Browse files Browse the repository at this point in the history
Delete idle runners, even if they started a long time after the step function did. Runners can be delayed in various ways, depending on the provider. One example is a provider using spot pricing when there is no availability. The provider will keep retrying for an hour (configurable) until there is spot availability. That's why we can't assume when the runner started and we have to continually check until the runner actually starts, figure out when it started, and make sure the idle timeout hasn't passed yet.

We now expect runner providers to add a label telling the reaper when it was started. This helps us figure out how long a runner has been idle. The label format is `cdkghr:started:<seconds since epoch>`. This makes `IRunnerProvider` harder to implement, but I don't think a lot of people were going to do that anyway. Nothing major breaks by not sending the `cdkghr:started:xxx` label. The reaper will just not be able to stop idle runners and keep retrying until the runner is done.

See #139 for test plan and more details on why runner deletion on error still exists. We can't leave dead runners behind as there is a limited amount of runners that can be registered to a repo and we can run out.

Fixes #190 by taking this logic outside of the step function and into separate Lambda based on SQS.
  • Loading branch information
kichik committed Apr 30, 2023
1 parent c616694 commit 9c61574
Show file tree
Hide file tree
Showing 22 changed files with 564 additions and 129 deletions.
1 change: 1 addition & 0 deletions .eslintrc.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .gitattributes

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .projen/files.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions .projen/tasks.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions API.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 6 additions & 41 deletions src/delete-runner.lambda.ts
@@ -1,42 +1,7 @@
import { RequestError } from '@octokit/request-error';
import { getOctokit } from './lambda-github';
import { getOctokit, getRunner } from './lambda-github';
import { StepFunctionLambdaInput } from './lambda-helpers';

interface DeleteRunnerInput extends StepFunctionLambdaInput {
readonly idleOnly: boolean;
}

async function getRunnerId(octokit: any, owner: string, repo: string, name: string, idleOnly: boolean) {
let page = 1;
while (true) {
const runners = await octokit.request('GET /repos/{owner}/{repo}/actions/runners?per_page=100&page={page}', {
page: page,
owner: owner,
repo: repo,
});

if (runners.data.runners.length == 0) {
return;
}

for (const runner of runners.data.runners) {
if (runner.name == name) {
if (idleOnly) {
if (!runner.busy) {
return runner.id;
} else {
console.log('Runner is busy, no need to delete.');
return;
}
}
return runner.id;
}
}

page++;
}
}

class RunnerBusy extends Error {
constructor(msg: string) {
super(msg);
Expand All @@ -45,17 +10,17 @@ class RunnerBusy extends Error {
}
}

exports.handler = async function (event: DeleteRunnerInput) {
exports.handler = async function (event: StepFunctionLambdaInput) {
const { octokit } = await getOctokit(event.installationId);

// find runner id
const runnerId = await getRunnerId(octokit, event.owner, event.repo, event.runnerName, event.idleOnly);
if (!runnerId) {
const runner = await getRunner(octokit, event.owner, event.repo, event.runnerName);
if (!runner) {
console.error(`Unable to find runner id for ${event.owner}/${event.repo}:${event.runnerName}`);
return;
}

console.log(`Runner ${event.runnerName} has id #${runnerId}`);
console.log(`Runner ${event.runnerName} has id #${runner.id}`);

// delete runner (it usually gets deleted by ./run.sh, but it stopped prematurely if we're here).
// it seems like runners are automatically removed after a timeout, if they first accepted a job.
Expand All @@ -65,7 +30,7 @@ exports.handler = async function (event: DeleteRunnerInput) {
await octokit.rest.actions.deleteSelfHostedRunnerFromRepo({
owner: event.owner,
repo: event.repo,
runner_id: runnerId,
runner_id: runner.id,
});
} catch (e) {
const reqError = <RequestError>e;
Expand Down
26 changes: 26 additions & 0 deletions src/idle-runner-repear-function.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

88 changes: 88 additions & 0 deletions src/idle-runner-repear.lambda.ts
@@ -0,0 +1,88 @@
import * as AWSLambda from 'aws-lambda';
import * as AWS from 'aws-sdk';
import { getOctokit, getRunner } from './lambda-github';

interface IdleReaperLambdaInput {
readonly executionArn: string;
readonly runnerName: string;
readonly owner: string;
readonly repo: string;
readonly installationId: string;
readonly maxIdleSeconds: number;
}

const sfn = new AWS.StepFunctions();

exports.handler = async function (event: AWSLambda.SQSEvent): Promise<AWSLambda.SQSBatchResponse> {
let result: AWSLambda.SQSBatchResponse = { batchItemFailures: [] };

for (const record of event.Records) {
const input = JSON.parse(record.body) as IdleReaperLambdaInput;
console.log(`Checking runner for ${input.owner}/${input.repo} [execution-id=${input.runnerName}]`);

const retryLater = () => result.batchItemFailures.push({ itemIdentifier: record.messageId });

// check if step function is still running
const execution = await sfn.describeExecution({ executionArn: input.executionArn }).promise();
if (execution.status != 'RUNNING') {
// no need to test again as runner already finished
console.log('Runner already finished');
continue;
}

// get github access
const { octokit } = await getOctokit(input.installationId);

// find runner
const runner = await getRunner(octokit, input.owner, input.repo, input.runnerName);
if (!runner) {
console.error(`Runner not running yet for ${input.owner}/${input.repo}:${input.runnerName}`);
retryLater();
continue;
}

// if not idle, we're done
if (runner.busy) {
console.log('Runner is not idle');
continue;
}

// check if max idle timeout has reached
let found = false;
for (const label of runner.labels) {
if (label.name.toLowerCase().startsWith('cdkghr:started:')) {
const started = parseFloat(label.name.split(':')[2]);
const startedDate = new Date(started * 1000);
const now = new Date();
const diffMs = now.getTime() - startedDate.getTime();

console.log(`Runner ${input.runnerName} started ${diffMs/1000} seconds ago`);

if (diffMs > 1000 * input.maxIdleSeconds) {
// max idle time reached, delete runner
console.log(`Runner ${input.runnerName} is idle for too long, deleting...`);

await octokit.rest.actions.deleteSelfHostedRunnerFromRepo({
owner: input.owner,
repo: input.repo,
runner_id: runner.id,
});
} else {
// still idle, timeout not reached -- retry later
retryLater();
}

found = true;
break;
}
}

if (!found) {
// no started label? retry later (it won't retry forever as eventually the runner will stop and the step function will finish)
console.error('No `cdkghr:started:xxx` label found???');
retryLater();
}
}

return result;
};
36 changes: 36 additions & 0 deletions src/lambda-github.ts
Expand Up @@ -49,3 +49,39 @@ export async function getOctokit(installationId?: string) {
octokit,
};
}

interface GitHubRunner {
readonly id: number;
readonly name: string;
readonly os: string;
readonly status: string;
readonly busy: boolean;
readonly labels: {
readonly id: number;
readonly name: string;
readonly type: string;
}[];
}

export async function getRunner(octokit: any, owner: string, repo: string, name: string): Promise<GitHubRunner | undefined> {
let page = 1;
while (true) {
const runners = await octokit.request('GET /repos/{owner}/{repo}/actions/runners?per_page=100&page={page}', {
page: page,
owner: owner,
repo: repo,
});

if (runners.data.runners.length == 0) {
return;
}

for (const runner of runners.data.runners) {
if (runner.name == name) {
return runner;
}
}

page++;
}
}
1 change: 0 additions & 1 deletion src/lambda-helpers.ts
Expand Up @@ -3,7 +3,6 @@ import * as AWS from 'aws-sdk';
export interface StepFunctionLambdaInput {
readonly owner: string;
readonly repo: string;
readonly runId: string;
readonly runnerName: string;
readonly installationId: string;
readonly labels: string[];
Expand Down
4 changes: 2 additions & 2 deletions src/providers/codebuild.ts
Expand Up @@ -249,7 +249,7 @@ export class CodeBuildRunnerProvider extends BaseProvider implements IRunnerProv
this.dind ? 'nohup dockerd --host=unix:///var/run/docker.sock --host=tcp://127.0.0.1:2375 --storage-driver=overlay2 &' : '',
this.dind ? 'timeout 15 sh -c "until docker info; do echo .; sleep 1; done"' : '',
'if [ "${RUNNER_VERSION}" = "latest" ]; then RUNNER_FLAGS=""; else RUNNER_FLAGS="--disableupdate"; fi',
'sudo -Hu runner /home/runner/config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL}" ${RUNNER_FLAGS} --name "${RUNNER_NAME}"',
'sudo -Hu runner /home/runner/config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL},cdkghr:started:`date +%s`" ${RUNNER_FLAGS} --name "${RUNNER_NAME}"',
],
},
build: {
Expand All @@ -269,7 +269,7 @@ export class CodeBuildRunnerProvider extends BaseProvider implements IRunnerProv
buildSpec.phases.install.commands = [
'cd \\actions',
'if (${Env:RUNNER_VERSION} -eq "latest") { $RunnerFlags = "" } else { $RunnerFlags = "--disableupdate" }',
'./config.cmd --unattended --url "https://${Env:GITHUB_DOMAIN}/${Env:OWNER}/${Env:REPO}" --token "${Env:RUNNER_TOKEN}" --ephemeral --work _work --labels "${Env:RUNNER_LABEL}" ${RunnerFlags} --name "${Env:RUNNER_NAME}"',
'./config.cmd --unattended --url "https://${Env:GITHUB_DOMAIN}/${Env:OWNER}/${Env:REPO}" --token "${Env:RUNNER_TOKEN}" --ephemeral --work _work --labels "${Env:RUNNER_LABEL},cdkghr:started:$(Get-Date -UFormat %s)" ${RunnerFlags} --name "${Env:RUNNER_NAME}"',
];
buildSpec.phases.build.commands = [
'cd \\actions',
Expand Down
2 changes: 1 addition & 1 deletion src/providers/docker-images/lambda/linux-arm64/runner.sh
Expand Up @@ -13,7 +13,7 @@ export HOME=/tmp/home

# start runner
if [ "${RUNNER_VERSION}" = "latest" ]; then RUNNER_FLAGS=""; else RUNNER_FLAGS="--disableupdate"; fi
./config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL}" --name "${RUNNER_NAME}" ${RUNNER_FLAGS}
./config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL},cdkghr:started:`date +%s`" --name "${RUNNER_NAME}" ${RUNNER_FLAGS}
echo Config done
./run.sh
echo Run done
Expand Down
2 changes: 1 addition & 1 deletion src/providers/docker-images/lambda/linux-x64/runner.sh
Expand Up @@ -13,7 +13,7 @@ export HOME=/tmp/home

# start runner
if [ "${RUNNER_VERSION}" = "latest" ]; then RUNNER_FLAGS=""; else RUNNER_FLAGS="--disableupdate"; fi
./config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL}" --name "${RUNNER_NAME}" ${RUNNER_FLAGS}
./config.sh --unattended --url "https://${GITHUB_DOMAIN}/${OWNER}/${REPO}" --token "${RUNNER_TOKEN}" --ephemeral --work _work --labels "${RUNNER_LABEL},cdkghr:started:`date +%s`" --name "${RUNNER_NAME}" ${RUNNER_FLAGS}
echo Config done
./run.sh
echo Run done
Expand Down

0 comments on commit 9c61574

Please sign in to comment.