Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Don't cancel workflow on errors #139

Merged
merged 5 commits into from Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .projen/deps.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .projenrc.js
Expand Up @@ -17,6 +17,7 @@ const project = new awscdk.AwsCdkConstructLibrary({
'esbuild', // for faster NodejsFunction bundling
'@octokit/core',
'@octokit/auth-app',
'@octokit/request-error',
'@octokit/rest',
'aws-sdk',
'@aws-sdk/types',
Expand Down
16 changes: 16 additions & 0 deletions API.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 43 additions & 20 deletions src/lambdas/delete-runner/index.ts
@@ -1,7 +1,13 @@
// eslint-disable-next-line import/no-extraneous-dependencies
import { RequestError } from '@octokit/request-error';
import { getOctokit } from '../github';
import { StepFunctionLambdaInput } from '../helpers';

async function getRunnerId(octokit: any, owner: string, repo: string, name: string) {
interface DeleteRunnerInput extends StepFunctionLambdaInput {
readonly idleOnly: boolean;
}

async function getRunnerId(octokit: any, owner: string, repo: string, name: string, idleOnly: boolean) {
let page = 1;
while (true) {
const runners = await octokit.request('GET /repos/{owner}/{repo}/actions/runners?per_page=100&page={page}', {
Expand All @@ -16,6 +22,14 @@ async function getRunnerId(octokit: any, owner: string, repo: string, name: stri

for (const runner of runners.data.runners) {
if (runner.name == name) {
if (idleOnly) {
if (!runner.busy) {
return runner.id;
} else {
console.log('Runner is busy, no need to delete.');
return;
}
}
return runner.id;
}
}
Expand All @@ -24,33 +38,42 @@ async function getRunnerId(octokit: any, owner: string, repo: string, name: stri
}
}

exports.handler = async function (event: StepFunctionLambdaInput) {
const { octokit } = await getOctokit(event.installationId);

// cancel job so it doesn't get assigned to other runners by mistake or just sit there waiting
try {
await octokit.request('POST /repos/{owner}/{repo}/actions/runs/{runId}/cancel', {
owner: event.owner,
repo: event.repo,
runId: event.runId,
});
} catch (e) {
console.error(`Unable to cancel workflow: ${e}`);
class RunnerBusy extends Error {
constructor(msg: string) {
super(msg);
this.name = 'RunnerBusy';
Object.setPrototypeOf(this, RunnerBusy.prototype);
}
}

exports.handler = async function (event: DeleteRunnerInput) {
const { octokit } = await getOctokit(event.installationId);

// find runner id
const runnerId = await getRunnerId(octokit, event.owner, event.repo, event.runnerName);
const runnerId = await getRunnerId(octokit, event.owner, event.repo, event.runnerName, event.idleOnly);
if (!runnerId) {
console.error(`Unable to find runner id for ${event.owner}/${event.repo}:${event.runnerName}`);
return;
}

console.log(`Runner ${event.runnerName} has id #${runnerId}`);

// delete runner (it usually gets deleted by ./run.sh, but it stopped prematurely if we're here)
await octokit.request('DELETE /repos/{owner}/{repo}/actions/runners/{runnerId}', {
owner: event.owner,
repo: event.repo,
runnerId,
});
// delete runner (it usually gets deleted by ./run.sh, but it stopped prematurely if we're here).
// it seems like runners are automatically removed after a timeout, if they first accepted a job.
// we try removing it anyway for cases where a job wasn't accepted, and just in case it wasn't removed.
// repos have a limited number of self-hosted runners, so we can't leave dead ones behind.
try {
await octokit.rest.actions.deleteSelfHostedRunnerFromRepo({
owner: event.owner,
repo: event.repo,
runner_id: runnerId,
});
} catch (e) {
const reqError = <RequestError>e;
if (reqError.message.includes('is still running a job')) {
throw new RunnerBusy(reqError.message);
} else {
throw e;
}
}
};
4 changes: 2 additions & 2 deletions src/lambdas/github.ts
@@ -1,6 +1,6 @@
/* eslint-disable import/no-extraneous-dependencies */
import { createAppAuth } from '@octokit/auth-app';
import { Octokit } from '@octokit/core';
import { Octokit } from '@octokit/rest';
import { getSecretValue, getSecretJsonValue } from './helpers';

export function baseUrlFromDomain(domain: string): string {
Expand Down Expand Up @@ -49,4 +49,4 @@ export async function getOctokit(installationId?: string) {
githubSecrets,
octokit,
};
}
}
2 changes: 1 addition & 1 deletion src/lambdas/token-retriever/index.ts
Expand Up @@ -4,7 +4,7 @@ import { StepFunctionLambdaInput } from '../helpers';
exports.handler = async function (event: StepFunctionLambdaInput) {
const { githubSecrets, octokit } = await getOctokit(event.installationId);

const response = await octokit.request('POST /repos/{owner}/{repo}/actions/runners/registration-token', {
const response = await octokit.rest.actions.createRegistrationTokenForRepo({
owner: event.owner,
repo: event.repo,
});
Expand Down
45 changes: 42 additions & 3 deletions src/runner.ts
Expand Up @@ -75,6 +75,13 @@ export interface GitHubRunnersProps {
* ```
*/
readonly extraCertificates?: string;

/**
* Time to wait before stopping a runner that remains idle. If the user cancelled the job, or if another runner stole it, this stops the runner to avoid wasting resources.
*
* @default 10 minutes
*/
readonly idleTimeout?: cdk.Duration;
}

/**
Expand Down Expand Up @@ -166,7 +173,7 @@ export class GitHubRunners extends Construct {
];
}

this.orchestrator = this.stateMachine();
this.orchestrator = this.stateMachine(props);
this.webhook = new GithubWebhookHandler(this, 'Webhook Handler', {
orchestrator: this.orchestrator,
secrets: this.secrets,
Expand All @@ -176,7 +183,7 @@ export class GitHubRunners extends Construct {
this.statusFunction();
}

private stateMachine() {
private stateMachine(props?: GitHubRunnersProps) {
const tokenRetrieverTask = new stepfunctions_tasks.LambdaInvoke(
this,
'Get Runner Token',
Expand All @@ -187,11 +194,41 @@ export class GitHubRunners extends Construct {
},
);

let deleteRunnerFunction = this.deleteRunner();
const deleteRunnerTask = new stepfunctions_tasks.LambdaInvoke(
this,
'Delete Runner',
{
lambdaFunction: this.deleteRunner(),
lambdaFunction: deleteRunnerFunction,
payloadResponseOnly: true,
resultPath: '$.delete',
payload: stepfunctions.TaskInput.fromObject({
runnerName: stepfunctions.JsonPath.stringAt('$$.Execution.Name'),
owner: stepfunctions.JsonPath.stringAt('$.owner'),
repo: stepfunctions.JsonPath.stringAt('$.repo'),
runId: stepfunctions.JsonPath.stringAt('$.runId'),
installationId: stepfunctions.JsonPath.stringAt('$.installationId'),
idleOnly: false,
}),
},
);
deleteRunnerTask.addRetry({
errors: [
'RunnerBusy',
],
interval: cdk.Duration.minutes(1),
backoffRate: 1,
maxAttempts: 60,
});

const waitForIdleRunner = new stepfunctions.Wait(this, 'Wait', {
time: stepfunctions.WaitTime.duration(props?.idleTimeout ?? cdk.Duration.minutes(10)),
});
const deleteIdleRunnerTask = new stepfunctions_tasks.LambdaInvoke(
this,
'Delete Idle Runner',
{
lambdaFunction: deleteRunnerFunction,
payloadResponseOnly: true,
resultPath: '$.delete',
payload: stepfunctions.TaskInput.fromObject({
Expand All @@ -200,6 +237,7 @@ export class GitHubRunners extends Construct {
repo: stepfunctions.JsonPath.stringAt('$.repo'),
runId: stepfunctions.JsonPath.stringAt('$.runId'),
installationId: stepfunctions.JsonPath.stringAt('$.installationId'),
idleOnly: true,
}),
},
);
Expand All @@ -226,6 +264,7 @@ export class GitHubRunners extends Construct {
const work = tokenRetrieverTask.next(
new stepfunctions.Parallel(this, 'Error Catcher', { resultPath: '$.result' })
.branch(providerChooser)
.branch(waitForIdleRunner.next(deleteIdleRunnerTask))
.addCatch(
deleteRunnerTask
.next(new stepfunctions.Fail(this, 'Runner Failed')),
Expand Down
28 changes: 14 additions & 14 deletions test/default.integ.snapshot/github-runners-test.assets.json
Expand Up @@ -144,28 +144,28 @@
}
}
},
"3a48f2b55b764c585fa481c83e119860d9d64dd4655fa7ff6e45ae0e94cf91ed": {
"89fb355a5751903f14f49c4f49a73774f27c2a8caf67ae82f5900bec2207968f": {
"source": {
"path": "asset.3a48f2b55b764c585fa481c83e119860d9d64dd4655fa7ff6e45ae0e94cf91ed",
"path": "asset.89fb355a5751903f14f49c4f49a73774f27c2a8caf67ae82f5900bec2207968f",
"packaging": "zip"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "3a48f2b55b764c585fa481c83e119860d9d64dd4655fa7ff6e45ae0e94cf91ed.zip",
"objectKey": "89fb355a5751903f14f49c4f49a73774f27c2a8caf67ae82f5900bec2207968f.zip",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
},
"6e55d76fa08197b5d130bca3a8643a21da2d8e1f74f73aed49e52c25e2a2215c": {
"ac1208cbebd205ffd017f3b0f80e919958634e61c36f32b78cb3af4e012b9a94": {
"source": {
"path": "asset.6e55d76fa08197b5d130bca3a8643a21da2d8e1f74f73aed49e52c25e2a2215c",
"path": "asset.ac1208cbebd205ffd017f3b0f80e919958634e61c36f32b78cb3af4e012b9a94",
"packaging": "zip"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "6e55d76fa08197b5d130bca3a8643a21da2d8e1f74f73aed49e52c25e2a2215c.zip",
"objectKey": "ac1208cbebd205ffd017f3b0f80e919958634e61c36f32b78cb3af4e012b9a94.zip",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
Expand All @@ -183,41 +183,41 @@
}
}
},
"548e48d52cf7a8c7b238c646708f1549cb0bb88151b4ffa727612d23a34d0701": {
"f4a33003d7218cfbd37b7b92b417c1c1e422550c1f612cf8b5c727c8d94d35a3": {
"source": {
"path": "asset.548e48d52cf7a8c7b238c646708f1549cb0bb88151b4ffa727612d23a34d0701",
"path": "asset.f4a33003d7218cfbd37b7b92b417c1c1e422550c1f612cf8b5c727c8d94d35a3",
"packaging": "zip"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "548e48d52cf7a8c7b238c646708f1549cb0bb88151b4ffa727612d23a34d0701.zip",
"objectKey": "f4a33003d7218cfbd37b7b92b417c1c1e422550c1f612cf8b5c727c8d94d35a3.zip",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
},
"e41db675dd3aec882949c6d38560f47559abd70d4961d0afd1af5b0e8b2f5c1e": {
"94a34b46fc08e3882c9ca0796ed08bdb4041104e97b0838983f29ab8c255c4d7": {
"source": {
"path": "asset.e41db675dd3aec882949c6d38560f47559abd70d4961d0afd1af5b0e8b2f5c1e",
"path": "asset.94a34b46fc08e3882c9ca0796ed08bdb4041104e97b0838983f29ab8c255c4d7",
"packaging": "zip"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "e41db675dd3aec882949c6d38560f47559abd70d4961d0afd1af5b0e8b2f5c1e.zip",
"objectKey": "94a34b46fc08e3882c9ca0796ed08bdb4041104e97b0838983f29ab8c255c4d7.zip",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
},
"781489eb6e2e3af062d2923ea547d79ea0a76323db5e5fb134e92ef9f458fcd4": {
"4cf176ef992d3071f376b6da2d8b20416e9b11fc20f385d6e9d76a16ed235c13": {
"source": {
"path": "github-runners-test.template.json",
"packaging": "file"
},
"destinations": {
"current_account-current_region": {
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
"objectKey": "781489eb6e2e3af062d2923ea547d79ea0a76323db5e5fb134e92ef9f458fcd4.json",
"objectKey": "4cf176ef992d3071f376b6da2d8b20416e9b11fc20f385d6e9d76a16ed235c13.json",
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
}
}
Expand Down