Skip to content

Commit

Permalink
chore(ci): stability after spot changes (#6367)
Browse files Browse the repository at this point in the history
- lenient ssh settings for when builds are just waiting
- request limit loop
- dont specify AZ or subnet for pure spot with no cache
- more instance types for spot
  • Loading branch information
ludamad committed May 13, 2024
1 parent 1f8fd1c commit 7ad4179
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 74 deletions.
3 changes: 2 additions & 1 deletion .github/ensure-tester-with-images/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ runs:
for image in ${{ inputs.builder_images_to_copy }} ; do
docker images --no-trunc --quiet \$image
done" > .success_key
echo "IMAGE_KEY=$(cat .success_key | md5sum | awk '{print $1}')" >> $GITHUB_ENV
# TODO better image key
echo "IMAGE_KEY=$(git rev-parse HEAD)" >> $GITHUB_ENV
echo "${{ inputs.run }}" >> .success_key
echo "SUCCESS_KEY=$(cat .success_key | md5sum | awk '{print $1}') " >> $GITHUB_ENV
Expand Down
14 changes: 8 additions & 6 deletions .github/ensure-tester/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,21 @@ runs:
# no github runners, 'bare spot' in action code
echo "runner_concurrency=0" >> $GITHUB_OUTPUT
echo "ttl=30" >> $GITHUB_OUTPUT
SIZE=large
if [[ $TYPE == 4core-* ]]; then
echo "instance_type=m6a.large" >> $GITHUB_OUTPUT
SIZE=large
elif [[ $TYPE == 8core-* ]]; then
echo "instance_type=m6a.2xlarge" >> $GITHUB_OUTPUT
SIZE=2xlarge
elif [[ $TYPE == 16core-* ]]; then
echo "instance_type=m6a.4xlarge" >> $GITHUB_OUTPUT
SIZE=4xlarge
elif [[ $TYPE == 32core-* ]]; then
echo "instance_type=m6a.8xlarge" >> $GITHUB_OUTPUT
SIZE=8xlarge
elif [[ $TYPE == 64core-* ]]; then
echo "instance_type=m6a.16xlarge" >> $GITHUB_OUTPUT
SIZE=16xlarge
elif [[ $TYPE == 128core-* ]]; then
echo "instance_type=m6a.32xlarge" >> $GITHUB_OUTPUT
SIZE=32xlarge
fi
echo "instance_type=m6a.$SIZE r6in.$SIZE r6a.$SIZE i4i.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT
- name: Start Tester
uses: ./.github/spot-runner-action
Expand Down
50 changes: 19 additions & 31 deletions .github/spot-runner-action/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ class Ec2Instance {
return __awaiter(this, void 0, void 0, function* () {
const client = yield this.getEc2Client();
const userData = yield new userdata_1.UserData(this.config);
const userDataScript = this.config.githubActionRunnerConcurrency !== 0 ? yield userData.getUserDataForBuilder() : yield userData.getUserDataForBareSpot();
const userDataScript = yield userData.getUserData();
const ec2InstanceTypeHash = this.getHashOfStringArray(this.config.ec2InstanceType.concat([
userDataScript,
JSON.stringify(this.tags),
Expand Down Expand Up @@ -318,8 +318,8 @@ class Ec2Instance {
},
Overrides: this.config.ec2InstanceType.map((instanceType) => ({
InstanceType: instanceType,
AvailabilityZone: availabilityZone,
SubnetId: this.config.ec2SubnetId,
AvailabilityZone: this.config.githubActionRunnerConcurrency > 0 ? availabilityZone : undefined,
SubnetId: this.config.githubActionRunnerConcurrency > 0 ? this.config.ec2SubnetId : undefined,
})),
};
const createFleetRequest = {
Expand All @@ -336,10 +336,15 @@ class Ec2Instance {
const client = yield this.getEc2Client();
const fleet = yield client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded") {
return "RequestLimitExceeded";
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {};
return (instances.InstanceIds || [])[0];
return (instances.InstanceIds || [])[0] || "";
});
}
getInstanceStatus(instanceId) {
Expand Down Expand Up @@ -722,21 +727,21 @@ function requestAndWaitForSpot(config) {
}
let instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 1;
core.info(`Starting instance with ${ec2Strategy} strategy`);
// 6 * 10000ms = 1 minute per strategy
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
try {
// Start instance
instanceId =
(yield ec2Client.requestMachine(
yield ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none")) || "";
if (instanceId) {
ec2Strategy.toLocaleLowerCase() === "none");
// let's exit, only loop on InsufficientInstanceCapacity
if (instanceId !== "RequestLimitExceeded") {
break;
}
// let's exit, only loop on InsufficientInstanceCapacity
break;
}
catch (error) {
// TODO is this still the relevant error?
Expand All @@ -752,7 +757,7 @@ function requestAndWaitForSpot(config) {
}
}
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000));
yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff)));
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down Expand Up @@ -1020,25 +1025,7 @@ class UserData {
constructor(config) {
this.config = config;
}
getUserDataForBareSpot() {
return __awaiter(this, void 0, void 0, function* () {
const cmds = [
"#!/bin/bash",
`exec 1>/run/log.out 2>&1`,
`shutdown -P +${this.config.ec2InstanceTtl}`,
`echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json`,
`sudo service docker restart`,
"sudo apt install -y brotli",
// NOTE also update versions below and in .github/ci-setup-action/action.yml
"sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly",
"sudo chmod +x /usr/local/bin/earthly",
"touch /home/ubuntu/.user-data-finished",
];
console.log("Sending: ", cmds.filter((x) => !x.startsWith("TOKENS")).join("\n"));
return Buffer.from(cmds.join("\n")).toString("base64");
});
}
getUserDataForBuilder() {
getUserData() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.config.githubActionRunnerLabel)
throw Error("failed to object job ID for label");
Expand All @@ -1053,9 +1040,10 @@ class UserData {
`sudo service docker restart`,
"sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly",
"sudo chmod +x /usr/local/bin/earthly",
"cd /run",
"sudo apt install -y brotli",
'echo "MaxStartups 1000" >> /etc/ssh/sshd_config',
'echo "ClientAliveInterval=30" >> /etc/ssh/sshd_config',
'echo "ClientAliveCountMax=20" >> /etc/ssh/sshd_config',
"sudo service sshd restart",
"touch /home/ubuntu/.user-data-finished",
];
Expand Down
15 changes: 10 additions & 5 deletions .github/spot-runner-action/src/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ export class Ec2Instance {
const userData = await new UserData(
this.config
);
const userDataScript = this.config.githubActionRunnerConcurrency !== 0 ? await userData.getUserDataForBuilder() : await userData.getUserDataForBareSpot();
const userDataScript = await userData.getUserData();
const ec2InstanceTypeHash = this.getHashOfStringArray(
this.config.ec2InstanceType.concat([
userDataScript,
Expand Down Expand Up @@ -225,7 +225,7 @@ export class Ec2Instance {
return launchTemplateName;
}

async requestMachine(useOnDemand: boolean): Promise<string | undefined> {
async requestMachine(useOnDemand: boolean): Promise<string> {
// Note advice re max bid: "If you specify a maximum price, your instances will be interrupted more frequently than if you do not specify this parameter."
const launchTemplateName = await this.getLaunchTemplate();
// Launch template name already in use
Expand All @@ -237,8 +237,8 @@ export class Ec2Instance {
},
Overrides: this.config.ec2InstanceType.map((instanceType) => ({
InstanceType: instanceType,
AvailabilityZone: availabilityZone,
SubnetId: this.config.ec2SubnetId,
AvailabilityZone: this.config.githubActionRunnerConcurrency > 0 ? availabilityZone : undefined,
SubnetId: this.config.githubActionRunnerConcurrency > 0 ? this.config.ec2SubnetId : undefined,
})),
};
const createFleetRequest: CreateFleetRequest = {
Expand All @@ -255,10 +255,15 @@ export class Ec2Instance {
const client = await this.getEc2Client();
const fleet = await client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded") {
return "RequestLimitExceeded";
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {};
return (instances.InstanceIds || [])[0];
return (instances.InstanceIds || [])[0] || "";
}

async getInstanceStatus(instanceId: string) {
Expand Down
14 changes: 7 additions & 7 deletions .github/spot-runner-action/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,22 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {

let instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 1;
core.info(`Starting instance with ${ec2Strategy} strategy`);
// 6 * 10000ms = 1 minute per strategy
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
try {
// Start instance
instanceId =
(await ec2Client.requestMachine(
await ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none"
)) || "";
if (instanceId) {
);
// let's exit, only loop on InsufficientInstanceCapacity
if (instanceId !== "RequestLimitExceeded") {
break;
}
// let's exit, only loop on InsufficientInstanceCapacity
break;
} catch (error) {
// TODO is this still the relevant error?
if (
Expand All @@ -94,7 +94,7 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
}
}
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000));
await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff));
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down
25 changes: 3 additions & 22 deletions .github/spot-runner-action/src/userdata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,7 @@ export class UserData {
this.config = config;
}

async getUserDataForBareSpot(): Promise<string> {
const cmds = [
"#!/bin/bash",
`exec 1>/run/log.out 2>&1`, // Log to /run/log.out
`shutdown -P +${this.config.ec2InstanceTtl}`,
`echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json`,
`sudo service docker restart`,
"sudo apt install -y brotli",
// NOTE also update versions below and in .github/ci-setup-action/action.yml
"sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly",
"sudo chmod +x /usr/local/bin/earthly",
"touch /home/ubuntu/.user-data-finished",
];
console.log(
"Sending: ",
cmds.filter((x) => !x.startsWith("TOKENS")).join("\n")
);
return Buffer.from(cmds.join("\n")).toString("base64");
}

async getUserDataForBuilder(): Promise<string> {
async getUserData(): Promise<string> {
if (!this.config.githubActionRunnerLabel)
throw Error("failed to object job ID for label");
// Note, we dont make the runner ephemeral as we start fresh runners as needed
Expand All @@ -42,9 +22,10 @@ export class UserData {
`sudo service docker restart`,
"sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly",
"sudo chmod +x /usr/local/bin/earthly",
"cd /run",
"sudo apt install -y brotli",
'echo "MaxStartups 1000" >> /etc/ssh/sshd_config',
'echo "ClientAliveInterval=30" >> /etc/ssh/sshd_config',
'echo "ClientAliveCountMax=20" >> /etc/ssh/sshd_config',
"sudo service sshd restart",
"touch /home/ubuntu/.user-data-finished",
];
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_on_builder
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ set -eu
# Enter the repo root
cd "$(dirname "$0")/.."

ssh -o StrictHostKeyChecking=no -i "$BUILDER_SPOT_KEY" ubuntu@"$BUILDER_SPOT_IP" "$@"
ssh -o TCPKeepAlive=no -o ServerAliveCountMax=20 -o ServerAliveInterval=30 -o StrictHostKeyChecking=no -i "$BUILDER_SPOT_KEY" ubuntu@"$BUILDER_SPOT_IP" "$@"
2 changes: 1 addition & 1 deletion scripts/run_on_tester
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ set -eu
# Enter the repo root
cd "$(dirname "$0")/.."

ssh -o StrictHostKeyChecking=no -i "$SPOT_KEY" ubuntu@"$SPOT_IP" "$@"
ssh -o TCPKeepAlive=no -o ServerAliveCountMax=20 -o ServerAliveInterval=30 -o StrictHostKeyChecking=no -i "$SPOT_KEY" ubuntu@"$SPOT_IP" "$@"

0 comments on commit 7ad4179

Please sign in to comment.