Skip to content

Commit

Permalink
chore(ci): spot health fix, earthly workarounds (#6379)
Browse files Browse the repository at this point in the history
- Fix backoff with instance requirements being set incorrectly
- Add more instance types to take advantage of this
- Pass one earthly workarounds, namely one for the rare case that all
earthly-ci calls try to bootstrap at once
- [s]Switch to 'systemctl restart docker' which seemed to recover my
spot from a bad case with 'pull ping', still not sure root cause but
running that fixed it while multiple fresh spots didn't[/s] Scratch
that, it was a bad /var/lib/docker/tmp path which is internally used
  • Loading branch information
ludamad committed May 13, 2024
1 parent fee7649 commit da7573c
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/ensure-builder/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ runs:
echo "runner_label=$USERNAME-$runner_type" >> $GITHUB_OUTPUT
if [[ $TYPE == builder-x86 ]]; then
# 128-core x86 instance types with least evictions
echo "instance_type=r6in.32xlarge r6a.32xlarge i4i.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT
echo "instance_type=i4i.32xlarge m6a.32xlarge m6i.32xlarge m6id.32xlarge m6idn.32xlarge m6in.32xlarge m7a.32xlarge r6a.32xlarge r6i.32xlarge r6id.32xlarge r6in.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT
echo "ami_id=ami-04d8422a9ba4de80f" >> $GITHUB_OUTPUT
echo "ebs_cache_size=256" >> $GITHUB_OUTPUT
echo "runner_concurrency=20" >> $GITHUB_OUTPUT
Expand Down
12 changes: 6 additions & 6 deletions .github/ensure-tester-with-images/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ runs:
export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }}
export BUILDER_SPOT_KEY=~/.ssh/build_instance_key
scripts/run_on_builder "
sudo mkdir -p /var/lib/docker/tmp
sudo mkdir -p /var/lib/docker/tmp-images
sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c '
if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then
docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp
mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
sudo flock /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.lock bash -c '
if ! [ -f /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli ] ; then
docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp
mv /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli
fi'
sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
sudo cat /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli
" | brotli --decompress | docker load
- name: Test
Expand Down
2 changes: 1 addition & 1 deletion .github/ensure-tester/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ runs:
elif [[ $TYPE == 128core-* ]]; then
SIZE=32xlarge
fi
echo "instance_type=m6a.$SIZE r6in.$SIZE r6a.$SIZE i4i.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT
echo "instance_type=i4i.$SIZE m6a.$SIZE m6i.$SIZE m6id.$SIZE m6idn.$SIZE m6in.$SIZE m7a.$SIZE r6a.$SIZE r6i.$SIZE r6id.$SIZE r6in.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT
- name: Start Tester
uses: ./.github/spot-runner-action
Expand Down
25 changes: 10 additions & 15 deletions .github/spot-runner-action/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -260,13 +260,6 @@ class Ec2Instance {
LaunchTemplateData: {
ImageId: this.config.ec2AmiId,
InstanceInitiatedShutdownBehavior: "terminate",
InstanceRequirements: {
// We do not know what the instance types correspond to
// just let the user send a list of allowed instance types
VCpuCount: { Min: 0 },
MemoryMiB: { Min: 0 },
AllowedInstanceTypes: this.config.ec2InstanceType,
},
SecurityGroupIds: [this.config.ec2SecurityGroupId],
KeyName: this.config.ec2KeyName,
UserData: userDataScript,
Expand Down Expand Up @@ -326,6 +319,9 @@ class Ec2Instance {
Type: "instant",
LaunchTemplateConfigs: [fleetLaunchConfig],
ClientToken: this.config.clientToken || undefined,
SpotOptions: {
AllocationStrategy: "price-capacity-optimized",
},
TargetCapacitySpecification: {
TotalTargetCapacity: 1,
OnDemandTargetCapacity: useOnDemand ? 1 : 0,
Expand All @@ -336,13 +332,13 @@ class Ec2Instance {
const client = yield this.getEc2Client();
const fleet = yield client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
core.warning(JSON.stringify(fleet.Errors, null, 2));
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded" ||
error.ErrorCode === "InsufficientInstanceCapacity") {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {};
return (instances.InstanceIds || [])[0] || "";
Expand Down Expand Up @@ -728,11 +724,10 @@ function requestAndWaitForSpot(config) {
}
let instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 1;
let backoff = 0;
core.info(`Starting instance with ${ec2Strategy} strategy`);
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
const MAX_ATTEMPTS = 3; // uses exponential backoff
for (let i = 0; i < MAX_ATTEMPTS; i++) {
// Start instance
const instanceIdOrError = yield ec2Client.requestMachine(
// we fallback to on-demand
Expand All @@ -742,15 +737,15 @@ function requestAndWaitForSpot(config) {
instanceIdOrError === "InsufficientInstanceCapacity") {
core.info("Failed to create instance due to " +
instanceIdOrError +
" , waiting 10 seconds and trying again.");
backoff += 1;
", waiting " + 5 * Math.pow(2, backoff) + " seconds and trying again.");
}
else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff)));
yield new Promise((r) => setTimeout(r, 5000 * Math.pow(2, backoff)));
backoff += 1;
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down
12 changes: 4 additions & 8 deletions .github/spot-runner-action/src/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,6 @@ export class Ec2Instance {
LaunchTemplateData: {
ImageId: this.config.ec2AmiId,
InstanceInitiatedShutdownBehavior: "terminate",
InstanceRequirements: {
// We do not know what the instance types correspond to
// just let the user send a list of allowed instance types
VCpuCount: { Min: 0 },
MemoryMiB: { Min: 0 },
AllowedInstanceTypes: this.config.ec2InstanceType,
},
SecurityGroupIds: [this.config.ec2SecurityGroupId],
KeyName: this.config.ec2KeyName,
UserData: userDataScript,
Expand Down Expand Up @@ -245,6 +238,9 @@ export class Ec2Instance {
Type: "instant",
LaunchTemplateConfigs: [fleetLaunchConfig],
ClientToken: this.config.clientToken || undefined,
SpotOptions: {
AllocationStrategy: "price-capacity-optimized",
},
TargetCapacitySpecification: {
TotalTargetCapacity: 1,
OnDemandTargetCapacity: useOnDemand ? 1 : 0,
Expand All @@ -255,6 +251,7 @@ export class Ec2Instance {
const client = await this.getEc2Client();
const fleet = await client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
core.warning(JSON.stringify(fleet.Errors, null, 2));
for (const error of fleet.Errors) {
if (
error.ErrorCode === "RequestLimitExceeded" ||
Expand All @@ -263,7 +260,6 @@ export class Ec2Instance {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {};
return (instances.InstanceIds || [])[0] || "";
Expand Down
9 changes: 5 additions & 4 deletions .github/spot-runner-action/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 0;
core.info(`Starting instance with ${ec2Strategy} strategy`);
for (let i = 0; i < 6; i++) {
const MAX_ATTEMPTS = 3; // uses exponential backoff
for (let i = 0; i < MAX_ATTEMPTS; i++) {
// Start instance
const instanceIdOrError =
await ec2Client.requestMachine(
Expand All @@ -75,18 +76,18 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
instanceIdOrError === "RequestLimitExceeded" ||
instanceIdOrError === "InsufficientInstanceCapacity"
) {
backoff += 1;
core.info(
"Failed to create instance due to " +
instanceIdOrError +
" , waiting " + 10000 * 2 ** backoff + " seconds and trying again."
", waiting " + 5 * 2 ** backoff + " seconds and trying again."
);
} else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff));
await new Promise((r) => setTimeout(r, 5000 * 2 ** backoff));
backoff += 1;
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ jobs:
concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86
- name: "Docs Preview"
timeout-minutes: 30
run: earthly --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }}
run: earthly-ci --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }}

bb-bench:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions scripts/ci/attach_ebs_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ fi
# Create a mount point and mount the volume
mkdir -p /var/lib/docker
mount $BLKDEVICE /var/lib/docker
service docker restart
# clear our images temp folder
rm -rf /var/lib/docker/tmp
rm -rf /var/lib/docker/tmp-images
systemctl restart docker
# important: everything (except earthly ls) should go through earthly-ci
scripts/earthly-ci bootstrap
touch /home/ubuntu/.setup-complete
10 changes: 8 additions & 2 deletions scripts/earthly-ci
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ OUTPUT_FILE=$(mktemp)
INCONSISTENT_GRAPH_STATE_COUNT=0 # Counter for 'inconsistent graph state' errors

# Maximum attempts
MAX_ATTEMPTS=8
MAX_ATTEMPTS=5
ATTEMPT_COUNT=0

# earthly settings
Expand Down Expand Up @@ -45,9 +45,15 @@ while [ $ATTEMPT_COUNT -lt $MAX_ATTEMPTS ]; do
echo "Got 'inconsistent graph state' or 'failed to get state for index'. Sleeping for 30 seconds and retrying."
sleep 30
elif grep 'Error: pull ping error: pull ping response' $OUTPUT_FILE >/dev/null; then
echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker"
echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker. If this persists, try 'systemctl restart docker' on the spot instance."
elif grep '================================= System Info ==================================' $OUTPUT_FILE >/dev/null; then
echo "Detected an Earthly daemon restart, possibly due to it (mis)detecting a cache setting change, trying again..."
elif grep 'dial unix /run/buildkit/buildkitd.sock' $OUTPUT_FILE >/dev/null; then
echo "Detected earthly unable to find buildkit, waiting and trying again..."
sleep 20
elif grep 'The container name "/earthly-buildkitd" is already in use by container' $OUTPUT_FILE >/dev/null; then
echo "Detected earthly bootstrap happening in parallel and failing, waiting and trying again."
sleep 20
else
# If other errors, exit the script
exit 1
Expand Down

0 comments on commit da7573c

Please sign in to comment.