Skip to content

Commit

Permalink
fix(ci): bigger cache disk, cache+prune docker images, disable Client…
Browse files Browse the repository at this point in the history
…IvcTests.Full (#5729)

We improve our caching by also caching /var/lib/docker
Move to new cache disk with bigger size (TODO cleanup old ones)
Earthly does not need a docker prune, but a long living spot instance
will accumulate lots of stuff from image ferrying. free it
Disabled ClientIvcTests.Full until investigation
  • Loading branch information
ludamad committed Apr 12, 2024
1 parent 825c455 commit 5dcbd75
Show file tree
Hide file tree
Showing 9 changed files with 45 additions and 40 deletions.
16 changes: 3 additions & 13 deletions .github/ci-setup-action/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ runs:
- name: Cache Submodules
id: cache-submodules
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .git/modules
key: submodules-${{ hashFiles('.gitmodules') }}-spot-ebs
Expand All @@ -52,16 +52,6 @@ runs:
shell: bash
run: ./scripts/setup_env.sh ${{ inputs.dockerhub_password }}

- name: Setup Docker
shell: bash
run: |
if ! [ -f /etc/docker/daemon.json ] ; then
echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json
sudo service docker restart
echo "Configured docker daemon for making many networks."
else
echo "Docker daemon already configured."
fi
# As detailed in https://github.com/ben-z/gh-action-mutex
# things do not become 'pending' in github actions, and instead just cancel one another
# so we can't use the native concurrency in GA. We use a simple file-lock since we're on the same machine.
Expand All @@ -70,8 +60,8 @@ runs:
if: ${{ inputs.concurrency_key }}
with:
run: |
while [ -f "/run/${{ inputs.concurrency_key }}.lock" ]; do sleep 1 ; echo "Lock is currently held, waiting..." ; done
touch "/run/${{ inputs.concurrency_key }}.lock"
while [ -f "/run/${{ inputs.concurrency_key }}.lock" ]; do sleep 1 ; echo "Lock is currently held by $(cat '/run/${{ inputs.concurrency_key }}.lock'), waiting..." ; done
echo "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" > "/run/${{ inputs.concurrency_key }}.lock"
echo "/run/${{ inputs.concurrency_key }}.lock acquired."
post: |
rm "/run/${{ inputs.concurrency_key }}.lock"
Expand Down
2 changes: 1 addition & 1 deletion .github/earthly-ci-config.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
global:
cache_size_pct: 75
cache_size_pct: 50
buildkit_max_parallelism: 50
container_frontend: docker-shell
buildkit_additional_args: ["-e", "BUILDKIT_STEP_LOG_MAX_SIZE=-1"]
3 changes: 2 additions & 1 deletion .github/workflows/ci-arm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ jobs:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: master-arm
ebs_cache_size_gb: 128
ebs_cache_size_gb: 256
runner_concurrency: 8
subaction: ${{ github.event.inputs.runner_action || 'start' }}
ec2_instance_type: r6g.16xlarge
ec2_ami_id: ami-0d8a9b0419ddb331a
ec2_instance_ttl: 40 # refreshed by jobs
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-x86
ebs_cache_size_gb: 128
ebs_cache_size_gb: 256
runner_concurrency: 50
subaction: ${{ github.event.inputs.runner_action || 'start' }}
ec2_instance_type: m6a.32xlarge
Expand Down Expand Up @@ -121,7 +121,7 @@ jobs:
needs: bb-bench-binaries
with:
runner_label: ${{ github.actor }}-bench-x86
ebs_cache_size_gb: 32
ebs_cache_size_gb: 64
runner_concurrency: 1
subaction: ${{ github.event.inputs.runner_action || 'start' }}
ec2_instance_type: m6a.4xlarge
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/protocol-circuits-gate-diff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
sudo cp -r clang+llvm-16.0.0-x86_64-linux-gnu-ubuntu-18.04/share/* /usr/local/share/
rm -rf clang+llvm-16.0.0-x86_64-linux-gnu-ubuntu-18.04.tar.xz clang+llvm-16.0.0-x86_64-linux-gnu-ubuntu-18.04
- uses: actions/cache@v3
- uses: actions/cache@v4
with:
path: |
barretenberg/cpp/build
Expand All @@ -60,7 +60,7 @@ jobs:
INSTALL_URL: https://raw.githubusercontent.com/noir-lang/noirup/main/install
NOIRUP_BIN_URL: https://raw.githubusercontent.com/noir-lang/noirup/main/noirup

- uses: actions/cache@v3
- uses: actions/cache@v4
with:
path: |
~/.cargo/bin/
Expand Down
18 changes: 18 additions & 0 deletions .github/workflows/setup-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ jobs:
setup:
needs: start-builder
runs-on: ${{ inputs.runner_label }}
if: ${{inputs.subaction != 'stop'}}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
Expand All @@ -91,5 +92,22 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: ./scripts/attach_ebs_cache.sh ${{ inputs.runner_label }} 128

- name: Configure and Restart Docker
shell: bash
run: |
# We need to restart after attaching disk cache
# Both only happen once, so we just make sure this happens once
if ! [ -f /etc/docker/daemon.json ] ; then
echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json
sudo service docker restart
echo "Configured docker daemon for making many networks."
else
echo "Docker daemon already configured."
fi
- name: Run Docker Prune
# helps to not overuse space
run: docker system prune -f

- name: Run Earthly Bootstrap
run: earthly bootstrap
13 changes: 0 additions & 13 deletions .github/workflows/stop-spot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,6 @@ name: Stop Personal Spot
on:
workflow_dispatch: {}
jobs:
stop-build-arm:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-arm
subaction: stop
# not used:
ebs_cache_size_gb: 128
runner_concurrency: 8
ec2_instance_type: r6g.16xlarge
ec2_ami_id: ami-0d8a9b0419ddb331a
ec2_instance_ttl: 40
secrets: inherit

stop-build-x86:
uses: ./.github/workflows/setup-runner.yml
with:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ class ClientIVCTests : public ::testing::Test {
* @brief A full Goblin test using PG that mimicks the basic aztec client architecture
*
*/
TEST_F(ClientIVCTests, Full)
// TODO fix with https://github.com/AztecProtocol/barretenberg/issues/930
// intermittent failures, presumably due to uninitialized memory
TEST_F(ClientIVCTests, DISABLED_Full)
{
using VerificationKey = Flavor::VerificationKey;

Expand Down
21 changes: 14 additions & 7 deletions scripts/attach_ebs_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@ INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id)
# TODO also mount various other aspects of docker image metadata

# Check for existing mount, assume we can continue if existing
if mount | grep -q /var/lib/docker/volumes; then
echo "Detected mount existing on /var/lib/docker/volumes already"
if mount | grep -q "/var/lib/docker/volumes type ext4"; then
echo "Detected mount existing on /var/lib/docker/volumes. This is our old mount."
echo "Run the stop spot workflow https://github.com/AztecProtocol/aztec-packages/actions/workflows/stop-spot.yml and rerun all steps in this workflow."
exit 0
fi

# Check for existing mount, assume we can continue if existing
if mount | grep -q "/var/lib/docker type ext4"; then
echo "Detected mount existing on /var/lib/docker already"
echo "Continuing..."
exit 0
fi
Expand All @@ -22,7 +29,7 @@ fi
# this means we are in a weird state (two spot instances running etc)
EXISTING_VOLUME=$(aws ec2 describe-volumes \
--region $REGION \
--filters "Name=tag:username,Values=$EBS_CACHE_TAG" \
--filters "Name=tag:username,Values=$EBS_CACHE_TAG-$SIZE" \
--query "Volumes[0].VolumeId" \
--output text)

Expand All @@ -33,7 +40,7 @@ if [ "$EXISTING_VOLUME" == "None" ]; then
--availability-zone $AVAILABILITY_ZONE \
--size $SIZE \
--volume-type $VOLUME_TYPE \
--tag-specifications "ResourceType=volume,Tags=[{Key=username,Value=$EBS_CACHE_TAG}]" \
--tag-specifications "ResourceType=volume,Tags=[{Key=username,Value=$EBS_CACHE_TAG-$SIZE}]" \
--query "VolumeId" \
--output text)
else
Expand Down Expand Up @@ -77,7 +84,7 @@ while [ "$(aws ec2 describe-volumes \
sleep 1
done

# We are expecting the device to come up as /dev/nvme1n1, but include generic code from
# We are expecting the device to come up as /dev/nvme1n1, but include generic code from
# https://github.com/slavivanov/ec2-spotter/blob/master/ec2spotter-remount-root
while true; do
if lsblk /dev/nvme1n1; then
Expand All @@ -100,5 +107,5 @@ if ! file -s $BLKDEVICE | grep -q ext4; then
fi

# Create a mount point and mount the volume
mkdir -p /var/lib/docker/volumes
mount $BLKDEVICE /var/lib/docker/volumes
mkdir -p /var/lib/docker
mount $BLKDEVICE /var/lib/docker

0 comments on commit 5dcbd75

Please sign in to comment.