Skip to content

Commit

Permalink
chore(ci): use 128 cores for x86 and add timeouts (#5665)
Browse files Browse the repository at this point in the history
ARM stays 64 core due to doing less work. If we take (muhc) longer than ARM,
it's a bad sign.
  • Loading branch information
ludamad committed Apr 11, 2024
1 parent ea3884e commit 0c5dc0a
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 31 deletions.
59 changes: 37 additions & 22 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true
jobs:
# Start cheap (~1/8th the cost of on demand, ~13th the cost of large GA runners) spot builders
# just for the CI job. These are specced per user and run the entire CI.
# just for the CI job. These are specced per user and run the entire CI.
# TODO These have a persistent EBS volume that forms a fast-online docker image cache (used by Earthly), meaning
# TODO build steps that ran in previous invocations are quickly ran from cache.
start-builder:
Expand All @@ -30,8 +30,8 @@ jobs:
strategy:
matrix:
config:
- {ec2_instance_type: m6a.16xlarge, runner_concurrency: 50, ec2_ami_id: ami-04d8422a9ba4de80f, runner_label_suffix: x86}
- {ec2_instance_type: r6g.16xlarge, runner_concurrency: 2, ec2_ami_id: ami-0d8a9b0419ddb331a, runner_label_suffix: arm}
- {ec2_instance_type: m6a.32xlarge, runner_concurrency: 50, ec2_ami_id: ami-04d8422a9ba4de80f, runner_label_suffix: x86}
- {ec2_instance_type: r6g.16xlarge, runner_concurrency: 8, ec2_ami_id: ami-0d8a9b0419ddb331a, runner_label_suffix: arm}
steps:
- name: Start EC2 runner
id: start-ec2-runner
Expand Down Expand Up @@ -60,6 +60,8 @@ jobs:
# prevents concurrency issues with multiple (implicit) earthly bootstraps
setup-arm:
needs: start-builder
timeout-minutes: 5
if: ${{ github.event.inputs.just_start_spot != 'true' }}
runs-on: ${{ github.actor }}-arm
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand All @@ -74,7 +76,7 @@ jobs:
build-arm:
needs: setup-arm
runs-on: ${{ github.actor }}-arm
if: ${{ github.event.inputs.just_start_spot != 'true' }}
timeout-minutes: 25
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
steps:
Expand All @@ -87,6 +89,7 @@ jobs:
e2e-arm:
needs: build-arm
runs-on: ${{ github.actor }}-arm
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
Expand All @@ -106,7 +109,9 @@ jobs:
# prevents concurrency issues with multiple (implicit) earthly bootstraps
setup-x86:
needs: start-builder
if: ${{ github.event.inputs.just_start_spot != 'true' }}
runs-on: ${{ github.actor }}-x86
timeout-minutes: 5
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -120,12 +125,20 @@ jobs:
build-x86:
needs: setup-x86
runs-on: ${{ github.actor }}-x86
if: ${{ github.event.inputs.just_start_spot != 'true' }}
timeout-minutes: 25
outputs:
e2e_list: ${{ steps.e2e_list.outputs.list }}
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- {uses: ./.github/ci-setup-action, with: { dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"}}
# Only allow one memory-hunger prover test to use this runner
# As detailed in https://github.com/ben-z/gh-action-mutex
# things do not become 'pending' in github actions, and instead just cancel one another
# so we can't use the native concurrency in GA
- name: Set up mutex
uses: ben-z/gh-action-mutex@v1.0.0-alpha.9
with:
branch: gh-action-mutex-build-x86-${{ github.actor }}
# prepare images locally, tagged by commit hash
- run: earthly ./yarn-project+export-end-to-end
# We base our e2e list used in e2e-x86 off the targets in ./yarn-project/end-to-end
Expand All @@ -138,6 +151,7 @@ jobs:
e2e-x86:
needs: build-x86
runs-on: ${{ github.actor }}-x86
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
Expand All @@ -153,45 +167,44 @@ jobs:
# run: BRANCH=${{ github.ref_name }} PULL_REQUEST=${{ github.event.number }} scripts/ci/upload_logs_to_s3 ./yarn-project/end-to-end/log

# barretenberg (prover) native tests
# only ran on x86 for resource reasons (memory intensive)
bb-native-tests:
needs: setup-x86
runs-on: ${{ github.actor }}-${{matrix.environment}}
runs-on: ${{ github.actor }}-x86
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
environment: [x86]
# pending fix for intermittent test
# environment: [x86, arm]
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- {uses: ./.github/ci-setup-action, with: { dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"}}
# # Only allow one memory-hunger prover test to use this runner
# # As detailed in https://github.com/ben-z/gh-action-mutex
# # things do not become 'pending' in github actions, and instead just cancel one another
# # so we can't use the native concurrency in GA
# - name: Set up mutex
# uses: ben-z/gh-action-mutex@v1.0.0-alpha.9
# with:
# branch: gh-action-mutex-bench-${{ github.actor }}
# Only allow one memory-hunger prover test to use this runner
# As detailed in https://github.com/ben-z/gh-action-mutex
# things do not become 'pending' in github actions, and instead just cancel one another
# so we can't use the native concurrency in GA
- name: Set up mutex
uses: ben-z/gh-action-mutex@v1.0.0-alpha.9
with:
branch: gh-action-mutex-bench-${{ github.actor }}
- working-directory: ./barretenberg/cpp/
run: earthly --no-output +test

# push benchmarking binaries to dockerhub registry
bb-bench-base:
bb-bench-binaries:
runs-on: ${{ github.actor }}-x86
timeout-minutes: 15
needs: setup-x86
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- {uses: ./.github/ci-setup-action, with: { dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"}}
- name: Build and Push Binaries
if: ${{ github.event.inputs.just_start_spot != 'true' }}
working-directory: ./barretenberg/cpp/
run: earthly --push +bench-base
run: earthly --push +bench-binaries

start-bb-bench-runner:
timeout-minutes: 5
# We wait for binaries to be done for kickoff
needs: bb-bench-base
needs: bb-bench-binaries
name: Start Bench Runner
runs-on: ubuntu-latest
permissions:
Expand Down Expand Up @@ -223,6 +236,7 @@ jobs:
setup-bb-bench:
runs-on: ${{ github.actor }}-bench-x86
needs: start-bb-bench-runner
timeout-minutes: 5
if: ${{ github.event.inputs.just_start_spot != 'true' }}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand All @@ -237,6 +251,7 @@ jobs:
bb-bench:
runs-on: ${{ github.actor }}-bench-x86
needs: setup-bb-bench
timeout-minutes: 15
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- {uses: ./.github/ci-setup-action, with: { dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"}}
Expand All @@ -251,7 +266,7 @@ jobs:

# # Post actions, deploy and summarize logs
# aztec-bench-summary:
# runs-on: ${{ github.actor }}
# runs-on: ${{ github.actor }}
# # IMPORTANT security flaw if we don't need 'check-run-condition'
# needs: e2e-x86
# concurrency:
Expand Down
42 changes: 33 additions & 9 deletions barretenberg/cpp/Earthfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ source:
COPY --dir src/barretenberg src/CMakeLists.txt src
# cmake source
COPY --dir cmake CMakeLists.txt CMakePresets.json .
RUN touch hey
# for debugging rebuilds
RUN echo CONTENT HASH $(find . -type f -exec sha256sum {} ';' | sort | sha256sum | awk '{print $1}') | tee .content-hash

Expand Down Expand Up @@ -162,30 +163,33 @@ preset-wasm-bench:
preset-release-assert-test:
FROM +preset-release-assert
# build all targets for tests
RUN cmake --build build
RUN cmake --build build
SAVE ARTIFACT build/bin

# Sent to the bench runner using a earthly-cloud build x86 --push +bench-base --bench_mode=true
# then we can run earthly-cloud bench x86 +bench-ultra-honk etc
bench-base:
# Sent to the bench runner using a earthly --push +bench-binaries
# then we can run earthly +bench-ultra-honk --bench_mode=cache
bench-binaries:
ARG EARTHLY_GIT_HASH
ARG TARGETARCH
ARG bench_mode=build
LOCALLY
IF [ $bench_mode = cache ]
FROM aztecprotocol/bb-bench-base:$TARGETARCH-$EARTHLY_GIT_HASH
FROM aztecprotocol/bb-bench-binaries:$TARGETARCH-$EARTHLY_GIT_HASH
SAVE ARTIFACT ./*
ELSE
FROM +source
FROM scratch
COPY +preset-op-count-time-bench/bin/*_bench op-count-time/bin/
COPY +preset-op-count-bench/bin/*_bench op-count/bin/
COPY +preset-release-bench/bin/*_bench release/bin/
COPY +preset-wasm-bench/bin/*_bench wasm/bin/
SAVE IMAGE --push aztecprotocol/bb-bench-base:$TARGETARCH-$EARTHLY_GIT_HASH
SAVE ARTIFACT ./*
SAVE IMAGE --push aztecprotocol/bb-bench-binaries:$TARGETARCH-$EARTHLY_GIT_HASH
END

# Runs on the bench image, sent from the builder runner
bench-ultra-honk:
FROM +bench-base
FROM +source
COPY --dir +bench-binaries/* .
# install SRS needed for proving
COPY --dir ./srs_db/+build/. srs_db
RUN cd release && ./bin/ultra_honk_bench --benchmark_filter="construct_proof_ultrahonk_power_of_2/20$"
Expand All @@ -195,7 +199,8 @@ bench-ultra-honk:
RUN cd wasm && wasmtime run --env HARDWARE_CONCURRENCY=16 -Wthreads=y -Sthreads=y --dir=".." ./bin/ultra_honk_bench --benchmark_filter="construct_proof_ultrahonk_power_of_2/20$"

bench-client-ivc:
FROM +bench-base
FROM +source
COPY --dir +bench-binaries/* .
# install SRS needed for proving
COPY --dir ./srs_db/+build/. srs_db
RUN cd release && ./bin/client_ivc_bench --benchmark_filter="ClientIVCBench/Full/6$"
Expand All @@ -204,13 +209,32 @@ bench-client-ivc:
COPY +wasmtime/wasmtime /usr/bin/wasmtime
RUN cd wasm && wasmtime run --env HARDWARE_CONCURRENCY=16 -Wthreads=y -Sthreads=y --dir=".." ./bin/client_ivc_bench --benchmark_filter="ClientIVCBench/Full/6$"

# Sent to the bench runner using a earthly --push +test-binaries
# then we can run earthly +test --test_mode=cache
test-binaries:
ARG EARTHLY_GIT_HASH
ARG TARGETARCH
ARG test_mode=build
LOCALLY
IF [ $test_mode = cache ]
FROM aztecprotocol/bb-test-binaries:$TARGETARCH-$EARTHLY_GIT_HASH
SAVE ARTIFACT build
ELSE
FROM scratch
COPY +preset-release-assert-test/bin/*_tests build/bin/
SAVE ARTIFACT build
SAVE IMAGE --push aztecprotocol/bb-test-binaries:$TARGETARCH-$EARTHLY_GIT_HASH
END

test-clang-format:
FROM +source
COPY .clang-format .
COPY format.sh .
RUN ./format.sh check

test:
FROM +source
COPY --dir +test-binaries/build build
BUILD +test-clang-format
FROM +preset-release-assert-test
COPY --dir ./srs_db/+build/. srs_db
Expand Down

0 comments on commit 0c5dc0a

Please sign in to comment.