Skip to content

Commit

Permalink
feat: benchmark Protogalaxy rounds (#4316)
Browse files Browse the repository at this point in the history
Splits folding into multiple functions for benchmarking.

Parallelizes the loop in compute_full_honk_evaluations, which results in
a 5x speedup in the perturbator round.

Before optimization:
```
Benchmarking lock created at ~/BENCHMARK_IN_PROGRESS.
protogalaxy_bench                                                             100% 8050KB  37.1MB/s   00:00    
2024-02-09T21:51:45+00:00
Running ./protogalaxy_bench
Run on (16 X 3000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 1024 KiB (x8)
  L3 Unified 36608 KiB (x1)
Load Average: 0.00, 0.00, 0.00
------------------------------------------------------
Benchmark            Time             CPU   Iterations
------------------------------------------------------
fold_one/14        311 ms          302 ms            2
fold_one/15        601 ms          582 ms            1
fold_one/16       1159 ms         1125 ms            1
fold_one/17       2317 ms         2248 ms            1
fold_one/18       4578 ms         4423 ms            1
fold_one/19       8997 ms         8694 ms            1
fold_one/20      17837 ms        17273 ms            1
```

After optimization (comparison only with UltraHonk) and addition of new
benchmarks:
```
Benchmarking lock created at ~/BENCHMARK_IN_PROGRESS.
protogalaxy_bench                                                             100% 8038KB  36.8MB/s   00:00    
2024-02-09T21:54:16+00:00
Running ./protogalaxy_bench
Run on (16 X 3000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 1024 KiB (x8)
  L3 Unified 36608 KiB (x1)
Load Average: 1.06, 1.10, 0.47
---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
fold_one<UltraComposer>/14              234 ms          223 ms            3
fold_one<UltraComposer>/15              438 ms          418 ms            2
fold_one<UltraComposer>/16              836 ms          800 ms            1
fold_one<UltraComposer>/17             1657 ms         1585 ms            1
fold_one<UltraComposer>/18             3316 ms         3131 ms            1
fold_one<UltraComposer>/19             6471 ms         6131 ms            1
fold_one<UltraComposer>/20            12638 ms        12022 ms            1
fold_one<GoblinUltraComposer>/14        633 ms          553 ms            1
fold_one<GoblinUltraComposer>/15       1175 ms         1069 ms            1
fold_one<GoblinUltraComposer>/16       2271 ms         2092 ms            1
fold_one<GoblinUltraComposer>/17       4794 ms         4218 ms            1
fold_one<GoblinUltraComposer>/18       9317 ms         8415 ms            1
fold_one<GoblinUltraComposer>/19      18220 ms        16743 ms            1
fold_one<GoblinUltraComposer>/20      38419 ms        34026 ms            1
```

The round benchmarks:
```
Benchmarking lock created at ~/BENCHMARK_IN_PROGRESS.
protogalaxy_round_bench                                                                                                                                                                                                                                                                                    100% 8049KB  40.3MB/s   00:00    
2024-02-12T21:51:43+00:00
Running ./protogalaxy_round_bench
Run on (16 X 3000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 1024 KiB (x8)
  L3 Unified 36608 KiB (x1)
Load Average: 0.26, 1.03, 0.92
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
bench_round_goblin_ultra/preparation/14               345 ms          234 ms            3
bench_round_goblin_ultra/preparation/15               575 ms          446 ms            2
bench_round_goblin_ultra/preparation/16              1045 ms          861 ms            1
bench_round_goblin_ultra/preparation/17              2376 ms         1606 ms            1
bench_round_goblin_ultra/preparation/18              4161 ms         3160 ms            1
bench_round_goblin_ultra/preparation/19              7819 ms         6411 ms            1
bench_round_goblin_ultra/preparation/20             18265 ms        12433 ms            1
bench_round_goblin_ultra/perturbator/14              39.2 ms         39.0 ms           18
bench_round_goblin_ultra/perturbator/15              78.8 ms         78.3 ms            9
bench_round_goblin_ultra/perturbator/16               160 ms          159 ms            4
bench_round_goblin_ultra/perturbator/17               325 ms          324 ms            2
bench_round_goblin_ultra/perturbator/18               660 ms          657 ms            1
bench_round_goblin_ultra/perturbator/19              1327 ms         1319 ms            1
bench_round_goblin_ultra/perturbator/20              2680 ms         2659 ms            1
bench_round_goblin_ultra/combiner_quotient/14         189 ms          187 ms            4
bench_round_goblin_ultra/combiner_quotient/15         380 ms          372 ms            2
bench_round_goblin_ultra/combiner_quotient/16         748 ms          742 ms            1
bench_round_goblin_ultra/combiner_quotient/17        1509 ms         1488 ms            1
bench_round_goblin_ultra/combiner_quotient/18        3017 ms         2998 ms            1
bench_round_goblin_ultra/combiner_quotient/19        6024 ms         5978 ms            1
bench_round_goblin_ultra/combiner_quotient/20       12055 ms        11989 ms            1
bench_round_goblin_ultra/accumulator_update/14       89.0 ms         89.0 ms            8
bench_round_goblin_ultra/accumulator_update/15        169 ms          169 ms            4
bench_round_goblin_ultra/accumulator_update/16        327 ms          327 ms            2
bench_round_goblin_ultra/accumulator_update/17        686 ms          686 ms            1
bench_round_goblin_ultra/accumulator_update/18       1371 ms         1371 ms            1
bench_round_goblin_ultra/accumulator_update/19       3102 ms         3102 ms            1
bench_round_goblin_ultra/accumulator_update/20       6552 ms         6552 ms            1
```

---------

Co-authored-by: lucasxia01 <lucasxia01@gmail.com>
Co-authored-by: ludamad <adam@aztecprotocol.com>
  • Loading branch information
3 people committed Feb 15, 2024
1 parent a2942b7 commit 91af28d
Show file tree
Hide file tree
Showing 15 changed files with 240 additions and 67 deletions.
17 changes: 17 additions & 0 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
"DISABLE_ASM": "ON"
}
},
{
"name": "clang16-dbg-fast",
"displayName": "Optimized debug build with Clang-16",
"description": "Build with globally installed Clang-16 in optimized debug mode",
"inherits": "clang16-dbg",
"environment": {
"CMAKE_BUILD_TYPE": "Debug",
"CFLAGS": "-O2 -gdwarf",
"CXXFLAGS": "-O2 -gdwarf-4",
"LDFLAGS": "-O2 -gdwarf-4"
}
},
{
"name": "asan",
"displayName": "Debugging build with address sanitizer on Clang-16",
Expand Down Expand Up @@ -339,6 +351,11 @@
"inherits": "default",
"configurePreset": "clang16-dbg"
},
{
"name": "clang16-dbg-fast",
"inherits": "default",
"configurePreset": "clang16-dbg-fast"
},
{
"name": "asan",
"inherits": "default",
Expand Down
1 change: 1 addition & 0 deletions barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ add_subdirectory(ivc_bench)
add_subdirectory(pippenger_bench)
add_subdirectory(plonk_bench)
add_subdirectory(protogalaxy_bench)
add_subdirectory(protogalaxy_rounds_bench)
add_subdirectory(relations_bench)
add_subdirectory(widgets_bench)
add_subdirectory(poseidon2_bench)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,44 @@
using namespace benchmark;

namespace bb {
using Flavor = UltraFlavor;
using Instance = ProverInstance_<Flavor>;
using Instances = ProverInstances_<Flavor, 2>;
using ProtoGalaxyProver = ProtoGalaxyProver_<Instances>;
using Builder = Flavor::CircuitBuilder;

// Fold one instance into an accumulator.
void fold_one(State& state) noexcept
template <typename Composer> void fold_one(State& state) noexcept
{
using Flavor = typename Composer::Flavor;
using Instance = ProverInstance_<Flavor>;
using Instances = ProverInstances_<Flavor, 2>;
using ProtoGalaxyProver = ProtoGalaxyProver_<Instances>;
using Builder = typename Flavor::CircuitBuilder;

bb::srs::init_crs_factory("../srs_db/ignition");

auto log2_num_gates = static_cast<size_t>(state.range(0));
auto composer = UltraComposer();
Composer composer;

const auto construct_instance = [&]() {
Builder builder;
bb::mock_proofs::generate_basic_arithmetic_circuit(builder, log2_num_gates);
if constexpr (std::same_as<Flavor, GoblinUltraFlavor>) {
GoblinMockCircuits::construct_arithmetic_circuit(builder, log2_num_gates);
} else {
static_assert(std::same_as<Flavor, UltraFlavor>);
bb::mock_proofs::generate_basic_arithmetic_circuit(builder, log2_num_gates);
}
return composer.create_instance(builder);
};

std::shared_ptr<Instance> instance_1 = construct_instance();
std::shared_ptr<Instance> instance_2 = construct_instance();

auto folding_prover = composer.create_folding_prover({ instance_1, instance_2 });
ProtoGalaxyProver folding_prover = composer.create_folding_prover({ instance_1, instance_2 });

for (auto _ : state) {
auto proof = folding_prover.fold_instances();
}
}

BENCHMARK(fold_one)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond);
BENCHMARK(fold_one<UltraComposer>)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond);
BENCHMARK(fold_one<GoblinUltraComposer>)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond);
} // namespace bb

BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
barretenberg_module(protogalaxy_rounds_bench ultra_honk protogalaxy stdlib_primitives)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
barretenberg_module(protogalaxy_round_bench ultra_honk protogalaxy stdlib_primitives)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <benchmark/benchmark.h>

#include "barretenberg/benchmark/ultra_bench/mock_proofs.hpp"
#include "barretenberg/proof_system/circuit_builder/ultra_circuit_builder.hpp"
#include "barretenberg/ultra_honk/ultra_composer.hpp"

using namespace benchmark;

namespace bb {

template <typename Composer>
void _bench_round(::benchmark::State& state,
void (*F)(ProtoGalaxyProver_<ProverInstances_<typename Composer::Flavor, 2>>&))
{
using Flavor = typename Composer::Flavor;
using Instance = ProverInstance_<Flavor>;
using Builder = typename Flavor::CircuitBuilder;

bb::srs::init_crs_factory("../srs_db/ignition");
auto log2_num_gates = static_cast<size_t>(state.range(0));
auto composer = Composer();

const auto construct_instance = [&]() {
Builder builder;
if constexpr (std::same_as<Flavor, GoblinUltraFlavor>) {
GoblinMockCircuits::construct_arithmetic_circuit(builder, log2_num_gates);
} else {
static_assert(std::same_as<Flavor, UltraFlavor>);
bb::mock_proofs::generate_basic_arithmetic_circuit(builder, log2_num_gates);
}
return composer.create_instance(builder);
};

std::shared_ptr<Instance> instance_1 = construct_instance();
std::shared_ptr<Instance> instance_2 = construct_instance();

auto folding_prover = composer.create_folding_prover({ instance_1, instance_2 });

// prepare the prover state
folding_prover.state.accumulator = instance_1;
folding_prover.state.deltas.resize(log2_num_gates);
std::fill_n(folding_prover.state.deltas.begin(), log2_num_gates, 0);
folding_prover.state.perturbator = Flavor::Polynomial::random(1 << log2_num_gates);
folding_prover.transcript = Flavor::Transcript::prover_init_empty();
folding_prover.preparation_round();

for (auto _ : state) {
F(folding_prover);
}
}

void bench_round_ultra(::benchmark::State& state, void (*F)(ProtoGalaxyProver_<ProverInstances_<UltraFlavor, 2>>&))
{
_bench_round<UltraComposer>(state, F);
}

void bench_round_goblin_ultra(::benchmark::State& state,
void (*F)(ProtoGalaxyProver_<ProverInstances_<GoblinUltraFlavor, 2>>&))
{
_bench_round<GoblinUltraComposer>(state, F);
}

BENCHMARK_CAPTURE(bench_round_ultra, preparation, [](auto& prover) { prover.preparation_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_ultra, perturbator, [](auto& prover) { prover.perturbator_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_ultra, combiner_quotient, [](auto& prover) { prover.combiner_quotient_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_ultra, accumulator_update, [](auto& prover) { prover.accumulator_update_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);

BENCHMARK_CAPTURE(bench_round_goblin_ultra, preparation, [](auto& prover) { prover.preparation_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_goblin_ultra, perturbator, [](auto& prover) { prover.perturbator_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_goblin_ultra, combiner_quotient, [](auto& prover) { prover.combiner_quotient_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);
BENCHMARK_CAPTURE(bench_round_goblin_ultra, accumulator_update, [](auto& prover) { prover.accumulator_update_round(); })
-> DenseRange(14, 20) -> Unit(kMillisecond);

} // namespace bb

BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ inline UltraProver get_prover(UltraComposer& composer,
return composer.create_prover(instance);
}

inline GoblinUltraProver get_prover(GoblinUltraComposer& composer,
void (*test_circuit_function)(GoblinUltraComposer::CircuitBuilder&, size_t),
size_t num_iterations)
{
GoblinUltraComposer::CircuitBuilder builder;
test_circuit_function(builder, num_iterations);
std::shared_ptr<GoblinUltraComposer::Instance> instance = composer.create_instance(builder);
return composer.create_prover(instance);
}

// standard plonk
inline plonk::Prover get_prover(plonk::StandardComposer& composer,
void (*test_circuit_function)(StandardCircuitBuilder&, size_t),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ enum {
};

/**
* @details Benchmark ultrahonk by performing all the rounds, but only measuring one.
* @details Benchmark Goblin ultrahonk by performing all the rounds, but only measuring one.
* Note: As a result the very short rounds take a long time for statistical significance, so recommended to set their
* iterations to 1.
* @param state - The google benchmark state.
* @param prover - The ultrahonk prover.
* @param prover - The Goblin ultrahonk prover.
* @param index - The pass to measure.
**/
BB_PROFILE static void test_round_inner(State& state, UltraProver& prover, size_t index) noexcept
BB_PROFILE static void test_round_inner(State& state, GoblinUltraProver& prover, size_t index) noexcept
{
auto time_if_index = [&](size_t target_index, auto&& func) -> void {
BB_REPORT_OP_COUNT_IN_BENCH(state);
Expand All @@ -55,25 +55,23 @@ BB_PROFILE static void test_round_inner(State& state, UltraProver& prover, size_
}
BB_PROFILE static void test_round(State& state, size_t index) noexcept
{
auto log2_num_gates = static_cast<size_t>(state.range(0));
bb::srs::init_crs_factory("../srs_db/ignition");

GoblinUltraComposer composer;
// TODO(https://github.com/AztecProtocol/barretenberg/issues/761) benchmark both sparse and dense circuits
GoblinUltraProver prover = bb::mock_proofs::get_prover(
composer, &bb::mock_proofs::generate_basic_arithmetic_circuit<GoblinUltraCircuitBuilder>, log2_num_gates);
for (auto _ : state) {
state.PauseTiming();
UltraComposer composer;
// TODO(https://github.com/AztecProtocol/barretenberg/issues/761) benchmark both sparse and dense circuits
UltraProver prover = bb::mock_proofs::get_prover(
composer, &bb::stdlib::generate_ecdsa_verification_test_circuit<UltraCircuitBuilder>, 10);
test_round_inner(state, prover, index);
state.ResumeTiming();
// NOTE: google bench is very finnicky, must end in ResumeTiming() for correctness
}
}
#define ROUND_BENCHMARK(round) \
static void ROUND_##round(State& state) noexcept \
{ \
test_round(state, round); \
} \
BENCHMARK(ROUND_##round)->Unit(kMillisecond)
BENCHMARK(ROUND_##round)->DenseRange(17, 19)->Unit(kMillisecond)

// Fast rounds take a long time to benchmark because of how we compute statistical significance.
// Limit to one iteration so we don't spend a lot of time redoing full proofs just to measure this part.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ class ClientIVCTests : public ::testing::Test {
* @details Currently default sized to 2^16 to match kernel. (Note: op gates will bump size to next power of 2)
*
*/
static Builder create_mock_circuit(ClientIVC& ivc, size_t num_gates = 1 << 15)
static Builder create_mock_circuit(ClientIVC& ivc, size_t log2_num_gates = 15)
{
Builder circuit{ ivc.goblin.op_queue };
GoblinMockCircuits::construct_arithmetic_circuit(circuit, num_gates);
GoblinMockCircuits::construct_arithmetic_circuit(circuit, log2_num_gates);
GoblinMockCircuits::construct_goblin_ecc_op_circuit(circuit);
return circuit;
}
Expand Down
15 changes: 15 additions & 0 deletions barretenberg/cpp/src/barretenberg/flavor/goblin_ultra.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,21 @@ class GoblinUltraFlavor {
: NativeTranscript(proof)
{}

static std::shared_ptr<Transcript_> prover_init_empty()
{
auto transcript = std::make_shared<Transcript_>();
constexpr uint32_t init{ 42 }; // arbitrary
transcript->send_to_verifier("Init", init);
return transcript;
};

static std::shared_ptr<Transcript_> verifier_init_empty(const std::shared_ptr<Transcript_>& transcript)
{
auto verifier_transcript = std::make_shared<Transcript_>(transcript->proof_data);
[[maybe_unused]] auto _ = verifier_transcript->template receive_from_prover<uint32_t>("Init");
return verifier_transcript;
};

void deserialize_full_transcript()
{
// take current proof and put them into the struct
Expand Down
22 changes: 12 additions & 10 deletions barretenberg/cpp/src/barretenberg/goblin/mock_circuits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ class GoblinMockCircuits {
* @param builder
* @param num_gates
*/
static void construct_arithmetic_circuit(GoblinUltraBuilder& builder, size_t num_gates = 1)
static void construct_arithmetic_circuit(GoblinUltraBuilder& builder, size_t log2_num_gates = 0)
{
size_t num_gates = 1 << log2_num_gates;
// For good measure, include a gate with some public inputs
{
FF a = FF::random_element();
Expand All @@ -53,17 +54,18 @@ class GoblinMockCircuits {

builder.create_big_add_gate({ a_idx, b_idx, c_idx, d_idx, FF(1), FF(1), FF(1), FF(-1), FF(0) });
}

// Add arbitrary arithmetic gates to obtain a total of num_gates-many gates
for (size_t i = 0; i < num_gates - 1; ++i) {
FF a = FF::random_element();
FF b = FF::random_element();
FF c = FF::random_element();
FF d = a + b + c;
uint32_t a_idx = builder.add_variable(a);
uint32_t b_idx = builder.add_variable(b);
uint32_t c_idx = builder.add_variable(c);
uint32_t d_idx = builder.add_variable(d);
FF a = FF::random_element();
FF b = FF::random_element();
FF c = FF::random_element();
FF d = a + b + c;
uint32_t a_idx = builder.add_variable(a);
uint32_t b_idx = builder.add_variable(b);
uint32_t c_idx = builder.add_variable(c);
uint32_t d_idx = builder.add_variable(d);

for (size_t i = 0; i < num_gates - 1; ++i) {
builder.create_big_add_gate({ a_idx, b_idx, c_idx, d_idx, FF(1), FF(1), FF(1), FF(-1), FF(0) });
}
}
Expand Down
Loading

0 comments on commit 91af28d

Please sign in to comment.