Skip to content
Permalink
Browse files

integrate assembly implementations into the blake3 crate

  • Loading branch information
oconnor663 committed Feb 11, 2020
1 parent b6b3c27 commit efbfa0463c793dc1319db10ca4e3b809937b227d
@@ -24,22 +24,30 @@ jobs:
toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
profile: minimal
override: true
# Default tests.
- run: cargo test
# No-default-features tests.
# Default tests plus Rayon.
- run: cargo test --features=rayon
# no_std tests.
- run: cargo test --no-default-features
# More features tests. Note that "c_avx512" participates in dynamic feature
# detection, so it'll be built, but it probably won't run.
- run: cargo test --features=c_avx512,rayon
# Test the x86 assembly implementations. Use -vv to log compiler commands.
- run: cargo test --features=c -vv
# Test the C intrinsics implementations. Use -vv to log compiler commands.
- run: cargo test --features=c,c_prefer_intrinsics -vv
# Test release mode. This does more iteratations in test_fuzz_hasher.
- run: cargo test --release
# Test benchmarks. Nightly only.
- run: cargo test --benches
if: matrix.rust_version == 'nightly'
# Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
- run: cargo test --benches --features=c
env:
RUSTC_BOOTSTRAP: 1
# Test vectors.
- name: test vectors
run: cargo test
working-directory: ./test_vectors
- name: test vectors
run: cargo test --features=c
working-directory: ./test_vectors
- name: test vectors
run: cargo test --features=c,c_prefer_intrinsics
working-directory: ./test_vectors
# Test b3sum.
- name: test b3sum
run: cargo test
@@ -11,10 +11,21 @@ edition = "2018"

[features]
default = ["std"]
# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU
# feature detection. A binary with "c_avx512" on is still cross-platform. This
# feature has no effect on non-x86.
c_avx512 = []
# The "c" feature includes C and assembly SIMD implementations of the
# compression function for x86 platforms, called via FFI. (Currently it has no
# effect on other platforms.) This requires a C toolchain on the build machine.
# This is necessary for AVX-512 support, which is not yet stable in Rust, and
# the assembly implementations also perform better than those using Rust/LLVM
# intrinsics. As with the Rust implementations, these C and assembly
# implementations participate in runtime CPU feature detection, and the
# resulting binary is portable.
c = []
# Normally x86-64 builds prefer assembly implementations over C intrinsics. The
# assembly implementations perform better, perform most consistently across
# compilers, and are much faster to build. However, this feature makes the
# build use the C intrinsics implementations instead. This is mainly for
# testing purposes, and most callers will not want to use it.
c_prefer_intrinsics = []
# The NEON implementation does not participate in dynamic feature detection,
# which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
# that AArch64 always supports NEON, but support on ARMv7 varies.
@@ -33,19 +33,18 @@ with BLAKE3.
This repository is the official implementation of BLAKE3. It includes:

* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
includes optimized SIMD implementations, with dynamic CPU feature
detection on x86. SSE4.1 and AVX2 support are implemented in Rust,
while AVX-512 and ARM NEON support are imported from the C
implementation and controlled by the `c_avx512` and `c_neon` features.
Multi-threading is implemented with
[Rayon](https://github.com/rayon-rs/rayon) and controlled by the
`rayon` feature.
includes optimized SIMD implementations, with runtime CPU feature
detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
feature enables C/assembly implementations and AVX-512 support. The
`c_neon` feature enables ARM NEON support. Multi-threading is also
supported, and the `rayon` feature provides a
[Rayon](https://github.com/rayon-rs/rayon)-based implementation.

* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
provides a command line interface. You can install it from
[crates.io](https://crates.io/crates/b3sum) with `cargo install
b3sum`. It enables the multi-threading and AVX-512 features of the
`blake3` crate by default.
b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
by default.

* The [C implementation](c), which like the Rust implementation includes
SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
@@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*

## Usage

This repository provides the `b3sum` command line utility and the
`blake3` Rust crate.

### The `b3sum` utility

The `b3sum` utility allows you to process files and data from standard
@@ -9,8 +9,8 @@ readme = "README.md"
edition = "2018"

[features]
default = ["c_avx512", "rayon"]
c_avx512 = ["blake3/c_avx512"]
default = ["c", "rayon"]
c = ["blake3/c"]
c_neon = ["blake3/c_neon"]
rayon = ["blake3/rayon", "memmap"]

@@ -4,7 +4,7 @@ extern crate test;

use arrayref::array_ref;
use arrayvec::ArrayVec;
use blake3::platform::MAX_SIMD_DEGREE;
use blake3::platform::{Platform, MAX_SIMD_DEGREE};
use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
use rand::prelude::*;
use test::Bencher;
@@ -48,173 +48,149 @@ impl RandomInput {
}
}

type CompressInPlaceFn =
unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);

fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
let mut state = [1u32; 8];
let mut r = RandomInput::new(b, 64);
let input = array_ref!(r.get(), 0, 64);
unsafe {
b.iter(|| f(&mut state, input, 64 as u8, 0, 0));
}
b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
}

#[bench]
fn bench_single_compression_portable(b: &mut Bencher) {
bench_single_compression_fn(b, blake3::portable::compress_in_place);
bench_single_compression_fn(b, Platform::portable());
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() {
return;
if let Some(platform) = Platform::sse41() {
bench_single_compression_fn(b, platform);
}
bench_single_compression_fn(b, blake3::sse41::compress_in_place);
}

#[bench]
#[cfg(feature = "c_avx512")]
#[cfg(feature = "c")]
fn bench_single_compression_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() {
return;
if let Some(platform) = Platform::avx512() {
bench_single_compression_fn(b, platform);
}
bench_single_compression_fn(b, blake3::c_avx512::compress_in_place);
}

type HashManyFn<A> = unsafe fn(
inputs: &[&A],
key: &[u32; 8],
counter: u64,
increment_counter: blake3::IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
);

fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) {
fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, CHUNK_LEN));
}
unsafe {
b.iter(|| {
let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
f(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::Yes,
0,
0,
0,
&mut out,
);
});
}
b.iter(|| {
let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
platform.hash_many(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::Yes,
0,
0,
0,
&mut out,
);
});
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() {
return;
if let Some(platform) = Platform::sse41() {
bench_many_chunks_fn(b, platform);
}
bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_avx2(b: &mut Bencher) {
if !blake3::platform::avx2_detected() {
return;
if let Some(platform) = Platform::avx2() {
bench_many_chunks_fn(b, platform);
}
bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
}

#[bench]
#[cfg(feature = "c_avx512")]
#[cfg(feature = "c")]
fn bench_many_chunks_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() {
return;
if let Some(platform) = Platform::avx512() {
bench_many_chunks_fn(b, platform);
}
bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
}

#[bench]
#[cfg(feature = "c_neon")]
fn bench_many_chunks_neon(b: &mut Bencher) {
// When "c_neon" is on, NEON support is assumed.
bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
if let Some(platform) = Platform::neon() {
bench_many_chunks_fn(b, platform);
}
}

// TODO: When we get const generics we can unify this with the chunks code.
fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) {
fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, BLOCK_LEN));
}
unsafe {
b.iter(|| {
let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
f(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::No,
0,
0,
0,
&mut out,
);
});
}
b.iter(|| {
let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
platform.hash_many(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::No,
0,
0,
0,
&mut out,
);
});
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() {
return;
if let Some(platform) = Platform::sse41() {
bench_many_parents_fn(b, platform);
}
bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_avx2(b: &mut Bencher) {
if !blake3::platform::avx2_detected() {
return;
if let Some(platform) = Platform::avx2() {
bench_many_parents_fn(b, platform);
}
bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
}

#[bench]
#[cfg(feature = "c_avx512")]
#[cfg(feature = "c")]
fn bench_many_parents_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() {
return;
if let Some(platform) = Platform::avx512() {
bench_many_parents_fn(b, platform);
}
bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
}

#[bench]
#[cfg(feature = "c_neon")]
fn bench_many_parents_neon(b: &mut Bencher) {
// When "c_neon" is on, NEON support is assumed.
bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
if let Some(platform) = Platform::neon() {
bench_many_parents_fn(b, platform);
}
}

fn bench_atonce(b: &mut Bencher, len: usize) {

0 comments on commit efbfa04

Please sign in to comment.
You can’t perform that action at this time.