Skip to content

Commit

Permalink
Add ARM NEON versions of blake2s and blake2b
Browse files Browse the repository at this point in the history
NOTE! The NEON version of blake2s is currently NO FASTER than the
reference implementations.  However, it is retained for reference
and in case it can be further improved.

The NEON version of blake2b is more than twice as fast as the
reference implementation on the Raspberry PI 2 Model B.
  • Loading branch information
leighbb committed Apr 2, 2018
1 parent beb75f4 commit 7965d3e
Show file tree
Hide file tree
Showing 18 changed files with 4,431 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,10 @@ ref/blake2xs
ref/blake2xb
sse/blake2xs
sse/blake2xb
neon/blake2s
neon/blake2b
neon/blake2sp
neon/blake2bp
neon/blake2xs
neon/blake2xb
**tags
160 changes: 160 additions & 0 deletions neon/blake2-impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/*
BLAKE2 reference source code package - reference C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#ifndef BLAKE2_IMPL_H
#define BLAKE2_IMPL_H

#include <stdint.h>
#include <string.h>

#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
#if defined(_MSC_VER)
#define BLAKE2_INLINE __inline
#elif defined(__GNUC__)
#define BLAKE2_INLINE __inline__
#else
#define BLAKE2_INLINE
#endif
#else
#define BLAKE2_INLINE inline
#endif

static BLAKE2_INLINE uint32_t load32( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
return (( uint32_t )( p[0] ) << 0) |
(( uint32_t )( p[1] ) << 8) |
(( uint32_t )( p[2] ) << 16) |
(( uint32_t )( p[3] ) << 24) ;
#endif
}

static BLAKE2_INLINE uint64_t load64( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
return (( uint64_t )( p[0] ) << 0) |
(( uint64_t )( p[1] ) << 8) |
(( uint64_t )( p[2] ) << 16) |
(( uint64_t )( p[3] ) << 24) |
(( uint64_t )( p[4] ) << 32) |
(( uint64_t )( p[5] ) << 40) |
(( uint64_t )( p[6] ) << 48) |
(( uint64_t )( p[7] ) << 56) ;
#endif
}

static BLAKE2_INLINE uint16_t load16( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint16_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
return (( uint16_t )( p[0] ) << 0) |
(( uint16_t )( p[1] ) << 8) ;
#endif
}

static BLAKE2_INLINE void store16( void *dst, uint16_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}

static BLAKE2_INLINE void store32( void *dst, uint32_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
#endif
}

static BLAKE2_INLINE void store64( void *dst, uint64_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
p[4] = (uint8_t)(w >> 32);
p[5] = (uint8_t)(w >> 40);
p[6] = (uint8_t)(w >> 48);
p[7] = (uint8_t)(w >> 56);
#endif
}

static BLAKE2_INLINE uint64_t load48( const void *src )
{
const uint8_t *p = ( const uint8_t * )src;
return (( uint64_t )( p[0] ) << 0) |
(( uint64_t )( p[1] ) << 8) |
(( uint64_t )( p[2] ) << 16) |
(( uint64_t )( p[3] ) << 24) |
(( uint64_t )( p[4] ) << 32) |
(( uint64_t )( p[5] ) << 40) ;
}

static BLAKE2_INLINE void store48( void *dst, uint64_t w )
{
uint8_t *p = ( uint8_t * )dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
p[4] = (uint8_t)(w >> 32);
p[5] = (uint8_t)(w >> 40);
}

static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 32 - c ) );
}

static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 64 - c ) );
}

/* prevents compiler optimizing out memset() */
static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n)
{
static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
memset_v(v, 0, n);
}

#endif
195 changes: 195 additions & 0 deletions neon/blake2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
BLAKE2 reference source code package - reference C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#ifndef BLAKE2_H
#define BLAKE2_H

#include <stddef.h>
#include <stdint.h>

#if defined(_MSC_VER)
#define BLAKE2_PACKED(x) __pragma(pack(push, 1)) x __pragma(pack(pop))
#else
#define BLAKE2_PACKED(x) x __attribute__((packed))
#endif

#if defined(__cplusplus)
extern "C" {
#endif

enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};

enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};

typedef struct blake2s_state__
{
uint32_t h[8];
uint32_t t[2];
uint32_t f[2];
uint8_t buf[BLAKE2S_BLOCKBYTES];
size_t buflen;
size_t outlen;
uint8_t last_node;
} blake2s_state;

typedef struct blake2b_state__
{
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[BLAKE2B_BLOCKBYTES];
size_t buflen;
size_t outlen;
uint8_t last_node;
} blake2b_state;

typedef struct blake2sp_state__
{
blake2s_state S[8][1];
blake2s_state R[1];
uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
size_t buflen;
size_t outlen;
} blake2sp_state;

typedef struct blake2bp_state__
{
blake2b_state S[4][1];
blake2b_state R[1];
uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
size_t buflen;
size_t outlen;
} blake2bp_state;


BLAKE2_PACKED(struct blake2s_param__
{
uint8_t digest_length; /* 1 */
uint8_t key_length; /* 2 */
uint8_t fanout; /* 3 */
uint8_t depth; /* 4 */
uint32_t leaf_length; /* 8 */
uint32_t node_offset; /* 12 */
uint16_t xof_length; /* 14 */
uint8_t node_depth; /* 15 */
uint8_t inner_length; /* 16 */
/* uint8_t reserved[0]; */
uint8_t salt[BLAKE2S_SALTBYTES]; /* 24 */
uint8_t personal[BLAKE2S_PERSONALBYTES]; /* 32 */
});

typedef struct blake2s_param__ blake2s_param;

BLAKE2_PACKED(struct blake2b_param__
{
uint8_t digest_length; /* 1 */
uint8_t key_length; /* 2 */
uint8_t fanout; /* 3 */
uint8_t depth; /* 4 */
uint32_t leaf_length; /* 8 */
uint32_t node_offset; /* 12 */
uint32_t xof_length; /* 16 */
uint8_t node_depth; /* 17 */
uint8_t inner_length; /* 18 */
uint8_t reserved[14]; /* 32 */
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
});

typedef struct blake2b_param__ blake2b_param;

typedef struct blake2xs_state__
{
blake2s_state S[1];
blake2s_param P[1];
} blake2xs_state;

typedef struct blake2xb_state__
{
blake2b_state S[1];
blake2b_param P[1];
} blake2xb_state;

/* Padded structs result in a compile-time error */
enum {
BLAKE2_DUMMY_1 = 1/(sizeof(blake2s_param) == BLAKE2S_OUTBYTES),
BLAKE2_DUMMY_2 = 1/(sizeof(blake2b_param) == BLAKE2B_OUTBYTES)
};

/* Streaming API */
int blake2s_init( blake2s_state *S, size_t outlen );
int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen );
int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
int blake2s_update( blake2s_state *S, const void *in, size_t inlen );
int blake2s_final( blake2s_state *S, void *out, size_t outlen );

int blake2b_init( blake2b_state *S, size_t outlen );
int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen );
int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
int blake2b_update( blake2b_state *S, const void *in, size_t inlen );
int blake2b_final( blake2b_state *S, void *out, size_t outlen );

int blake2sp_init( blake2sp_state *S, size_t outlen );
int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen );
int blake2sp_update( blake2sp_state *S, const void *in, size_t inlen );
int blake2sp_final( blake2sp_state *S, void *out, size_t outlen );

int blake2bp_init( blake2bp_state *S, size_t outlen );
int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen );
int blake2bp_update( blake2bp_state *S, const void *in, size_t inlen );
int blake2bp_final( blake2bp_state *S, void *out, size_t outlen );

/* Variable output length API */
int blake2xs_init( blake2xs_state *S, const size_t outlen );
int blake2xs_init_key( blake2xs_state *S, const size_t outlen, const void *key, size_t keylen );
int blake2xs_update( blake2xs_state *S, const void *in, size_t inlen );
int blake2xs_final(blake2xs_state *S, void *out, size_t outlen);

int blake2xb_init( blake2xb_state *S, const size_t outlen );
int blake2xb_init_key( blake2xb_state *S, const size_t outlen, const void *key, size_t keylen );
int blake2xb_update( blake2xb_state *S, const void *in, size_t inlen );
int blake2xb_final(blake2xb_state *S, void *out, size_t outlen);

/* Simple API */
int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );

int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
int blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );

int blake2xs( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
int blake2xb( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );

/* This is simply an alias for blake2b */
int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );

#if defined(__cplusplus)
}
#endif

#endif
Loading

0 comments on commit 7965d3e

Please sign in to comment.