-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SSSE3 intrinsics for SPECK-128 (GH #538)
Performance increased by about 100% on a 3.1 GHz Core i5 Skylake. Throughput went from about 7.3 cpb to about 3.5 cpb. Not bad for a software-based implementation of a block cipher
- Loading branch information
Showing
1 changed file
with
367 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
// speck-simd.cpp - written and placed in the public domain by Jeffrey Walton | ||
// | ||
// This source file uses intrinsics and built-ins to gain access to | ||
// AES-NI, ARMv8a AES and Power8 AES instructions. A separate source | ||
// file is needed because additional CXXFLAGS are required to enable | ||
// the appropriate instructions sets in some build configurations. | ||
|
||
#include "pch.h" | ||
#include "config.h" | ||
|
||
#include "speck.h" | ||
#include "misc.h" | ||
|
||
#if (CRYPTOPP_SSSE3_AVAILABLE) | ||
# include <tmmintrin.h> | ||
#endif | ||
|
||
// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 | ||
#if (__SUNPRO_CC >= 0x5130) | ||
# define MAYBE_CONST | ||
# define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) | ||
#else | ||
# define MAYBE_CONST const | ||
# define MAYBE_UNCONST_CAST(T, x) (x) | ||
#endif | ||
|
||
// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 | ||
#define M128_CAST(x) ((__m128i *)(void *)(x)) | ||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) | ||
|
||
ANONYMOUS_NAMESPACE_BEGIN | ||
|
||
using CryptoPP::byte; | ||
using CryptoPP::word32; | ||
using CryptoPP::word64; | ||
using CryptoPP::rotlFixed; | ||
using CryptoPP::rotrFixed; | ||
using CryptoPP::BlockTransformation; | ||
|
||
#if defined(CRYPTOPP_SSSE3_AVAILABLE) | ||
|
||
CRYPTOPP_ALIGN_DATA(16) | ||
const word32 s_one[] = {0, 0, 0, 1<<24}; | ||
|
||
template <unsigned int R> | ||
inline __m128i RotateLeft64(const __m128i& val) | ||
{ | ||
CRYPTOPP_ASSERT(R < 64); | ||
const __m128i a(_mm_slli_epi64(val, R)); | ||
const __m128i b(_mm_srli_epi64(val, 64-R)); | ||
return _mm_or_si128(a, b); | ||
} | ||
|
||
template <unsigned int R> | ||
inline __m128i RotateRight64(const __m128i& val) | ||
{ | ||
CRYPTOPP_ASSERT(R < 64); | ||
const __m128i a(_mm_slli_epi64(val, 64-R)); | ||
const __m128i b(_mm_srli_epi64(val, R)); | ||
return _mm_or_si128(a, b); | ||
} | ||
|
||
inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) | ||
{ | ||
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a | ||
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. | ||
// Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. | ||
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. | ||
__m128i block1 = _mm_setzero_si128(); | ||
__m128i x1 = _mm_unpacklo_epi64(block0, block1); | ||
__m128i y1 = _mm_unpackhi_epi64(block0, block1); | ||
|
||
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); | ||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
|
||
for (size_t i=0; static_cast<int>(i)<rounds; ++i) | ||
{ | ||
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); | ||
|
||
x1 = RotateRight64<8>(x1); | ||
x1 = _mm_add_epi64(x1, y1); | ||
x1 = _mm_xor_si128(x1, k1); | ||
y1 = RotateLeft64<3>(y1); | ||
y1 = _mm_xor_si128(y1, x1); | ||
} | ||
|
||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
|
||
block0 = _mm_unpacklo_epi64(x1, y1); | ||
block1 = _mm_unpackhi_epi64(x1, y1); | ||
} | ||
|
||
inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, | ||
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) | ||
{ | ||
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a | ||
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. | ||
// Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. | ||
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. | ||
__m128i x1 = _mm_unpacklo_epi64(block0, block1); | ||
__m128i y1 = _mm_unpackhi_epi64(block0, block1); | ||
__m128i x2 = _mm_unpacklo_epi64(block2, block3); // x2 | ||
__m128i y2 = _mm_unpackhi_epi64(block2, block3); // y2 | ||
|
||
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); | ||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
x2 = _mm_shuffle_epi8(x2, mask); | ||
y2 = _mm_shuffle_epi8(y2, mask); | ||
|
||
for (size_t i=0; static_cast<int>(i)<rounds; ++i) | ||
{ | ||
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); | ||
|
||
x1 = RotateRight64<8>(x1); | ||
x2 = RotateRight64<8>(x2); | ||
x1 = _mm_add_epi64(x1, y1); | ||
x2 = _mm_add_epi64(x2, y2); | ||
x1 = _mm_xor_si128(x1, k1); | ||
x2 = _mm_xor_si128(x2, k1); | ||
y1 = RotateLeft64<3>(y1); | ||
y2 = RotateLeft64<3>(y2); | ||
y1 = _mm_xor_si128(y1, x1); | ||
y2 = _mm_xor_si128(y2, x2); | ||
} | ||
|
||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
x2 = _mm_shuffle_epi8(x2, mask); | ||
y2 = _mm_shuffle_epi8(y2, mask); | ||
|
||
block0 = _mm_unpacklo_epi64(x1, y1); | ||
block1 = _mm_unpackhi_epi64(x1, y1); | ||
block2 = _mm_unpacklo_epi64(x2, y2); | ||
block3 = _mm_unpackhi_epi64(x2, y2); | ||
} | ||
|
||
inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) | ||
{ | ||
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a | ||
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. | ||
// Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. | ||
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. | ||
__m128i block1 = _mm_setzero_si128(); | ||
__m128i x1 = _mm_unpacklo_epi64(block0, block1); | ||
__m128i y1 = _mm_unpackhi_epi64(block0, block1); | ||
|
||
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); | ||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
|
||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) | ||
{ | ||
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); | ||
|
||
// y ^= x; | ||
y1 = _mm_xor_si128(y1, x1); | ||
// y = rotrFixed(y,3); | ||
y1 = RotateRight64<3>(y1); | ||
// x ^= k; | ||
x1 = _mm_xor_si128(x1, k1); | ||
// x -= y; | ||
x1 = _mm_sub_epi64(x1, y1); | ||
// x = rotlFixed(x,8); | ||
x1 = RotateLeft64<8>(x1); | ||
} | ||
|
||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
|
||
block0 = _mm_unpacklo_epi64(x1, y1); | ||
block1 = _mm_unpackhi_epi64(x1, y1); | ||
} | ||
|
||
inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, | ||
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) | ||
{ | ||
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a | ||
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. | ||
// Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. | ||
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. | ||
__m128i x1 = _mm_unpacklo_epi64(block0, block1); | ||
__m128i y1 = _mm_unpackhi_epi64(block0, block1); | ||
__m128i x2 = _mm_unpacklo_epi64(block2, block3); // x2 | ||
__m128i y2 = _mm_unpackhi_epi64(block2, block3); // y2 | ||
|
||
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); | ||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
x2 = _mm_shuffle_epi8(x2, mask); | ||
y2 = _mm_shuffle_epi8(y2, mask); | ||
|
||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) | ||
{ | ||
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); | ||
|
||
// y ^= x; | ||
y1 = _mm_xor_si128(y1, x1); | ||
y2 = _mm_xor_si128(y2, x2); | ||
// y = rotrFixed(y,3); | ||
y1 = RotateRight64<3>(y1); | ||
y2 = RotateRight64<3>(y2); | ||
// x ^= k; | ||
x1 = _mm_xor_si128(x1, k1); | ||
x2 = _mm_xor_si128(x2, k1); | ||
// x -= y; | ||
x1 = _mm_sub_epi64(x1, y1); | ||
x2 = _mm_sub_epi64(x2, y2); | ||
// x = rotlFixed(x,8); | ||
x1 = RotateLeft64<8>(x1); | ||
x2 = RotateLeft64<8>(x2); | ||
} | ||
|
||
x1 = _mm_shuffle_epi8(x1, mask); | ||
y1 = _mm_shuffle_epi8(y1, mask); | ||
x2 = _mm_shuffle_epi8(x2, mask); | ||
y2 = _mm_shuffle_epi8(y2, mask); | ||
|
||
block0 = _mm_unpacklo_epi64(x1, y1); | ||
block1 = _mm_unpackhi_epi64(x1, y1); | ||
block2 = _mm_unpacklo_epi64(x2, y2); | ||
block3 = _mm_unpackhi_epi64(x2, y2); | ||
} | ||
|
||
template <typename F1, typename F4> | ||
inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, | ||
const word64 *subKeys, size_t rounds, const byte *inBlocks, | ||
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) | ||
{ | ||
CRYPTOPP_ASSERT(subKeys); | ||
CRYPTOPP_ASSERT(inBlocks); | ||
CRYPTOPP_ASSERT(outBlocks); | ||
CRYPTOPP_ASSERT(length >= 16); | ||
|
||
const size_t blockSize = 16; | ||
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; | ||
size_t xorIncrement = xorBlocks ? blockSize : 0; | ||
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; | ||
|
||
if (flags & BlockTransformation::BT_ReverseDirection) | ||
{ | ||
inBlocks += length - blockSize; | ||
xorBlocks += length - blockSize; | ||
outBlocks += length - blockSize; | ||
inIncrement = 0-inIncrement; | ||
xorIncrement = 0-xorIncrement; | ||
outIncrement = 0-outIncrement; | ||
} | ||
|
||
if (flags & BlockTransformation::BT_AllowParallel) | ||
{ | ||
while (length >= 4*blockSize) | ||
{ | ||
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; | ||
if (flags & BlockTransformation::BT_InBlockIsCounter) | ||
{ | ||
const __m128i be1 = *CONST_M128_CAST(s_one); | ||
block1 = _mm_add_epi32(block0, be1); | ||
block2 = _mm_add_epi32(block1, be1); | ||
block3 = _mm_add_epi32(block2, be1); | ||
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); | ||
} | ||
else | ||
{ | ||
inBlocks += inIncrement; | ||
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); | ||
inBlocks += inIncrement; | ||
block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); | ||
inBlocks += inIncrement; | ||
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); | ||
inBlocks += inIncrement; | ||
} | ||
|
||
if (flags & BlockTransformation::BT_XorInput) | ||
{ | ||
// Coverity finding, appears to be false positive. Assert the condition. | ||
CRYPTOPP_ASSERT(xorBlocks); | ||
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
} | ||
|
||
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); | ||
|
||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) | ||
{ | ||
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
xorBlocks += xorIncrement; | ||
} | ||
|
||
_mm_storeu_si128(M128_CAST(outBlocks), block0); | ||
outBlocks += outIncrement; | ||
_mm_storeu_si128(M128_CAST(outBlocks), block1); | ||
outBlocks += outIncrement; | ||
_mm_storeu_si128(M128_CAST(outBlocks), block2); | ||
outBlocks += outIncrement; | ||
_mm_storeu_si128(M128_CAST(outBlocks), block3); | ||
outBlocks += outIncrement; | ||
|
||
length -= 4*blockSize; | ||
} | ||
} | ||
|
||
while (length >= blockSize) | ||
{ | ||
__m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); | ||
|
||
if (flags & BlockTransformation::BT_XorInput) | ||
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
|
||
if (flags & BlockTransformation::BT_InBlockIsCounter) | ||
const_cast<byte *>(inBlocks)[15]++; | ||
|
||
func1(block, subKeys, static_cast<unsigned int>(rounds)); | ||
|
||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) | ||
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); | ||
|
||
_mm_storeu_si128(M128_CAST(outBlocks), block); | ||
|
||
inBlocks += inIncrement; | ||
outBlocks += outIncrement; | ||
xorBlocks += xorIncrement; | ||
length -= blockSize; | ||
} | ||
|
||
return length; | ||
} | ||
|
||
#endif // CRYPTOPP_SSSE3_AVAILABLE | ||
|
||
ANONYMOUS_NAMESPACE_END | ||
|
||
/////////////////////////////////////////////////////////// | ||
|
||
NAMESPACE_BEGIN(CryptoPP) | ||
|
||
#if defined(CRYPTOPP_SSSE3_AVAILABLE) | ||
size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, | ||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) | ||
{ | ||
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_4_Blocks, | ||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); | ||
} | ||
|
||
size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, | ||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) | ||
{ | ||
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks, | ||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); | ||
} | ||
#endif | ||
|
||
NAMESPACE_END |