Skip to content

Commit

Permalink
Implement _mm_aeskeygenassist_si128
Browse files Browse the repository at this point in the history
Both crypto extension and NEON implementation were derived from the
original work of Michael G. Kazakov. See
https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
for details.
  • Loading branch information
jserv committed Jul 16, 2020
1 parent c4c5826 commit 6ab1c80
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 24 deletions.
34 changes: 34 additions & 0 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -4093,6 +4093,27 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
return _mm_xor_si128(out, RoundKey);
#endif
}

// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
// This instruction generates a round key for AES encryption. See
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
// for details.
//
// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
{
#define SSE2NEON_AES_H0(x) (x)
static const uint8_t sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
#undef SSE2NEON_AES_H0
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
for (int i = 0; i < 4; ++i) {
((uint8_t *) &X1)[i] = sbox[((uint8_t *) &X1)[i]];
((uint8_t *) &X3)[i] = sbox[((uint8_t *) &X3)[i]];
}
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
}
#undef SSE2NEON_AES_DATA

#else /* __ARM_FEATURE_CRYPTO */
Expand All @@ -4108,6 +4129,19 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
vreinterpretq_u8_m128i(b));
}

FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
{
a = vaeseq_u8(a, (__m128i){}); // AESE does ShiftRows and SubBytes on A
__m128i dest = {
// Undo ShiftRows step from AESE and extract X1 and X3
a[0x4], a[0x1], a[0xE], a[0xB], // SubBytes(X1)
a[0x1], a[0xE], a[0xB], a[0x4], // ROT(SubBytes(X1))
a[0xC], a[0x9], a[0x6], a[0x3], // SubBytes(X3)
a[0x9], a[0x6], a[0x3], a[0xC], // ROT(SubBytes(X3))
};
return dest ^ (__m128i)((uint32x4_t){0, rcon, 0, rcon});
}
#endif

/* Streaming Extensions */
Expand Down
92 changes: 68 additions & 24 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,9 @@ const char *SSE2NEONTest::getInstructionTestString(InstructionTest test)
case IT_MM_AESENC_SI128:
ret = "IT_MM_AESENC_SI128";
break;
case IT_MM_AESKEYGENASSIST_SI128:
ret = "IT_MM_AESKEYGENASSIST_SI128";
break;
case IT_MM_CLMULEPI64_SI128:
ret = "IT_MM_CLMULEPI64_SI128";
break;
Expand Down Expand Up @@ -2793,36 +2796,38 @@ bool test_mm_clmulepi64_si128(const uint64_t *_a, const uint64_t *_b)
return true;
}

static const uint8_t crypto_aes_sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
0xb0, 0x54, 0xbb, 0x16,
};

#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
inline __m128i aesenc_128_reference(__m128i a, __m128i b)
{
static const uint8_t sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
0xb0, 0x54, 0xbb, 0x16};
uint8_t i, t, u, v[4][4];
for (i = 0; i < 16; ++i) {
v[((i / 4) + 4 - (i % 4)) % 4][i % 4] =
sbox[((SIMDVec *) &a)->m128_u8[i]];
crypto_aes_sbox[((SIMDVec *) &a)->m128_u8[i]];
}
for (i = 0; i < 4; ++i) {
t = v[i][0];
Expand All @@ -2839,6 +2844,27 @@ inline __m128i aesenc_128_reference(__m128i a, __m128i b)
return a;
}

static inline uint32_t sub_word(uint32_t key)
{
return (crypto_aes_sbox[key >> 24] << 24) |
(crypto_aes_sbox[(key >> 16) & 0xff] << 16) |
(crypto_aes_sbox[(key >> 8) & 0xff] << 8) |
crypto_aes_sbox[key & 0xff];
}

// Rotates right (circular right shift) value by "amount" positions
static inline uint32_t rotr(uint32_t value, uint32_t amount)
{
return (value >> amount) | (value << ((32 - amount) & 31));
}

inline __m128i aeskeygenassist_128_reference(__m128i a, const int rcon)
{
const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)));
const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)));
return _mm_set_epi32(rotr(X3, 8) ^ rcon, X3, rotr(X1, 8) ^ rcon, X1);
}

bool test_mm_aesenc_si128(const int32_t *a, const int32_t *b)
{
__m128i data = _mm_loadu_si128((const __m128i *) a);
Expand All @@ -2850,6 +2876,20 @@ bool test_mm_aesenc_si128(const int32_t *a, const int32_t *b)
return validate128(resultReference, resultIntrinsic);
}

// FIXME: improve the test case for AES-256 key expansion.
// Reference:
// https://github.com/randombit/botan/blob/master/src/lib/block/aes/aes_ni/aes_ni.cpp
bool test_mm_aeskeygenassist_si128(const int32_t *a, const int32_t *b)
{
__m128i data = _mm_loadu_si128((const __m128i *) a);

const int8_t rcon = 0x40; /* must be an 8-bit immediate */
__m128i resultReference = aeskeygenassist_128_reference(data, rcon);
__m128i resultIntrinsic = _mm_aeskeygenassist_si128(data, rcon);

return validate128(resultReference, resultIntrinsic);
}

bool test_mm_malloc(const size_t *a, const size_t *b)
{
size_t size = *a % (1024 * 16) + 1;
Expand Down Expand Up @@ -3465,6 +3505,10 @@ class SSE2NEONTestImpl : public SSE2NEONTest
case IT_MM_AESENC_SI128:
ret = test_mm_aesenc_si128(mTestIntPointer1, mTestIntPointer2);
break;
case IT_MM_AESKEYGENASSIST_SI128:
ret = test_mm_aeskeygenassist_si128(mTestIntPointer1,
mTestIntPointer2);
break;
case IT_MM_CLMULEPI64_SI128:
ret = test_mm_clmulepi64_si128((const uint64_t *) mTestIntPointer1,
(const uint64_t *) mTestIntPointer2);
Expand Down
1 change: 1 addition & 0 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ enum InstructionTest {

// AES
IT_MM_AESENC_SI128,
IT_MM_AESKEYGENASSIST_SI128,

// Others
IT_MM_CLMULEPI64_SI128,
Expand Down

0 comments on commit 6ab1c80

Please sign in to comment.