Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add _mm_aesdec_si128 #559

Merged
merged 1 commit into from
Dec 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
167 changes: 153 additions & 14 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -9404,7 +9404,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)

#if !defined(__ARM_FEATURE_CRYPTO)
/* clang-format off */
#define SSE2NEON_AES_DATA(w) \
#define SSE2NEON_AES_SBOX(w) \
{ \
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
Expand Down Expand Up @@ -9444,11 +9444,52 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
w(0xb0), w(0x54), w(0xbb), w(0x16) \
}
#define SSE2NEON_AES_RSBOX(w) \
{ \
w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
w(0x55), w(0x21), w(0x0c), w(0x7d) \
}
/* clang-format on */

/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
#define SSE2NEON_AES_H0(x) (x)
static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
static const uint8_t SSE2NEON_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
#undef SSE2NEON_AES_H0

// In the absence of crypto extensions, implement aesenc using regular neon
Expand All @@ -9457,7 +9498,7 @@ static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
// for more information Reproduced with permission of the author.
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
{
#if defined(__aarch64__)
static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
Expand All @@ -9467,7 +9508,7 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};

uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
uint8x16_t w = vreinterpretq_u8_m128i(a);

// shift rows
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
Expand All @@ -9486,11 +9527,13 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
// add round key
return vreinterpretq_m128i_u8(w) ^ RoundKey;

#else /* ARMv7-A implementation */
#else /* ARMv7-A implementation for a table-based AES */
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
((uint32_t) (b1) << 8) | (uint32_t) (b0))
// muliplying 'x' by 2 in GF(2^8)
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
// muliplying 'x' by 3 in GF(2^8)
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
#define SSE2NEON_AES_U0(p) \
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
Expand All @@ -9500,11 +9543,14 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
#define SSE2NEON_AES_U3(p) \
SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))

// this generates a table containing every possible permutation of
// shift_rows() and sub_bytes() with mix_columns().
static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
};
#undef SSE2NEON_AES_B2W
#undef SSE2NEON_AES_F2
Expand All @@ -9514,11 +9560,15 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
#undef SSE2NEON_AES_U2
#undef SSE2NEON_AES_U3

uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
uint32_t x1 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
uint32_t x2 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
uint32_t x3 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]

// finish the modulo addition step in mix_columns()
__m128i out = _mm_set_epi32(
(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
Expand All @@ -9533,6 +9583,80 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
#endif
}

// Perform one round of an AES decryption flow on data (state) in a using the
// round key in RoundKey, and store the result in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
{
#if defined(__aarch64__)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shall take the following into consideration:

  1. Armv8-A without Cryptographic extension
  2. Armv8-A and A32 with Cryptographic extension
  3. Armv7-A

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we don't have tests for function guarded by "__ARM_FEATURE_CRYPTO" macro. I think maybe I should add another makefile target to run test with +crypto?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly, we shall have conditional +crypto in feasible permutations.

static const uint8_t inv_shift_rows[] = {0x0, 0xd, 0xa, 0x7, 0x4, 0x1,
0xe, 0xb, 0x8, 0x5, 0x2, 0xf,
0xc, 0x9, 0x6, 0x3};
static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};

uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(a);

// shift rows
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));

// sub bytes
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_rsbox), w);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x40), w - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x80), w - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0xc0), w - 0xc0);

// muliplying 'v' by 4 in GF(2^8)
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
v ^= w;
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);

// mix columns
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
0x1b); // muliplying 'v' by 2 in GF(2^8)
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));

// add round key
return vreinterpretq_m128i_u8(w) ^ RoundKey;

#else /* ARMv7-A NEON implementation */
/* FIXME: optimized for NEON */
Copy link
Contributor Author

@howjmay howjmay Dec 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jserv the tables for table-based AES algorithm are way too many and huge. Therefore, I am using plain pure C implementation here instead of the table-based AES which is used in _mm_aesenc_si128.
I am trying to optimize it.
A simple table-based AES needs tables like this https://github.com/iVishalr/AES-Encryption/blob/master/lookup.h

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation of both helper_aesenc and helper_aesdec are table-based AES. Here are its tables https://github.com/qemu/qemu/blob/266469947161aa10b1d36843580d369d5aa38589/crypto/aes.c

Copy link
Member

@jserv jserv Dec 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. Let's stick to shorter implementation since Armv7 path exits for compatibility purpose.

#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
#define MULTIPLY(x, y) \
(((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))

uint8_t i, e, f, g, h, v[4][4];
uint8_t *_a = (uint8_t *) &a;
for (i = 0; i < 16; ++i) {
v[((i / 4) + (i % 4)) % 4][i % 4] = SSE2NEON_rsbox[_a[i]];
}

for (i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];

v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
MULTIPLY(h, 0x09);
v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
MULTIPLY(h, 0x0d);
v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
MULTIPLY(h, 0x0b);
v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
MULTIPLY(h, 0x0e);
}
#undef XT
#undef MULTIPLY

return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
#endif
}

// Perform the last round of an AES encryption flow on data (state) in a using
// the round key in RoundKey, and store the result in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
Expand Down Expand Up @@ -9601,7 +9725,8 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
}
#undef SSE2NEON_AES_DATA
#undef SSE2NEON_AES_SBOX
jserv marked this conversation as resolved.
Show resolved Hide resolved
#undef SSE2NEON_AES_RSBOX

#else /* __ARM_FEATURE_CRYPTO */
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
Expand All @@ -9617,6 +9742,16 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
vreinterpretq_u8_m128i(b));
}

// Perform one round of an AES decryption flow on data (state) in a using the
// round key in RoundKey, and store the result in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
{
return vreinterpretq_m128i_u8(veorq_u8(
vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
vreinterpretq_u8_m128i(RoundKey)));
}

// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
{
Expand All @@ -9625,6 +9760,10 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
RoundKey);
}

// Assist in expanding the AES cipher key by computing steps towards generating
// a round key for encryption cipher using data from a and an 8-bit round
// constant specified in imm8, and store the result in dst."
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
{
// AESE does ShiftRows and SubBytes on A
Expand Down
81 changes: 81 additions & 0 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,32 @@ static const uint8_t crypto_aes_sbox[256] = {
0xb0, 0x54, 0xbb, 0x16,
};

static const uint8_t crypto_aes_rsbox[256] = {
0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e,
0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32,
0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49,
0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50,
0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05,
0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41,
0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8,
0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b,
0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59,
0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d,
0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63,
0x55, 0x21, 0x0c, 0x7d,
};

// XT is x_time function that muliplies 'x' by 2 in GF(2^8)
#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
inline __m128i aesenc_128_reference(__m128i a, __m128i b)
{
Expand All @@ -597,6 +623,43 @@ inline __m128i aesenc_128_reference(__m128i a, __m128i b)
v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]);
v[i][3] ^= u ^ XT(v[i][3] ^ t);
}

for (i = 0; i < 16; ++i) {
((SIMDVec *) &a)->m128_u8[i] =
v[i / 4][i % 4] ^ ((SIMDVec *) &b)->m128_u8[i];
}

return a;
}

#define MULTIPLY(x, y) \
(((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))

inline __m128i aesdec_128_reference(__m128i a, __m128i b)
{
uint8_t i, e, f, g, h, v[4][4];
for (i = 0; i < 16; ++i) {
v[((i / 4) + (i % 4)) % 4][i % 4] =
crypto_aes_rsbox[((SIMDVec *) &a)->m128_u8[i]];
}

for (i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];

v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
MULTIPLY(h, 0x09);
v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
MULTIPLY(h, 0x0d);
v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
MULTIPLY(h, 0x0b);
v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
MULTIPLY(h, 0x0e);
}

for (i = 0; i < 16; ++i) {
((SIMDVec *) &a)->m128_u8[i] =
v[i / 4][i % 4] ^ ((SIMDVec *) &b)->m128_u8[i];
Expand Down Expand Up @@ -11552,6 +11615,19 @@ result_t test_mm_aesenc_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
return validate128(resultReference, resultIntrinsic);
}

result_t test_mm_aesdec_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const int32_t *a = (int32_t *) impl.mTestIntPointer1;
const int32_t *b = (int32_t *) impl.mTestIntPointer2;
__m128i data = _mm_loadu_si128((const __m128i *) a);
__m128i rk = _mm_loadu_si128((const __m128i *) b);

__m128i resultReference = aesdec_128_reference(data, rk);
__m128i resultIntrinsic = _mm_aesdec_si128(data, rk);

return validate128(resultReference, resultIntrinsic);
}

result_t test_mm_aesenclast_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const int32_t *a = (const int32_t *) impl.mTestIntPointer1;
Expand All @@ -11565,6 +11641,11 @@ result_t test_mm_aesenclast_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
return validate128(resultReference, resultIntrinsic);
}

result_t test_mm_aesdeclast_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
return TEST_UNIMPL;
jserv marked this conversation as resolved.
Show resolved Hide resolved
}

// FIXME: improve the test case for AES-256 key expansion.
// Reference:
// https://github.com/randombit/botan/blob/master/src/lib/block/aes/aes_ni/aes_ni.cpp
Expand Down
2 changes: 2 additions & 0 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,9 @@
_(mm_crc32_u8) \
/* AES */ \
_(mm_aesenc_si128) \
_(mm_aesdec_si128) \
_(mm_aesenclast_si128) \
_(mm_aesdeclast_si128) \
_(mm_aeskeygenassist_si128) \
/* Others */ \
_(mm_clmulepi64_si128) \
Expand Down