Skip to content

Commit

Permalink
NEON optimizations for ZIP reading (reconstruct and interleave) (#1348)
Browse files Browse the repository at this point in the history
Pretty much straight copies from the SSE2/SSE4 SIMD code, just changed
for NEON.

Reading 310Mpix worth of ZIP compressed EXR files into memory,
on Apple M1 Max (Clang 14, RelWithDebInfo build config): 7.50s -> 5.52s.

"reconstruct" part goes 1.55s -> 0.35s, "interleave" 0.80s -> 0.04s.

Signed-off-by: Aras Pranckevicius <aras@nesnausk.org>
  • Loading branch information
aras-p committed Mar 3, 2023
1 parent 0603cc2 commit 677c6a5
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 0 deletions.
93 changes: 93 additions & 0 deletions src/lib/OpenEXR/ImfZip.cpp
Expand Up @@ -160,6 +160,56 @@ reconstruct_sse41 (char* buf, size_t outSize)

#endif

#ifdef IMF_HAVE_NEON

void
reconstruct_neon (char* buf, size_t outSize)
{
static const size_t bytesPerChunk = sizeof (uint8x16_t);
const size_t vOutSize = outSize / bytesPerChunk;

const uint8x16_t c = vdupq_n_u8 (-128);
const uint8x16_t shuffleMask = vdupq_n_u8 (15);

// The first element doesn't have its high bit flipped during compression,
// so it must not be flipped here. To make the SIMD loop nice and
// uniform, we pre-flip the bit so that the loop will unflip it again.
buf[0] += -128;

unsigned char* vBuf = reinterpret_cast<unsigned char*> (buf);
uint8x16_t vZero = vdupq_n_u8 (0);
uint8x16_t vPrev = vdupq_n_u8 (0);
for (size_t i = 0; i < vOutSize; ++i)
{
uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);

// Compute the prefix sum of elements.
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 1));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 2));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 4));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 8));
d = vaddq_u8 (d, vPrev);

vst1q_u8 (vBuf, d);
vBuf += sizeof (uint8x16_t);

// Broadcast the high byte in our result to all lanes of the prev
// value for the next iteration.
vPrev = vqtbl1q_u8 (d, shuffleMask);
}

unsigned char prev = vgetq_lane_u8 (vPrev, 15);
for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
unsigned char d = prev + buf[i] - 128;
buf[i] = d;
prev = d;
}
}

#endif


void
reconstruct_scalar (char* buf, size_t outSize)
{
Expand Down Expand Up @@ -212,6 +262,44 @@ interleave_sse2 (const char* source, size_t outSize, char* out)

#endif

#ifdef IMF_HAVE_NEON

void
interleave_neon (const char* source, size_t outSize, char* out)
{
static const size_t bytesPerChunk = 2 * sizeof (uint8x16_t);

const size_t vOutSize = outSize / bytesPerChunk;

const unsigned char* v1 = reinterpret_cast<const unsigned char*> (source);
const unsigned char* v2 =
reinterpret_cast<const unsigned char*> (source + (outSize + 1) / 2);
unsigned char* vOut = reinterpret_cast<unsigned char*> (out);

for (size_t i = 0; i < vOutSize; ++i)
{
uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);

uint8x16_t lo = vzip1q_u8 (a, b);
uint8x16_t hi = vzip2q_u8 (a, b);

vst1q_u8 (vOut, lo); vOut += sizeof (uint8x16_t);
vst1q_u8 (vOut, hi); vOut += sizeof (uint8x16_t);
}

const char* t1 = reinterpret_cast<const char*> (v1);
const char* t2 = reinterpret_cast<const char*> (v2);
char* sOut = reinterpret_cast<char*> (vOut);

for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
*(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
}
}

#endif

void
interleave_scalar (const char* source, size_t outSize, char* out)
{
Expand Down Expand Up @@ -291,6 +379,11 @@ Zip::initializeFuncs ()
interleave = interleave_sse2;
}
#endif

#ifdef IMF_HAVE_NEON
reconstruct = reconstruct_neon;
interleave = interleave_neon;
#endif
}

OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
77 changes: 77 additions & 0 deletions src/lib/OpenEXRCore/internal_zip.c
Expand Up @@ -24,6 +24,11 @@
# define IMF_HAVE_SSE4_1 1
# include <smmintrin.h>
#endif
#if defined(__ARM_NEON)
# define IMF_HAVE_NEON 1
# include <arm_neon.h>
#endif


/**************************************/

Expand Down Expand Up @@ -73,6 +78,54 @@ reconstruct (uint8_t* buf, uint64_t outSize)
prev = d;
}
}
#elif defined(IMF_HAVE_NEON)
static void
reconstruct (uint8_t* buf, uint64_t outSize)
{
static const uint64_t bytesPerChunk = sizeof (uint8x16_t);
const uint64_t vOutSize = outSize / bytesPerChunk;
const uint8x16_t c = vdupq_n_u8 (-128);
const uint8x16_t shuffleMask = vdupq_n_u8 (15);
const uint8x16_t zero = vdupq_n_u8 (0);
uint8_t * vBuf;
uint8x16_t vPrev;
uint8_t prev;

/*
* The first element doesn't have its high bit flipped during compression,
* so it must not be flipped here. To make the SIMD loop nice and
* uniform, we pre-flip the bit so that the loop will unflip it again.
*/
buf[0] += -128;
vBuf = buf;
vPrev = vdupq_n_u8 (0);

for (uint64_t i = 0; i < vOutSize; ++i)
{
uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);

/* Compute the prefix sum of elements. */
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 1));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 2));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 4));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 8));
d = vaddq_u8 (d, vPrev);

vst1q_u8 (vBuf, d); vBuf += sizeof (uint8x16_t);

// Broadcast the high byte in our result to all lanes of the prev
// value for the next iteration.
vPrev = vqtbl1q_u8 (d, shuffleMask);
}

prev = vgetq_lane_u8 (vPrev, 15);
for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
uint8_t d = prev + buf[i] - 128;
buf[i] = d;
prev = d;
}
}
#else
static void
reconstruct (uint8_t* buf, uint64_t sz)
Expand Down Expand Up @@ -121,6 +174,30 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
*(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
}

#elif defined(IMF_HAVE_NEON)
static void
interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
{
static const uint64_t bytesPerChunk = 2 * sizeof (uint8x16_t);
const uint64_t vOutSize = outSize / bytesPerChunk;
const uint8_t* v1 = source;
const uint8_t* v2 = source + (outSize + 1) / 2;

for (uint64_t i = 0; i < vOutSize; ++i)
{
uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);
uint8x16_t lo = vzip1q_u8 (a, b);
uint8x16_t hi = vzip2q_u8 (a, b);

vst1q_u8 (out, lo); out += sizeof (uint8x16_t);
vst1q_u8 (out, hi); out += sizeof (uint8x16_t);
}

for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
*(out++) = (i % 2 == 0) ? *(v1++) : *(v2++);
}

#else

static void
Expand Down

0 comments on commit 677c6a5

Please sign in to comment.