Skip to content

Commit

Permalink
Add some small arm neon optimizations (#1847)
Browse files Browse the repository at this point in the history
* Remove unused includes

Signed-off-by: Mark Reid <mindmark@gmail.com>

* Use neon hardware support for f16 conversions

Signed-off-by: Mark Reid <mindmark@gmail.com>

* Add some small neon optimizations
use blendv,floor and fma intrinsics were possible

Signed-off-by: Mark Reid <mindmark@gmail.com>

---------

Signed-off-by: Mark Reid <mindmark@gmail.com>
  • Loading branch information
markreidvfx committed Sep 14, 2023
1 parent 16b3157 commit 14f0afa
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 7 deletions.
1 change: 0 additions & 1 deletion src/OpenColorIO/AVX.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#if OCIO_USE_AVX

#include <immintrin.h>
#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"
Expand Down
1 change: 0 additions & 1 deletion src/OpenColorIO/AVX2.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#if OCIO_USE_AVX2

#include <immintrin.h>
#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"
Expand Down
2 changes: 2 additions & 0 deletions src/OpenColorIO/CPUInfoConfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
// Relevant only for arm64 architecture.
#if defined(__aarch64__)
#cmakedefine01 OCIO_USE_SSE2NEON
#else
#define OCIO_USE_SSE2NEON 0
#endif

// On the Apple platform, a universal build is created for both x86_64 and arm64 architectures.
Expand Down
40 changes: 35 additions & 5 deletions src/OpenColorIO/SSE2.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
#endif
#endif

#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"

Expand Down Expand Up @@ -76,6 +74,8 @@ static inline void sse2RGBATranspose_4x4(__m128 row0, __m128 row1, __m128 row2,
out_a = _mm_movehl_ps(tmp3, tmp1);
}

#if !OCIO_USE_SSE2NEON

static inline __m128i sse2_blendv(__m128i a, __m128i b, __m128i mask)
{
return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a, b), mask), a);
Expand Down Expand Up @@ -164,6 +164,8 @@ static inline __m128 sse2_cvtph_ps(__m128i a)
return _mm_or_ps(o, sign);
}

#endif

// Note Packing functions perform no 0.0 - 1.0 normalization
// but perform 0 - max value clamping for integer formats
template<BitDepth BD> struct SSE2RGBAPack {};
Expand Down Expand Up @@ -290,21 +292,48 @@ struct SSE2RGBAPack<BIT_DEPTH_F16>
__m128i rgba_00_01 = _mm_loadu_si128((const __m128i*)(in + 0));
__m128i rgba_02_03 = _mm_loadu_si128((const __m128i*)(in + 8));

#if OCIO_USE_SSE2NEON
// use neon hardware support for f16 to f32
__m128 rgba0 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01))))
);
__m128 rgba1 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01))))
);
__m128 rgba2 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03))))
);
__m128 rgba3 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03))))
);
#else
__m128 rgba0 = sse2_cvtph_ps(rgba_00_01);
__m128 rgba1 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_00_01, _MM_SHUFFLE(1,0,3,2)));
__m128 rgba2 = sse2_cvtph_ps(rgba_02_03);
__m128 rgba3 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_02_03, _MM_SHUFFLE(1,0,3,2)));

#endif
sse2RGBATranspose_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a);
}

static inline void Store(half *out, __m128 r, __m128 g, __m128 b, __m128 a)
{
__m128 rgba0, rgba1, rgba2, rgba3;
__m128i rgba;

sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3);

#if OCIO_USE_SSE2NEON
// use neon hardware support for f32 to f16
float16x8_t rgba;
float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0));
float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1));
float16x4_t rgba04_05 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba2));
float16x4_t rgba06_07 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba3));
rgba = vcombine_f16(rgba00_01, rgba03_03);
vst1q_f16((float16_t *)(out+0), rgba);

rgba = vcombine_f16(rgba04_05, rgba06_07);
vst1q_f16((float16_t *)(out+8), rgba);
#else
__m128i rgba;
__m128i rgba00_01 = sse2_cvtps_ph(rgba0);
__m128i rgba02_03 = sse2_cvtps_ph(rgba1);
__m128i rgba04_05 = sse2_cvtps_ph(rgba2);
Expand All @@ -315,6 +344,7 @@ struct SSE2RGBAPack<BIT_DEPTH_F16>

rgba = _mm_xor_si128(rgba04_05, _mm_shuffle_epi32(rgba06_07, _MM_SHUFFLE(1,0,3,2)));
_mm_storeu_si128((__m128i*)(out+8), rgba);
#endif
}
};

Expand Down
10 changes: 10 additions & 0 deletions src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@ namespace {

static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c)
{
#if OCIO_USE_SSE2NEON
return vreinterpretq_m128_f32(
vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))
);
#else
return _mm_add_ps(_mm_mul_ps(a, b), c);
#endif
}

static inline __m128 floor_ps_sse2(__m128 v)
{
#if OCIO_USE_SSE2NEON
return _mm_floor_ps(v);
#else
// NOTE: using truncate cvtt
return _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
#endif
}


Expand Down
14 changes: 14 additions & 0 deletions src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,32 @@ struct rgbavec_sse2 {

static inline __m128 floor_ps_sse2(__m128 v)
{
#if OCIO_USE_SSE2NEON
return _mm_floor_ps(v);
#else
// NOTE: using truncate cvtt
return _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
#endif
}

static inline __m128 blendv_ps_sse2(__m128 a, __m128 b, __m128 mask)
{
#if OCIO_USE_SSE2NEON
return _mm_blendv_ps(a, b, mask);
#else
return _mm_xor_ps(_mm_and_ps(_mm_xor_ps(a, b), mask), a);
#endif
}

static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c)
{
#if OCIO_USE_SSE2NEON
return vreinterpretq_m128_f32(
vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))
);
#else
return _mm_add_ps(_mm_mul_ps(a, b), c);
#endif
}

static inline rgbavec_sse2 interp_tetrahedral_sse2(const Lut3DContextSSE2 &ctx, __m128 r, __m128 g, __m128 b, __m128 a)
Expand Down

0 comments on commit 14f0afa

Please sign in to comment.