Skip to content
This repository has been archived by the owner on Jul 5, 2022. It is now read-only.

Commit

Permalink
Slightliy better performance due to caching changes, added neon detec…
Browse files Browse the repository at this point in the history
…tion code for android platforms
  • Loading branch information
Martin Fleisz committed Jun 28, 2011
1 parent b958b1f commit 1f201c0
Showing 1 changed file with 36 additions and 63 deletions.
99 changes: 36 additions & 63 deletions libfreerdp-rfx/neon/rfx_neon.c
Expand Up @@ -24,59 +24,20 @@

#include "rfx_neon.h"

#if defined(ANDROID_DISABLED)
#if defined(ANDROID)
#include <cpu-features.h>
#include <android/log.h>
#endif

#define CACHE_LINE_BYTES 64

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_buffer(char * buffer, int num_bytes)
prefetch_data(void * buffer1)
{
asm(" mov r3, %0 \t\n"
" add r4, r3, %1 \t\n"
"1: \t\n"
" pld [r3] \t\n"
" add r3, r3, #64 \t\n"
" cmp r3, r4 \t\n"
" bne 1b \t\n"
asm(" pld [%0, #64] \t\n"
: // no output
: "r" (buffer), "r" (num_bytes)
: "r3", "r4" );
: "r" (buffer1)
);
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_line(char * buffer)
{
asm(" mov r3, %0 \t\n"
" pld [r3, #0] \t\n"
: // no output
: "r" (buffer)
: "r3" );
}


static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_buffers(char * buffer1, char * buffer2, char * buffer3, int num_bytes)
{
asm(" mov r3, %0 \t\n"
" mov r4, %1 \t\n"
" mov r5, %2 \t\n"
" mov r6, #0 \t\n"
"cache_loop2: \t\n"
" pld [r3, r6] \t\n"
" pld [r3, r6] \t\n"
" pld [r3, r6] \t\n"
" add r6, r6, #64 \t\n"
" cmp r6, %3 \t\n"
" bne cache_loop2 \t\n"
: // no output
: "r" (buffer1), "r" (buffer2), "r" (buffer3), "r" (num_bytes)
: "r3", "r4", "r5", "r6" );
}


void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
{
int16x8_t zero = vdupq_n_s16(0);
Expand All @@ -87,11 +48,13 @@ void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;

prefetch_buffers((char*)y_r_buf, (char*)cb_g_buf, (char*)cr_b_buf, sizeof(sint16) * 4096);

int i;
for (i = 0; i < 4096 / 8; i++)
{
prefetch_data(&y_r_buf[i]);
prefetch_data(&cr_b_buf[i]);
prefetch_data(&cb_g_buf[i]);

int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
y = vaddq_s16(y, y_add);

Expand Down Expand Up @@ -130,7 +93,7 @@ void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 factor)
rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
{
if (factor <= 6)
return;
Expand All @@ -140,6 +103,7 @@ rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 fact

do
{
prefetch_data(buf);
int16x8_t val = vld1q_s16((sint16*)buf);
val = vshlq_s16(val, quantFactors);
vst1q_s16((sint16*)buf, val);
Expand All @@ -151,8 +115,6 @@ rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 fact
void
rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
{
prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));

rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
Expand Down Expand Up @@ -182,9 +144,11 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
{
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
int16x8_t l_n = vld1q_s16(l_ptr);
prefetch_data(l_ptr);

int16x8_t h_n = vld1q_s16(h_ptr);
int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
prefetch_data(h_ptr);

if (n == 0)
{
Expand Down Expand Up @@ -212,11 +176,13 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);

int16x8_t h_n = vld1q_s16(h_ptr);
prefetch_data(h_ptr);

h_n = vshlq_n_s16(h_n, 1);

int16x8x2_t dst_n;
dst_n.val[0] = vld1q_s16(l_ptr);
prefetch_data(l_ptr);
int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
if (n == subband_width - 8)
{
Expand All @@ -230,6 +196,7 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);

vst2q_s16(dst_ptr, dst_n);
prefetch_data(dst_ptr);

l_ptr+=8;
h_ptr+=8;
Expand All @@ -253,6 +220,8 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
{
for (x = 0; x < total_width; x+=8)
{
prefetch_data(l_ptr);
prefetch_data(h_ptr);
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);

int16x8_t l_n = vld1q_s16(l_ptr);
Expand All @@ -268,6 +237,7 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
}
tmp_n = vshrq_n_s16(tmp_n, 1);

prefetch_data(dst_ptr);
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
vst1q_s16(dst_ptr, dst_n);

Expand All @@ -287,9 +257,12 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
for (x = 0; x < total_width; x+=8)
{
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);

int16x8_t h_n = vld1q_s16(h_ptr);
int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);

prefetch_data(h_ptr);
prefetch_data(dst_ptr - total_width);

h_n = vshlq_n_s16(h_n, 1);

int16x8_t tmp_n = dst_n_m;
Expand All @@ -304,6 +277,7 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb

int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
vst1q_s16(dst_ptr, dst_n);
prefetch_data(dst_ptr);

h_ptr+=8;
dst_ptr+=8;
Expand All @@ -318,8 +292,6 @@ rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
sint16 * hl, * lh, * hh, * ll;
sint16 * l_dst, * h_dst;

prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16));

/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
/* The lower part L uses LL(3) and HL(0). */
Expand All @@ -342,35 +314,36 @@ rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
}

void
rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)
rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
{
prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));

rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer_8, 8);
rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer_16, 16);
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer_32, 32);
rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
}



int isNeonSupported()
{
#if defined(ANDROID_DISABLED)
#if defined(ANDROID)
if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
{
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - No ARM CPU found");
DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
return 0;
}

features = android_getCpuFeatures();
uint64_t features = android_getCpuFeatures();
if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
{
if (features & ANDROID_CPU_ARM_FEATURE_NEON)
{
DEBUG_RFX("NEON optimization enabled!");
return 1;
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - CPU not NEON capable");
}
DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
}
else
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - No ARMv7 CPU found");
DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");

return 0;
#else
Expand Down

0 comments on commit 1f201c0

Please sign in to comment.