| @@ -1,4 +1,4 @@ | ||
| APP_PLATFORM := android-9 | ||
| APP_PLATFORM := android-14 | ||
| APP_ABI := armeabi armeabi-v7a | ||
| NDK_TOOLCHAIN_VERSION := 4.9 | ||
| APP_STL := gnustl_static |
| @@ -0,0 +1,174 @@ | ||
| /* Copyright (c) 2015 Xiph.Org Foundation | ||
| Written by Viswanath Puttagunta */ | ||
| /** | ||
| @file celt_ne10_fft.c | ||
| @brief ARM Neon optimizations for fft using NE10 library | ||
| */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifndef SKIP_CONFIG_H | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #endif | ||
| #include <NE10_init.h> | ||
| #include <NE10_dsp.h> | ||
| #include "os_support.h" | ||
| #include "kiss_fft.h" | ||
| #include "stack_alloc.h" | ||
| #if !defined(FIXED_POINT) | ||
| # define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon | ||
| # define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t | ||
| # define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t | ||
| # define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32 | ||
| # define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t | ||
| # define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon | ||
| #else | ||
| # define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft) | ||
| # define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t | ||
| # define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t | ||
| # define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32 | ||
| # define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32 | ||
| # define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t | ||
| # define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon | ||
| #endif | ||
| #if defined(CUSTOM_MODES) | ||
| /* nfft lengths in NE10 that support scaled fft */ | ||
| # define NE10_FFTSCALED_SUPPORT_MAX 4 | ||
| static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = { | ||
| 480, 240, 120, 60 | ||
| }; | ||
| int opus_fft_alloc_arm_neon(kiss_fft_state *st) | ||
| { | ||
| int i; | ||
| size_t memneeded = sizeof(struct arch_fft_state); | ||
| st->arch_fft = (arch_fft_state *)opus_alloc(memneeded); | ||
| if (!st->arch_fft) | ||
| return -1; | ||
| for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) { | ||
| if(st->nfft == ne10_fft_scaled_support[i]) | ||
| break; | ||
| } | ||
| if (i == NE10_FFTSCALED_SUPPORT_MAX) { | ||
| /* This nfft length (scaled fft) is not supported in NE10 */ | ||
| st->arch_fft->is_supported = 0; | ||
| st->arch_fft->priv = NULL; | ||
| } | ||
| else { | ||
| st->arch_fft->is_supported = 1; | ||
| st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft); | ||
| if (st->arch_fft->priv == NULL) { | ||
| return -1; | ||
| } | ||
| } | ||
| return 0; | ||
| } | ||
| void opus_fft_free_arm_neon(kiss_fft_state *st) | ||
| { | ||
| NE10_FFT_CFG_TYPE_T cfg; | ||
| if (!st->arch_fft) | ||
| return; | ||
| cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv; | ||
| if (cfg) | ||
| NE10_FFT_DESTROY_C2C_TYPE(cfg); | ||
| opus_free(st->arch_fft); | ||
| } | ||
| #endif | ||
| void opus_fft_neon(const kiss_fft_state *st, | ||
| const kiss_fft_cpx *fin, | ||
| kiss_fft_cpx *fout) | ||
| { | ||
| NE10_FFT_STATE_TYPE_T state; | ||
| NE10_FFT_CFG_TYPE_T cfg = &state; | ||
| VARDECL(NE10_FFT_CPX_TYPE_T, buffer); | ||
| SAVE_STACK; | ||
| ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T); | ||
| if (!st->arch_fft->is_supported) { | ||
| /* This nfft length (scaled fft) not supported in NE10 */ | ||
| opus_fft_c(st, fin, fout); | ||
| } | ||
| else { | ||
| memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T)); | ||
| state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0]; | ||
| #if !defined(FIXED_POINT) | ||
| state.is_forward_scaled = 1; | ||
| NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout, | ||
| (NE10_FFT_CPX_TYPE_T *)fin, | ||
| cfg, 0); | ||
| #else | ||
| NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout, | ||
| (NE10_FFT_CPX_TYPE_T *)fin, | ||
| cfg, 0, 1); | ||
| #endif | ||
| } | ||
| RESTORE_STACK; | ||
| } | ||
| void opus_ifft_neon(const kiss_fft_state *st, | ||
| const kiss_fft_cpx *fin, | ||
| kiss_fft_cpx *fout) | ||
| { | ||
| NE10_FFT_STATE_TYPE_T state; | ||
| NE10_FFT_CFG_TYPE_T cfg = &state; | ||
| VARDECL(NE10_FFT_CPX_TYPE_T, buffer); | ||
| SAVE_STACK; | ||
| ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T); | ||
| if (!st->arch_fft->is_supported) { | ||
| /* This nfft length (scaled fft) not supported in NE10 */ | ||
| opus_ifft_c(st, fin, fout); | ||
| } | ||
| else { | ||
| memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T)); | ||
| state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0]; | ||
| #if !defined(FIXED_POINT) | ||
| state.is_backward_scaled = 0; | ||
| NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout, | ||
| (NE10_FFT_CPX_TYPE_T *)fin, | ||
| cfg, 1); | ||
| #else | ||
| NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout, | ||
| (NE10_FFT_CPX_TYPE_T *)fin, | ||
| cfg, 1, 0); | ||
| #endif | ||
| } | ||
| RESTORE_STACK; | ||
| } |
| @@ -0,0 +1,258 @@ | ||
| /* Copyright (c) 2015 Xiph.Org Foundation | ||
| Written by Viswanath Puttagunta */ | ||
| /** | ||
| @file celt_ne10_mdct.c | ||
| @brief ARM Neon optimizations for mdct using NE10 library | ||
| */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifndef SKIP_CONFIG_H | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #endif | ||
| #include "kiss_fft.h" | ||
| #include "_kiss_fft_guts.h" | ||
| #include "mdct.h" | ||
| #include "stack_alloc.h" | ||
| void clt_mdct_forward_neon(const mdct_lookup *l, | ||
| kiss_fft_scalar *in, | ||
| kiss_fft_scalar * OPUS_RESTRICT out, | ||
| const opus_val16 *window, | ||
| int overlap, int shift, int stride, int arch) | ||
| { | ||
| int i; | ||
| int N, N2, N4; | ||
| VARDECL(kiss_fft_scalar, f); | ||
| VARDECL(kiss_fft_cpx, f2); | ||
| const kiss_fft_state *st = l->kfft[shift]; | ||
| const kiss_twiddle_scalar *trig; | ||
| SAVE_STACK; | ||
| N = l->n; | ||
| trig = l->trig; | ||
| for (i=0;i<shift;i++) | ||
| { | ||
| N >>= 1; | ||
| trig += N; | ||
| } | ||
| N2 = N>>1; | ||
| N4 = N>>2; | ||
| ALLOC(f, N2, kiss_fft_scalar); | ||
| ALLOC(f2, N4, kiss_fft_cpx); | ||
| /* Consider the input to be composed of four blocks: [a, b, c, d] */ | ||
| /* Window, shuffle, fold */ | ||
| { | ||
| /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
| const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | ||
| const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | ||
| kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
| const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); | ||
| const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | ||
| for(i=0;i<((overlap+3)>>2);i++) | ||
| { | ||
| /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | ||
| *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); | ||
| *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); | ||
| xp1+=2; | ||
| xp2-=2; | ||
| wp1+=2; | ||
| wp2-=2; | ||
| } | ||
| wp1 = window; | ||
| wp2 = window+overlap-1; | ||
| for(;i<N4-((overlap+3)>>2);i++) | ||
| { | ||
| /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | ||
| *yp++ = *xp2; | ||
| *yp++ = *xp1; | ||
| xp1+=2; | ||
| xp2-=2; | ||
| } | ||
| for(;i<N4;i++) | ||
| { | ||
| /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | ||
| *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2); | ||
| *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]); | ||
| xp1+=2; | ||
| xp2-=2; | ||
| wp1+=2; | ||
| wp2-=2; | ||
| } | ||
| } | ||
| /* Pre-rotation */ | ||
| { | ||
| kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
| const kiss_twiddle_scalar *t = &trig[0]; | ||
| for(i=0;i<N4;i++) | ||
| { | ||
| kiss_fft_cpx yc; | ||
| kiss_twiddle_scalar t0, t1; | ||
| kiss_fft_scalar re, im, yr, yi; | ||
| t0 = t[i]; | ||
| t1 = t[N4+i]; | ||
| re = *yp++; | ||
| im = *yp++; | ||
| yr = S_MUL(re,t0) - S_MUL(im,t1); | ||
| yi = S_MUL(im,t0) + S_MUL(re,t1); | ||
| yc.r = yr; | ||
| yc.i = yi; | ||
| f2[i] = yc; | ||
| } | ||
| } | ||
| opus_fft(st, f2, (kiss_fft_cpx *)f, arch); | ||
| /* Post-rotate */ | ||
| { | ||
| /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
| const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f; | ||
| kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | ||
| kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | ||
| const kiss_twiddle_scalar *t = &trig[0]; | ||
| /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
| for(i=0;i<N4;i++) | ||
| { | ||
| kiss_fft_scalar yr, yi; | ||
| yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]); | ||
| yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]); | ||
| *yp1 = yr; | ||
| *yp2 = yi; | ||
| fp++; | ||
| yp1 += 2*stride; | ||
| yp2 -= 2*stride; | ||
| } | ||
| } | ||
| RESTORE_STACK; | ||
| } | ||
| void clt_mdct_backward_neon(const mdct_lookup *l, | ||
| kiss_fft_scalar *in, | ||
| kiss_fft_scalar * OPUS_RESTRICT out, | ||
| const opus_val16 * OPUS_RESTRICT window, | ||
| int overlap, int shift, int stride, int arch) | ||
| { | ||
| int i; | ||
| int N, N2, N4; | ||
| VARDECL(kiss_fft_scalar, f); | ||
| const kiss_twiddle_scalar *trig; | ||
| const kiss_fft_state *st = l->kfft[shift]; | ||
| N = l->n; | ||
| trig = l->trig; | ||
| for (i=0;i<shift;i++) | ||
| { | ||
| N >>= 1; | ||
| trig += N; | ||
| } | ||
| N2 = N>>1; | ||
| N4 = N>>2; | ||
| ALLOC(f, N2, kiss_fft_scalar); | ||
| /* Pre-rotate */ | ||
| { | ||
| /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
| const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | ||
| const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | ||
| kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
| const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | ||
| for(i=0;i<N4;i++) | ||
| { | ||
| kiss_fft_scalar yr, yi; | ||
| yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]); | ||
| yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]); | ||
| yp[2*i] = yr; | ||
| yp[2*i+1] = yi; | ||
| xp1+=2*stride; | ||
| xp2-=2*stride; | ||
| } | ||
| } | ||
| opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch); | ||
| /* Post-rotate and de-shuffle from both ends of the buffer at once to make | ||
| it in-place. */ | ||
| { | ||
| kiss_fft_scalar * yp0 = out+(overlap>>1); | ||
| kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | ||
| const kiss_twiddle_scalar *t = &trig[0]; | ||
| /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | ||
| middle pair will be computed twice. */ | ||
| for(i=0;i<(N4+1)>>1;i++) | ||
| { | ||
| kiss_fft_scalar re, im, yr, yi; | ||
| kiss_twiddle_scalar t0, t1; | ||
| re = yp0[0]; | ||
| im = yp0[1]; | ||
| t0 = t[i]; | ||
| t1 = t[N4+i]; | ||
| /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | ||
| yr = S_MUL(re,t0) + S_MUL(im,t1); | ||
| yi = S_MUL(re,t1) - S_MUL(im,t0); | ||
| re = yp1[0]; | ||
| im = yp1[1]; | ||
| yp0[0] = yr; | ||
| yp1[1] = yi; | ||
| t0 = t[(N4-i-1)]; | ||
| t1 = t[(N2-i-1)]; | ||
| /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | ||
| yr = S_MUL(re,t0) + S_MUL(im,t1); | ||
| yi = S_MUL(re,t1) - S_MUL(im,t0); | ||
| yp1[0] = yr; | ||
| yp0[1] = yi; | ||
| yp0 += 2; | ||
| yp1 -= 2; | ||
| } | ||
| } | ||
| /* Mirror on both sides for TDAC */ | ||
| { | ||
| kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | ||
| kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | ||
| const opus_val16 * OPUS_RESTRICT wp1 = window; | ||
| const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; | ||
| for(i = 0; i < overlap/2; i++) | ||
| { | ||
| kiss_fft_scalar x1, x2; | ||
| x1 = *xp1; | ||
| x2 = *yp1; | ||
| *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); | ||
| *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); | ||
| wp1++; | ||
| wp2--; | ||
| } | ||
| } | ||
| RESTORE_STACK; | ||
| } |
| @@ -0,0 +1,311 @@ | ||
| /* Copyright (c) 2014-2015 Xiph.Org Foundation | ||
| Written by Viswanath Puttagunta */ | ||
| /** | ||
| @file celt_neon_intr.c | ||
| @brief ARM Neon Intrinsic optimizations for celt | ||
| */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include <arm_neon.h> | ||
| #include "../pitch.h" | ||
| #if defined(FIXED_POINT) | ||
| void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) | ||
| { | ||
| int j; | ||
| int32x4_t a = vld1q_s32(sum); | ||
| /* Load y[0...3] */ | ||
| /* This requires len>0 to always be valid (which we assert in the C code). */ | ||
| int16x4_t y0 = vld1_s16(y); | ||
| y += 4; | ||
| for (j = 0; j + 8 <= len; j += 8) | ||
| { | ||
| /* Load x[0...7] */ | ||
| int16x8_t xx = vld1q_s16(x); | ||
| int16x4_t x0 = vget_low_s16(xx); | ||
| int16x4_t x4 = vget_high_s16(xx); | ||
| /* Load y[4...11] */ | ||
| int16x8_t yy = vld1q_s16(y); | ||
| int16x4_t y4 = vget_low_s16(yy); | ||
| int16x4_t y8 = vget_high_s16(yy); | ||
| int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); | ||
| int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0); | ||
| int16x4_t y1 = vext_s16(y0, y4, 1); | ||
| int16x4_t y5 = vext_s16(y4, y8, 1); | ||
| int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1); | ||
| int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1); | ||
| int16x4_t y2 = vext_s16(y0, y4, 2); | ||
| int16x4_t y6 = vext_s16(y4, y8, 2); | ||
| int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2); | ||
| int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2); | ||
| int16x4_t y3 = vext_s16(y0, y4, 3); | ||
| int16x4_t y7 = vext_s16(y4, y8, 3); | ||
| int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3); | ||
| int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3); | ||
| y0 = y8; | ||
| a = a7; | ||
| x += 8; | ||
| y += 8; | ||
| } | ||
| for (; j < len; j++) | ||
| { | ||
| int16x4_t x0 = vld1_dup_s16(x); /* load next x */ | ||
| int32x4_t a0 = vmlal_s16(a, y0, x0); | ||
| int16x4_t y4 = vld1_dup_s16(y); /* load next y */ | ||
| y0 = vext_s16(y0, y4, 1); | ||
| a = a0; | ||
| x++; | ||
| y++; | ||
| } | ||
| vst1q_s32(sum, a); | ||
| } | ||
| #else | ||
| /* | ||
| * Function: xcorr_kernel_neon_float | ||
| * --------------------------------- | ||
| * Computes 4 correlation values and stores them in sum[4] | ||
| */ | ||
| static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y, | ||
| float32_t sum[4], int len) { | ||
| float32x4_t YY[3]; | ||
| float32x4_t YEXT[3]; | ||
| float32x4_t XX[2]; | ||
| float32x2_t XX_2; | ||
| float32x4_t SUMM; | ||
| const float32_t *xi = x; | ||
| const float32_t *yi = y; | ||
| celt_assert(len>0); | ||
| YY[0] = vld1q_f32(yi); | ||
| SUMM = vdupq_n_f32(0); | ||
| /* Consume 8 elements in x vector and 12 elements in y | ||
| * vector. However, the 12'th element never really gets | ||
| * touched in this loop. So, if len == 8, then we only | ||
| * must access y[0] to y[10]. y[11] must not be accessed | ||
| * hence make sure len > 8 and not len >= 8 | ||
| */ | ||
| while (len > 8) { | ||
| yi += 4; | ||
| YY[1] = vld1q_f32(yi); | ||
| yi += 4; | ||
| YY[2] = vld1q_f32(yi); | ||
| XX[0] = vld1q_f32(xi); | ||
| xi += 4; | ||
| XX[1] = vld1q_f32(xi); | ||
| xi += 4; | ||
| SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); | ||
| YEXT[0] = vextq_f32(YY[0], YY[1], 1); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); | ||
| YEXT[1] = vextq_f32(YY[0], YY[1], 2); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); | ||
| YEXT[2] = vextq_f32(YY[0], YY[1], 3); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); | ||
| SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); | ||
| YEXT[0] = vextq_f32(YY[1], YY[2], 1); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); | ||
| YEXT[1] = vextq_f32(YY[1], YY[2], 2); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); | ||
| YEXT[2] = vextq_f32(YY[1], YY[2], 3); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); | ||
| YY[0] = YY[2]; | ||
| len -= 8; | ||
| } | ||
| /* Consume 4 elements in x vector and 8 elements in y | ||
| * vector. However, the 8'th element in y never really gets | ||
| * touched in this loop. So, if len == 4, then we only | ||
| * must access y[0] to y[6]. y[7] must not be accessed | ||
| * hence make sure len>4 and not len>=4 | ||
| */ | ||
| if (len > 4) { | ||
| yi += 4; | ||
| YY[1] = vld1q_f32(yi); | ||
| XX[0] = vld1q_f32(xi); | ||
| xi += 4; | ||
| SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); | ||
| YEXT[0] = vextq_f32(YY[0], YY[1], 1); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); | ||
| YEXT[1] = vextq_f32(YY[0], YY[1], 2); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); | ||
| YEXT[2] = vextq_f32(YY[0], YY[1], 3); | ||
| SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); | ||
| YY[0] = YY[1]; | ||
| len -= 4; | ||
| } | ||
| while (--len > 0) { | ||
| XX_2 = vld1_dup_f32(xi++); | ||
| SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); | ||
| YY[0]= vld1q_f32(++yi); | ||
| } | ||
| XX_2 = vld1_dup_f32(xi); | ||
| SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); | ||
| vst1q_f32(sum, SUMM); | ||
| } | ||
| /* | ||
| * Function: xcorr_kernel_neon_float_process1 | ||
| * --------------------------------- | ||
| * Computes single correlation values and stores in *sum | ||
| */ | ||
| static void xcorr_kernel_neon_float_process1(const float32_t *x, | ||
| const float32_t *y, float32_t *sum, int len) { | ||
| float32x4_t XX[4]; | ||
| float32x4_t YY[4]; | ||
| float32x2_t XX_2; | ||
| float32x2_t YY_2; | ||
| float32x4_t SUMM; | ||
| float32x2_t SUMM_2[2]; | ||
| const float32_t *xi = x; | ||
| const float32_t *yi = y; | ||
| SUMM = vdupq_n_f32(0); | ||
| /* Work on 16 values per iteration */ | ||
| while (len >= 16) { | ||
| XX[0] = vld1q_f32(xi); | ||
| xi += 4; | ||
| XX[1] = vld1q_f32(xi); | ||
| xi += 4; | ||
| XX[2] = vld1q_f32(xi); | ||
| xi += 4; | ||
| XX[3] = vld1q_f32(xi); | ||
| xi += 4; | ||
| YY[0] = vld1q_f32(yi); | ||
| yi += 4; | ||
| YY[1] = vld1q_f32(yi); | ||
| yi += 4; | ||
| YY[2] = vld1q_f32(yi); | ||
| yi += 4; | ||
| YY[3] = vld1q_f32(yi); | ||
| yi += 4; | ||
| SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); | ||
| SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); | ||
| SUMM = vmlaq_f32(SUMM, YY[2], XX[2]); | ||
| SUMM = vmlaq_f32(SUMM, YY[3], XX[3]); | ||
| len -= 16; | ||
| } | ||
| /* Work on 8 values */ | ||
| if (len >= 8) { | ||
| XX[0] = vld1q_f32(xi); | ||
| xi += 4; | ||
| XX[1] = vld1q_f32(xi); | ||
| xi += 4; | ||
| YY[0] = vld1q_f32(yi); | ||
| yi += 4; | ||
| YY[1] = vld1q_f32(yi); | ||
| yi += 4; | ||
| SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); | ||
| SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); | ||
| len -= 8; | ||
| } | ||
| /* Work on 4 values */ | ||
| if (len >= 4) { | ||
| XX[0] = vld1q_f32(xi); | ||
| xi += 4; | ||
| YY[0] = vld1q_f32(yi); | ||
| yi += 4; | ||
| SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); | ||
| len -= 4; | ||
| } | ||
| /* Start accumulating results */ | ||
| SUMM_2[0] = vget_low_f32(SUMM); | ||
| if (len >= 2) { | ||
| /* While at it, consume 2 more values if available */ | ||
| XX_2 = vld1_f32(xi); | ||
| xi += 2; | ||
| YY_2 = vld1_f32(yi); | ||
| yi += 2; | ||
| SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2); | ||
| len -= 2; | ||
| } | ||
| SUMM_2[1] = vget_high_f32(SUMM); | ||
| SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]); | ||
| SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]); | ||
| /* Ok, now we have result accumulated in SUMM_2[0].0 */ | ||
| if (len > 0) { | ||
| /* Case when you have one value left */ | ||
| XX_2 = vld1_dup_f32(xi); | ||
| YY_2 = vld1_dup_f32(yi); | ||
| SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2); | ||
| } | ||
| vst1_lane_f32(sum, SUMM_2[0], 0); | ||
| } | ||
| void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, | ||
| opus_val32 *xcorr, int len, int max_pitch) { | ||
| int i; | ||
| celt_assert(max_pitch > 0); | ||
| celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); | ||
| for (i = 0; i < (max_pitch-3); i += 4) { | ||
| xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i, | ||
| (float32_t *)xcorr+i, len); | ||
| } | ||
| /* In case max_pitch isn't multiple of 4 | ||
| * compute single correlation value per iteration | ||
| */ | ||
| for (; i < max_pitch; i++) { | ||
| xcorr_kernel_neon_float_process1((const float32_t *)_x, | ||
| (const float32_t *)_y+i, (float32_t *)xcorr+i, len); | ||
| } | ||
| } | ||
| #endif |
| @@ -0,0 +1,72 @@ | ||
| /* Copyright (c) 2015 Xiph.Org Foundation | ||
| Written by Viswanath Puttagunta */ | ||
| /** | ||
| @file fft_arm.h | ||
| @brief ARM Neon Intrinsic optimizations for fft using NE10 library | ||
| */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #if !defined(FFT_ARM_H) | ||
| #define FFT_ARM_H | ||
| #include "config.h" | ||
| #include "kiss_fft.h" | ||
| #if defined(HAVE_ARM_NE10) | ||
| int opus_fft_alloc_arm_neon(kiss_fft_state *st); | ||
| void opus_fft_free_arm_neon(kiss_fft_state *st); | ||
| void opus_fft_neon(const kiss_fft_state *st, | ||
| const kiss_fft_cpx *fin, | ||
| kiss_fft_cpx *fout); | ||
| void opus_ifft_neon(const kiss_fft_state *st, | ||
| const kiss_fft_cpx *fin, | ||
| kiss_fft_cpx *fout); | ||
| #if !defined(OPUS_HAVE_RTCD) | ||
| #define OVERRIDE_OPUS_FFT (1) | ||
| #define opus_fft_alloc_arch(_st, arch) \ | ||
| ((void)(arch), opus_fft_alloc_arm_neon(_st)) | ||
| #define opus_fft_free_arch(_st, arch) \ | ||
| ((void)(arch), opus_fft_free_arm_neon(_st)) | ||
| #define opus_fft(_st, _fin, _fout, arch) \ | ||
| ((void)(arch), opus_fft_neon(_st, _fin, _fout)) | ||
| #define opus_ifft(_st, _fin, _fout, arch) \ | ||
| ((void)(arch), opus_ifft_neon(_st, _fin, _fout)) | ||
| #endif /* OPUS_HAVE_RTCD */ | ||
| #endif /* HAVE_ARM_NE10 */ | ||
| #endif |
| @@ -0,0 +1,35 @@ | ||
| /* Copyright (C) 2015 Vidyo */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifndef FIXED_ARM64_H | ||
| #define FIXED_ARM64_H | ||
| #include <arm_neon.h> | ||
| #undef SIG2WORD16 | ||
| #define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT))) | ||
| #endif |
| @@ -0,0 +1,60 @@ | ||
| /* Copyright (c) 2015 Xiph.Org Foundation | ||
| Written by Viswanath Puttagunta */ | ||
| /** | ||
| @file arm_mdct.h | ||
| @brief ARM Neon Intrinsic optimizations for mdct using NE10 library | ||
| */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #if !defined(MDCT_ARM_H) | ||
| #define MDCT_ARM_H | ||
| #include "config.h" | ||
| #include "mdct.h" | ||
| #if defined(HAVE_ARM_NE10) | ||
| /** Compute a forward MDCT and scale by 4/N, trashes the input array */ | ||
| void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in, | ||
| kiss_fft_scalar * OPUS_RESTRICT out, | ||
| const opus_val16 *window, int overlap, | ||
| int shift, int stride, int arch); | ||
| void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in, | ||
| kiss_fft_scalar * OPUS_RESTRICT out, | ||
| const opus_val16 *window, int overlap, | ||
| int shift, int stride, int arch); | ||
| #if !defined(OPUS_HAVE_RTCD) | ||
| #define OVERRIDE_OPUS_MDCT (1) | ||
| #define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \ | ||
| clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch) | ||
| #define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \ | ||
| clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch) | ||
| #endif /* OPUS_HAVE_RTCD */ | ||
| #endif /* HAVE_ARM_NE10 */ | ||
| #endif |