| @@ -0,0 +1,132 @@ | ||
| /* Copyright (c) 2014, Cisco Systems, INC | ||
| Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include <xmmintrin.h> | ||
| #include <emmintrin.h> | ||
| #include <smmintrin.h> | ||
| #include "celt_lpc.h" | ||
| #include "stack_alloc.h" | ||
| #include "mathops.h" | ||
| #include "pitch.h" | ||
| #include "x86cpu.h" | ||
| #if defined(FIXED_POINT) | ||
| void celt_fir_sse4_1(const opus_val16 *_x, | ||
| const opus_val16 *num, | ||
| opus_val16 *_y, | ||
| int N, | ||
| int ord, | ||
| opus_val16 *mem, | ||
| int arch) | ||
| { | ||
| int i,j; | ||
| VARDECL(opus_val16, rnum); | ||
| VARDECL(opus_val16, x); | ||
| __m128i vecNoA; | ||
| opus_int32 noA ; | ||
| SAVE_STACK; | ||
| ALLOC(rnum, ord, opus_val16); | ||
| ALLOC(x, N+ord, opus_val16); | ||
| for(i=0;i<ord;i++) | ||
| rnum[i] = num[ord-i-1]; | ||
| for(i=0;i<ord;i++) | ||
| x[i] = mem[ord-i-1]; | ||
| for (i=0;i<N-7;i+=8) | ||
| { | ||
| x[i+ord ]=_x[i ]; | ||
| x[i+ord+1]=_x[i+1]; | ||
| x[i+ord+2]=_x[i+2]; | ||
| x[i+ord+3]=_x[i+3]; | ||
| x[i+ord+4]=_x[i+4]; | ||
| x[i+ord+5]=_x[i+5]; | ||
| x[i+ord+6]=_x[i+6]; | ||
| x[i+ord+7]=_x[i+7]; | ||
| } | ||
| for (;i<N-3;i+=4) | ||
| { | ||
| x[i+ord ]=_x[i ]; | ||
| x[i+ord+1]=_x[i+1]; | ||
| x[i+ord+2]=_x[i+2]; | ||
| x[i+ord+3]=_x[i+3]; | ||
| } | ||
| for (;i<N;i++) | ||
| x[i+ord]=_x[i]; | ||
| for(i=0;i<ord;i++) | ||
| mem[i] = _x[N-i-1]; | ||
| #ifdef SMALL_FOOTPRINT | ||
| for (i=0;i<N;i++) | ||
| { | ||
| opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); | ||
| for (j=0;j<ord;j++) | ||
| { | ||
| sum = MAC16_16(sum,rnum[j],x[i+j]); | ||
| } | ||
| _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); | ||
| } | ||
| #else | ||
| noA = EXTEND32(1) << SIG_SHIFT >> 1; | ||
| vecNoA = _mm_set_epi32(noA, noA, noA, noA); | ||
| for (i=0;i<N-3;i+=4) | ||
| { | ||
| opus_val32 sums[4] = {0}; | ||
| __m128i vecSum, vecX; | ||
| xcorr_kernel(rnum, x+i, sums, ord, arch); | ||
| vecSum = _mm_loadu_si128((__m128i *)sums); | ||
| vecSum = _mm_add_epi32(vecSum, vecNoA); | ||
| vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); | ||
| vecX = OP_CVTEPI16_EPI32_M64(_x + i); | ||
| vecSum = _mm_add_epi32(vecSum, vecX); | ||
| vecSum = _mm_packs_epi32(vecSum, vecSum); | ||
| _mm_storel_epi64((__m128i *)(_y + i), vecSum); | ||
| } | ||
| for (;i<N;i++) | ||
| { | ||
| opus_val32 sum = 0; | ||
| for (j=0;j<ord;j++) | ||
| sum = MAC16_16(sum, rnum[j], x[i + j]); | ||
| _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); | ||
| } | ||
| #endif | ||
| RESTORE_STACK; | ||
| } | ||
| #endif |
| @@ -0,0 +1,68 @@ | ||
| /* Copyright (c) 2014, Cisco Systems, INC | ||
| Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifndef CELT_LPC_SSE_H | ||
| #define CELT_LPC_SSE_H | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) | ||
| #define OVERRIDE_CELT_FIR | ||
| void celt_fir_sse4_1( | ||
| const opus_val16 *x, | ||
| const opus_val16 *num, | ||
| opus_val16 *y, | ||
| int N, | ||
| int ord, | ||
| opus_val16 *mem, | ||
| int arch); | ||
| #if defined(OPUS_X86_PRESUME_SSE4_1) | ||
| #define celt_fir(x, num, y, N, ord, mem, arch) \ | ||
| ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) | ||
| #else | ||
| extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( | ||
| const opus_val16 *x, | ||
| const opus_val16 *num, | ||
| opus_val16 *y, | ||
| int N, | ||
| int ord, | ||
| opus_val16 *mem, | ||
| int arch); | ||
| # define celt_fir(x, num, y, N, ord, mem, arch) \ | ||
| ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) | ||
| #endif | ||
| #endif | ||
| #endif |
| @@ -0,0 +1,185 @@ | ||
| /* Copyright (c) 2014, Cisco Systems, INC | ||
| Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include "macros.h" | ||
| #include "celt_lpc.h" | ||
| #include "stack_alloc.h" | ||
| #include "mathops.h" | ||
| #include "pitch.h" | ||
| #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) | ||
| #include <xmmintrin.h> | ||
| #include "arch.h" | ||
| void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) | ||
| { | ||
| int j; | ||
| __m128 xsum1, xsum2; | ||
| xsum1 = _mm_loadu_ps(sum); | ||
| xsum2 = _mm_setzero_ps(); | ||
| for (j = 0; j < len-3; j += 4) | ||
| { | ||
| __m128 x0 = _mm_loadu_ps(x+j); | ||
| __m128 yj = _mm_loadu_ps(y+j); | ||
| __m128 y3 = _mm_loadu_ps(y+j+3); | ||
| xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); | ||
| xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), | ||
| _mm_shuffle_ps(yj,y3,0x49))); | ||
| xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), | ||
| _mm_shuffle_ps(yj,y3,0x9e))); | ||
| xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); | ||
| } | ||
| if (j < len) | ||
| { | ||
| xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
| if (++j < len) | ||
| { | ||
| xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
| if (++j < len) | ||
| { | ||
| xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
| } | ||
| } | ||
| } | ||
| _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); | ||
| } | ||
| void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | ||
| int N, opus_val32 *xy1, opus_val32 *xy2) | ||
| { | ||
| int i; | ||
| __m128 xsum1, xsum2; | ||
| xsum1 = _mm_setzero_ps(); | ||
| xsum2 = _mm_setzero_ps(); | ||
| for (i=0;i<N-3;i+=4) | ||
| { | ||
| __m128 xi = _mm_loadu_ps(x+i); | ||
| __m128 y1i = _mm_loadu_ps(y01+i); | ||
| __m128 y2i = _mm_loadu_ps(y02+i); | ||
| xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); | ||
| xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); | ||
| } | ||
| /* Horizontal sum */ | ||
| xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); | ||
| xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); | ||
| _mm_store_ss(xy1, xsum1); | ||
| xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); | ||
| xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); | ||
| _mm_store_ss(xy2, xsum2); | ||
| for (;i<N;i++) | ||
| { | ||
| *xy1 = MAC16_16(*xy1, x[i], y01[i]); | ||
| *xy2 = MAC16_16(*xy2, x[i], y02[i]); | ||
| } | ||
| } | ||
| opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, | ||
| int N) | ||
| { | ||
| int i; | ||
| float xy; | ||
| __m128 sum; | ||
| sum = _mm_setzero_ps(); | ||
| /* FIXME: We should probably go 8-way and use 2 sums. */ | ||
| for (i=0;i<N-3;i+=4) | ||
| { | ||
| __m128 xi = _mm_loadu_ps(x+i); | ||
| __m128 yi = _mm_loadu_ps(y+i); | ||
| sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); | ||
| } | ||
| /* Horizontal sum */ | ||
| sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); | ||
| sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); | ||
| _mm_store_ss(&xy, sum); | ||
| for (;i<N;i++) | ||
| { | ||
| xy = MAC16_16(xy, x[i], y[i]); | ||
| } | ||
| return xy; | ||
| } | ||
| void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, | ||
| opus_val16 g10, opus_val16 g11, opus_val16 g12) | ||
| { | ||
| int i; | ||
| __m128 x0v; | ||
| __m128 g10v, g11v, g12v; | ||
| g10v = _mm_load1_ps(&g10); | ||
| g11v = _mm_load1_ps(&g11); | ||
| g12v = _mm_load1_ps(&g12); | ||
| x0v = _mm_loadu_ps(&x[-T-2]); | ||
| for (i=0;i<N-3;i+=4) | ||
| { | ||
| __m128 yi, yi2, x1v, x2v, x3v, x4v; | ||
| const opus_val32 *xp = &x[i-T-2]; | ||
| yi = _mm_loadu_ps(x+i); | ||
| x4v = _mm_loadu_ps(xp+4); | ||
| #if 0 | ||
| /* Slower version with all loads */ | ||
| x1v = _mm_loadu_ps(xp+1); | ||
| x2v = _mm_loadu_ps(xp+2); | ||
| x3v = _mm_loadu_ps(xp+3); | ||
| #else | ||
| x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); | ||
| x1v = _mm_shuffle_ps(x0v, x2v, 0x99); | ||
| x3v = _mm_shuffle_ps(x2v, x4v, 0x99); | ||
| #endif | ||
| yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); | ||
| #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ | ||
| yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); | ||
| yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); | ||
| #else | ||
| /* Use partial sums */ | ||
| yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), | ||
| _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); | ||
| yi = _mm_add_ps(yi, yi2); | ||
| #endif | ||
| x0v=x4v; | ||
| _mm_storeu_ps(y+i, yi); | ||
| } | ||
| #ifdef CUSTOM_MODES | ||
| for (;i<N;i++) | ||
| { | ||
| y[i] = x[i] | ||
| + MULT16_32_Q15(g10,x[i-T]) | ||
| + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) | ||
| + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); | ||
| } | ||
| #endif | ||
| } | ||
| #endif |
| @@ -0,0 +1,95 @@ | ||
| /* Copyright (c) 2014, Cisco Systems, INC | ||
| Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include <xmmintrin.h> | ||
| #include <emmintrin.h> | ||
| #include "macros.h" | ||
| #include "celt_lpc.h" | ||
| #include "stack_alloc.h" | ||
| #include "mathops.h" | ||
| #include "pitch.h" | ||
| #if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) | ||
| opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, | ||
| int N) | ||
| { | ||
| opus_int i, dataSize16; | ||
| opus_int32 sum; | ||
| __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; | ||
| __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; | ||
| sum = 0; | ||
| dataSize16 = N & ~15; | ||
| acc1 = _mm_setzero_si128(); | ||
| acc2 = _mm_setzero_si128(); | ||
| for (i=0;i<dataSize16;i+=16) | ||
| { | ||
| inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
| inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
| inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); | ||
| inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); | ||
| inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
| inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); | ||
| acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
| acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); | ||
| } | ||
| acc1 = _mm_add_epi32( acc1, acc2 ); | ||
| if (N - i >= 8) | ||
| { | ||
| inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
| inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
| inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
| acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
| i += 8; | ||
| } | ||
| acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); | ||
| acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); | ||
| sum += _mm_cvtsi128_si32(acc1); | ||
| for (;i<N;i++) { | ||
| sum = silk_SMLABB(sum, x[i], y[i]); | ||
| } | ||
| return sum; | ||
| } | ||
| #endif |
| @@ -0,0 +1,195 @@ | ||
| /* Copyright (c) 2014, Cisco Systems, INC | ||
| Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include <xmmintrin.h> | ||
| #include <emmintrin.h> | ||
| #include "macros.h" | ||
| #include "celt_lpc.h" | ||
| #include "stack_alloc.h" | ||
| #include "mathops.h" | ||
| #include "pitch.h" | ||
| #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) | ||
| #include <smmintrin.h> | ||
| #include "x86cpu.h" | ||
| opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, | ||
| int N) | ||
| { | ||
| opus_int i, dataSize16; | ||
| opus_int32 sum; | ||
| __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; | ||
| __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; | ||
| __m128i inVec1_3210, inVec2_3210; | ||
| sum = 0; | ||
| dataSize16 = N & ~15; | ||
| acc1 = _mm_setzero_si128(); | ||
| acc2 = _mm_setzero_si128(); | ||
| for (i=0;i<dataSize16;i+=16) { | ||
| inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
| inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
| inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); | ||
| inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); | ||
| inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
| inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); | ||
| acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
| acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); | ||
| } | ||
| acc1 = _mm_add_epi32(acc1, acc2); | ||
| if (N - i >= 8) | ||
| { | ||
| inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
| inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
| inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
| acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
| i += 8; | ||
| } | ||
| if (N - i >= 4) | ||
| { | ||
| inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); | ||
| inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); | ||
| inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); | ||
| acc1 = _mm_add_epi32(acc1, inVec1_3210); | ||
| i += 4; | ||
| } | ||
| acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); | ||
| acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); | ||
| sum += _mm_cvtsi128_si32(acc1); | ||
| for (;i<N;i++) | ||
| { | ||
| sum = silk_SMLABB(sum, x[i], y[i]); | ||
| } | ||
| return sum; | ||
| } | ||
| void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) | ||
| { | ||
| int j; | ||
| __m128i vecX, vecX0, vecX1, vecX2, vecX3; | ||
| __m128i vecY0, vecY1, vecY2, vecY3; | ||
| __m128i sum0, sum1, sum2, sum3, vecSum; | ||
| __m128i initSum; | ||
| celt_assert(len >= 3); | ||
| sum0 = _mm_setzero_si128(); | ||
| sum1 = _mm_setzero_si128(); | ||
| sum2 = _mm_setzero_si128(); | ||
| sum3 = _mm_setzero_si128(); | ||
| for (j=0;j<(len-7);j+=8) | ||
| { | ||
| vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); | ||
| vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); | ||
| vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); | ||
| vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); | ||
| vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); | ||
| sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); | ||
| sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); | ||
| sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); | ||
| sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); | ||
| } | ||
| sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); | ||
| sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); | ||
| sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); | ||
| sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); | ||
| sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); | ||
| sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); | ||
| sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); | ||
| sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); | ||
| vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), | ||
| _mm_unpacklo_epi32(sum2, sum3)); | ||
| for (;j<(len-3);j+=4) | ||
| { | ||
| vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); | ||
| vecX0 = _mm_shuffle_epi32(vecX, 0x00); | ||
| vecX1 = _mm_shuffle_epi32(vecX, 0x55); | ||
| vecX2 = _mm_shuffle_epi32(vecX, 0xaa); | ||
| vecX3 = _mm_shuffle_epi32(vecX, 0xff); | ||
| vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); | ||
| vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); | ||
| vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); | ||
| vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); | ||
| sum0 = _mm_mullo_epi32(vecX0, vecY0); | ||
| sum1 = _mm_mullo_epi32(vecX1, vecY1); | ||
| sum2 = _mm_mullo_epi32(vecX2, vecY2); | ||
| sum3 = _mm_mullo_epi32(vecX3, vecY3); | ||
| sum0 = _mm_add_epi32(sum0, sum1); | ||
| sum2 = _mm_add_epi32(sum2, sum3); | ||
| vecSum = _mm_add_epi32(vecSum, sum0); | ||
| vecSum = _mm_add_epi32(vecSum, sum2); | ||
| } | ||
| for (;j<len;j++) | ||
| { | ||
| vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); | ||
| vecX0 = _mm_shuffle_epi32(vecX, 0x00); | ||
| vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); | ||
| sum0 = _mm_mullo_epi32(vecX0, vecY0); | ||
| vecSum = _mm_add_epi32(vecSum, sum0); | ||
| } | ||
| initSum = _mm_loadu_si128((__m128i *)(&sum[0])); | ||
| initSum = _mm_add_epi32(initSum, vecSum); | ||
| _mm_storeu_si128((__m128i *)sum, initSum); | ||
| } | ||
| #endif |
| @@ -0,0 +1,50 @@ | ||
| /* Copyright (c) 2016 Jean-Marc Valin */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifndef VQ_SSE_H | ||
| #define VQ_SSE_H | ||
| #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) | ||
| #define OVERRIDE_OP_PVQ_SEARCH | ||
| opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); | ||
| #if defined(OPUS_X86_PRESUME_SSE2) | ||
| #define op_pvq_search(x, iy, K, N, arch) \ | ||
| (op_pvq_search_sse2(x, iy, K, N, arch)) | ||
| #else | ||
| extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( | ||
| celt_norm *_X, int *iy, int K, int N, int arch); | ||
| # define op_pvq_search(X, iy, K, N, arch) \ | ||
| ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch)) | ||
| #endif | ||
| #endif | ||
| #endif |
| @@ -0,0 +1,218 @@ | ||
| /* Copyright (c) 2007-2008 CSIRO | ||
| Copyright (c) 2007-2009 Xiph.Org Foundation | ||
| Copyright (c) 2007-2016 Jean-Marc Valin */ | ||
| /* | ||
| Redistribution and use in source and binary forms, with or without | ||
| modification, are permitted provided that the following conditions | ||
| are met: | ||
| - Redistributions of source code must retain the above copyright | ||
| notice, this list of conditions and the following disclaimer. | ||
| - Redistributions in binary form must reproduce the above copyright | ||
| notice, this list of conditions and the following disclaimer in the | ||
| documentation and/or other materials provided with the distribution. | ||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| */ | ||
| #ifdef HAVE_CONFIG_H | ||
| #include "config.h" | ||
| #endif | ||
| #include <xmmintrin.h> | ||
| #include <emmintrin.h> | ||
| #include "celt_lpc.h" | ||
| #include "stack_alloc.h" | ||
| #include "mathops.h" | ||
| #include "vq.h" | ||
| #include "x86cpu.h" | ||
| #ifndef FIXED_POINT | ||
| opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) | ||
| { | ||
| int i, j; | ||
| int pulsesLeft; | ||
| float xy, yy; | ||
| VARDECL(celt_norm, y); | ||
| VARDECL(celt_norm, X); | ||
| VARDECL(float, signy); | ||
| __m128 signmask; | ||
| __m128 sums; | ||
| __m128i fours; | ||
| SAVE_STACK; | ||
| (void)arch; | ||
| /* All bits set to zero, except for the sign bit. */ | ||
| signmask = _mm_set_ps1(-0.f); | ||
| fours = _mm_set_epi32(4, 4, 4, 4); | ||
| ALLOC(y, N+3, celt_norm); | ||
| ALLOC(X, N+3, celt_norm); | ||
| ALLOC(signy, N+3, float); | ||
| OPUS_COPY(X, _X, N); | ||
| X[N] = X[N+1] = X[N+2] = 0; | ||
| sums = _mm_setzero_ps(); | ||
| for (j=0;j<N;j+=4) | ||
| { | ||
| __m128 x4, s4; | ||
| x4 = _mm_loadu_ps(&X[j]); | ||
| s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); | ||
| /* Get rid of the sign */ | ||
| x4 = _mm_andnot_ps(signmask, x4); | ||
| sums = _mm_add_ps(sums, x4); | ||
| /* Clear y and iy in case we don't do the projection. */ | ||
| _mm_storeu_ps(&y[j], _mm_setzero_ps()); | ||
| _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); | ||
| _mm_storeu_ps(&X[j], x4); | ||
| _mm_storeu_ps(&signy[j], s4); | ||
| } | ||
| sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); | ||
| xy = yy = 0; | ||
| pulsesLeft = K; | ||
| /* Do a pre-search by projecting on the pyramid */ | ||
| if (K > (N>>1)) | ||
| { | ||
| __m128i pulses_sum; | ||
| __m128 yy4, xy4; | ||
| __m128 rcp4; | ||
| opus_val32 sum = _mm_cvtss_f32(sums); | ||
| /* If X is too small, just replace it with a pulse at 0 */ | ||
| /* Prevents infinities and NaNs from causing too many pulses | ||
| to be allocated. 64 is an approximation of infinity here. */ | ||
| if (!(sum > EPSILON && sum < 64)) | ||
| { | ||
| X[0] = QCONST16(1.f,14); | ||
| j=1; do | ||
| X[j]=0; | ||
| while (++j<N); | ||
| sums = _mm_set_ps1(1.f); | ||
| } | ||
| /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ | ||
| rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums)); | ||
| xy4 = yy4 = _mm_setzero_ps(); | ||
| pulses_sum = _mm_setzero_si128(); | ||
| for (j=0;j<N;j+=4) | ||
| { | ||
| __m128 rx4, x4, y4; | ||
| __m128i iy4; | ||
| x4 = _mm_loadu_ps(&X[j]); | ||
| rx4 = _mm_mul_ps(x4, rcp4); | ||
| iy4 = _mm_cvttps_epi32(rx4); | ||
| pulses_sum = _mm_add_epi32(pulses_sum, iy4); | ||
| _mm_storeu_si128((__m128i*)&iy[j], iy4); | ||
| y4 = _mm_cvtepi32_ps(iy4); | ||
| xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); | ||
| yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); | ||
| /* double the y[] vector so we don't have to do it in the search loop. */ | ||
| _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); | ||
| } | ||
| pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); | ||
| pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); | ||
| xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); | ||
| xy = _mm_cvtss_f32(xy4); | ||
| yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); | ||
| yy = _mm_cvtss_f32(yy4); | ||
| } | ||
| X[N] = X[N+1] = X[N+2] = -100; | ||
| y[N] = y[N+1] = y[N+2] = 100; | ||
| celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass"); | ||
| /* This should never happen, but just in case it does (e.g. on silence) | ||
| we fill the first bin with pulses. */ | ||
| if (pulsesLeft > N+3) | ||
| { | ||
| opus_val16 tmp = (opus_val16)pulsesLeft; | ||
| yy = MAC16_16(yy, tmp, tmp); | ||
| yy = MAC16_16(yy, tmp, y[0]); | ||
| iy[0] += pulsesLeft; | ||
| pulsesLeft=0; | ||
| } | ||
| for (i=0;i<pulsesLeft;i++) | ||
| { | ||
| int best_id; | ||
| __m128 xy4, yy4; | ||
| __m128 max, max2; | ||
| __m128i count; | ||
| __m128i pos; | ||
| best_id = 0; | ||
| /* The squared magnitude term gets added anyway, so we might as well | ||
| add it outside the loop */ | ||
| yy = ADD16(yy, 1); | ||
| xy4 = _mm_load1_ps(&xy); | ||
| yy4 = _mm_load1_ps(&yy); | ||
| max = _mm_setzero_ps(); | ||
| pos = _mm_setzero_si128(); | ||
| count = _mm_set_epi32(3, 2, 1, 0); | ||
| for (j=0;j<N;j+=4) | ||
| { | ||
| __m128 x4, y4, r4; | ||
| x4 = _mm_loadu_ps(&X[j]); | ||
| y4 = _mm_loadu_ps(&y[j]); | ||
| x4 = _mm_add_ps(x4, xy4); | ||
| y4 = _mm_add_ps(y4, yy4); | ||
| y4 = _mm_rsqrt_ps(y4); | ||
| r4 = _mm_mul_ps(x4, y4); | ||
| /* Update the index of the max. */ | ||
| pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); | ||
| /* Update the max. */ | ||
| max = _mm_max_ps(max, r4); | ||
| /* Update the indices (+4) */ | ||
| count = _mm_add_epi32(count, fours); | ||
| } | ||
| /* Horizontal max */ | ||
| max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); | ||
| /* Now that max2 contains the max at all positions, look at which value(s) of the | ||
| partial max is equal to the global max. */ | ||
| pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); | ||
| pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); | ||
| pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); | ||
| best_id = _mm_cvtsi128_si32(pos); | ||
| /* Updating the sums of the new pulse(s) */ | ||
| xy = ADD32(xy, EXTEND32(X[best_id])); | ||
| /* We're multiplying y[j] by two so we don't have to do it here */ | ||
| yy = ADD16(yy, y[best_id]); | ||
| /* Only now that we've made the final choice, update y/iy */ | ||
| /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ | ||
| y[best_id] += 2; | ||
| iy[best_id]++; | ||
| } | ||
| /* Put the original sign back */ | ||
| for (j=0;j<N;j+=4) | ||
| { | ||
| __m128i y4; | ||
| __m128i s4; | ||
| y4 = _mm_loadu_si128((__m128i*)&iy[j]); | ||
| s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); | ||
| y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); | ||
| _mm_storeu_si128((__m128i*)&iy[j], y4); | ||
| } | ||
| RESTORE_STACK; | ||
| return yy; | ||
| } | ||
| #endif |