View
@@ -37,16 +37,32 @@
#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b))
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),16))
#else
#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
#endif
/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT16_32_P16(a,b) ((opus_val32)PSHR((opus_int64)((opus_val16)(a))*(b),16))
#else
#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
#endif
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT16_32_Q15(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),15))
#else
#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
#endif
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
#else
#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
#endif
/** Compile-time conversion of float constant to 16-bit value */
#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
@@ -88,6 +104,9 @@
/** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */
#define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a))))
/** Shift by a and round-to-neareast 32-bit value. Result is a saturated 16-bit value */
#define SROUND16(x,a) EXTRACT16(SATURATE(PSHR32(x,a), 32767));
/** Divide by two */
#define HALF16(x) (SHR16(x,1))
#define HALF32(x) (SHR32(x,1))
@@ -101,6 +120,12 @@
/** Subtract two 32-bit values */
#define SUB32(a,b) ((opus_val32)(a)-(opus_val32)(b))
/** Add two 32-bit values, ignore any overflows */
#define ADD32_ovflw(a,b) ((opus_val32)((opus_uint32)(a)+(opus_uint32)(b)))
/** Subtract two 32-bit values, ignore any overflows */
#define SUB32_ovflw(a,b) ((opus_val32)((opus_uint32)(a)-(opus_uint32)(b)))
#define NEG32_ovflw(a) ((opus_val32)(-(opus_uint32)(a)))
/** 16x16 multiplication where the result fits in 16 bits */
#define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b))))
@@ -113,7 +138,11 @@
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
b must fit in 31 bits.
Result fits in 32 bits. */
#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add.
Results fits in 32 bits */
#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)))
#define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
@@ -131,4 +160,17 @@
/** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
#define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
#if defined(MIPSr1_ASM)
#include "mips/fixed_generic_mipsr1.h"
#endif
static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
{
x = PSHR32(x, SIG_SHIFT);
x = MAX32(x, -32768);
x = MIN32(x, 32767);
return EXTRACT16(x);
}
#define SIG2WORD16(x) (SIG2WORD16_generic(x))
#endif
View
@@ -61,7 +61,13 @@
** the config.h file.
*/
#if (HAVE_LRINTF)
/* With GCC, when SSE is available, the fastest conversion is cvtss2si. */
#if defined(__GNUC__) && defined(__SSE__)
#include <xmmintrin.h>
static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_ss(x));}
#elif defined(HAVE_LRINTF)
/* These defines enable functionality introduced with the 1999 ISO C
** standard. They must be defined before the inclusion of math.h to
@@ -90,14 +96,14 @@
#include <math.h>
#define float2int(x) lrint(x)
#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN64) || defined (_WIN64))
#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_X64)
#include <xmmintrin.h>
__inline long int float2int(float value)
{
return _mm_cvtss_si32(_mm_load_ss(&value));
}
#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN32) || defined (_WIN32))
#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86)
#include <math.h>
/* Win32 doesn't seem to have these functions.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -32,6 +32,7 @@
#include <stdlib.h>
#include <math.h>
#include "arch.h"
#include "cpu_support.h"
#ifdef __cplusplus
extern "C" {
@@ -77,17 +78,28 @@ typedef struct {
4*4*4*2
*/
typedef struct arch_fft_state{
int is_supported;
void *priv;
} arch_fft_state;
typedef struct kiss_fft_state{
int nfft;
#ifndef FIXED_POINT
kiss_fft_scalar scale;
opus_val16 scale;
#ifdef FIXED_POINT
int scale_shift;
#endif
int shift;
opus_int16 factors[2*MAXFACTORS];
const opus_int16 *bitrev;
const kiss_twiddle_cpx *twiddles;
arch_fft_state *arch_fft;
} kiss_fft_state;
#if defined(HAVE_ARM_NE10)
#include "arm/fft_arm.h"
#endif
/*typedef struct kiss_fft_state* kiss_fft_cfg;*/
/**
@@ -113,9 +125,9 @@ typedef struct kiss_fft_state{
* buffer size in *lenmem.
* */
kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base);
kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch);
kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch);
/**
* opus_fft(cfg,in_out_buf)
@@ -127,10 +139,59 @@ kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
* Note that each element is complex and can be accessed like
f[k].r and f[k].i
* */
void opus_fft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
void opus_fft_free(const kiss_fft_state *cfg, int arch);
void opus_fft_free_arch_c(kiss_fft_state *st);
int opus_fft_alloc_arch_c(kiss_fft_state *st);
#if !defined(OVERRIDE_OPUS_FFT)
/* Is run-time CPU detection enabled on this platform? */
#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(
kiss_fft_state *st);
#define opus_fft_alloc_arch(_st, arch) \
((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(
kiss_fft_state *st);
#define opus_fft_free_arch(_st, arch) \
((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
#define opus_fft(_cfg, _fin, _fout, arch) \
((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
#define opus_ifft(_cfg, _fin, _fout, arch) \
((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
#define opus_fft_alloc_arch(_st, arch) \
((void)(arch), opus_fft_alloc_arch_c(_st))
#define opus_fft_free_arch(_st, arch) \
((void)(arch), opus_fft_free_arch_c(_st))
#define opus_fft(_cfg, _fin, _fout, arch) \
((void)(arch), opus_fft_c(_cfg, _fin, _fout))
#define opus_ifft(_cfg, _fin, _fout, arch) \
((void)(arch), opus_ifft_c(_cfg, _fin, _fout))
void opus_fft_free(const kiss_fft_state *cfg);
#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
#endif /* end if !defined(OVERRIDE_OPUS_FFT) */
#ifdef __cplusplus
}
View
@@ -164,7 +164,7 @@ opus_val16 celt_cos_norm(opus_val32 x)
{
return _celt_cos_pi_2(EXTRACT16(x));
} else {
return NEG32(_celt_cos_pi_2(EXTRACT16(65536-x)));
return NEG16(_celt_cos_pi_2(EXTRACT16(65536-x)));
}
} else {
if (x&0x0000ffff)
View
@@ -38,11 +38,44 @@
#include "entcode.h"
#include "os_support.h"
#define PI 3.141592653f
/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
#define FRAC_MUL16(a,b) ((16384+((opus_int32)(opus_int16)(a)*(opus_int16)(b)))>>15)
unsigned isqrt32(opus_uint32 _val);
/* CELT doesn't need it for fixed-point, by analysis.c does. */
#if !defined(FIXED_POINT) || defined(ANALYSIS_C)
#define cA 0.43157974f
#define cB 0.67848403f
#define cC 0.08595542f
#define cE ((float)PI/2)
static OPUS_INLINE float fast_atan2f(float y, float x) {
float x2, y2;
x2 = x*x;
y2 = y*y;
/* For very small values, we don't care about the answer, so
we can just return 0. */
if (x2 + y2 < 1e-18f)
{
return 0;
}
if(x2<y2){
float den = (y2 + cB*x2) * (y2 + cC*x2);
return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
}else{
float den = (x2 + cB*y2) * (x2 + cC*y2);
return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
}
}
#undef cA
#undef cB
#undef cC
#undef cD
#endif
#ifndef OVERRIDE_CELT_MAXABS16
static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len)
{
@@ -80,7 +113,6 @@ static OPUS_INLINE opus_val32 celt_maxabs32(const opus_val32 *x, int len)
#ifndef FIXED_POINT
#define PI 3.141592653f
#define celt_sqrt(x) ((float)sqrt(x))
#define celt_rsqrt(x) (1.f/celt_sqrt(x))
#define celt_rsqrt_norm(x) (celt_rsqrt(x))
View
@@ -53,76 +53,100 @@
#include "mathops.h"
#include "stack_alloc.h"
#if defined(MIPSr1_ASM)
#include "mips/mdct_mipsr1.h"
#endif
#ifdef CUSTOM_MODES
int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
{
int i;
int N4;
kiss_twiddle_scalar *trig;
#if defined(FIXED_POINT)
int shift;
int N2=N>>1;
#endif
l->n = N;
N4 = N>>2;
l->maxshift = maxshift;
for (i=0;i<=maxshift;i++)
{
if (i==0)
l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0);
l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
else
l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
#ifndef ENABLE_TI_DSPLIB55
if (l->kfft[i]==NULL)
return 0;
#endif
}
l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
if (l->trig==NULL)
return 0;
/* We have enough points that sine isn't necessary */
for (shift=0;shift<=maxshift;shift++)
{
/* We have enough points that sine isn't necessary */
#if defined(FIXED_POINT)
for (i=0;i<=N4;i++)
trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
#if 1
for (i=0;i<N2;i++)
trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
#else
for (i=0;i<=N4;i++)
trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
for (i=0;i<N2;i++)
trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
#endif
#else
for (i=0;i<N2;i++)
trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
#endif
trig += N2;
N2 >>= 1;
N >>= 1;
}
return 1;
}
void clt_mdct_clear(mdct_lookup *l)
void clt_mdct_clear(mdct_lookup *l, int arch)
{
int i;
for (i=0;i<=l->maxshift;i++)
opus_fft_free(l->kfft[i]);
opus_fft_free(l->kfft[i], arch);
opus_free((kiss_twiddle_scalar*)l->trig);
}
#endif /* CUSTOM_MODES */
/* Forward MDCT trashes the input array */
void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap, int shift, int stride)
#ifndef OVERRIDE_clt_mdct_forward
void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
kiss_twiddle_scalar sine;
VARDECL(kiss_fft_scalar, f);
VARDECL(kiss_fft_scalar, f2);
VARDECL(kiss_fft_cpx, f2);
const kiss_fft_state *st = l->kfft[shift];
const kiss_twiddle_scalar *trig;
opus_val16 scale;
#ifdef FIXED_POINT
/* Allows us to scale with MULT16_32_Q16(), which is faster than
MULT16_32_Q15() on ARM. */
int scale_shift = st->scale_shift-1;
#endif
SAVE_STACK;
(void)arch;
scale = st->scale;
N = l->n;
N >>= shift;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f, N2, kiss_fft_scalar);
ALLOC(f2, N2, kiss_fft_scalar);
/* sin(x) ~= x here */
#ifdef FIXED_POINT
sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
#else
sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
#endif
ALLOC(f2, N4, kiss_fft_cpx);
/* Consider the input to be composed of four blocks: [a, b, c, d] */
/* Window, shuffle, fold */
@@ -167,123 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
/* Pre-rotation */
{
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const kiss_twiddle_scalar *t = &l->trig[0];
const kiss_twiddle_scalar *t = &trig[0];
for(i=0;i<N4;i++)
{
kiss_fft_cpx yc;
kiss_twiddle_scalar t0, t1;
kiss_fft_scalar re, im, yr, yi;
re = yp[0];
im = yp[1];
yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
/* works because the cos is nearly one */
*yp++ = yr + S_MUL(yi,sine);
*yp++ = yi - S_MUL(yr,sine);
t0 = t[i];
t1 = t[N4+i];
re = *yp++;
im = *yp++;
yr = S_MUL(re,t0) - S_MUL(im,t1);
yi = S_MUL(im,t0) + S_MUL(re,t1);
yc.r = yr;
yc.i = yi;
yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
f2[st->bitrev[i]] = yc;
}
}
/* N/4 complex FFT, down-scales by 4/N */
opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
/* N/4 complex FFT, does not downscale anymore */
opus_fft_impl(st, f2);
/* Post-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
const kiss_twiddle_scalar *t = &l->trig[0];
const kiss_twiddle_scalar *t = &trig[0];
/* Temp pointers to make it really clear to the compiler what we're doing */
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
/* works because the cos is nearly one */
*yp1 = yr - S_MUL(yi,sine);
*yp2 = yi + S_MUL(yr,sine);;
fp += 2;
yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
*yp1 = yr;
*yp2 = yi;
fp++;
yp1 += 2*stride;
yp2 -= 2*stride;
}
}
RESTORE_STACK;
}
#endif /* OVERRIDE_clt_mdct_forward */
void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
#ifndef OVERRIDE_clt_mdct_backward
void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
kiss_twiddle_scalar sine;
VARDECL(kiss_fft_scalar, f2);
SAVE_STACK;
const kiss_twiddle_scalar *trig;
(void) arch;
N = l->n;
N >>= shift;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f2, N2, kiss_fft_scalar);
/* sin(x) ~= x here */
#ifdef FIXED_POINT
sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
#else
sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
#endif
/* Pre-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
kiss_fft_scalar * OPUS_RESTRICT yp = f2;
const kiss_twiddle_scalar *t = &l->trig[0];
kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
for(i=0;i<N4;i++)
{
int rev;
kiss_fft_scalar yr, yi;
yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
/* works because the cos is nearly one */
*yp++ = yr - S_MUL(yi,sine);
*yp++ = yi + S_MUL(yr,sine);
rev = *bitrev++;
yr = ADD32_ovflw(S_MUL(*xp2, t[i]), S_MUL(*xp1, t[N4+i]));
yi = SUB32_ovflw(S_MUL(*xp1, t[i]), S_MUL(*xp2, t[N4+i]));
/* We swap real and imag because we use an FFT instead of an IFFT. */
yp[2*rev+1] = yr;
yp[2*rev] = yi;
/* Storing the pre-rotation directly in the bitrev order. */
xp1+=2*stride;
xp2-=2*stride;
}
}
/* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
/* Post-rotate and de-shuffle from both ends of the buffer at once to make
it in-place. */
{
kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
const kiss_twiddle_scalar *t = &l->trig[0];
kiss_fft_scalar * yp0 = out+(overlap>>1);
kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
const kiss_twiddle_scalar *t = &trig[0];
/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
middle pair will be computed twice. */
for(i=0;i<(N4+1)>>1;i++)
{
kiss_fft_scalar re, im, yr, yi;
kiss_twiddle_scalar t0, t1;
re = yp0[0];
im = yp0[1];
t0 = t[i<<shift];
t1 = t[(N4-i)<<shift];
/* We swap real and imag because we're using an FFT instead of an IFFT. */
re = yp0[1];
im = yp0[0];
t0 = t[i];
t1 = t[N4+i];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) - S_MUL(im,t1);
yi = S_MUL(im,t0) + S_MUL(re,t1);
re = yp1[0];
im = yp1[1];
/* works because the cos is nearly one */
yp0[0] = -(yr - S_MUL(yi,sine));
yp1[1] = yi + S_MUL(yr,sine);
yr = ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1));
yi = SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0));
/* We swap real and imag because we're using an FFT instead of an IFFT. */
re = yp1[1];
im = yp1[0];
yp0[0] = yr;
yp1[1] = yi;
t0 = t[(N4-i-1)<<shift];
t1 = t[(i+1)<<shift];
t0 = t[(N4-i-1)];
t1 = t[(N2-i-1)];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) - S_MUL(im,t1);
yi = S_MUL(im,t0) + S_MUL(re,t1);
/* works because the cos is nearly one */
yp1[0] = -(yr - S_MUL(yi,sine));
yp0[1] = yi + S_MUL(yr,sine);
yr = ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1));
yi = SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0));
yp1[0] = yr;
yp0[1] = yi;
yp0 += 2;
yp1 -= 2;
}
@@ -301,11 +333,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
kiss_fft_scalar x1, x2;
x1 = *xp1;
x2 = *yp1;
*yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
*xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
*yp1++ = SUB32_ovflw(MULT16_32_Q15(*wp2, x2), MULT16_32_Q15(*wp1, x1));
*xp1-- = ADD32_ovflw(MULT16_32_Q15(*wp1, x2), MULT16_32_Q15(*wp2, x1));
wp1++;
wp2--;
}
}
RESTORE_STACK;
}
#endif /* OVERRIDE_clt_mdct_backward */
View
@@ -53,18 +53,60 @@ typedef struct {
const kiss_twiddle_scalar * OPUS_RESTRICT trig;
} mdct_lookup;
int clt_mdct_init(mdct_lookup *l,int N, int maxshift);
void clt_mdct_clear(mdct_lookup *l);
#if defined(HAVE_ARM_NE10)
#include "arm/mdct_arm.h"
#endif
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch);
void clt_mdct_clear(mdct_lookup *l, int arch);
/** Compute a forward MDCT and scale by 4/N, trashes the input array */
void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap, int shift, int stride);
void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap,
int shift, int stride, int arch);
/** Compute a backward MDCT (no scaling) and performs weighted overlap-add
(scales implicitly by 1/2) */
void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in,
void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride);
const opus_val16 * OPUS_RESTRICT window,
int overlap, int shift, int stride, int arch);
#if !defined(OVERRIDE_OPUS_MDCT)
/* Is run-time CPU detection enabled on this platform? */
#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10)
extern void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(
const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window,
int overlap, int shift, int stride, int arch);
#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
((*CLT_MDCT_FORWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
_window, _overlap, _shift, \
_stride, _arch))
extern void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(
const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window,
int overlap, int shift, int stride, int arch);
#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
(*CLT_MDCT_BACKWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
_window, _overlap, _shift, \
_stride, _arch)
#else /* if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) */
#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
clt_mdct_forward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
clt_mdct_backward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
#endif /* end if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) && !defined(FIXED_POINT) */
#endif /* end if !defined(OVERRIDE_OPUS_MDCT) */
#endif
View
@@ -37,6 +37,7 @@
#include "os_support.h"
#include "stack_alloc.h"
#include "quant_bands.h"
#include "cpu_support.h"
static const opus_int16 eband5ms[] = {
/*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 */
@@ -229,6 +230,7 @@ CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
opus_val16 *window;
opus_int16 *logN;
int LM;
int arch = opus_select_arch();
ALLOC_STACK;
#if !defined(VAR_ARRAYS) && !defined(USE_ALLOCA)
if (global_stack==NULL)
@@ -389,7 +391,7 @@ CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
compute_pulse_cache(mode, mode->maxLM);
if (clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts,
mode->maxLM) == 0)
mode->maxLM, arch) == 0)
goto failure;
if (error)
@@ -408,6 +410,8 @@ CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
#ifdef CUSTOM_MODES
void opus_custom_mode_destroy(CELTMode *mode)
{
int arch = opus_select_arch();
if (mode == NULL)
return;
#ifndef CUSTOM_MODES_ONLY
@@ -431,7 +435,7 @@ void opus_custom_mode_destroy(CELTMode *mode)
opus_free((opus_int16*)mode->cache.index);
opus_free((unsigned char*)mode->cache.bits);
opus_free((unsigned char*)mode->cache.caps);
clt_mdct_clear(&mode->mdct);
clt_mdct_clear(&mode->mdct, arch);
opus_free((CELTMode *)mode);
}
View
@@ -39,14 +39,6 @@
#define MAX_PERIOD 1024
#ifndef OVERLAP
#define OVERLAP(mode) ((mode)->overlap)
#endif
#ifndef FRAMESIZE
#define FRAMESIZE(mode) ((mode)->mdctSize)
#endif
typedef struct {
int size;
const opus_int16 *index;
View
@@ -67,18 +67,18 @@ static OPUS_INLINE void opus_free (void *ptr)
}
#endif
/** Copy n bytes of memory from src to dst. The 0* term provides compile-time type checking */
/** Copy n elements from src to dst. The 0* term provides compile-time type checking */
#ifndef OVERRIDE_OPUS_COPY
#define OPUS_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
#endif
/** Copy n bytes of memory from src to dst, allowing overlapping regions. The 0* term
/** Copy n elements from src to dst, allowing overlapping regions. The 0* term
provides compile-time type checking */
#ifndef OVERRIDE_OPUS_MOVE
#define OPUS_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
#endif
/** Set n elements of dst to zero, starting at address s */
/** Set n elements of dst to zero */
#ifndef OVERRIDE_OPUS_CLEAR
#define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
#endif
View
@@ -214,25 +214,35 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
}
#if 0 /* This is a simple version of the pitch correlation that should work
well on DSPs like Blackfin and TI C5x/C6x */
/* Pure C implementation. */
#ifdef FIXED_POINT
opus_val32
#else
void
#endif
celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int max_pitch)
#if defined(OVERRIDE_PITCH_XCORR)
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch)
#else
celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch)
#endif
{
#if 0 /* This is a simple version of the pitch correlation that should work
well on DSPs like Blackfin and TI C5x/C6x */
int i, j;
#ifdef FIXED_POINT
opus_val32 maxcorr=1;
#endif
#if !defined(OVERRIDE_PITCH_XCORR)
(void)arch;
#endif
for (i=0;i<max_pitch;i++)
{
opus_val32 sum = 0;
for (j=0;j<len;j++)
sum = MAC16_16(sum, x[j],y[i+j]);
sum = MAC16_16(sum, _x[j], _y[i+j]);
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
@@ -241,30 +251,25 @@ celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int m
#ifdef FIXED_POINT
return maxcorr;
#endif
}
#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
#ifdef FIXED_POINT
opus_val32
#else
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
{
int i,j;
int i;
/*The EDSP version requires that max_pitch is at least 1, and that _x is
32-bit aligned.
Since it's hard to put asserts in assembly, put them here.*/
celt_assert(max_pitch>0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
#ifdef FIXED_POINT
opus_val32 maxcorr=1;
#endif
celt_assert(max_pitch>0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(_x, _y+i, sum, len);
#if defined(OVERRIDE_PITCH_XCORR)
xcorr_kernel_c(_x, _y+i, sum, len);
#else
xcorr_kernel(_x, _y+i, sum, len, arch);
#endif
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
@@ -279,9 +284,12 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
for (;i<max_pitch;i++)
{
opus_val32 sum = 0;
for (j=0;j<len;j++)
sum = MAC16_16(sum, _x[j],_y[i+j]);
opus_val32 sum;
#if defined(OVERRIDE_PITCH_XCORR)
sum = celt_inner_prod_c(_x, _y+i, len);
#else
sum = celt_inner_prod(_x, _y+i, len, arch);
#endif
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
@@ -290,9 +298,9 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
#ifdef FIXED_POINT
return maxcorr;
#endif
#endif
}
#endif
void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
int len, int max_pitch, int *pitch, int arch)
{
@@ -361,12 +369,17 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
#endif
for (i=0;i<max_pitch>>1;i++)
{
opus_val32 sum=0;
opus_val32 sum;
xcorr[i] = 0;
if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
continue;
#ifdef FIXED_POINT
sum = 0;
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
sum = celt_inner_prod_c(x_lp, y+i, len>>1);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
@@ -399,9 +412,44 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
RESTORE_STACK;
}
#ifdef FIXED_POINT
static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy)
{
opus_val32 x2y2;
int sx, sy, shift;
opus_val32 g;
opus_val16 den;
if (xy == 0 || xx == 0 || yy == 0)
return 0;
sx = celt_ilog2(xx)-14;
sy = celt_ilog2(yy)-14;
shift = sx + sy;
x2y2 = MULT16_16_Q14(VSHR32(xx, sx), VSHR32(yy, sy));
if (shift & 1) {
if (x2y2 < 32768)
{
x2y2 <<= 1;
shift--;
} else {
x2y2 >>= 1;
shift++;
}
}
den = celt_rsqrt_norm(x2y2);
g = MULT16_32_Q15(den, xy);
g = VSHR32(g, (shift>>1)-1);
return EXTRACT16(MIN32(g, Q15ONE));
}
#else
static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy)
{
return xy/celt_sqrt(1+xx*yy);
}
#endif
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0_, int prev_period, opus_val16 prev_gain)
int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
{
int k, i, T, T0;
opus_val16 g, g0;
@@ -426,7 +474,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
T = T0 = *T0_;
ALLOC(yy_lookup, maxperiod+1, opus_val32);
dual_inner_prod(x, x, x-T0, N, &xx, &xy);
dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);
yy_lookup[0] = xx;
yy=xx;
for (i=1;i<=maxperiod;i++)
@@ -437,26 +485,15 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
yy = yy_lookup[T0];
best_xy = xy;
best_yy = yy;
#ifdef FIXED_POINT
{
opus_val32 x2y2;
int sh, t;
x2y2 = 1+HALF32(MULT32_32_Q31(xx,yy));
sh = celt_ilog2(x2y2)>>1;
t = VSHR32(x2y2, 2*(sh-7));
g = g0 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
}
#else
g = g0 = xy/celt_sqrt(1+xx*yy);
#endif
g = g0 = compute_pitch_gain(xy, xx, yy);
/* Look for any pitch at T/k */
for (k=2;k<=15;k++)
{
int T1, T1b;
opus_val16 g1;
opus_val16 cont=0;
opus_val16 thresh;
T1 = (2*T0+k)/(2*k);
T1 = celt_udiv(2*T0+k, 2*k);
if (T1 < minperiod)
break;
/* Look for another strong correlation at T1b */
@@ -468,27 +505,16 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
T1b = T0+T1;
} else
{
T1b = (2*second_check[k]*T0+k)/(2*k);
T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
}
dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
xy += xy2;
yy = yy_lookup[T1] + yy_lookup[T1b];
#ifdef FIXED_POINT
{
opus_val32 x2y2;
int sh, t;
x2y2 = 1+MULT32_32_Q31(xx,yy);
sh = celt_ilog2(x2y2)>>1;
t = VSHR32(x2y2, 2*(sh-7));
g1 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
}
#else
g1 = xy/celt_sqrt(1+2.f*xx*1.f*yy);
#endif
dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch);
xy = HALF32(xy + xy2);
yy = HALF32(yy_lookup[T1] + yy_lookup[T1b]);
g1 = compute_pitch_gain(xy, xx, yy);
if (abs(T1-prev_period)<=1)
cont = prev_gain;
else if (abs(T1-prev_period)<=2 && 5*k*k < T0)
cont = HALF32(prev_gain);
cont = HALF16(prev_gain);
else
cont = 0;
thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont);
@@ -513,13 +539,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
pg = SHR32(frac_div32(best_xy,best_yy+1),16);
for (k=0;k<3;k++)
{
int T1 = T+k-1;
xy = 0;
for (i=0;i<N;i++)
xy = MAC16_16(xy, x[i], x[i-T1]);
xcorr[k] = xy;
}
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
offset = 1;
else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
View
@@ -37,11 +37,17 @@
#include "modes.h"
#include "cpu_support.h"
#if defined(__SSE__) && !defined(FIXED_POINT)
#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \
|| ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT))
#include "x86/pitch_sse.h"
#endif
#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
#if defined(MIPSr1_ASM)
#include "mips/pitch_mipsr1.h"
#endif
#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
|| defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
@@ -52,12 +58,12 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
int len, int max_pitch, int *pitch, int arch);
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0, int prev_period, opus_val16 prev_gain);
int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
/* OPT: This is the kernel you really want to optimize. It gets used a lot
by the prefilter and by the PLC. */
#ifndef OVERRIDE_XCORR_KERNEL
static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
opus_val16 y_0, y_1, y_2, y_3;
@@ -122,10 +128,14 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
sum[3] = MAC16_16(sum[3],tmp,y_1);
}
}
#ifndef OVERRIDE_XCORR_KERNEL
#define xcorr_kernel(x, y, sum, len, arch) \
((void)(arch),xcorr_kernel_c(x, y, sum, len))
#endif /* OVERRIDE_XCORR_KERNEL */
#ifndef OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
@@ -139,8 +149,35 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
*xy1 = xy01;
*xy2 = xy02;
}
#ifndef OVERRIDE_DUAL_INNER_PROD
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))
#endif
/*We make sure a C version is always available for cases where the overhead of
vectorization and passing around an arch flag aren't worth it.*/
static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
const opus_val16 *y, int N)
{
int i;
opus_val32 xy=0;
for (i=0;i<N;i++)
xy = MAC16_16(xy, x[i], y[i]);
return xy;
}
#if !defined(OVERRIDE_CELT_INNER_PROD)
# define celt_inner_prod(x, y, N, arch) \
((void)(arch),celt_inner_prod_c(x, y, N))
#endif
#ifdef NON_STATIC_COMB_FILTER_CONST_C
void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12);
#endif
#ifdef FIXED_POINT
opus_val32
#else
@@ -150,24 +187,14 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
# if defined(OPUS_HAVE_RTCD)
extern
# if defined(FIXED_POINT)
#ifdef FIXED_POINT
opus_val32
# else
#else
void
# endif
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int);
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
# else
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
# endif
#endif
celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch);
#endif
#endif
View
@@ -292,7 +292,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
#endif
}
if (lfe)
max_decay=3;
max_decay = QCONST16(3.f,DB_SHIFT);
enc_start_state = *enc;
ALLOC(oldEBands_intra, C*m->nbEBands, opus_val16);
@@ -418,6 +418,7 @@ void quant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *ol
offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384);
#endif
oldEBands[i+c*m->nbEBands] += offset;
error[i+c*m->nbEBands] -= offset;
bits_left--;
} while (++c < C);
}
@@ -547,9 +548,15 @@ void amp2Log2(const CELTMode *m, int effEnd, int end,
c=0;
do {
for (i=0;i<effEnd;i++)
{
bandLogE[i+c*m->nbEBands] =
celt_log2(SHL32(bandE[i+c*m->nbEBands],2))
celt_log2(bandE[i+c*m->nbEBands])
- SHL16((opus_val16)eMeans[i],6);
#ifdef FIXED_POINT
/* Compensate for bandE[] being Q12 but celt_log2() taking a Q14 input. */
bandLogE[i+c*m->nbEBands] += QCONST16(2.f, DB_SHIFT);
#endif
}
for (i=effEnd;i<end;i++)
bandLogE[c*m->nbEBands+i] = -QCONST16(14.f,DB_SHIFT);
} while (++c < C);
View
@@ -131,7 +131,7 @@ void compute_pulse_cache(CELTMode *m, int LM)
for (i=0;i<nbEntries;i++)
{
unsigned char *ptr = bits+entryI[i];
opus_int16 tmp[MAX_PULSES+1];
opus_int16 tmp[CELT_MAX_PULSES+1];
get_required_bits(tmp, entryN[i], get_pulses(entryK[i]), BITRES);
for (j=1;j<=entryK[i];j++)
ptr[j] = tmp[get_pulses(j)]-1;
@@ -296,7 +296,7 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
done = 0;
for (j=end;j-->start;)
{
int tmp = bits1[j] + (lo*bits2[j]>>ALLOC_STEPS);
int tmp = bits1[j] + ((opus_int32)lo*bits2[j]>>ALLOC_STEPS);
if (tmp < thresh[j] && !done)
{
if (tmp >= alloc_floor)
@@ -333,7 +333,7 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
/*Figure out how many left-over bits we would be adding to this band.
This can include bits we've stolen back from higher, skipped bands.*/
left = total-psum;
percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0);
band_width = m->eBands[codedBands]-m->eBands[j];
@@ -348,12 +348,17 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
/*This if() block is the only part of the allocation function that
is not a mandatory part of the bitstream: any bands we choose to
skip here must be explicitly signaled.*/
/*Choose a threshold with some hysteresis to keep bands from
fluctuating in and out.*/
int depth_threshold;
/*We choose a threshold with some hysteresis to keep bands from
fluctuating in and out, but we try not to fold below a certain point. */
if (codedBands > 17)
depth_threshold = j<prev ? 7 : 9;
else
depth_threshold = 0;
#ifdef FUZZING
if ((rand()&0x1) == 0)
#else
if (codedBands<=start+2 || (band_bits > ((j<prev?7:9)*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
#endif
{
ec_enc_bit_logp(ec, 1, 1);
@@ -414,7 +419,7 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
/* Allocate the remaining bits */
left = total-psum;
percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
for (j=start;j<codedBands;j++)
bits[j] += ((int)percoeff*(m->eBands[j+1]-m->eBands[j]));
@@ -465,7 +470,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
offset += NClogN>>3;
/* Divide with rounding */
ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))) / (den<<BITRES));
ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))));
ebits[j] = celt_udiv(ebits[j], den)>>BITRES;
/* Make sure not to bust */
if (C*ebits[j] > (bits[j]>>BITRES))
View
@@ -32,7 +32,7 @@
#define MAX_PSEUDO 40
#define LOG_MAX_PSEUDO 6
#define MAX_PULSES 128
#define CELT_MAX_PULSES 128
#define MAX_FINE_BITS 8
View
@@ -116,9 +116,11 @@
#else
#ifdef CELT_C
char *scratch_ptr=0;
char *global_stack=0;
#else
extern char *global_stack;
extern char *scratch_ptr;
#endif /* CELT_C */
#ifdef ENABLE_VALGRIND
@@ -140,8 +142,12 @@ extern char *global_stack_top;
#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
#if 0 /* Set this to 1 to instrument pseudostack usage */
#define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
#else
#define RESTORE_STACK (global_stack = _saved_stack)
#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? opus_alloc_scratch(GLOBAL_STACK_SIZE) : global_stack); _saved_stack = global_stack;
#endif
#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? (scratch_ptr=opus_alloc_scratch(GLOBAL_STACK_SIZE)) : global_stack); _saved_stack = global_stack;
#endif /* ENABLE_VALGRIND */
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -37,30 +37,35 @@
#include "os_support.h"
#include "bands.h"
#include "rate.h"
#include "pitch.h"
#ifndef OVERRIDE_vq_exp_rotation1
static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
{
int i;
opus_val16 ms;
celt_norm *Xptr;
Xptr = X;
ms = NEG16(s);
for (i=0;i<len-stride;i++)
{
celt_norm x1, x2;
x1 = Xptr[0];
x2 = Xptr[stride];
Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
*Xptr++ = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15));
*Xptr++ = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
}
Xptr = &X[len-2*stride-1];
for (i=len-2*stride-1;i>=0;i--)
{
celt_norm x1, x2;
x1 = Xptr[0];
x2 = Xptr[stride];
Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
*Xptr-- = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15));
*Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
}
}
#endif /* OVERRIDE_vq_exp_rotation1 */
static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread)
{
@@ -91,7 +96,7 @@ static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int
}
/*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
extract_collapse_mask().*/
len /= stride;
len = celt_udiv(len, stride);
for (i=0;i<stride;i++)
{
if (dir < 0)
@@ -140,53 +145,40 @@ static unsigned extract_collapse_mask(int *iy, int N, int B)
return 1;
/*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
exp_rotation().*/
N0 = N/B;
N0 = celt_udiv(N, B);
collapse_mask = 0;
i=0; do {
int j;
unsigned tmp=0;
j=0; do {
collapse_mask |= (iy[i*N0+j]!=0)<<i;
tmp |= iy[i*N0+j];
} while (++j<N0);
collapse_mask |= (tmp!=0)<<i;
} while (++i<B);
return collapse_mask;
}
unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
#ifdef RESYNTH
, opus_val16 gain
#endif
)
opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch)
{
VARDECL(celt_norm, y);
VARDECL(int, iy);
VARDECL(opus_val16, signx);
VARDECL(int, signx);
int i, j;
opus_val16 s;
int pulsesLeft;
opus_val32 sum;
opus_val32 xy;
opus_val16 yy;
unsigned collapse_mask;
SAVE_STACK;
celt_assert2(K>0, "alg_quant() needs at least one pulse");
celt_assert2(N>1, "alg_quant() needs at least two dimensions");
(void)arch;
ALLOC(y, N, celt_norm);
ALLOC(iy, N, int);
ALLOC(signx, N, opus_val16);
exp_rotation(X, N, 1, B, K, spread);
ALLOC(signx, N, int);
/* Get rid of the sign */
sum = 0;
j=0; do {
if (X[j]>0)
signx[j]=1;
else {
signx[j]=-1;
X[j]=-X[j];
}
signx[j] = X[j]<0;
/* OPT: Make sure the compiler doesn't use a branch on ABS16(). */
X[j] = ABS16(X[j]);
iy[j] = 0;
y[j] = 0;
} while (++j<N);
@@ -218,7 +210,12 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
while (++j<N);
sum = QCONST16(1.f,14);
}
rcp = EXTRACT16(MULT16_32_Q16(K-1, celt_rcp(sum)));
#ifdef FIXED_POINT
rcp = EXTRACT16(MULT16_32_Q16(K, celt_rcp(sum)));
#else
/* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
rcp = EXTRACT16(MULT16_32_Q16(K+0.8, celt_rcp(sum)));
#endif
j=0; do {
#ifdef FIXED_POINT
/* It's really important to round *towards zero* here */
@@ -233,7 +230,7 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
pulsesLeft -= iy[j];
} while (++j<N);
}
celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
/* This should never happen, but just in case it does (e.g. on silence)
we fill the first bin with pulses. */
@@ -249,12 +246,12 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
pulsesLeft=0;
}
s = 1;
for (i=0;i<pulsesLeft;i++)
{
opus_val16 Rxy, Ryy;
int best_id;
opus_val32 best_num = -VERY_LARGE16;
opus_val16 best_den = 0;
opus_val32 best_num;
opus_val16 best_den;
#ifdef FIXED_POINT
int rshift;
#endif
@@ -264,10 +261,23 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
best_id = 0;
/* The squared magnitude term gets added anyway, so we might as well
add it outside the loop */
yy = ADD32(yy, 1);
j=0;
yy = ADD16(yy, 1);
/* Calculations for position 0 are out of the loop, in part to reduce
mispredicted branches (since the if condition is usually false)
in the loop. */
/* Temporary sums of the new pulse(s) */
Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[0])),rshift));
/* We're multiplying y[j] by two so we don't have to do it here */
Ryy = ADD16(yy, y[0]);
/* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that
Rxy is positive because the sign is pre-computed) */
Rxy = MULT16_16_Q15(Rxy,Rxy);
best_den = Ryy;
best_num = Rxy;
j=1;
do {
opus_val16 Rxy, Ryy;
/* Temporary sums of the new pulse(s) */
Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift));
/* We're multiplying y[j] by two so we don't have to do it here */
@@ -278,8 +288,11 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
Rxy = MULT16_16_Q15(Rxy,Rxy);
/* The idea is to check for num/den >= best_num/best_den, but that way
we can do it without any division */
/* OPT: Make sure to use conditional moves here */
if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))
/* OPT: It's not clear whether a cmov is faster than a branch here
since the condition is more often false than true and using
a cmov introduces data dependencies across iterations. The optimal
choice may be architecture-dependent. */
if (opus_unlikely(MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)))
{
best_den = Ryy;
best_num = Rxy;
@@ -294,23 +307,47 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
/* Only now that we've made the final choice, update y/iy */
/* Multiplying y[j] by 2 so we don't have to do it everywhere else */
y[best_id] += 2*s;
y[best_id] += 2;
iy[best_id]++;
}
/* Put the original sign back */
j=0;
do {
X[j] = MULT16_16(signx[j],X[j]);
if (signx[j] < 0)
iy[j] = -iy[j];
/*iy[j] = signx[j] ? -iy[j] : iy[j];*/
/* OPT: The is more likely to be compiled without a branch than the code above
but has the same performance otherwise. */
iy[j] = (iy[j]^-signx[j]) + signx[j];
} while (++j<N);
RESTORE_STACK;
return yy;
}
unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
opus_val16 gain, int resynth, int arch)
{
VARDECL(int, iy);
opus_val16 yy;
unsigned collapse_mask;
SAVE_STACK;
celt_assert2(K>0, "alg_quant() needs at least one pulse");
celt_assert2(N>1, "alg_quant() needs at least two dimensions");
/* Covers vectorization by up to 4. */
ALLOC(iy, N+3, int);
exp_rotation(X, N, 1, B, K, spread);
yy = op_pvq_search(X, iy, K, N, arch);
encode_pulses(iy, N, K, enc);
#ifdef RESYNTH
normalise_residual(iy, X, N, yy, gain);
exp_rotation(X, N, -1, B, K, spread);
#endif
if (resynth)
{
normalise_residual(iy, X, N, yy, gain);
exp_rotation(X, N, -1, B, K, spread);
}
collapse_mask = extract_collapse_mask(iy, N, B);
RESTORE_STACK;
@@ -322,7 +359,6 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
ec_dec *dec, opus_val16 gain)
{
int i;
opus_val32 Ryy;
unsigned collapse_mask;
VARDECL(int, iy);
@@ -331,34 +367,26 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
celt_assert2(K>0, "alg_unquant() needs at least one pulse");
celt_assert2(N>1, "alg_unquant() needs at least two dimensions");
ALLOC(iy, N, int);
decode_pulses(iy, N, K, dec);
Ryy = 0;
i=0;
do {
Ryy = MAC16_16(Ryy, iy[i], iy[i]);
} while (++i < N);
Ryy = decode_pulses(iy, N, K, dec);
normalise_residual(iy, X, N, Ryy, gain);
exp_rotation(X, N, -1, B, K, spread);
collapse_mask = extract_collapse_mask(iy, N, B);
RESTORE_STACK;
return collapse_mask;
}
void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
#ifndef OVERRIDE_renormalise_vector
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
{
int i;
#ifdef FIXED_POINT
int k;
#endif
opus_val32 E = EPSILON;
opus_val32 E;
opus_val16 g;
opus_val32 t;
celt_norm *xptr = X;
for (i=0;i<N;i++)
{
E = MAC16_16(E, *xptr, *xptr);
xptr++;
}
celt_norm *xptr;
E = EPSILON + celt_inner_prod(X, X, N, arch);
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
@@ -373,8 +401,9 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
}
/*return celt_sqrt(E);*/
}
#endif /* OVERRIDE_renormalise_vector */
int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
{
int i;
int itheta;
@@ -393,22 +422,16 @@ int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
Eside = MAC16_16(Eside, s, s);
}
} else {
for (i=0;i<N;i++)
{
celt_norm m, s;
m = X[i];
s = Y[i];
Emid = MAC16_16(Emid, m, m);
Eside = MAC16_16(Eside, s, s);
}
Emid += celt_inner_prod(X, X, N, arch);
Eside += celt_inner_prod(Y, Y, N, arch);
}
mid = celt_sqrt(Emid);
side = celt_sqrt(Eside);
#ifdef FIXED_POINT
/* 0.63662 = 2/pi */
itheta = MULT16_16_Q15(QCONST16(0.63662f,15),celt_atan2p(side, mid));
#else
itheta = (int)floor(.5f+16384*0.63662f*atan2(side,mid));
itheta = (int)floor(.5f+16384*0.63662f*fast_atan2f(side,mid));
#endif
return itheta;
View
@@ -37,6 +37,21 @@
#include "entdec.h"
#include "modes.h"
#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT))
#include "x86/vq_sse.h"
#endif
#if defined(MIPSr1_ASM)
#include "mips/vq_mipsr1.h"
#endif
opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch);
#if !defined(OVERRIDE_OP_PVQ_SEARCH)
#define op_pvq_search(x, iy, K, N, arch) \
(op_pvq_search_c(x, iy, K, N, arch))
#endif
/** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of
* the pitch and a combination of pulses such that its norm is still equal
* to 1. This is the function that will typically require the most CPU.
@@ -46,12 +61,8 @@
* @param enc Entropy encoder state
* @ret A mask indicating which blocks in the band received pulses
*/
unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
ec_enc *enc
#ifdef RESYNTH
, opus_val16 gain
#endif
);
unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
opus_val16 gain, int resynth, int arch);
/** Algebraic pulse decoder
* @param X Decoded normalised spectrum (returned)
@@ -63,8 +74,8 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
ec_dec *dec, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N);
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);
#endif /* VQ_H */
View
@@ -0,0 +1,132 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#include "x86cpu.h"
#if defined(FIXED_POINT)
void celt_fir_sse4_1(const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
VARDECL(opus_val16, x);
__m128i vecNoA;
opus_int32 noA ;
SAVE_STACK;
ALLOC(rnum, ord, opus_val16);
ALLOC(x, N+ord, opus_val16);
for(i=0;i<ord;i++)
rnum[i] = num[ord-i-1];
for(i=0;i<ord;i++)
x[i] = mem[ord-i-1];
for (i=0;i<N-7;i+=8)
{
x[i+ord ]=_x[i ];
x[i+ord+1]=_x[i+1];
x[i+ord+2]=_x[i+2];
x[i+ord+3]=_x[i+3];
x[i+ord+4]=_x[i+4];
x[i+ord+5]=_x[i+5];
x[i+ord+6]=_x[i+6];
x[i+ord+7]=_x[i+7];
}
for (;i<N-3;i+=4)
{
x[i+ord ]=_x[i ];
x[i+ord+1]=_x[i+1];
x[i+ord+2]=_x[i+2];
x[i+ord+3]=_x[i+3];
}
for (;i<N;i++)
x[i+ord]=_x[i];
for(i=0;i<ord;i++)
mem[i] = _x[N-i-1];
#ifdef SMALL_FOOTPRINT
for (i=0;i<N;i++)
{
opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
{
sum = MAC16_16(sum,rnum[j],x[i+j]);
}
_y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
}
#else
noA = EXTEND32(1) << SIG_SHIFT >> 1;
vecNoA = _mm_set_epi32(noA, noA, noA, noA);
for (i=0;i<N-3;i+=4)
{
opus_val32 sums[4] = {0};
__m128i vecSum, vecX;
xcorr_kernel(rnum, x+i, sums, ord, arch);
vecSum = _mm_loadu_si128((__m128i *)sums);
vecSum = _mm_add_epi32(vecSum, vecNoA);
vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
vecX = OP_CVTEPI16_EPI32_M64(_x + i);
vecSum = _mm_add_epi32(vecSum, vecX);
vecSum = _mm_packs_epi32(vecSum, vecSum);
_mm_storel_epi64((__m128i *)(_y + i), vecSum);
}
for (;i<N;i++)
{
opus_val32 sum = 0;
for (j=0;j<ord;j++)
sum = MAC16_16(sum, rnum[j], x[i + j]);
_y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
}
#endif
RESTORE_STACK;
}
#endif
View
@@ -0,0 +1,68 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CELT_LPC_SSE_H
#define CELT_LPC_SSE_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
#define OVERRIDE_CELT_FIR
void celt_fir_sse4_1(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem,
int arch);
#if defined(OPUS_X86_PRESUME_SSE4_1)
#define celt_fir(x, num, y, N, ord, mem, arch) \
((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
#else
extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem,
int arch);
# define celt_fir(x, num, y, N, ord, mem, arch) \
((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
#endif
#endif
#endif
View
@@ -0,0 +1,185 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "macros.h"
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
#include <xmmintrin.h>
#include "arch.h"
void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
int j;
__m128 xsum1, xsum2;
xsum1 = _mm_loadu_ps(sum);
xsum2 = _mm_setzero_ps();
for (j = 0; j < len-3; j += 4)
{
__m128 x0 = _mm_loadu_ps(x+j);
__m128 yj = _mm_loadu_ps(y+j);
__m128 y3 = _mm_loadu_ps(y+j+3);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
_mm_shuffle_ps(yj,y3,0x49)));
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
_mm_shuffle_ps(yj,y3,0x9e)));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
}
if (j < len)
{
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
}
}
}
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
__m128 xsum1, xsum2;
xsum1 = _mm_setzero_ps();
xsum2 = _mm_setzero_ps();
for (i=0;i<N-3;i+=4)
{
__m128 xi = _mm_loadu_ps(x+i);
__m128 y1i = _mm_loadu_ps(y01+i);
__m128 y2i = _mm_loadu_ps(y02+i);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
}
/* Horizontal sum */
xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
_mm_store_ss(xy1, xsum1);
xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
_mm_store_ss(xy2, xsum2);
for (;i<N;i++)
{
*xy1 = MAC16_16(*xy1, x[i], y01[i]);
*xy2 = MAC16_16(*xy2, x[i], y02[i]);
}
}
opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
int N)
{
int i;
float xy;
__m128 sum;
sum = _mm_setzero_ps();
/* FIXME: We should probably go 8-way and use 2 sums. */
for (i=0;i<N-3;i+=4)
{
__m128 xi = _mm_loadu_ps(x+i);
__m128 yi = _mm_loadu_ps(y+i);
sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
}
/* Horizontal sum */
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
_mm_store_ss(&xy, sum);
for (;i<N;i++)
{
xy = MAC16_16(xy, x[i], y[i]);
}
return xy;
}
void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
int i;
__m128 x0v;
__m128 g10v, g11v, g12v;
g10v = _mm_load1_ps(&g10);
g11v = _mm_load1_ps(&g11);
g12v = _mm_load1_ps(&g12);
x0v = _mm_loadu_ps(&x[-T-2]);
for (i=0;i<N-3;i+=4)
{
__m128 yi, yi2, x1v, x2v, x3v, x4v;
const opus_val32 *xp = &x[i-T-2];
yi = _mm_loadu_ps(x+i);
x4v = _mm_loadu_ps(xp+4);
#if 0
/* Slower version with all loads */
x1v = _mm_loadu_ps(xp+1);
x2v = _mm_loadu_ps(xp+2);
x3v = _mm_loadu_ps(xp+3);
#else
x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
#endif
yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
#else
/* Use partial sums */
yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
_mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
yi = _mm_add_ps(yi, yi2);
#endif
x0v=x4v;
_mm_storeu_ps(y+i, yi);
}
#ifdef CUSTOM_MODES
for (;i<N;i++)
{
y[i] = x[i]
+ MULT16_32_Q15(g10,x[i-T])
+ MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+ MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
}
#endif
}
#endif
View
@@ -1,4 +1,5 @@
/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
/**
@file pitch_sse.h
@brief Pitch analysis
@@ -32,125 +33,160 @@
#ifndef PITCH_SSE_H
#define PITCH_SSE_H
#include <xmmintrin.h>
#include "arch.h"
#if defined(HAVE_CONFIG_H)
#include "config.h"
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
void xcorr_kernel_sse4_1(
const opus_int16 *x,
const opus_int16 *y,
opus_val32 sum[4],
int len);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
void xcorr_kernel_sse(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
#endif
#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
#define OVERRIDE_XCORR_KERNEL
static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
int j;
__m128 xsum1, xsum2;
xsum1 = _mm_loadu_ps(sum);
xsum2 = _mm_setzero_ps();
for (j = 0; j < len-3; j += 4)
{
__m128 x0 = _mm_loadu_ps(x+j);
__m128 yj = _mm_loadu_ps(y+j);
__m128 y3 = _mm_loadu_ps(y+j+3);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
_mm_shuffle_ps(yj,y3,0x49)));
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
_mm_shuffle_ps(yj,y3,0x9e)));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
}
if (j < len)
{
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
}
}
}
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
#define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
#define OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
__m128 xsum1, xsum2;
xsum1 = _mm_setzero_ps();
xsum2 = _mm_setzero_ps();
for (i=0;i<N-3;i+=4)
{
__m128 xi = _mm_loadu_ps(x+i);
__m128 y1i = _mm_loadu_ps(y01+i);
__m128 y2i = _mm_loadu_ps(y02+i);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
}
/* Horizontal sum */
xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
_mm_store_ss(xy1, xsum1);
xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
_mm_store_ss(xy2, xsum2);
for (;i<N;i++)
{
*xy1 = MAC16_16(*xy1, x[i], y01[i]);
*xy2 = MAC16_16(*xy2, x[i], y02[i]);
}
}
#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
#define OVERRIDE_XCORR_KERNEL
#define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_sse(x, y, sum, len))
#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
#define OVERRIDE_XCORR_KERNEL
#define xcorr_kernel(x, y, sum, len, arch) \
((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse4_1(
const opus_int16 *x,
const opus_int16 *y,
int N);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse2(
const opus_int16 *x,
const opus_int16 *y,
int N);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse(
const opus_val16 *x,
const opus_val16 *y,
int N);
#endif
#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse4_1(x, y, N))
#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse2(x, y, N))
#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse(x, y, N))
#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
int N);
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
#define OVERRIDE_COMB_FILTER_CONST
static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
int i;
__m128 x0v;
__m128 g10v, g11v, g12v;
g10v = _mm_load1_ps(&g10);
g11v = _mm_load1_ps(&g11);
g12v = _mm_load1_ps(&g12);
x0v = _mm_loadu_ps(&x[-T-2]);
for (i=0;i<N-3;i+=4)
{
__m128 yi, yi2, x1v, x2v, x3v, x4v;
const opus_val32 *xp = &x[i-T-2];
yi = _mm_loadu_ps(x+i);
x4v = _mm_loadu_ps(xp+4);
#if 0
/* Slower version with all loads */
x1v = _mm_loadu_ps(xp+1);
x2v = _mm_loadu_ps(xp+2);
x3v = _mm_loadu_ps(xp+3);
#else
x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
#endif
yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
#define OVERRIDE_DUAL_INNER_PROD
#define OVERRIDE_COMB_FILTER_CONST
#undef dual_inner_prod
#undef comb_filter_const
void dual_inner_prod_sse(const opus_val16 *x,
const opus_val16 *y01,
const opus_val16 *y02,
int N,
opus_val32 *xy1,
opus_val32 *xy2);
void comb_filter_const_sse(opus_val32 *y,
opus_val32 *x,
int T,
int N,
opus_val16 g10,
opus_val16 g11,
opus_val16 g12);
#if defined(OPUS_X86_PRESUME_SSE)
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
#else
/* Use partial sums */
yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
_mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
yi = _mm_add_ps(yi, yi2);
extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y01,
const opus_val16 *y02,
int N,
opus_val32 *xy1,
opus_val32 *xy2);
#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
opus_val32 *y,
opus_val32 *x,
int T,
int N,
opus_val16 g10,
opus_val16 g11,
opus_val16 g12);
#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))
#define NON_STATIC_COMB_FILTER_CONST_C
#endif
x0v=x4v;
_mm_storeu_ps(y+i, yi);
}
#ifdef CUSTOM_MODES
for (;i<N;i++)
{
y[i] = x[i]
+ MULT16_32_Q15(g10,x[i-T])
+ MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+ MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
}
#endif
}
#endif
View
@@ -0,0 +1,95 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include "macros.h"
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
int N)
{
opus_int i, dataSize16;
opus_int32 sum;
__m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
__m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
sum = 0;
dataSize16 = N & ~15;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for (i=0;i<dataSize16;i+=16)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
}
acc1 = _mm_add_epi32( acc1, acc2 );
if (N - i >= 8)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
i += 8;
}
acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
sum += _mm_cvtsi128_si32(acc1);
for (;i<N;i++) {
sum = silk_SMLABB(sum, x[i], y[i]);
}
return sum;
}
#endif
View
@@ -0,0 +1,195 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include "macros.h"
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
#include <smmintrin.h>
#include "x86cpu.h"
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
int N)
{
opus_int i, dataSize16;
opus_int32 sum;
__m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
__m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
__m128i inVec1_3210, inVec2_3210;
sum = 0;
dataSize16 = N & ~15;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for (i=0;i<dataSize16;i+=16) {
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
}
acc1 = _mm_add_epi32(acc1, acc2);
if (N - i >= 8)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
i += 8;
}
if (N - i >= 4)
{
inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
acc1 = _mm_add_epi32(acc1, inVec1_3210);
i += 4;
}
acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
sum += _mm_cvtsi128_si32(acc1);
for (;i<N;i++)
{
sum = silk_SMLABB(sum, x[i], y[i]);
}
return sum;
}
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
{
int j;
__m128i vecX, vecX0, vecX1, vecX2, vecX3;
__m128i vecY0, vecY1, vecY2, vecY3;
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
sum1 = _mm_setzero_si128();
sum2 = _mm_setzero_si128();
sum3 = _mm_setzero_si128();
for (j=0;j<(len-7);j+=8)
{
vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
}
sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
_mm_unpacklo_epi32(sum2, sum3));
for (;j<(len-3);j+=4)
{
vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
vecX0 = _mm_shuffle_epi32(vecX, 0x00);
vecX1 = _mm_shuffle_epi32(vecX, 0x55);
vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
vecX3 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
sum1 = _mm_mullo_epi32(vecX1, vecY1);
sum2 = _mm_mullo_epi32(vecX2, vecY2);
sum3 = _mm_mullo_epi32(vecX3, vecY3);
sum0 = _mm_add_epi32(sum0, sum1);
sum2 = _mm_add_epi32(sum2, sum3);
vecSum = _mm_add_epi32(vecSum, sum0);
vecSum = _mm_add_epi32(vecSum, sum2);
}
for (;j<len;j++)
{
vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
vecX0 = _mm_shuffle_epi32(vecX, 0x00);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
}
#endif
View
@@ -0,0 +1,50 @@
/* Copyright (c) 2016 Jean-Marc Valin */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef VQ_SSE_H
#define VQ_SSE_H
#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
#define OVERRIDE_OP_PVQ_SEARCH
opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
#if defined(OPUS_X86_PRESUME_SSE2)
#define op_pvq_search(x, iy, K, N, arch) \
(op_pvq_search_sse2(x, iy, K, N, arch))
#else
extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
celt_norm *_X, int *iy, int K, int N, int arch);
# define op_pvq_search(X, iy, K, N, arch) \
((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch))
#endif
#endif
#endif
View
@@ -0,0 +1,218 @@
/* Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2007-2016 Jean-Marc Valin */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "vq.h"
#include "x86cpu.h"
#ifndef FIXED_POINT
opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
{
int i, j;
int pulsesLeft;
float xy, yy;
VARDECL(celt_norm, y);
VARDECL(celt_norm, X);
VARDECL(float, signy);
__m128 signmask;
__m128 sums;
__m128i fours;
SAVE_STACK;
(void)arch;
/* All bits set to zero, except for the sign bit. */
signmask = _mm_set_ps1(-0.f);
fours = _mm_set_epi32(4, 4, 4, 4);
ALLOC(y, N+3, celt_norm);
ALLOC(X, N+3, celt_norm);
ALLOC(signy, N+3, float);
OPUS_COPY(X, _X, N);
X[N] = X[N+1] = X[N+2] = 0;
sums = _mm_setzero_ps();
for (j=0;j<N;j+=4)
{
__m128 x4, s4;
x4 = _mm_loadu_ps(&X[j]);
s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
/* Get rid of the sign */
x4 = _mm_andnot_ps(signmask, x4);
sums = _mm_add_ps(sums, x4);
/* Clear y and iy in case we don't do the projection. */
_mm_storeu_ps(&y[j], _mm_setzero_ps());
_mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
_mm_storeu_ps(&X[j], x4);
_mm_storeu_ps(&signy[j], s4);
}
sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
xy = yy = 0;
pulsesLeft = K;
/* Do a pre-search by projecting on the pyramid */
if (K > (N>>1))
{
__m128i pulses_sum;
__m128 yy4, xy4;
__m128 rcp4;
opus_val32 sum = _mm_cvtss_f32(sums);
/* If X is too small, just replace it with a pulse at 0 */
/* Prevents infinities and NaNs from causing too many pulses
to be allocated. 64 is an approximation of infinity here. */
if (!(sum > EPSILON && sum < 64))
{
X[0] = QCONST16(1.f,14);
j=1; do
X[j]=0;
while (++j<N);
sums = _mm_set_ps1(1.f);
}
/* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
xy4 = yy4 = _mm_setzero_ps();
pulses_sum = _mm_setzero_si128();
for (j=0;j<N;j+=4)
{
__m128 rx4, x4, y4;
__m128i iy4;
x4 = _mm_loadu_ps(&X[j]);
rx4 = _mm_mul_ps(x4, rcp4);
iy4 = _mm_cvttps_epi32(rx4);
pulses_sum = _mm_add_epi32(pulses_sum, iy4);
_mm_storeu_si128((__m128i*)&iy[j], iy4);
y4 = _mm_cvtepi32_ps(iy4);
xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
/* double the y[] vector so we don't have to do it in the search loop. */
_mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
}
pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
xy = _mm_cvtss_f32(xy4);
yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
yy = _mm_cvtss_f32(yy4);
}
X[N] = X[N+1] = X[N+2] = -100;
y[N] = y[N+1] = y[N+2] = 100;
celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
/* This should never happen, but just in case it does (e.g. on silence)
we fill the first bin with pulses. */
if (pulsesLeft > N+3)
{
opus_val16 tmp = (opus_val16)pulsesLeft;
yy = MAC16_16(yy, tmp, tmp);
yy = MAC16_16(yy, tmp, y[0]);
iy[0] += pulsesLeft;
pulsesLeft=0;
}
for (i=0;i<pulsesLeft;i++)
{
int best_id;
__m128 xy4, yy4;
__m128 max, max2;
__m128i count;
__m128i pos;
best_id = 0;
/* The squared magnitude term gets added anyway, so we might as well
add it outside the loop */
yy = ADD16(yy, 1);
xy4 = _mm_load1_ps(&xy);
yy4 = _mm_load1_ps(&yy);
max = _mm_setzero_ps();
pos = _mm_setzero_si128();
count = _mm_set_epi32(3, 2, 1, 0);
for (j=0;j<N;j+=4)
{
__m128 x4, y4, r4;
x4 = _mm_loadu_ps(&X[j]);
y4 = _mm_loadu_ps(&y[j]);
x4 = _mm_add_ps(x4, xy4);
y4 = _mm_add_ps(y4, yy4);
y4 = _mm_rsqrt_ps(y4);
r4 = _mm_mul_ps(x4, y4);
/* Update the index of the max. */
pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
/* Update the max. */
max = _mm_max_ps(max, r4);
/* Update the indices (+4) */
count = _mm_add_epi32(count, fours);
}
/* Horizontal max */
max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
/* Now that max2 contains the max at all positions, look at which value(s) of the
partial max is equal to the global max. */
pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
best_id = _mm_cvtsi128_si32(pos);
/* Updating the sums of the new pulse(s) */
xy = ADD32(xy, EXTEND32(X[best_id]));
/* We're multiplying y[j] by two so we don't have to do it here */
yy = ADD16(yy, y[best_id]);
/* Only now that we've made the final choice, update y/iy */
/* Multiplying y[j] by 2 so we don't have to do it everywhere else */
y[best_id] += 2;
iy[best_id]++;
}
/* Put the original sign back */
for (j=0;j<N;j+=4)
{
__m128i y4;
__m128i s4;
y4 = _mm_loadu_si128((__m128i*)&iy[j]);
s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
_mm_storeu_si128((__m128i*)&iy[j], y4);
}
RESTORE_STACK;
return yy;
}
#endif
Oops, something went wrong.