View
@@ -108,6 +108,64 @@ include $(BUILD_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_MODULE := WebRtcAec
LOCAL_SRC_FILES := ./libtgvoip/external/libWebRtcAec_android_$(TARGET_ARCH_ABI).a
include $(PREBUILT_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_MODULE := voip
LOCAL_CPPFLAGS := -Wall -std=c++11 -DANDROID -finline-functions -ffast-math -Os -fno-strict-aliasing -O3
LOCAL_CFLAGS := -O3 -DUSE_KISS_FFT -fexceptions
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
# LOCAL_CPPFLAGS += -mfloat-abi=softfp -mfpu=neon
# LOCAL_CFLAGS += -mfloat-abi=softfp -mfpu=neon -DFLOATING_POINT
# LOCAL_ARM_NEON := true
else
LOCAL_CFLAGS += -DFIXED_POINT
ifeq ($(TARGET_ARCH_ABI),armeabi)
# LOCAL_CPPFLAGS += -mfloat-abi=softfp -mfpu=neon
# LOCAL_CFLAGS += -mfloat-abi=softfp -mfpu=neon
else
ifeq ($(TARGET_ARCH_ABI),x86)
endif
endif
endif
MY_DIR := libtgvoip
LOCAL_C_INCLUDES := jni/opus/include jni/boringssl/include/
LOCAL_SRC_FILES := \
./libtgvoip/logging.cpp \
./libtgvoip/VoIPController.cpp \
./libtgvoip/BufferInputStream.cpp \
./libtgvoip/BufferOutputStream.cpp \
./libtgvoip/BlockingQueue.cpp \
./libtgvoip/audio/AudioInput.cpp \
./libtgvoip/os/android/AudioInputOpenSLES.cpp \
./libtgvoip/MediaStreamItf.cpp \
./libtgvoip/audio/AudioOutput.cpp \
./libtgvoip/OpusEncoder.cpp \
./libtgvoip/os/android/AudioOutputOpenSLES.cpp \
./libtgvoip/JitterBuffer.cpp \
./libtgvoip/OpusDecoder.cpp \
./libtgvoip/BufferPool.cpp \
./libtgvoip/os/android/OpenSLEngineWrapper.cpp \
./libtgvoip/os/android/AudioInputAndroid.cpp \
./libtgvoip/os/android/AudioOutputAndroid.cpp \
./libtgvoip/EchoCanceller.cpp \
./libtgvoip/CongestionControl.cpp \
./libtgvoip/VoIPServerConfig.cpp
include $(BUILD_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_CPPFLAGS := -Wall -std=c++11 -DANDROID -frtti -DHAVE_PTHREAD -finline-functions -ffast-math -O0
LOCAL_C_INCLUDES += ./jni/boringssl/include/
LOCAL_ARM_MODE := arm
@@ -237,13 +295,13 @@ include $(BUILD_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_PRELINK_MODULE := false
LOCAL_MODULE := tmessages.24
LOCAL_MODULE := tmessages.26
LOCAL_CFLAGS := -w -std=c11 -Os -DNULL=0 -DSOCKLEN_T=socklen_t -DLOCALE_NOT_USED -D_LARGEFILE_SOURCE=1 -D_FILE_OFFSET_BITS=64
LOCAL_CFLAGS += -Drestrict='' -D__EMX__ -DOPUS_BUILD -DFIXED_POINT -DUSE_ALLOCA -DHAVE_LRINT -DHAVE_LRINTF -fno-math-errno
LOCAL_CFLAGS += -DANDROID_NDK -DDISABLE_IMPORTGL -fno-strict-aliasing -fprefetch-loop-arrays -DAVOID_TABLES -DANDROID_TILE_BASED_DECODE -DANDROID_ARMV6_IDCT -ffast-math -D__STDC_CONSTANT_MACROS
LOCAL_CPPFLAGS := -DBSD=1 -ffast-math -Os -funroll-loops -std=c++11
LOCAL_LDLIBS := -ljnigraphics -llog -lz -latomic
LOCAL_STATIC_LIBRARIES := webp sqlite tgnet breakpad avformat avcodec avutil
LOCAL_LDLIBS := -ljnigraphics -llog -lz -latomic -lOpenSLES
LOCAL_STATIC_LIBRARIES := webp sqlite tgnet breakpad avformat avcodec avutil voip WebRtcAec
LOCAL_SRC_FILES := \
./opus/src/opus.c \
@@ -261,6 +319,14 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_MODE := arm
LOCAL_CPPFLAGS += -DLIBYUV_NEON
LOCAL_CFLAGS += -DLIBYUV_NEON
LOCAL_CFLAGS += -DOPUS_HAVE_RTCD -DOPUS_ARM_ASM
LOCAL_SRC_FILES += \
# ./opus/celt/arm/celt_neon_intr.c \
# ./opus/silk/arm/NSQ_neon.c \
./opus/silk/arm/arm_silk_map.c
# LOCAL_SRC_FILES += ./opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
else
ifeq ($(TARGET_ARCH_ABI),armeabi)
LOCAL_ARM_MODE := arm
@@ -270,8 +336,25 @@ else
LOCAL_CFLAGS += -Dx86fix
LOCAL_CPPFLAGS += -Dx86fix
LOCAL_ARM_MODE := arm
LOCAL_SRC_FILE += \
./libyuv/source/row_x86.asm
# LOCAL_SRC_FILES += \
# ./libyuv/source/row_x86.asm
# LOCAL_SRC_FILES += \
# ./opus/celt/x86/celt_lpc_sse.c \
# ./opus/celt/x86/pitch_sse.c \
# ./opus/celt/x86/pitch_sse2.c \
# ./opus/celt/x86/pitch_sse4_1.c \
# ./opus/celt/x86/vq_sse2.c \
# ./opus/celt/x86/x86_celt_map.c \
# ./opus/celt/x86/x86cpu.c \
# ./opus/silk/fixed/x86/burg_modified_FIX_sse.c \
# ./opus/silk/fixed/x86/vector_ops_FIX_sse.c \
# ./opus/silk/x86/NSQ_del_dec_sse.c \
# ./opus/silk/x86/NSQ_sse.c \
# ./opus/silk/x86/VAD_sse.c \
# ./opus/silk/x86/VQ_WMat_sse.c \
# ./opus/silk/x86/x86_silk_map.c
endif
endif
endif
@@ -352,7 +435,8 @@ LOCAL_SRC_FILES += \
./opus/silk/stereo_decode_pred.c \
./opus/silk/stereo_encode_pred.c \
./opus/silk/stereo_find_predictor.c \
./opus/silk/stereo_quant_pred.c
./opus/silk/stereo_quant_pred.c \
./opus/silk/LPC_fit.c
LOCAL_SRC_FILES += \
./opus/silk/fixed/LTP_analysis_filter_FIX.c \
@@ -364,12 +448,10 @@ LOCAL_SRC_FILES += \
./opus/silk/fixed/find_pitch_lags_FIX.c \
./opus/silk/fixed/find_pred_coefs_FIX.c \
./opus/silk/fixed/noise_shape_analysis_FIX.c \
./opus/silk/fixed/prefilter_FIX.c \
./opus/silk/fixed/process_gains_FIX.c \
./opus/silk/fixed/regularize_correlations_FIX.c \
./opus/silk/fixed/residual_energy16_FIX.c \
./opus/silk/fixed/residual_energy_FIX.c \
./opus/silk/fixed/solve_LS_FIX.c \
./opus/silk/fixed/warped_autocorrelation_FIX.c \
./opus/silk/fixed/apply_sine_window_FIX.c \
./opus/silk/fixed/autocorr_FIX.c \
@@ -483,7 +565,8 @@ LOCAL_SRC_FILES += \
./gifvideo.cpp \
./SqliteWrapper.cpp \
./TgNetWrapper.cpp \
./NativeLoader.cpp
./NativeLoader.cpp \
./libtgvoip/client/android/tg_voip_jni.cpp
include $(BUILD_SHARED_LIBRARY)
View
@@ -1,4 +1,4 @@
APP_PLATFORM := android-9
APP_PLATFORM := android-14
APP_ABI := armeabi armeabi-v7a
NDK_TOOLCHAIN_VERSION := 4.9
APP_STL := gnustl_static
View
@@ -26,6 +26,8 @@ jmethodID jclass_ConnectionsManager_onLogout;
jmethodID jclass_ConnectionsManager_onConnectionStateChanged;
jmethodID jclass_ConnectionsManager_onInternalPushReceived;
jmethodID jclass_ConnectionsManager_onUpdateConfig;
jmethodID jclass_ConnectionsManager_onBytesSent;
jmethodID jclass_ConnectionsManager_onBytesReceived;
jint createLoadOpetation(JNIEnv *env, jclass c, jint dc_id, jlong id, jlong volume_id, jlong access_hash, jint local_id, jbyteArray encKey, jbyteArray encIv, jstring extension, jint version, jint size, jstring dest, jstring temp, jobject delegate) {
if (encKey != nullptr && encIv == nullptr || encKey == nullptr && encIv != nullptr || extension == nullptr || dest == nullptr || temp == nullptr) {
@@ -168,7 +170,7 @@ void sendRequest(JNIEnv *env, jclass c, jint object, jobject onComplete, jobject
if (onQuickAck != nullptr) {
onQuickAck = env->NewGlobalRef(onQuickAck);
}
ConnectionsManager::getInstance().sendRequest(request, ([onComplete](TLObject *response, TL_error *error) {
ConnectionsManager::getInstance().sendRequest(request, ([onComplete](TLObject *response, TL_error *error, int32_t networkType) {
TL_api_response *resp = (TL_api_response *) response;
jint ptr = 0;
jint errorCode = 0;
@@ -180,7 +182,7 @@ void sendRequest(JNIEnv *env, jclass c, jint object, jobject onComplete, jobject
errorText = jniEnv->NewStringUTF(error->text.c_str());
}
if (onComplete != nullptr) {
jniEnv->CallVoidMethod(onComplete, jclass_RequestDelegateInternal_run, ptr, errorCode, errorText);
jniEnv->CallVoidMethod(onComplete, jclass_RequestDelegateInternal_run, ptr, errorCode, errorText, networkType);
}
if (errorText != nullptr) {
jniEnv->DeleteLocalRef(errorText);
@@ -246,8 +248,8 @@ void setUseIpv6(JNIEnv *env, jclass c, bool value) {
ConnectionsManager::getInstance().setUseIpv6(value);
}
void setNetworkAvailable(JNIEnv *env, jclass c, jboolean value) {
ConnectionsManager::getInstance().setNetworkAvailable(value);
void setNetworkAvailable(JNIEnv *env, jclass c, jboolean value, jint networkType) {
ConnectionsManager::getInstance().setNetworkAvailable(value, networkType);
}
void setPushConnectionEnabled(JNIEnv *env, jclass c, jboolean value) {
@@ -289,17 +291,25 @@ class Delegate : public ConnectiosManagerDelegate {
void onInternalPushReceived() {
jniEnv->CallStaticVoidMethod(jclass_ConnectionsManager, jclass_ConnectionsManager_onInternalPushReceived);
}
void onBytesReceived(int32_t amount, int32_t networkType) {
jniEnv->CallStaticVoidMethod(jclass_ConnectionsManager, jclass_ConnectionsManager_onBytesReceived, amount, networkType);
}
void onBytesSent(int32_t amount, int32_t networkType) {
jniEnv->CallStaticVoidMethod(jclass_ConnectionsManager, jclass_ConnectionsManager_onBytesSent, amount, networkType);
}
};
void init(JNIEnv *env, jclass c, jint version, jint layer, jint apiId, jstring deviceModel, jstring systemVersion, jstring appVersion, jstring langCode, jstring configPath, jstring logPath, jint userId, jboolean enablePushConnection) {
void init(JNIEnv *env, jclass c, jint version, jint layer, jint apiId, jstring deviceModel, jstring systemVersion, jstring appVersion, jstring langCode, jstring configPath, jstring logPath, jint userId, jboolean enablePushConnection, jboolean hasNetwork, jint networkType) {
const char *deviceModelStr = env->GetStringUTFChars(deviceModel, 0);
const char *systemVersionStr = env->GetStringUTFChars(systemVersion, 0);
const char *appVersionStr = env->GetStringUTFChars(appVersion, 0);
const char *langCodeStr = env->GetStringUTFChars(langCode, 0);
const char *configPathStr = env->GetStringUTFChars(configPath, 0);
const char *logPathStr = env->GetStringUTFChars(logPath, 0);
ConnectionsManager::getInstance().init(version, layer, apiId, std::string(deviceModelStr), std::string(systemVersionStr), std::string(appVersionStr), std::string(langCodeStr), std::string(configPathStr), std::string(logPathStr), userId, true, enablePushConnection);
ConnectionsManager::getInstance().init(version, layer, apiId, std::string(deviceModelStr), std::string(systemVersionStr), std::string(appVersionStr), std::string(langCodeStr), std::string(configPathStr), std::string(logPathStr), userId, true, enablePushConnection, hasNetwork, networkType);
if (deviceModelStr != 0) {
env->ReleaseStringUTFChars(deviceModel, deviceModelStr);
@@ -339,13 +349,13 @@ static JNINativeMethod ConnectionsManagerMethods[] = {
{"native_applyDatacenterAddress", "(ILjava/lang/String;I)V", (void *) applyDatacenterAddress},
{"native_getConnectionState", "()I", (void *) getConnectionState},
{"native_setUserId", "(I)V", (void *) setUserId},
{"native_init", "(IIILjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IZ)V", (void *) init},
{"native_init", "(IIILjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IZZI)V", (void *) init},
{"native_switchBackend", "()V", (void *) switchBackend},
{"native_pauseNetwork", "()V", (void *) pauseNetwork},
{"native_resumeNetwork", "(Z)V", (void *) resumeNetwork},
{"native_updateDcSettings", "()V", (void *) updateDcSettings},
{"native_setUseIpv6", "(Z)V", (void *) setUseIpv6},
{"native_setNetworkAvailable", "(Z)V", (void *) setNetworkAvailable},
{"native_setNetworkAvailable", "(ZI)V", (void *) setNetworkAvailable},
{"native_setPushConnectionEnabled", "(Z)V", (void *) setPushConnectionEnabled},
{"native_setJava", "(Z)V", (void *) setJava}
};
@@ -381,7 +391,7 @@ extern "C" int registerNativeTgNetFunctions(JavaVM *vm, JNIEnv *env) {
if (jclass_RequestDelegateInternal == 0) {
return JNI_FALSE;
}
jclass_RequestDelegateInternal_run = env->GetMethodID(jclass_RequestDelegateInternal, "run", "(IILjava/lang/String;)V");
jclass_RequestDelegateInternal_run = env->GetMethodID(jclass_RequestDelegateInternal, "run", "(IILjava/lang/String;I)V");
if (jclass_RequestDelegateInternal_run == 0) {
return JNI_FALSE;
}
@@ -447,6 +457,14 @@ extern "C" int registerNativeTgNetFunctions(JavaVM *vm, JNIEnv *env) {
if (jclass_ConnectionsManager_onUpdateConfig == 0) {
return JNI_FALSE;
}
jclass_ConnectionsManager_onBytesSent = env->GetStaticMethodID(jclass_ConnectionsManager, "onBytesSent", "(II)V");
if (jclass_ConnectionsManager_onBytesSent == 0) {
return JNI_FALSE;
}
jclass_ConnectionsManager_onBytesReceived = env->GetStaticMethodID(jclass_ConnectionsManager, "onBytesReceived", "(II)V");
if (jclass_ConnectionsManager_onBytesReceived == 0) {
return JNI_FALSE;
}
ConnectionsManager::getInstance().setDelegate(new Delegate());
return JNI_TRUE;
View
@@ -127,7 +127,7 @@ void ec_GFp_nistp_points_make_affine_internal(
*
* A left-shift followed by subtraction of the original value yields a new
* representation of the same value, using signed bits s_i = b_(i+1) - b_i.
* This representation from Booth's paper has since appeared in the
* This representation from Booth's paper has since onAppeared in the
* literature under a variety of different names including "reversed binary
* form", "alternating greedy expansion", "mutual opposite form", and
* "sign-alternating {+-1}-representation".
View
@@ -273,6 +273,5 @@ jint Java_org_telegram_ui_Components_AnimatedFileDrawable_getVideoFrame(JNIEnv *
return 1;
}
}
return 0;
}
}
Submodule libtgvoip added at eb813e
View
@@ -58,16 +58,12 @@
# define S_MUL(a,b) MULT16_32_Q15(b, a)
# define C_MUL(m,a,b) \
do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
do{ (m).r = SUB32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = ADD32_ovflw(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
# define C_MULC(m,a,b) \
do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
# define C_MUL4(m,a,b) \
do{ (m).r = SHR32(SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)),2); \
(m).i = SHR32(ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)),2); }while(0)
do{ (m).r = ADD32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = SUB32_ovflw(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
# define C_MULBYSCALAR( c, s ) \
do{ (c).r = S_MUL( (c).r , s ) ;\
@@ -81,17 +77,17 @@
DIVSCALAR( (c).i , div); }while (0)
#define C_ADD( res, a,b)\
do {(res).r=ADD32((a).r,(b).r); (res).i=ADD32((a).i,(b).i); \
do {(res).r=ADD32_ovflw((a).r,(b).r); (res).i=ADD32_ovflw((a).i,(b).i); \
}while(0)
#define C_SUB( res, a,b)\
do {(res).r=SUB32((a).r,(b).r); (res).i=SUB32((a).i,(b).i); \
do {(res).r=SUB32_ovflw((a).r,(b).r); (res).i=SUB32_ovflw((a).i,(b).i); \
}while(0)
#define C_ADDTO( res , a)\
do {(res).r = ADD32((res).r, (a).r); (res).i = ADD32((res).i,(a).i);\
do {(res).r = ADD32_ovflw((res).r, (a).r); (res).i = ADD32_ovflw((res).i,(a).i);\
}while(0)
#define C_SUBFROM( res , a)\
do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \
do {(res).r = ADD32_ovflw((res).r,(a).r); (res).i = SUB32_ovflw((res).i,(a).i); \
}while(0)
#if defined(OPUS_ARM_INLINE_ASM)
@@ -101,6 +97,9 @@
#if defined(OPUS_ARM_INLINE_EDSP)
#include "arm/kiss_fft_armv5e.h"
#endif
#if defined(MIPSr1_ASM)
#include "mips/kiss_fft_mipsr1.h"
#endif
#else /* not FIXED_POINT*/
View
@@ -46,6 +46,14 @@
# endif
# endif
#if OPUS_GNUC_PREREQ(3, 0)
#define opus_likely(x) (__builtin_expect(!!(x), 1))
#define opus_unlikely(x) (__builtin_expect(!!(x), 0))
#else
#define opus_likely(x) (!!(x))
#define opus_unlikely(x) (!!(x))
#endif
#define CELT_SIG_SCALE 32768.f
#define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__);
@@ -69,18 +77,24 @@ static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
#define IMUL32(a,b) ((a)*(b))
#define ABS(x) ((x) < 0 ? (-(x)) : (x)) /**< Absolute integer value. */
#define ABS16(x) ((x) < 0 ? (-(x)) : (x)) /**< Absolute 16-bit value. */
#define MIN16(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 16-bit value. */
#define MAX16(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 16-bit value. */
#define ABS32(x) ((x) < 0 ? (-(x)) : (x)) /**< Absolute 32-bit value. */
#define MIN32(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 32-bit value. */
#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */
#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */
#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */
#define UADD32(a,b) ((a)+(b))
#define USUB32(a,b) ((a)-(b))
/* Set this if opus_int64 is a native type of the CPU. */
/* Assume that all LP64 architectures have fast 64-bit types; also x86_64
(which can be ILP32 for x32) and Win64 (which is LLP64). */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define OPUS_FAST_INT64 1
#else
#define OPUS_FAST_INT64 0
#endif
#define PRINT_MIPS(file)
#ifdef FIXED_POINT
@@ -95,6 +109,9 @@ typedef opus_val32 celt_ener;
#define Q15ONE 32767
#define SIG_SHIFT 12
/* Safe saturation value for 32-bit signals. Should be less than
2^31*(1-0.85) to avoid blowing up on DC at deemphasis.*/
#define SIG_SAT (300000000)
#define NORM_SCALING 16384
@@ -108,13 +125,22 @@ typedef opus_val32 celt_ener;
#define SCALEIN(a) (a)
#define SCALEOUT(a) (a)
#define ABS16(x) ((x) < 0 ? (-(x)) : (x))
#define ABS32(x) ((x) < 0 ? (-(x)) : (x))
static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
}
#ifdef FIXED_DEBUG
#include "fixed_debug.h"
#else
#include "fixed_generic.h"
#ifdef OPUS_ARM_INLINE_EDSP
#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR
#include "arm/fixed_arm64.h"
#elif OPUS_ARM_INLINE_EDSP
#include "arm/fixed_armv5e.h"
#elif defined (OPUS_ARM_INLINE_ASM)
#include "arm/fixed_armv4.h"
@@ -137,6 +163,22 @@ typedef float celt_sig;
typedef float celt_norm;
typedef float celt_ener;
#ifdef FLOAT_APPROX
/* This code should reliably detect NaN/inf even when -ffast-math is used.
Assumes IEEE 754 format. */
static OPUS_INLINE int celt_isnan(float x)
{
union {float f; opus_uint32 i;} in;
in.f = x;
return ((in.i>>23)&0xFF)==0xFF && (in.i&0x007FFFFF)!=0;
}
#else
#ifdef __FAST_MATH__
#error Cannot build libopus with -ffast-math unless FLOAT_APPROX is defined. This could result in crashes on extreme (e.g. NaN) input
#endif
#define celt_isnan(x) ((x)!=(x))
#endif
#define Q15ONE 1.0f
#define NORM_SCALING 1.f
@@ -146,11 +188,16 @@ typedef float celt_ener;
#define VERY_LARGE16 1e15f
#define Q15_ONE ((opus_val16)1.f)
/* This appears to be the same speed as C99's fabsf() but it's more portable. */
#define ABS16(x) ((float)fabs(x))
#define ABS32(x) ((float)fabs(x))
#define QCONST16(x,bits) (x)
#define QCONST32(x,bits) (x)
#define NEG16(x) (-(x))
#define NEG32(x) (-(x))
#define NEG32_ovflw(x) (-(x))
#define EXTRACT16(x) (x)
#define EXTEND32(x) (x)
#define SHR16(a,shift) (a)
@@ -167,13 +214,16 @@ typedef float celt_ener;
#define SATURATE16(x) (x)
#define ROUND16(a,shift) (a)
#define SROUND16(a,shift) (a)
#define HALF16(x) (.5f*(x))
#define HALF32(x) (.5f*(x))
#define ADD16(a,b) ((a)+(b))
#define SUB16(a,b) ((a)-(b))
#define ADD32(a,b) ((a)+(b))
#define SUB32(a,b) ((a)-(b))
#define ADD32_ovflw(a,b) ((a)+(b))
#define SUB32_ovflw(a,b) ((a)-(b))
#define MULT16_16_16(a,b) ((a)*(b))
#define MULT16_16(a,b) ((opus_val32)(a)*(opus_val32)(b))
#define MAC16_16(c,a,b) ((c)+(opus_val32)(a)*(opus_val32)(b))
@@ -184,6 +234,7 @@ typedef float celt_ener;
#define MULT32_32_Q31(a,b) ((a)*(b))
#define MAC16_32_Q15(c,a,b) ((c)+(a)*(b))
#define MAC16_32_Q16(c,a,b) ((c)+(a)*(b))
#define MULT16_16_Q11_32(a,b) ((a)*(b))
#define MULT16_16_Q11(a,b) ((a)*(b))
@@ -201,6 +252,8 @@ typedef float celt_ener;
#define SCALEIN(a) ((a)*CELT_SIG_SCALE)
#define SCALEOUT(a) ((a)*(1/CELT_SIG_SCALE))
#define SIG2WORD16(x) (x)
#endif /* !FIXED_POINT */
#ifndef GLOBAL_STACK_SIZE
View
@@ -1,7 +1,33 @@
#!/usr/bin/perl
# Copyright (C) 2002-2013 Xiph.org Foundation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
my $bigend; # little/big endian
my $nxstack;
my $apple = 0;
my $symprefix = "";
$nxstack = 0;
@@ -10,11 +36,16 @@
while ($ARGV[0] =~ /^-/) {
$_ = shift;
last if /^--/;
if (/^-n/) {
last if /^--$/;
if (/^-n$/) {
$nflag++;
next;
}
if (/^--apple$/) {
$apple = 1;
$symprefix = "_";
next;
}
die "I don't recognize this switch: $_\\n";
}
$printit++ unless $nflag;
@@ -25,6 +56,8 @@
$thumb = 0; # ARM mode by default, not Thumb.
@proc_stack = ();
printf (" .syntax unified\n");
LINE:
while (<>) {
@@ -53,7 +86,7 @@
s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
s/\bIMPORT\b/.extern/;
s/\bEXPORT\b/.global/;
s/\bEXPORT\b\s*/.global $symprefix/;
s/^(\s+)\[/$1IF/;
s/^(\s+)\|/$1ELSE/;
s/^(\s+)\]/$1ENDIF/;
@@ -109,7 +142,7 @@
# won't match the original source file (we could use the .line
# directive, which is documented to be obsolete, but then gdb will
# show the wrong line in the translated source file).
s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/;
s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/ unless ($apple);
}
}
@@ -131,9 +164,13 @@
$prefix = "";
if ($proc)
{
$prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
$prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc) unless ($apple);
# Make sure we $prefix isn't empty here (for the $apple case).
# We handle mangling the label here, make sure it doesn't match
# the label handling below (if $prefix would be empty).
$prefix = "; ";
push(@proc_stack, $proc);
s/^[A-Za-z_\.]\w+/$&:/;
s/^[A-Za-z_\.]\w+/$symprefix$&:/;
}
$prefix = $prefix."\t.thumb_func; " if ($thumb);
s/\bPROC\b/@ $&/;
@@ -146,7 +183,7 @@
my $proc;
s/\bENDP\b/@ $&/;
$proc = pop(@proc_stack);
$_ = "\t.size $proc, .-$proc".$_ if ($proc);
$_ = "\t.size $proc, .-$proc".$_ if ($proc && !$apple);
}
s/\bSUBT\b/@ $&/;
s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
@@ -311,6 +348,6 @@
}
#If we had a code section, mark that this object doesn't need an executable
# stack.
if ($nxstack) {
if ($nxstack && !$apple) {
printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n");
}
View
@@ -30,20 +30,114 @@
#endif
#include "pitch.h"
#include "kiss_fft.h"
#include "mdct.h"
#if defined(OPUS_HAVE_RTCD)
# if defined(FIXED_POINT)
# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int , int) = {
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
};
# else
# error "Floating-point implementation is not supported by ARM asm yet." \
"Reconfigure with --disable-rtcd or send patches."
# endif
# endif
# else /* !FIXED_POINT */
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
celt_pitch_xcorr_float_neon /* Neon */
};
# endif
# endif /* FIXED_POINT */
#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* ARMv4 */
xcorr_kernel_c, /* EDSP */
xcorr_kernel_c, /* Media */
xcorr_kernel_neon_fixed, /* Neon */
};
#endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# if defined(HAVE_ARM_NE10)
# if defined(CUSTOM_MODES)
int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_alloc_arch_c, /* ARMv4 */
opus_fft_alloc_arch_c, /* EDSP */
opus_fft_alloc_arch_c, /* Media */
opus_fft_alloc_arm_neon /* Neon with NE10 library support */
};
void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_free_arch_c, /* ARMv4 */
opus_fft_free_arch_c, /* EDSP */
opus_fft_free_arch_c, /* Media */
opus_fft_free_arm_neon /* Neon with NE10 */
};
# endif /* CUSTOM_MODES */
void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout) = {
opus_fft_c, /* ARMv4 */
opus_fft_c, /* EDSP */
opus_fft_c, /* Media */
opus_fft_neon /* Neon with NE10 */
};
void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout) = {
opus_ifft_c, /* ARMv4 */
opus_ifft_c, /* EDSP */
opus_ifft_c, /* Media */
opus_ifft_neon /* Neon with NE10 */
};
void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift,
int stride, int arch) = {
clt_mdct_forward_c, /* ARMv4 */
clt_mdct_forward_c, /* EDSP */
clt_mdct_forward_c, /* Media */
clt_mdct_forward_neon /* Neon with NE10 */
};
void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift,
int stride, int arch) = {
clt_mdct_backward_c, /* ARMv4 */
clt_mdct_backward_c, /* EDSP */
clt_mdct_backward_c, /* Media */
clt_mdct_backward_neon /* Neon with NE10 */
};
# endif /* HAVE_ARM_NE10 */
# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
#endif /* OPUS_HAVE_RTCD */
View
@@ -37,11 +37,12 @@
#include "cpu_support.h"
#include "os_support.h"
#include "opus_types.h"
#include "arch.h"
#define OPUS_CPU_ARM_V4 (1)
#define OPUS_CPU_ARM_EDSP (1<<1)
#define OPUS_CPU_ARM_MEDIA (1<<2)
#define OPUS_CPU_ARM_NEON (1<<3)
#define OPUS_CPU_ARM_V4_FLAG (1<<OPUS_ARCH_ARM_V4)
#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
#if defined(_MSC_VER)
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -55,29 +56,31 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
/* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
* instructions via their assembled hex code.
* All of these instructions should be essentially nops. */
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*PLD [r13]*/
__emit(0xF5DDF000);
flags|=OPUS_CPU_ARM_EDSP;
flags|=OPUS_CPU_ARM_EDSP_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*SHADD8 r3,r3,r3*/
__emit(0xE6333F93);
flags|=OPUS_CPU_ARM_MEDIA;
flags|=OPUS_CPU_ARM_MEDIA_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
# if defined(OPUS_ARM_MAY_HAVE_NEON)
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*VORR q0,q0,q0*/
__emit(0xF2200150);
flags|=OPUS_CPU_ARM_NEON;
flags|=OPUS_CPU_ARM_NEON_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
@@ -107,34 +110,34 @@ opus_uint32 opus_cpu_capabilities(void)
while(fgets(buf, 512, cpuinfo) != NULL)
{
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
/* Search for edsp and neon flag */
if(memcmp(buf, "Features", 8) == 0)
{
char *p;
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
p = strstr(buf, " edsp");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_EDSP;
# endif
flags |= OPUS_CPU_ARM_EDSP_FLAG;
# if defined(OPUS_ARM_MAY_HAVE_NEON)
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
p = strstr(buf, " neon");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_NEON;
flags |= OPUS_CPU_ARM_NEON_FLAG;
# endif
}
# endif
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
/* Search for media capabilities (>= ARMv6) */
if(memcmp(buf, "CPU architecture:", 17) == 0)
{
int version;
version = atoi(buf+17);
if(version >= 6)
flags |= OPUS_CPU_ARM_MEDIA;
flags |= OPUS_CPU_ARM_MEDIA_FLAG;
}
# endif
}
@@ -156,18 +159,26 @@ int opus_select_arch(void)
opus_uint32 flags = opus_cpu_capabilities();
int arch = 0;
if(!(flags & OPUS_CPU_ARM_EDSP))
if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
/* Asserts ensure arch values are sequential */
celt_assert(arch == OPUS_ARCH_ARM_V4);
return arch;
}
arch++;
if(!(flags & OPUS_CPU_ARM_MEDIA))
if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
celt_assert(arch == OPUS_ARCH_ARM_EDSP);
return arch;
}
arch++;
if(!(flags & OPUS_CPU_ARM_NEON))
if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
return arch;
}
arch++;
celt_assert(arch == OPUS_ARCH_ARM_NEON);
return arch;
}
View
@@ -66,6 +66,12 @@
# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
#define OPUS_ARCH_ARM_V4 (0)
#define OPUS_ARCH_ARM_EDSP (1)
#define OPUS_ARCH_ARM_MEDIA (2)
#define OPUS_ARCH_ARM_NEON (3)
# endif
#endif
View
@@ -0,0 +1,174 @@
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_ne10_fft.c
@brief ARM Neon optimizations for fft using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SKIP_CONFIG_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#endif
#include <NE10_init.h>
#include <NE10_dsp.h>
#include "os_support.h"
#include "kiss_fft.h"
#include "stack_alloc.h"
#if !defined(FIXED_POINT)
# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
#else
# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
#endif
#if defined(CUSTOM_MODES)
/* nfft lengths in NE10 that support scaled fft */
# define NE10_FFTSCALED_SUPPORT_MAX 4
static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
480, 240, 120, 60
};
int opus_fft_alloc_arm_neon(kiss_fft_state *st)
{
int i;
size_t memneeded = sizeof(struct arch_fft_state);
st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
if (!st->arch_fft)
return -1;
for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
if(st->nfft == ne10_fft_scaled_support[i])
break;
}
if (i == NE10_FFTSCALED_SUPPORT_MAX) {
/* This nfft length (scaled fft) is not supported in NE10 */
st->arch_fft->is_supported = 0;
st->arch_fft->priv = NULL;
}
else {
st->arch_fft->is_supported = 1;
st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
if (st->arch_fft->priv == NULL) {
return -1;
}
}
return 0;
}
void opus_fft_free_arm_neon(kiss_fft_state *st)
{
NE10_FFT_CFG_TYPE_T cfg;
if (!st->arch_fft)
return;
cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
if (cfg)
NE10_FFT_DESTROY_C2C_TYPE(cfg);
opus_free(st->arch_fft);
}
#endif
void opus_fft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout)
{
NE10_FFT_STATE_TYPE_T state;
NE10_FFT_CFG_TYPE_T cfg = &state;
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
SAVE_STACK;
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
if (!st->arch_fft->is_supported) {
/* This nfft length (scaled fft) not supported in NE10 */
opus_fft_c(st, fin, fout);
}
else {
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
#if !defined(FIXED_POINT)
state.is_forward_scaled = 1;
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 0);
#else
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 0, 1);
#endif
}
RESTORE_STACK;
}
void opus_ifft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout)
{
NE10_FFT_STATE_TYPE_T state;
NE10_FFT_CFG_TYPE_T cfg = &state;
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
SAVE_STACK;
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
if (!st->arch_fft->is_supported) {
/* This nfft length (scaled fft) not supported in NE10 */
opus_ifft_c(st, fin, fout);
}
else {
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
#if !defined(FIXED_POINT)
state.is_backward_scaled = 0;
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 1);
#else
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 1, 0);
#endif
}
RESTORE_STACK;
}
View
@@ -0,0 +1,258 @@
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_ne10_mdct.c
@brief ARM Neon optimizations for mdct using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SKIP_CONFIG_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#endif
#include "kiss_fft.h"
#include "_kiss_fft_guts.h"
#include "mdct.h"
#include "stack_alloc.h"
void clt_mdct_forward_neon(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
VARDECL(kiss_fft_scalar, f);
VARDECL(kiss_fft_cpx, f2);
const kiss_fft_state *st = l->kfft[shift];
const kiss_twiddle_scalar *trig;
SAVE_STACK;
N = l->n;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f, N2, kiss_fft_scalar);
ALLOC(f2, N4, kiss_fft_cpx);
/* Consider the input to be composed of four blocks: [a, b, c, d] */
/* Window, shuffle, fold */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
for(i=0;i<((overlap+3)>>2);i++)
{
/* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
*yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
*yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
xp1+=2;
xp2-=2;
wp1+=2;
wp2-=2;
}
wp1 = window;
wp2 = window+overlap-1;
for(;i<N4-((overlap+3)>>2);i++)
{
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
*yp++ = *xp2;
*yp++ = *xp1;
xp1+=2;
xp2-=2;
}
for(;i<N4;i++)
{
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
*yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
*yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
xp1+=2;
xp2-=2;
wp1+=2;
wp2-=2;
}
}
/* Pre-rotation */
{
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const kiss_twiddle_scalar *t = &trig[0];
for(i=0;i<N4;i++)
{
kiss_fft_cpx yc;
kiss_twiddle_scalar t0, t1;
kiss_fft_scalar re, im, yr, yi;
t0 = t[i];
t1 = t[N4+i];
re = *yp++;
im = *yp++;
yr = S_MUL(re,t0) - S_MUL(im,t1);
yi = S_MUL(im,t0) + S_MUL(re,t1);
yc.r = yr;
yc.i = yi;
f2[i] = yc;
}
}
opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
/* Post-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
const kiss_twiddle_scalar *t = &trig[0];
/* Temp pointers to make it really clear to the compiler what we're doing */
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
*yp1 = yr;
*yp2 = yi;
fp++;
yp1 += 2*stride;
yp2 -= 2*stride;
}
}
RESTORE_STACK;
}
void clt_mdct_backward_neon(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window,
int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
VARDECL(kiss_fft_scalar, f);
const kiss_twiddle_scalar *trig;
const kiss_fft_state *st = l->kfft[shift];
N = l->n;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f, N2, kiss_fft_scalar);
/* Pre-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
yp[2*i] = yr;
yp[2*i+1] = yi;
xp1+=2*stride;
xp2-=2*stride;
}
}
opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
/* Post-rotate and de-shuffle from both ends of the buffer at once to make
it in-place. */
{
kiss_fft_scalar * yp0 = out+(overlap>>1);
kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
const kiss_twiddle_scalar *t = &trig[0];
/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
middle pair will be computed twice. */
for(i=0;i<(N4+1)>>1;i++)
{
kiss_fft_scalar re, im, yr, yi;
kiss_twiddle_scalar t0, t1;
re = yp0[0];
im = yp0[1];
t0 = t[i];
t1 = t[N4+i];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) + S_MUL(im,t1);
yi = S_MUL(re,t1) - S_MUL(im,t0);
re = yp1[0];
im = yp1[1];
yp0[0] = yr;
yp1[1] = yi;
t0 = t[(N4-i-1)];
t1 = t[(N2-i-1)];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) + S_MUL(im,t1);
yi = S_MUL(re,t1) - S_MUL(im,t0);
yp1[0] = yr;
yp0[1] = yi;
yp0 += 2;
yp1 -= 2;
}
}
/* Mirror on both sides for TDAC */
{
kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
const opus_val16 * OPUS_RESTRICT wp1 = window;
const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
for(i = 0; i < overlap/2; i++)
{
kiss_fft_scalar x1, x2;
x1 = *xp1;
x2 = *yp1;
*yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
*xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
wp1++;
wp2--;
}
}
RESTORE_STACK;
}
View
@@ -0,0 +1,311 @@
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
@brief ARM Neon Intrinsic optimizations for celt
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <arm_neon.h>
#include "../pitch.h"
#if defined(FIXED_POINT)
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
int32x4_t a = vld1q_s32(sum);
/* Load y[0...3] */
/* This requires len>0 to always be valid (which we assert in the C code). */
int16x4_t y0 = vld1_s16(y);
y += 4;
for (j = 0; j + 8 <= len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
int16x4_t x0 = vget_low_s16(xx);
int16x4_t x4 = vget_high_s16(xx);
/* Load y[4...11] */
int16x8_t yy = vld1q_s16(y);
int16x4_t y4 = vget_low_s16(yy);
int16x4_t y8 = vget_high_s16(yy);
int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
int16x4_t y1 = vext_s16(y0, y4, 1);
int16x4_t y5 = vext_s16(y4, y8, 1);
int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
int16x4_t y2 = vext_s16(y0, y4, 2);
int16x4_t y6 = vext_s16(y4, y8, 2);
int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
int16x4_t y3 = vext_s16(y0, y4, 3);
int16x4_t y7 = vext_s16(y4, y8, 3);
int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
y0 = y8;
a = a7;
x += 8;
y += 8;
}
for (; j < len; j++)
{
int16x4_t x0 = vld1_dup_s16(x); /* load next x */
int32x4_t a0 = vmlal_s16(a, y0, x0);
int16x4_t y4 = vld1_dup_s16(y); /* load next y */
y0 = vext_s16(y0, y4, 1);
a = a0;
x++;
y++;
}
vst1q_s32(sum, a);
}
#else
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
* Computes 4 correlation values and stores them in sum[4]
*/
static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
float32_t sum[4], int len) {
float32x4_t YY[3];
float32x4_t YEXT[3];
float32x4_t XX[2];
float32x2_t XX_2;
float32x4_t SUMM;
const float32_t *xi = x;
const float32_t *yi = y;
celt_assert(len>0);
YY[0] = vld1q_f32(yi);
SUMM = vdupq_n_f32(0);
/* Consume 8 elements in x vector and 12 elements in y
* vector. However, the 12'th element never really gets
* touched in this loop. So, if len == 8, then we only
* must access y[0] to y[10]. y[11] must not be accessed
* hence make sure len > 8 and not len >= 8
*/
while (len > 8) {
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
YEXT[0] = vextq_f32(YY[1], YY[2], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
YEXT[1] = vextq_f32(YY[1], YY[2], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
YEXT[2] = vextq_f32(YY[1], YY[2], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
YY[0] = YY[2];
len -= 8;
}
/* Consume 4 elements in x vector and 8 elements in y
* vector. However, the 8'th element in y never really gets
* touched in this loop. So, if len == 4, then we only
* must access y[0] to y[6]. y[7] must not be accessed
* hence make sure len>4 and not len>=4
*/
if (len > 4) {
yi += 4;
YY[1] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
YY[0] = YY[1];
len -= 4;
}
while (--len > 0) {
XX_2 = vld1_dup_f32(xi++);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
YY[0]= vld1q_f32(++yi);
}
XX_2 = vld1_dup_f32(xi);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
vst1q_f32(sum, SUMM);
}
/*
* Function: xcorr_kernel_neon_float_process1
* ---------------------------------
* Computes single correlation values and stores in *sum
*/
static void xcorr_kernel_neon_float_process1(const float32_t *x,
const float32_t *y, float32_t *sum, int len) {
float32x4_t XX[4];
float32x4_t YY[4];
float32x2_t XX_2;
float32x2_t YY_2;
float32x4_t SUMM;
float32x2_t SUMM_2[2];
const float32_t *xi = x;
const float32_t *yi = y;
SUMM = vdupq_n_f32(0);
/* Work on 16 values per iteration */
while (len >= 16) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
XX[2] = vld1q_f32(xi);
xi += 4;
XX[3] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
yi += 4;
YY[3] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
len -= 16;
}
/* Work on 8 values */
if (len >= 8) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
len -= 8;
}
/* Work on 4 values */
if (len >= 4) {
XX[0] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
len -= 4;
}
/* Start accumulating results */
SUMM_2[0] = vget_low_f32(SUMM);
if (len >= 2) {
/* While at it, consume 2 more values if available */
XX_2 = vld1_f32(xi);
xi += 2;
YY_2 = vld1_f32(yi);
yi += 2;
SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
len -= 2;
}
SUMM_2[1] = vget_high_f32(SUMM);
SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
/* Ok, now we have result accumulated in SUMM_2[0].0 */
if (len > 0) {
/* Case when you have one value left */
XX_2 = vld1_dup_f32(xi);
YY_2 = vld1_dup_f32(yi);
SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
}
vst1_lane_f32(sum, SUMM_2[0], 0);
}
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch) {
int i;
celt_assert(max_pitch > 0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
for (i = 0; i < (max_pitch-3); i += 4) {
xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
(float32_t *)xcorr+i, len);
}
/* In case max_pitch isn't multiple of 4
* compute single correlation value per iteration
*/
for (; i < max_pitch; i++) {
xcorr_kernel_neon_float_process1((const float32_t *)_x,
(const float32_t *)_y+i, (float32_t *)xcorr+i, len);
}
}
#endif
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -42,6 +42,7 @@ IF OPUS_ARM_MAY_HAVE_NEON
; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
xcorr_kernel_neon PROC
xcorr_kernel_neon_start
; input:
; r3 = int len
; r4 = opus_val16 *x
@@ -181,7 +182,7 @@ celt_pitch_xcorr_neon_process4
VEOR q0, q0, q0
; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
; So we don't save/restore any other registers.
BL xcorr_kernel_neon
BL xcorr_kernel_neon_start
SUBS r6, r6, #4
VST1.32 {q0}, [r2]!
; _y += 4
@@ -257,6 +258,7 @@ IF OPUS_ARM_MAY_HAVE_EDSP
; This will get used on ARMv7 devices without NEON, so it has been optimized
; to take advantage of dual-issuing where possible.
xcorr_kernel_edsp PROC
xcorr_kernel_edsp_start
; input:
; r3 = int len
; r4 = opus_val16 *_x (must be 32-bit aligned)
@@ -309,7 +311,7 @@ xcorr_kernel_edsp_process4_done
SUBS r2, r2, #1 ; j--
; Stall
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
LDRGTH r14, [r4], #2 ; r14 = *x++
LDRHGT r14, [r4], #2 ; r14 = *x++
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
@@ -319,15 +321,15 @@ xcorr_kernel_edsp_process4_done
SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
LDRH r10, [r5], #2 ; r10 = y_4 = *y++
SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
LDRGTH r12, [r4], #2 ; r12 = *x++
LDRHGT r12, [r4], #2 ; r12 = *x++
SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
BLE xcorr_kernel_edsp_done
SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
CMP r2, #1 ; j--
SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
LDRH r2, [r5], #2 ; r2 = y_5 = *y++
SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
LDRGTH r14, [r4] ; r14 = *x
LDRHGT r14, [r4] ; r14 = *x
SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
BLE xcorr_kernel_edsp_done
SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
@@ -387,11 +389,11 @@ celt_pitch_xcorr_edsp_process1u_loop4
celt_pitch_xcorr_edsp_process1u_loop4_done
ADDS r12, r12, #4
celt_pitch_xcorr_edsp_process1u_loop1
LDRGEH r6, [r4], #2
LDRHGE r6, [r4], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
SUBGES r12, r12, #1
LDRGTH r8, [r5], #2
SUBSGE r12, r12, #1
LDRHGT r8, [r5], #2
BGT celt_pitch_xcorr_edsp_process1u_loop1
; Restore _x
SUB r4, r4, r3, LSL #1
@@ -416,7 +418,7 @@ celt_pitch_xcorr_edsp_process4
MOV r7, #0
MOV r8, #0
MOV r9, #0
BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
CMP r0, r6
; _y+=4
@@ -474,7 +476,7 @@ celt_pitch_xcorr_edsp_process2_1
ADDS r12, r12, #1
; Stall
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDRGTH r7, [r4], #2
LDRHGT r7, [r4], #2
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
BLE celt_pitch_xcorr_edsp_process2_done
LDRH r9, [r5], #2
@@ -527,8 +529,8 @@ celt_pitch_xcorr_edsp_process1a_loop_done
SUBGE r12, r12, #2
SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
ADDS r12, r12, #1
LDRGEH r6, [r4], #2
LDRGEH r8, [r5], #2
LDRHGE r6, [r4], #2
LDRHGE r8, [r5], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
; maxcorr = max(maxcorr, sum)
View
@@ -0,0 +1,72 @@
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file fft_arm.h
@brief ARM Neon Intrinsic optimizations for fft using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(FFT_ARM_H)
#define FFT_ARM_H
#include "config.h"
#include "kiss_fft.h"
#if defined(HAVE_ARM_NE10)
int opus_fft_alloc_arm_neon(kiss_fft_state *st);
void opus_fft_free_arm_neon(kiss_fft_state *st);
void opus_fft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout);
void opus_ifft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout);
#if !defined(OPUS_HAVE_RTCD)
#define OVERRIDE_OPUS_FFT (1)
#define opus_fft_alloc_arch(_st, arch) \
((void)(arch), opus_fft_alloc_arm_neon(_st))
#define opus_fft_free_arch(_st, arch) \
((void)(arch), opus_fft_free_arm_neon(_st))
#define opus_fft(_st, _fin, _fout, arch) \
((void)(arch), opus_fft_neon(_st, _fin, _fout))
#define opus_ifft(_st, _fin, _fout, arch) \
((void)(arch), opus_ifft_neon(_st, _fin, _fout))
#endif /* OPUS_HAVE_RTCD */
#endif /* HAVE_ARM_NE10 */
#endif
View
@@ -0,0 +1,35 @@
/* Copyright (C) 2015 Vidyo */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FIXED_ARM64_H
#define FIXED_ARM64_H
#include <arm_neon.h>
#undef SIG2WORD16
#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT)))
#endif
View
@@ -68,6 +68,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
#undef MAC16_32_Q15
#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
Result fits in 32 bits. */
#undef MAC16_32_Q16
#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#undef MULT32_32_Q31
View
@@ -82,6 +82,23 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
}
#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
Result fits in 32 bits. */
#undef MAC16_32_Q16
static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
opus_val32 b)
{
int res;
__asm__(
"#MAC16_32_Q16\n\t"
"smlawb %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(b), "r"(a), "r"(c)
);
return res;
}
#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
/** 16x16 multiply-add where the result fits in 32 bits */
#undef MAC16_16
static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
@@ -113,4 +130,22 @@ static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
}
#define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
#ifdef OPUS_ARM_INLINE_MEDIA
#undef SIG2WORD16
static OPUS_INLINE opus_val16 SIG2WORD16_armv6(opus_val32 x)
{
celt_sig res;
__asm__(
"#SIG2WORD16\n\t"
"ssat %0, #16, %1, ASR #12\n\t"
: "=r"(res)
: "r"(x+2048)
);
return EXTRACT16(res);
}
#define SIG2WORD16(x) (SIG2WORD16_armv6(x))
#endif /* OPUS_ARM_INLINE_MEDIA */
#endif
View
@@ -0,0 +1,60 @@
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file arm_mdct.h
@brief ARM Neon Intrinsic optimizations for mdct using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(MDCT_ARM_H)
#define MDCT_ARM_H
#include "config.h"
#include "mdct.h"
#if defined(HAVE_ARM_NE10)
/** Compute a forward MDCT and scale by 4/N, trashes the input array */
void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap,
int shift, int stride, int arch);
void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap,
int shift, int stride, int arch);
#if !defined(OPUS_HAVE_RTCD)
#define OVERRIDE_OPUS_MDCT (1)
#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
#endif /* OPUS_HAVE_RTCD */
#endif /* HAVE_ARM_NE10 */
#endif
View
@@ -46,12 +46,81 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
# endif
# if !defined(OPUS_HAVE_RTCD)
# if defined(OPUS_HAVE_RTCD) && \
((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
extern opus_val32
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int);
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
# elif defined(OPUS_ARM_PRESUME_EDSP) || \
defined(OPUS_ARM_PRESUME_MEDIA) || \
defined(OPUS_ARM_PRESUME_NEON)
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void xcorr_kernel_neon_fixed(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
# endif
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
# define OVERRIDE_XCORR_KERNEL (1)
# define xcorr_kernel(x, y, sum, len, arch) \
((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_XCORR_KERNEL (1)
# define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
# endif
#else /* Start !FIXED_POINT */
/* Float case */
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
#endif
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int);
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
# endif
# endif
#endif /* end !FIXED_POINT */
#endif
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -41,7 +41,7 @@
* @param X Spectrum
* @param bandE Square root of the energy for each band (returned)
*/
void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M);
void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM);
/*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/
@@ -59,14 +59,15 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel
* @param bandE Square root of the energy for each band
*/
void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start, int end, int C, int M);
celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start,
int end, int M, int downsample, int silence);
#define SPREAD_NONE (0)
#define SPREAD_LIGHT (1)
#define SPREAD_NORMAL (2)
#define SPREAD_AGGRESSIVE (3)
int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
int last_decision, int *hf_average, int *tapset_decision, int update_hf,
int end, int C, int M);
@@ -97,15 +98,20 @@ void haar1(celt_norm *X, int N0, int stride);
* @param LM log2() of the number of 2.5 subframes in the frame
* @param codedBands Last band to receive bits + 1
* @param seed Random generator seed
* @param arch Run-time architecture (see opus_select_arch())
*/
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
opus_val16 *prev2logE, int *pulses, opus_uint32 seed);
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
int complexity, int arch, int disable_inv);
void anti_collapse(const CELTMode *m, celt_norm *X_,
unsigned char *collapse_masks, int LM, int C, int size, int start,
int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
int arch);
opus_uint32 celt_lcg_rand(opus_uint32 seed);
View
@@ -54,6 +54,10 @@
#define PACKAGE_VERSION "unknown"
#endif
#if defined(MIPSr1_ASM)
#include "mips/celt_mipsr1.h"
#endif
int resampling_factor(opus_int32 rate)
{
@@ -85,8 +89,77 @@ int resampling_factor(opus_int32 rate)
return ret;
}
#ifndef OVERRIDE_COMB_FILTER_CONST
static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
#if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C)
/* This version should be faster on ARM */
#ifdef OPUS_ARM_ASM
#ifndef NON_STATIC_COMB_FILTER_CONST_C
static
#endif
void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
opus_val32 x0, x1, x2, x3, x4;
int i;
x4 = SHL32(x[-T-2], 1);
x3 = SHL32(x[-T-1], 1);
x2 = SHL32(x[-T], 1);
x1 = SHL32(x[-T+1], 1);
for (i=0;i<N-4;i+=5)
{
opus_val32 t;
x0=SHL32(x[i-T+2],1);
t = MAC16_32_Q16(x[i], g10, x2);
t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
t = SATURATE(t, SIG_SAT);
y[i] = t;
x4=SHL32(x[i-T+3],1);
t = MAC16_32_Q16(x[i+1], g10, x1);
t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
t = SATURATE(t, SIG_SAT);
y[i+1] = t;
x3=SHL32(x[i-T+4],1);
t = MAC16_32_Q16(x[i+2], g10, x0);
t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
t = SATURATE(t, SIG_SAT);
y[i+2] = t;
x2=SHL32(x[i-T+5],1);
t = MAC16_32_Q16(x[i+3], g10, x4);
t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
t = SATURATE(t, SIG_SAT);
y[i+3] = t;
x1=SHL32(x[i-T+6],1);
t = MAC16_32_Q16(x[i+4], g10, x3);
t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
t = SATURATE(t, SIG_SAT);
y[i+4] = t;
}
#ifdef CUSTOM_MODES
for (;i<N;i++)
{
opus_val32 t;
x0=SHL32(x[i-T+2],1);
t = MAC16_32_Q16(x[i], g10, x2);
t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
t = SATURATE(t, SIG_SAT);
y[i] = t;
x4=x3;
x3=x2;
x2=x1;
x1=x0;
}
#endif
}
#else
#ifndef NON_STATIC_COMB_FILTER_CONST_C
static
#endif
void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
opus_val32 x0, x1, x2, x3, x4;
@@ -102,6 +175,7 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+ MULT16_32_Q15(g10,x2)
+ MULT16_32_Q15(g11,ADD32(x1,x3))
+ MULT16_32_Q15(g12,ADD32(x0,x4));
y[i] = SATURATE(y[i], SIG_SAT);
x4=x3;
x3=x2;
x2=x1;
@@ -110,10 +184,12 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
}
#endif
#endif
#ifndef OVERRIDE_comb_filter
void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
const opus_val16 *window, int overlap)
const opus_val16 *window, int overlap, int arch)
{
int i;
/* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
@@ -131,16 +207,23 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
OPUS_MOVE(y, x, N);
return;
}
g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
/* When the gain is zero, T0 and/or T1 is set to zero. We need
to have then be at least 2 to avoid processing garbage data. */
T0 = IMAX(T0, COMBFILTER_MINPERIOD);
T1 = IMAX(T1, COMBFILTER_MINPERIOD);
g00 = MULT16_16_P15(g0, gains[tapset0][0]);
g01 = MULT16_16_P15(g0, gains[tapset0][1]);
g02 = MULT16_16_P15(g0, gains[tapset0][2]);
g10 = MULT16_16_P15(g1, gains[tapset1][0]);
g11 = MULT16_16_P15(g1, gains[tapset1][1]);
g12 = MULT16_16_P15(g1, gains[tapset1][2]);
x1 = x[-T1+1];
x2 = x[-T1 ];
x3 = x[-T1-1];
x4 = x[-T1-2];
/* If the filter didn't change, we don't need the overlap */
if (g0==g1 && T0==T1 && tapset0==tapset1)
overlap=0;
for (i=0;i<overlap;i++)
{
opus_val16 f;
@@ -153,6 +236,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+ MULT16_32_Q15(MULT16_16_Q15(f,g10),x2)
+ MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3))
+ MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4));
y[i] = SATURATE(y[i], SIG_SAT);
x4=x3;
x3=x2;
x2=x1;
@@ -168,14 +252,20 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
}
/* Compute the part with the constant filter. */
comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);
}
#endif /* OVERRIDE_comb_filter */
/* TF change table. Positive values mean better frequency resolution (longer
effective window), whereas negative values mean better time resolution
(shorter effective window). The second index is computed as:
4*isTransient + 2*tf_select + per_band_flag */
const signed char tf_select_table[4][8] = {
{0, -1, 0, -1, 0,-1, 0,-1},
{0, -1, 0, -2, 1, 0, 1,-1},
{0, -2, 0, -3, 2, 0, 1,-1},
{0, -2, 0, -3, 3, 0, 1,-1},
/*isTransient=0 isTransient=1 */
{0, -1, 0, -1, 0,-1, 0,-1}, /* 2.5 ms */
{0, -1, 0, -2, 1, 0, 1,-1}, /* 5 ms */
{0, -2, 0, -3, 2, 0, 1,-1}, /* 10 ms */
{0, -2, 0, -3, 3, 0, 1,-1}, /* 20 ms */
};
@@ -213,6 +303,9 @@ const char *opus_strerror(int error)
const char *opus_get_version_string(void)
{
return "libopus " PACKAGE_VERSION
/* Applications may rely on the presence of this substring in the version
string to determine if they have a fixed-point or floating-point build
at runtime. */
#ifdef FIXED_POINT
"-fixed"
#endif
View
@@ -57,13 +57,21 @@ typedef struct {
float noisiness;
float activity;
float music_prob;
int bandwidth;
}AnalysisInfo;
int bandwidth;
float activity_probability;
} AnalysisInfo;
typedef struct {
int signalType;
int offset;
} SILKInfo;
#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr)))
#define __celt_check_silkinfo_ptr(ptr) ((ptr) + ((ptr) - (const SILKInfo*)(ptr)))
/* Encoder/decoder Requests */
/* Expose this option again when variable framesize actually works */
@@ -116,6 +124,9 @@ typedef struct {
#define OPUS_SET_ENERGY_MASK_REQUEST 10026
#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __opus_check_val16_ptr(x)
#define CELT_SET_SILK_INFO_REQUEST 10028
#define CELT_SET_SILK_INFO(x) CELT_SET_SILK_INFO_REQUEST, __celt_check_silkinfo_ptr(x)
/* Encoder stuff */
int celt_encoder_get_size(int channels);
@@ -134,7 +145,8 @@ int celt_decoder_get_size(int channels);
int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec);
int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
#define celt_encoder_ctl opus_custom_encoder_ctl
#define celt_decoder_ctl opus_custom_decoder_ctl
@@ -200,15 +212,25 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RES
void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
const opus_val16 *window, int overlap);
const opus_val16 *window, int overlap, int arch);
#ifdef NON_STATIC_COMB_FILTER_CONST_C
void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12);
#endif
#ifndef OVERRIDE_COMB_FILTER_CONST
# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
#endif
void init_caps(const CELTMode *m,int *cap,int LM,int C);
#ifdef RESYNTH
void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch);
void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
celt_sig * OPUS_RESTRICT out_mem[], int C, int LM);
void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
int LM, int downsample, int silence);
#endif
#ifdef __cplusplus
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -49,8 +49,7 @@ int p
float *lpc = _lpc;
#endif
for (i = 0; i < p; i++)
lpc[i] = 0;
OPUS_CLEAR(lpc, p);
if (ac[0] != 0)
{
for (i = 0; i < p; i++) {
@@ -88,12 +87,15 @@ int p
#endif
}
void celt_fir(const opus_val16 *_x,
void celt_fir_c(
const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
@@ -111,6 +113,7 @@ void celt_fir(const opus_val16 *_x,
for(i=0;i<ord;i++)
mem[i] = _x[N-i-1];
#ifdef SMALL_FOOTPRINT
(void)arch;
for (i=0;i<N;i++)
{
opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
@@ -124,7 +127,7 @@ void celt_fir(const opus_val16 *_x,
for (i=0;i<N-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(rnum, x+i, sum, ord);
xcorr_kernel(rnum, x+i, sum, ord, arch);
_y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT)));
_y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
_y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
@@ -146,10 +149,12 @@ void celt_iir(const opus_val32 *_x,
opus_val32 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
#ifdef SMALL_FOOTPRINT
int i,j;
(void)arch;
for (i=0;i<N;i++)
{
opus_val32 sum = _x[i];
@@ -161,7 +166,7 @@ void celt_iir(const opus_val32 *_x,
{
mem[j]=mem[j-1];
}
mem[0] = ROUND16(sum,SIG_SHIFT);
mem[0] = SROUND16(sum, SIG_SHIFT);
_y[i] = sum;
}
#else
@@ -187,31 +192,31 @@ void celt_iir(const opus_val32 *_x,
sum[1]=_x[i+1];
sum[2]=_x[i+2];
sum[3]=_x[i+3];
xcorr_kernel(rden, y+i, sum, ord);
xcorr_kernel(rden, y+i, sum, ord, arch);
/* Patch up the result to compensate for the fact that this is an IIR */
y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT);
y[i+ord ] = -SROUND16(sum[0],SIG_SHIFT);
_y[i ] = sum[0];
sum[1] = MAC16_16(sum[1], y[i+ord ], den[0]);
y[i+ord+1] = -ROUND16(sum[1],SIG_SHIFT);
y[i+ord+1] = -SROUND16(sum[1],SIG_SHIFT);
_y[i+1] = sum[1];
sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]);
sum[2] = MAC16_16(sum[2], y[i+ord ], den[1]);
y[i+ord+2] = -ROUND16(sum[2],SIG_SHIFT);
y[i+ord+2] = -SROUND16(sum[2],SIG_SHIFT);
_y[i+2] = sum[2];
sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]);
sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]);
sum[3] = MAC16_16(sum[3], y[i+ord ], den[2]);
y[i+ord+3] = -ROUND16(sum[3],SIG_SHIFT);
y[i+ord+3] = -SROUND16(sum[3],SIG_SHIFT);
_y[i+3] = sum[3];
}
for (;i<N;i++)
{
opus_val32 sum = _x[i];
for (j=0;j<ord;j++)
sum -= MULT16_16(rden[j],y[i+j]);
y[i+ord] = ROUND16(sum,SIG_SHIFT);
y[i+ord] = SROUND16(sum,SIG_SHIFT);
_y[i] = sum;
}
for(i=0;i<ord;i++)
View
@@ -29,24 +29,37 @@
#define PLC_H
#include "arch.h"
#include "cpu_support.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.h"
#endif
#define LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
void celt_fir(const opus_val16 *x,
void celt_fir_c(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
#if !defined(OVERRIDE_CELT_FIR)
#define celt_fir(x, num, y, N, ord, mem, arch) \
(celt_fir_c(x, num, y, N, ord, mem, arch))
#endif
void celt_iir(const opus_val32 *x,
const opus_val16 *den,
opus_val32 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
const opus_val16 *window, int overlap, int lag, int n, int arch);
View
@@ -31,7 +31,8 @@
#include "opus_types.h"
#include "opus_defines.h"
#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
#if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
#include "arm/armcpu.h"
/* We currently support 4 ARM variants:
@@ -42,6 +43,22 @@
*/
#define OPUS_ARCHMASK 3
#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
(defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
#include "x86/x86cpu.h"
/* We currently support 5 x86 variants:
* arch[0] -> non-sse
* arch[1] -> sse
* arch[2] -> sse2
* arch[3] -> sse4.1
* arch[4] -> avx
*/
#define OPUS_ARCHMASK 7
int opus_select_arch(void);
#else
#define OPUS_ARCHMASK 0
@@ -50,5 +67,4 @@ static OPUS_INLINE int opus_select_arch(void)
return 0;
}
#endif
#endif
View
@@ -74,7 +74,7 @@ int log2_frac(opus_uint32 val, int frac)
/*Although derived separately, the pulse vector coding scheme is equivalent to
a Pyramid Vector Quantizer \cite{Fis86}.
Some additional notes about an early version appear at
http://people.xiph.org/~tterribe/notes/cwrs.html, but the codebook ordering
https://people.xiph.org/~tterribe/notes/cwrs.html, but the codebook ordering
and the definitions of some terms have evolved since that was written.
The conversion from a pulse vector to an integer index (encoding) and back
@@ -460,10 +460,12 @@ void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
}
static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
opus_uint32 p;
int s;
int k0;
opus_int16 val;
opus_val32 yy=0;
celt_assert(_k>0);
celt_assert(_n>1);
while(_n>2){
@@ -487,7 +489,9 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
}
else for(p=row[_k];p>_i;p=row[_k])_k--;
_i-=p;
*_y++=(k0-_k+s)^s;
val=(k0-_k+s)^s;
*_y++=val;
yy=MAC16_16(yy,val,val);
}
/*Lots of dimensions case:*/
else{
@@ -507,7 +511,9 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
do p=CELT_PVQ_U_ROW[--_k][_n];
while(p>_i);
_i-=p;
*_y++=(k0-_k+s)^s;
val=(k0-_k+s)^s;
*_y++=val;
yy=MAC16_16(yy,val,val);
}
}
_n--;
@@ -519,14 +525,19 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
k0=_k;
_k=(_i+1)>>1;
if(_k)_i-=2*_k-1;
*_y++=(k0-_k+s)^s;
val=(k0-_k+s)^s;
*_y++=val;
yy=MAC16_16(yy,val,val);
/*_n==1*/
s=-(int)_i;
*_y=(_k+s)^s;
val=(_k+s)^s;
*_y=val;
yy=MAC16_16(yy,val,val);
return yy;
}
void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
return cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
}
#else /* SMALL_FOOTPRINT */
@@ -591,8 +602,10 @@ static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){
_y: Returns the vector of pulses.
_u: Must contain entries [0..._k+1] of row _n of U() on input.
Its contents will be destructively modified.*/
static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
int j;
opus_int16 val;
opus_val32 yy=0;
celt_assert(_n>0);
j=0;
do{
@@ -607,10 +620,13 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
while(p>_i)p=_u[--_k];
_i-=p;
yj-=_k;
_y[j]=(yj+s)^s;
val=(yj+s)^s;
_y[j]=val;
yy=MAC16_16(yy,val,val);
uprev(_u,_k+2,0);
}
while(++j<_n);
return yy;
}
/*Returns the index of the given combination of K elements chosen from a set
@@ -685,13 +701,15 @@ void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
RESTORE_STACK;
}
void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
VARDECL(opus_uint32,u);
int ret;
SAVE_STACK;
celt_assert(_k>0);
ALLOC(u,_k+2U,opus_uint32);
cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
ret = cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
RESTORE_STACK;
return ret;
}
#endif /* SMALL_FOOTPRINT */
View
@@ -43,6 +43,6 @@ void get_required_bits(opus_int16 *bits, int N, int K, int frac);
void encode_pulses(const int *_y, int N, int K, ec_enc *enc);
void decode_pulses(int *_y, int N, int K, ec_dec *dec);
opus_val32 decode_pulses(int *_y, int N, int K, ec_dec *dec);
#endif /* CWRS_H */
View
@@ -62,6 +62,27 @@ int ec_ilog(opus_uint32 _v){
}
#endif
#if 1
/* This is a faster version of ec_tell_frac() that takes advantage
of the low (1/8 bit) resolution to use just a linear function
followed by a lookup to determine the exact transition thresholds. */
opus_uint32 ec_tell_frac(ec_ctx *_this){
static const unsigned correction[8] =
{35733, 38967, 42495, 46340,
50535, 55109, 60097, 65535};
opus_uint32 nbits;
opus_uint32 r;
int l;
unsigned b;
nbits=_this->nbits_total<<BITRES;
l=EC_ILOG(_this->rng);
r=_this->rng>>(l-16);
b = (r>>12)-8;
b += r>correction[b];
l = (l<<3)+b;
return nbits-l;
}
#else
opus_uint32 ec_tell_frac(ec_ctx *_this){
opus_uint32 nbits;
opus_uint32 r;
@@ -91,3 +112,42 @@ opus_uint32 ec_tell_frac(ec_ctx *_this){
}
return nbits-l;
}
#endif
#ifdef USE_SMALL_DIV_TABLE
/* Result of 2^32/(2*i+1), except for i=0. */
const opus_uint32 SMALL_DIV_TABLE[129] = {
0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924,
0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111,
0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C,
0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084,
0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906,
0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A,
0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A,
0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104,
0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1,
0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2,
0x0329161F, 0x03159721, 0x03030303, 0x02F14990,
0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46,
0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597,
0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17,
0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902,
0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810,
0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC,
0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30,
0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364,
0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14,
0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F,
0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE,
0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6,
0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3,
0x01539094, 0x01501501, 0x014CAB88, 0x0149539E,
0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A,
0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190,
0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227,
0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4,
0x01194538, 0x0116E068, 0x011485F0, 0x0112358E,
0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3,
0x01073260, 0x0105197F, 0x0103091B, 0x01010101
};
#endif
View
@@ -34,6 +34,12 @@
# include <stddef.h>
# include "ecintrin.h"
extern const opus_uint32 SMALL_DIV_TABLE[129];
#ifdef OPUS_ARM_ASM
#define USE_SMALL_DIV_TABLE
#endif
/*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
larger type, you can speed up the decoder by using it here.*/
typedef opus_uint32 ec_window;
@@ -114,4 +120,33 @@ static OPUS_INLINE int ec_tell(ec_ctx *_this){
rounding error is in the positive direction).*/
opus_uint32 ec_tell_frac(ec_ctx *_this);
/* Tested exhaustively for all n and for 1<=d<=256 */
static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
celt_assert(d>0);
#ifdef USE_SMALL_DIV_TABLE
if (d>256)
return n/d;
else {
opus_uint32 t, q;
t = EC_ILOG(d&-d);
q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32;
return q+(n-q*d >= d);
}
#else
return n/d;
#endif
}
static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) {
celt_assert(d>0);
#ifdef USE_SMALL_DIV_TABLE
if (n<0)
return -(opus_int32)celt_udiv(-n, d);
else
return celt_udiv(n, d);
#else
return n/d;
#endif
}
#endif
View
@@ -138,7 +138,7 @@ void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage){
unsigned ec_decode(ec_dec *_this,unsigned _ft){
unsigned s;
_this->ext=_this->rng/_ft;
_this->ext=celt_udiv(_this->rng,_ft);
s=(unsigned)(_this->val/_this->ext);
return _ft-EC_MINI(s+1,_ft);
}
View
@@ -98,7 +98,7 @@ static void ec_enc_carry_out(ec_enc *_this,int _c){
else _this->ext++;
}
static void ec_enc_normalize(ec_enc *_this){
static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
/*If the range is too small, output some bits and rescale it.*/
while(_this->rng<=EC_CODE_BOT){
ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));
@@ -127,7 +127,7 @@ void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size){
void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
opus_uint32 r;
r=_this->rng/_ft;
r=celt_udiv(_this->rng,_ft);
if(_fl>0){
_this->val+=_this->rng-IMUL32(r,(_ft-_fl));
_this->rng=IMUL32(r,(_fh-_fl));
View
@@ -59,6 +59,12 @@ extern opus_int64 celt_mips;
#define SHR(a,b) SHR32(a,b)
#define PSHR(a,b) PSHR32(a,b)
/** Add two 32-bit values, ignore any overflows */
#define ADD32_ovflw(a,b) (celt_mips+=2,(opus_val32)((opus_uint32)(a)+(opus_uint32)(b)))
/** Subtract two 32-bit values, ignore any overflows */
#define SUB32_ovflw(a,b) (celt_mips+=2,(opus_val32)((opus_uint32)(a)-(opus_uint32)(b)))
#define NEG32_ovflw(a) (celt_mips+=2,(opus_val32)(-(opus_uint32)(a)))
static OPUS_INLINE short NEG16(int x)
{
int res;
@@ -227,6 +233,8 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
#define ROUND16(x,a) (celt_mips--,EXTRACT16(PSHR32((x),(a))))
#define SROUND16(x,a) (celt_mips--,EXTRACT16(SATURATE(PSHR32(x,a), 32767)));
#define HALF16(x) (SHR16(x,1))
#define HALF32(x) (SHR32(x,1))
@@ -496,6 +504,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int
#define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
#define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))
#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b))))
static OPUS_INLINE int SATURATE(int a, int b)
{
@@ -767,6 +776,16 @@ static OPUS_INLINE int DIV32_(opus_int64 a, opus_int64 b, char *file, int line)
return res;
}
static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
{
x = PSHR32(x, SIG_SHIFT);
x = MAX32(x, -32768);
x = MIN32(x, 32767);
return EXTRACT16(x);
}
#define SIG2WORD16(x) (SIG2WORD16_generic(x))
#undef PRINT_MIPS
#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
Oops, something went wrong.