CMSIS-NN: Remove non-TFLM compatible functions

Non TFLM compatible functions are removed.
ARM-software · Sep 30, 2022 · 011bf32 · 011bf32
1 parent cc3e92d
commit 011bf32
Show file tree

Hide file tree

Showing 42 changed files with 30 additions and 8,337 deletions.
diff --git a/Include/arm_nn_tables.h b/Include/arm_nn_tables.h
@@ -1,15 +1,5 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS NN Library
- * Title:        arm_nn_tables.h
- * Description:  Extern declaration for NN tables
- *
- * $Date:        17. August 2021
- * $Revision:    V.1.0.2
- *
- * Target Processor:  Cortex-M cores
- * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,6 +16,17 @@
  * limitations under the License.
  */
 
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_tables.h
+ * Description:  Extern declaration for NN tables
+ *
+ * $Date:        30. September 2022
+ * $Revision:    V.2.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ * -------------------------------------------------------------------- */
+
 #ifndef _ARM_NN_TABLES_H
 #define _ARM_NN_TABLES_H
 
@@ -36,21 +37,5 @@
  *
  */
 
-extern const q15_t sigmoidTable_q15[256];
-extern const q7_t sigmoidTable_q7[256];
-
-extern const q7_t tanhTable_q7[256];
-extern const q15_t tanhTable_q15[256];
-
-/**
- * @brief 2-way tables for various activation functions
- *
- * 2-way table, H table for value larger than 1/4
- * L table for value smaller than 1/4, H table for remaining
- * We have this only for the q15_t version. It does not make
- * sense to have it for q7_t type
- */
-extern const q15_t sigmoidHTable_q15[192];
-extern const q15_t sigmoidLTable_q15[128];
 
 #endif /*  ARM_NN_TABLES_H */
diff --git a/Include/arm_nn_types.h b/Include/arm_nn_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        8 August 2022
- * $Revision:    V.10.0.0
+ * $Date:        30 September 2022
+ * $Revision:    V.11.0.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -105,42 +105,6 @@ union arm_nn_long_long
  *
  */
 
-/**
- * @brief Converts the elements of the q7 vector to q15 vector without left-shift
- * @param[in]       *pSrc points to the q7 input vector
- * @param[out]      *pDst points to the q15 output vector
- * @param[in]       blockSize length of the input vector
- *
- */
-void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
-
-/**
- * @brief Non-saturating addition of elements of a q7 vector
- * @param[in]       *input Pointer to the q7 input vector
- * @param[out]      *output Pointer to the q31 output variable.
- * @param[in]       block_size length of the input vector
- * \par Description:
- *
- * 2^24 samples can be added without saturating the result.
- *
- * The equation used for the conversion process is:
- *
- * <pre>
- *  sum = input[0] + input[1] + .. + input[block_size -1]
- * </pre>
- *
- * */
-void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
-
-/**
- * @brief  Converts the elements of the q7 vector to reordered q15 vector without left-shift
- * @param[in]       *pSrc points to the q7 input vector
- * @param[out]      *pDst points to the q15 output vector
- * @param[in]       blockSize length of the input vector
- *
- */
-void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
-
 /**
  * @brief Converts the elements from a q7 vector to a q15 vector with an added offset
  * @param[in]    src        pointer to the q7 input vector
@@ -159,37 +123,6 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
  */
 void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
 
-/**
- * @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
- * @param[in]       src        pointer to the q7 input vector
- * @param[out]      dst        pointer to the q15 output vector
- * @param[in]       block_size length of the input vector
- * @param[in]       offset     offset to be added to each input vector element.
- *
- * @details  This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
- *           the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
- * original order.
- *
- */
-void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
-
-/**
- * @brief Converts the elements from a q7 vector and accumulate to a q15 vector
- * @param[in]    *src       points to the q7 input vector
- * @param[out]   *dst       points to the q15 output vector
- * @param[in]    block_size length of the input vector
- *
- * \par Description:
- *
- * The equation used for the conversion process is:
- *
- * <pre>
- *  dst[n] += (q15_t) src[n] ;   0 <= n < block_size.
- * </pre>
- *
- */
-void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
-
 /**
  * @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
  * @param[in]    row     pointer to row
@@ -803,38 +736,6 @@ read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2,
  *
  */
 
-/**
- * @brief           q7 vector multiplication with variable output shifts
- * @param[in]       *pSrcA        pointer to the first input vector
- * @param[in]       *pSrcB        pointer to the second input vector
- * @param[out]      *pDst         pointer to the output vector
- * @param[in]       out_shift     amount of right-shift for output
- * @param[in]       blockSize     number of samples in each vector
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
- */
-
-void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);
-
-/**
- * @brief           q7 vector multiplication with variable output shifts
- * @param[in]       *pSrcA        pointer to the first input vector
- * @param[in]       *pSrcB        pointer to the second input vector
- * @param[out]      *pDst         pointer to the output vector
- * @param[in]       out_shift     amount of right-shift for output
- * @param[in]       blockSize     number of samples in each vector
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
- */
-
-void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
-
 /**
  * @brief Matrix-multiplication function for convolution with per-channel requantization.
  * @param[in]       input_a     pointer to operand A

diff --git a/README.md b/README.md
@@ -2,103 +2,7 @@
 CMSIS NN software library is a collection of efficient neural network kernels developed to maximize the
 performance and minimize the memory footprint of neural networks on Cortex-M processors.
 ## About
-This page  give a quick overview of the functions available and key differences between them.
+Work In Progress to move CMSIS-NN repository from https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN to 
+https://github.com/ARM-software/CMSIS-NN. https://github.com/ARM-software/CMSIS_5/issues/1564
 
-**Note:** The GitHub documentation does not follow the *develop* branch but rather the last official release in the *master* branch. Consequently, the group documentation linked to in the table table might not have the listed API. Please refer to the description in the [header](https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Include/arm_nnfunctions.h) file instead.
 
-## Support / Contact
-For any questions or to reach the CMSIS-NN team, please create a new issue in https://github.com/ARM-software/CMSIS_5/issues
-## Supported Framework
-[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
-## Legacy vs TFL micro compliant APIs
-There are two kinds of APIs available in the CMSIS-NN repository; One that supports a legacy symmetric quantization scheme[1] and one that supports TFL micro's symmetric quantization scheme. One of the main differences is how the quantization is performed. The legacy APIs have a fixed point format with power of 2 scaling. This simplifies the re-quantization to a cycle efficient shift operation. No new development is done on the legacy functions and all of the new development is on the functions that support TFL micro. The table below highlights some of the differences between the two formats for convolution related functions. The TFL micro compliant APIs in most cases have a _s8 suffix and is always specified in the API header file.
-
-Operation | Legacy APIs | TFL micro compliant APIs|
-|:-----------|:---------------------|:-------------|
-Core loop | No input or filter offset | Input and/or filter offset |
-Re-quantization | Shift and saturate in one instruction. ~ 5 cycles | Greater than 200 cycles for one output element
-Quantization | Per layer quantization | Per-channel quantization
-Output offset | No | Per-layer output offset
-Fused Activation | No | Yes
-
-## TFL micro compliant APIs
-Group | API | Base Operator | Input Constraints | Additional memory required for <br/> optimizations (bytes) | DSP Optimized |  MVE Optimized | Other comments |
-|:----| :---| :------------ | :---------------- | :--------------------------------------------------------| :-------------| :------------- | :------------- |
-|[Conv](https://arm-software.github.io/CMSIS_5/NN/html/group__NNConv.html)||||| |  ||
-||arm_convolve_wrapper_s8()|CONV| None |n.a.| Yes | Yes |The additional memory required depends on the optimal convolution function called.|
-||arm_convolve_s8()|CONV| None |4 * (ker_x * ker_y * input_ch + delta)| Yes | Yes |delta - MVE only|
-||arm_convolve_1x1_s8_fast() | CONV | dilation = 1 <br/> ker_x = 1, ker_y = 1 <br/> pad = 0<br/> stride = 1<br/> input_ch % 4 = 0| No | Yes |Yes ||
-||arm_convolve_1_x_n_s8() | CONV | dilation = 1 <br/> output_y % 4 = 0 | Yes. Refer to API for details |Yes |Yes|Not all implementations require additional memory|
-||arm_depthwise_conv_wrapper_s8()| DEPTHWISE_CONV | None |n.a.| Yes| Yes| The additional memory required depends on the optimal convolution function called|
-||arm_depthwise_conv_3x3_s8() | DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 <br/> pad_x <= 1 | No|No|No| Preferred function for 3x3 kernel size for DSP extension. </br> For MVE, use arm_depthwise_conv_s8_opt()||
-||arm_depthwise_conv_s8() | DEPTHWISE_CONV | None | No|No|No||
-||arm_depthwise_conv_s8_opt()| DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 | DSP: 2 * ker_x * ker_y * input_ch <br/> MVE: 2 * DSP + 4 | Yes| Yes| Best case is when channels are multiple of 4 or <br/>at the least >= 4 |
-||arm_convolve_wrapper_s16()|CONV|None|n.a.| Yes | No | The additional memory required depends on the optimal convolution function called |
-||arm_convolve_s16()|CONV|None|No| No | No ||
-||arm_convolve_fast_s16()|CONV|dilation = 1, <br/> ker_x * ker_y * input_ch < 512 <br/> |4 * ker_x * ker_y * input_ch| Yes | Yes ||
-||arm_depthwise_conv_wrapper_s16() | DEPTHWISE_CONV | None | n.a. | Yes | Yes | The additional memory required depends on the optimal convolution function called |
-||arm_depthwise_conv_s16() | DEPTHWISE_CONV | None | No | Yes ||
-||arm_depthwise_conv_fast_s16() | DEPTHWISE_CONV | Yes | Yes. Refer to API for details | Yes | Yes ||
-|[Fully Connected](https://arm-software.github.io/CMSIS_5/NN/html/group__FC.html)||||| |  | |
-||arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | Yes | |
-||arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | Yes | |
-|[Pooling](https://arm-software.github.io/CMSIS_5/NN/html/group__Pooling.html)||||| |  ||
-|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
-|| arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
-|| arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes|  |
-|| arm_maxpool_s16() | MAX POOL | None | None | No| Yes|  |
-|[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| |  ||
-||arm_softmax_q7()| SOFTMAX | None | None | Yes | No | Not bit exact to TFLu but can be up to 70x faster |
-||arm_softmax_s8()| SOFTMAX | None | None | No | Yes | Bit exact to TFLu |
-||arm_softmax_s8_s16()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
-||arm_softmax_s16()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
-||arm_softmax_u8()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
-|[SVDF](https://arm-software.github.io/CMSIS_5/NN/html/group__SVDF.html)||||| |  ||
-||arm_svdf_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
-||arm_svdf_state_s16_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
-|[Misc](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| |  ||
-||arm_reshape_s8()| SOFTMAX | None | None | No | No | |
-||arm_elementwise_add_s8()| ELEMENTWISE ADD | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
-||arm_elementwise_add_s16()| ELEMENTWISE ADD | None | None | Yes| No| Reshape is not done in this function <br/> Only minor improvements are expected |
-||arm_elementwise_mul_s8()| ELEMENTWISE MUL | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
-||arm_elementwise_mul_s16()| ELEMENTWISE MUL | None | None | Yes| No| Reshape is not done in this function <br/> Only minor improvements are expected |
-||arm_relu_q7() | RELU | None | None | Yes| No|
-||arm_relu6_s8() | RELU | None | None | Yes| No|
-|[Concat](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| |  ||
-||arm_concatenation_s8_w() | CONCAT | None | None | No| No||
-||arm_concatenation_s8_x() | CONCAT | None | None | No| No||
-||arm_concatenation_s8_y() | CONCAT | None | None | No| No||
-||arm_concatenation_s8_z() | CONCAT | None | None | No| No||
-
-
-## Building CMSIS-NN as a library
-It is recommended to use toolchain files from [Arm Ethos-U Core Platform](https://review.mlplatform.org/admin/repos/ml/ethos-u/ethos-u-core-platform) project. These are supporting TARGET_CPU, which is a required argument. Note that if not specifying TARGET_CPU, these toolchains will set some default. The format must be TARGET_CPU=cortex-mXX, see examples below.
-Clone Arm Ethos-U Core Platform project and build, for example:
-
-```
-cd </path/to/CMSIS_5>/CMSIS/NN
-mkdir build
-cd build
-cmake .. -DCMAKE_TOOLCHAIN_FILE=</path/to/ethos-u-core-platform>/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55
-make
-```
-
-Some more examples, assuming Ethos-u-core-platform is cloned into your home directory:
-
-```
-cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55
-cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m7
-cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/armclang.cmake -DTARGET_CPU=cortex-m3
-```
-
-### Compiler options
-Default optimization level is Ofast. Please change according to project needs. Just bear in mind it will impact performance.
-With only optimization level -O0, ARM_MATH_AUTOVECTORIZE needs to be defined.
-
-The compiler option '-fomit-frame-pointer' is enabled by default at -O and higher. With no optimization level you may need to specifiy '-fomit-frame-pointer' as a minimum.
-
-The compiler option '-fno-builtin' does not utilize optimized implementations of e.g. memcpy and memset, which are heavily used by CMSIS-NN. It can significantly downgrade performance. So this should be avoided.
-The compiler option '-ffreestanding' should also be avoided as it enables '-fno-builtin' implicitly.
-
-## Reference
-[1] Legacy CMSIS-NN and how to use it https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page