Skip to content

Commit

Permalink
CMSIS-NN: Remove non-TFLM compatible functions
Browse files Browse the repository at this point in the history
Non TFLM compatible functions are removed.
  • Loading branch information
felix-johnny committed Sep 30, 2022
1 parent cc3e92d commit 011bf32
Show file tree
Hide file tree
Showing 42 changed files with 30 additions and 8,337 deletions.
39 changes: 12 additions & 27 deletions Include/arm_nn_tables.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 17. August 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
* SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
Expand All @@ -26,6 +16,17 @@
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 30. September 2022
* $Revision: V.2.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */

#ifndef _ARM_NN_TABLES_H
#define _ARM_NN_TABLES_H

Expand All @@ -36,21 +37,5 @@
*
*/

extern const q15_t sigmoidTable_q15[256];
extern const q7_t sigmoidTable_q7[256];

extern const q7_t tanhTable_q7[256];
extern const q15_t tanhTable_q15[256];

/**
* @brief 2-way tables for various activation functions
*
* 2-way table, H table for value larger than 1/4
* L table for value smaller than 1/4, H table for remaining
* We have this only for the q15_t version. It does not make
* sense to have it for q7_t type
*/
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];

#endif /* ARM_NN_TABLES_H */
2 changes: 1 addition & 1 deletion Include/arm_nn_types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
* SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
Expand Down
901 changes: 7 additions & 894 deletions Include/arm_nnfunctions.h

Large diffs are not rendered by default.

103 changes: 2 additions & 101 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 8 August 2022
* $Revision: V.10.0.0
* $Date: 30 September 2022
* $Revision: V.11.0.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -105,42 +105,6 @@ union arm_nn_long_long
*
*/

/**
* @brief Converts the elements of the q7 vector to q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
*
*/
void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);

/**
* @brief Non-saturating addition of elements of a q7 vector
* @param[in] *input Pointer to the q7 input vector
* @param[out] *output Pointer to the q31 output variable.
* @param[in] block_size length of the input vector
* \par Description:
*
* 2^24 samples can be added without saturating the result.
*
* The equation used for the conversion process is:
*
* <pre>
* sum = input[0] + input[1] + .. + input[block_size -1]
* </pre>
*
* */
void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);

/**
* @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
*
*/
void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);

/**
* @brief Converts the elements from a q7 vector to a q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
Expand All @@ -159,37 +123,6 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
*/
void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);

/**
* @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
* @param[out] dst pointer to the q15 output vector
* @param[in] block_size length of the input vector
* @param[in] offset offset to be added to each input vector element.
*
* @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
* original order.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);

/**
* @brief Converts the elements from a q7 vector and accumulate to a q15 vector
* @param[in] *src points to the q7 input vector
* @param[out] *dst points to the q15 output vector
* @param[in] block_size length of the input vector
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* dst[n] += (q15_t) src[n] ; 0 <= n < block_size.
* </pre>
*
*/
void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);

/**
* @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
* @param[in] row pointer to row
Expand Down Expand Up @@ -803,38 +736,6 @@ read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2,
*
*/

/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
*/

void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);

/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
*/

void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);

/**
* @brief Matrix-multiplication function for convolution with per-channel requantization.
* @param[in] input_a pointer to operand A
Expand Down
100 changes: 2 additions & 98 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,103 +2,7 @@
CMSIS NN software library is a collection of efficient neural network kernels developed to maximize the
performance and minimize the memory footprint of neural networks on Cortex-M processors.
## About
This page give a quick overview of the functions available and key differences between them.
Work In Progress to move CMSIS-NN repository from https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN to
https://github.com/ARM-software/CMSIS-NN. https://github.com/ARM-software/CMSIS_5/issues/1564

**Note:** The GitHub documentation does not follow the *develop* branch but rather the last official release in the *master* branch. Consequently, the group documentation linked to in the table table might not have the listed API. Please refer to the description in the [header](https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Include/arm_nnfunctions.h) file instead.

## Support / Contact
For any questions or to reach the CMSIS-NN team, please create a new issue in https://github.com/ARM-software/CMSIS_5/issues
## Supported Framework
[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
## Legacy vs TFL micro compliant APIs
There are two kinds of APIs available in the CMSIS-NN repository; One that supports a legacy symmetric quantization scheme[1] and one that supports TFL micro's symmetric quantization scheme. One of the main differences is how the quantization is performed. The legacy APIs have a fixed point format with power of 2 scaling. This simplifies the re-quantization to a cycle efficient shift operation. No new development is done on the legacy functions and all of the new development is on the functions that support TFL micro. The table below highlights some of the differences between the two formats for convolution related functions. The TFL micro compliant APIs in most cases have a _s8 suffix and is always specified in the API header file.

Operation | Legacy APIs | TFL micro compliant APIs|
|:-----------|:---------------------|:-------------|
Core loop | No input or filter offset | Input and/or filter offset |
Re-quantization | Shift and saturate in one instruction. ~ 5 cycles | Greater than 200 cycles for one output element
Quantization | Per layer quantization | Per-channel quantization
Output offset | No | Per-layer output offset
Fused Activation | No | Yes

## TFL micro compliant APIs
Group | API | Base Operator | Input Constraints | Additional memory required for <br/> optimizations (bytes) | DSP Optimized | MVE Optimized | Other comments |
|:----| :---| :------------ | :---------------- | :--------------------------------------------------------| :-------------| :------------- | :------------- |
|[Conv](https://arm-software.github.io/CMSIS_5/NN/html/group__NNConv.html)||||| | ||
||arm_convolve_wrapper_s8()|CONV| None |n.a.| Yes | Yes |The additional memory required depends on the optimal convolution function called.|
||arm_convolve_s8()|CONV| None |4 * (ker_x * ker_y * input_ch + delta)| Yes | Yes |delta - MVE only|
||arm_convolve_1x1_s8_fast() | CONV | dilation = 1 <br/> ker_x = 1, ker_y = 1 <br/> pad = 0<br/> stride = 1<br/> input_ch % 4 = 0| No | Yes |Yes ||
||arm_convolve_1_x_n_s8() | CONV | dilation = 1 <br/> output_y % 4 = 0 | Yes. Refer to API for details |Yes |Yes|Not all implementations require additional memory|
||arm_depthwise_conv_wrapper_s8()| DEPTHWISE_CONV | None |n.a.| Yes| Yes| The additional memory required depends on the optimal convolution function called|
||arm_depthwise_conv_3x3_s8() | DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 <br/> pad_x <= 1 | No|No|No| Preferred function for 3x3 kernel size for DSP extension. </br> For MVE, use arm_depthwise_conv_s8_opt()||
||arm_depthwise_conv_s8() | DEPTHWISE_CONV | None | No|No|No||
||arm_depthwise_conv_s8_opt()| DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 | DSP: 2 * ker_x * ker_y * input_ch <br/> MVE: 2 * DSP + 4 | Yes| Yes| Best case is when channels are multiple of 4 or <br/>at the least >= 4 |
||arm_convolve_wrapper_s16()|CONV|None|n.a.| Yes | No | The additional memory required depends on the optimal convolution function called |
||arm_convolve_s16()|CONV|None|No| No | No ||
||arm_convolve_fast_s16()|CONV|dilation = 1, <br/> ker_x * ker_y * input_ch < 512 <br/> |4 * ker_x * ker_y * input_ch| Yes | Yes ||
||arm_depthwise_conv_wrapper_s16() | DEPTHWISE_CONV | None | n.a. | Yes | Yes | The additional memory required depends on the optimal convolution function called |
||arm_depthwise_conv_s16() | DEPTHWISE_CONV | None | No | Yes ||
||arm_depthwise_conv_fast_s16() | DEPTHWISE_CONV | Yes | Yes. Refer to API for details | Yes | Yes ||
|[Fully Connected](https://arm-software.github.io/CMSIS_5/NN/html/group__FC.html)||||| | | |
||arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL | None | No | Yes | Yes | |
||arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL | None | No | Yes | Yes | |
|[Pooling](https://arm-software.github.io/CMSIS_5/NN/html/group__Pooling.html)||||| | ||
|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
|| arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
|| arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes| |
|| arm_maxpool_s16() | MAX POOL | None | None | No| Yes| |
|[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| | ||
||arm_softmax_q7()| SOFTMAX | None | None | Yes | No | Not bit exact to TFLu but can be up to 70x faster |
||arm_softmax_s8()| SOFTMAX | None | None | No | Yes | Bit exact to TFLu |
||arm_softmax_s8_s16()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
||arm_softmax_s16()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
||arm_softmax_u8()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
|[SVDF](https://arm-software.github.io/CMSIS_5/NN/html/group__SVDF.html)||||| | ||
||arm_svdf_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
||arm_svdf_state_s16_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
|[Misc](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| | ||
||arm_reshape_s8()| SOFTMAX | None | None | No | No | |
||arm_elementwise_add_s8()| ELEMENTWISE ADD | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
||arm_elementwise_add_s16()| ELEMENTWISE ADD | None | None | Yes| No| Reshape is not done in this function <br/> Only minor improvements are expected |
||arm_elementwise_mul_s8()| ELEMENTWISE MUL | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
||arm_elementwise_mul_s16()| ELEMENTWISE MUL | None | None | Yes| No| Reshape is not done in this function <br/> Only minor improvements are expected |
||arm_relu_q7() | RELU | None | None | Yes| No|
||arm_relu6_s8() | RELU | None | None | Yes| No|
|[Concat](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| | ||
||arm_concatenation_s8_w() | CONCAT | None | None | No| No||
||arm_concatenation_s8_x() | CONCAT | None | None | No| No||
||arm_concatenation_s8_y() | CONCAT | None | None | No| No||
||arm_concatenation_s8_z() | CONCAT | None | None | No| No||


## Building CMSIS-NN as a library
It is recommended to use toolchain files from [Arm Ethos-U Core Platform](https://review.mlplatform.org/admin/repos/ml/ethos-u/ethos-u-core-platform) project. These are supporting TARGET_CPU, which is a required argument. Note that if not specifying TARGET_CPU, these toolchains will set some default. The format must be TARGET_CPU=cortex-mXX, see examples below.
Clone Arm Ethos-U Core Platform project and build, for example:

```
cd </path/to/CMSIS_5>/CMSIS/NN
mkdir build
cd build
cmake .. -DCMAKE_TOOLCHAIN_FILE=</path/to/ethos-u-core-platform>/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55
make
```

Some more examples, assuming Ethos-u-core-platform is cloned into your home directory:

```
cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55
cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m7
cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/armclang.cmake -DTARGET_CPU=cortex-m3
```

### Compiler options
Default optimization level is Ofast. Please change according to project needs. Just bear in mind it will impact performance.
With only optimization level -O0, ARM_MATH_AUTOVECTORIZE needs to be defined.

The compiler option '-fomit-frame-pointer' is enabled by default at -O and higher. With no optimization level you may need to specifiy '-fomit-frame-pointer' as a minimum.

The compiler option '-fno-builtin' does not utilize optimized implementations of e.g. memcpy and memset, which are heavily used by CMSIS-NN. It can significantly downgrade performance. So this should be avoided.
The compiler option '-ffreestanding' should also be avoided as it enables '-fno-builtin' implicitly.

## Reference
[1] Legacy CMSIS-NN and how to use it https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
Loading

0 comments on commit 011bf32

Please sign in to comment.