diff --git a/assets/contributors.csv b/assets/contributors.csv index ac5348825b..d50bc49455 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -46,3 +46,4 @@ Alaaeddine Chakroun,Day Devs,Alaaeddine-Chakroun,alaaeddine-chakroun,,https://da Koki Mitsunami,Arm,,,, Chen Zhang,Zilliz,,,, Tianyu Li,Arm,,,, +Georgios Mermigkis,VectorCamp,gMerm,georgios-mermigkis,,https://vectorcamp.gr/ diff --git a/content/learning-paths/cross-platform/simd-info-demo/_index.md b/content/learning-paths/cross-platform/simd-info-demo/_index.md new file mode 100644 index 0000000000..2f4aad28a7 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/_index.md @@ -0,0 +1,42 @@ +--- +title: Introduction to SIMD.info + +minutes_to_complete: 30 + +who_is_this_for: This is for software developers interested in porting SIMD code across platforms. + +learning_objectives: + - Learn how to use SIMD.info’s tools and features, such as navigation, search, and comparison, to simplify the process of finding equivalent SIMD intrinsics between architectures and improving code portability. + +prerequisites: + - A basic understanding of SIMD. + - Access to an Arm platform with SIMD supported engine, with recent versions of a C compiler (Clang or GCC) installed. + +author_primary: Georgios Mermigkis & Konstantinos Margaritis, VectorCamp + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Aarch64 + - Armv8-a + - Armv9-a +tools_software_languages: + - GCC + - Clang + - Coding + - Rust +operatingsystems: + - Linux +shared_path: true +shared_between: + - laptops-and-desktops + - servers-and-cloud-computing + - smartphones-and-mobile + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md b/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md new file mode 100644 index 0000000000..320c29c6e1 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md @@ -0,0 +1,19 @@ +--- +next_step_guidance: You should explore **SIMD.info** more and find out porting opportunities between different SIMD engines. + +recommended_path: /learning-paths/cross-platform/ + +further_reading: + - resource: + title: SIMD.info + link: https://simd.info + type: website + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/cross-platform/simd-info-demo/_review.md b/content/learning-paths/cross-platform/simd-info-demo/_review.md new file mode 100644 index 0000000000..cc6a2a64d0 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/_review.md @@ -0,0 +1,44 @@ +--- +review: + - questions: + question: > + What is SIMD.info? + answers: + - An online resource for SIMD C intrinsics for all major architectures + - It's an online forum for SIMD developers + - A book about SIMD programming + correct_answer: 1 + explanation: > + While it allows comments in the SIMD intrinsics, SIMD.info is not really a forum. It is an online **free** resource to assist developers porting C code between popular architectures, for example, from SSE/AVX/AVX512 to Arm ASIMD. + + - questions: + question: > + What architectures are listed in SIMD.info? + answers: + - Intel SSE and Arm ASIMD + - Power VSX and Arm ASIMD/SVE + - Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX + correct_answer: 3 + explanation: > + At the time of writing SIMD.info supports Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX as SIMD architectures. Work is in progress to include Arm SVE/SVE2, MIPS MSA, RISC-V RVV 1.0, s390 Z and others. + + - questions: + question: > + What are SIMD.info's major features? + answers: + - Hierarchical tree, Search, AI code translation + - Search, Hierarchical tree, Code examples + - Hierarchical tree, Search, Intrinsics Comparison, Code examples, Equivalents mapping, links to official documentation + correct_answer: 3 + explanation: > + SIMD.info provides multiple features, including a hierarchical tree, Search facility, Intrinsics Comparison, Code examples, Equivalents mapping, links to official documentation and others. AI code translation is not a feature of SIMD.info but will be the focus of another project, SIMD.ai. + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/cross-platform/simd-info-demo/conclusion.md b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md new file mode 100644 index 0000000000..5ad238c264 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md @@ -0,0 +1,17 @@ +--- +title: Conclusion +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Conclusion and Additional Resources + +Porting SIMD code between architecture can be a daunting process, in many cases requiring many hours of studying multiple ISAs in online resources or ISA manuals of thousands pages. Our primary focus in this work was to optimize the existing algorithm directly with SIMD intrinsics, without altering the algorithm or data layout. While reordering data to align with native ARM instructions could offer performance benefits, our scope remained within the constraints of the current data layout and algorithm. For those interested in data layout strategies to further enhance performance on ARM, the [vectorization-friendly data layout learning path](https://learn.arm.com/learning-paths/cross-platform/vectorization-friendly-data-layout/) offers valuable insights. + +Using **[SIMD.info](https://simd.info)** can be be instrumental in reducing the amount of time spent in this process, providing a centralized and user-friendly resource for finding **NEON** equivalents to intrinsics of other architectures. It saves considerable time and effort by offering detailed descriptions, prototypes, and comparisons directly, eliminating the need for extensive web searches and manual lookups. + +While porting between vectors of different sizes is more complex, work is underway -at the time of writing this guide- to complete integration of **SVE**/**SVE2** Arm extensions and allow matching them with **AVX512** intrinsics, as they are both using predicate masks. + +Please check **[SIMD.info](https://simd.info)** regularly for updates on this. diff --git a/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md b/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md new file mode 100644 index 0000000000..897a109e08 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md @@ -0,0 +1,17 @@ +--- +title: Overview & Context +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### The Challenge of SIMD Code Portability +One of the biggest challenges developers face when working with SIMD code is making it portable across different platforms. SIMD instructions are designed to increase performance by executing the same operation on multiple data elements in parallel. However, each architecture has its own set of SIMD instructions, making it difficult to write code that works on all of them without major changes to the code and/or algorithm. + +Consider you have the task of porting a software written using Intel intrinsics, like SSE/AVX/AVX512, to Arm Neon. +The differences in instruction sets and data handling require careful attention. + +This lack of portability increases development time and introduces the risk of errors during the porting process. Currently, developers rely on ISA documentation and manually search across various vendor platforms like [ARM Developer](https://developer.arm.com/architectures/instruction-sets/intrinsics/) and [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html) to find equivalent instructions. + +[SIMD.info](https://simd.info) aims to solve this by helping you find equivalent instructions and providing a more streamlined way to adapt your code for different architectures. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md new file mode 100644 index 0000000000..89f25a23c7 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md @@ -0,0 +1,67 @@ +--- +title: SIMD.info Features +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Comprehensive SIMD.info Capabilities +**[SIMD.info](https://simd.info/)** offers a variety of powerful tools to help developers work more efficiently with SIMD code across different architectures. With a database of over 10,000 intrinsics, it provides detailed information to support effective SIMD development. + +For each intrinsic, SIMD.info provides comprehensive details, including: + +1. **Purpose**: A brief description of what the intrinsic does and its primary use case. +2. **Result**: Explanation of the output or result of the intrinsic. +3. **Example**: A code snippet demonstrating how to use the intrinsic. +4. **Prototypes**: Function prototypes for different programming languages (currently C/C++). +5. **Assembly Instruction**: The corresponding assembly instruction used by the intrinsic. +6. **Notes**: Any additional notes or caveats about the intrinsic. +7. **Architecture**: List of architectures that support the intrinsic +8. **Link(s) to Official Documentation** + +This detailed information ensures you have all the necessary resources to effectively use and port SIMD instructions across different platforms. Each feature is designed to simplify navigation, improve the search for equivalent instructions, and foster a collaborative environment for knowledge-sharing. + +- **Tree-based navigation:** **SIMD.info** uses a clear, hierarchical layout to organize instructions. It categorizes instructions into broad groups like **Arithmetic**, which are further divided into specific subcategories such as **Vector Add** and **Vector Subtract**. This organized structure makes it straightforward to browse through SIMD instruction sets across various platforms, allowing you to efficiently find and access the exact instructions you need. +An example of how the tree structure looks like: + + + - **Arithmetic** + - **Arithmetic (Complex Numbers)** + - **Boolean Logic & Bit Manipulation** + - **Boolean AND** + - **Boolean AND NOT** + - **Boolean AND NOT 128-bit vector** + - **Boolean AND NOT 16-bit signed integers** + - **Boolean AND NOT 16-bit unsigned integers** + - **Boolean AND NOT 256-bit vector** + - **Boolean AND NOT 32-bit floats** + - **Boolean AND NOT 32-bit signed integers** + - AVX512: mm512_andnot_epi32 + - NEON: vbic_s32 + - NEON: vbicq_s32 + - VSX: vec_andc + - **Bit Clear** + - **XOR** + +- **Advanced search functionality:** With its robust search engine, **SIMD.info** allows you to either search for a specific intrinsic (e.g. `vaddq_f64`) or enter more general terms (e.g. *How to add 2 vectors*), and it will return a list of the corresponding intrinsics. You can also filter results based on the specific engine you're working with, such as **NEON**, **SSE4.2**, **AVX**, **AVX512**, **VSX**. This functionality streamlines the process of finding the right commands tailored to your needs. + +- **Comparison tools:** This feature lets you directly compare SIMD instructions from different (or the same) platforms side by side, offering a clear view of the similarities and differences. It’s an invaluable tool for porting code across architectures, as it ensures accuracy and efficiency. + +- **Discussion forum (like StackOverflow):** The integrated discussion forum, powered by **[discuss](https://disqus.com/)** allows users to ask questions, share insights, and troubleshoot problems together. This community-driven space ensures that you’re never stuck on a complex issue without support, fostering collaboration and knowledge-sharing among SIMD developers. Imagine something like **StackOverflow** but specific to SIMD intrinsics. + +### Work in Progress & Future Development +- **Pseudo-code:** Currently under development, this feature will enable users to generate high-level pseudo-code based on specific SIMD instructions. This tool aims to enable better understanding of the SIMD instructions, in a *common language*. This will also be used in the next feature, **Intrinsics Diagrams**. + +- **Intrinsics Diagrams:** A feature under progress, creating detailed diagrams for each intrinsic to visualize how it operates on a low level using registers. These diagrams will help you grasp the mechanics of SIMD instructions more clearly, aiding in optimization and debugging. + +- **[SIMD.ai](https://simd.ai/):** SIMD.ai is an upcoming feature that promises to bring AI-assisted insights and recommendations to the SIMD development process, making it faster and more efficient to port SIMD code between architectures. + +### How These Features Aid in SIMD Development +**[SIMD.info](https://simd.info/)** offers a range of features that streamline the process of porting SIMD code across different architectures. The hierarchical structure of tree-based navigation allows you to easily locate instructions within a clear framework. This organization into broad categories and specific subcategories, such as **Arithmetic** and **Boolean Logic**, makes it straightforward to identify the relevant SIMD instructions. + +When you need to port code from one architecture to another, the advanced search functionality proves invaluable. You can either search for specific intrinsics or use broader terms to find equivalent instructions across platforms. This capability ensures that you quickly find the right intrinsics for Arm, Intel or Power architectures. + +Furthermore, **SIMD.info**’s comparison tools enhance this process by enabling side-by-side comparisons of instructions from various platforms. This feature highlights the similarities and differences between instructions, which is crucial for accurately adapting your code. By understanding how similar operations are implemented across architectures, you can ensure that your ported code performs optimally. + +Let's look at an actual example. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md new file mode 100644 index 0000000000..6a8e1c4463 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md @@ -0,0 +1,33 @@ +--- +title: Porting Process +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Using SIMD.info to find NEON Equivalents +Now that you have a clear view of the example, you can start the process of porting the code to Arm **Neon/ASIMD**. + +This is where [SIMD.info](https://simd.info/) comes in. + +In SIMD programming, the primary concern is the integrity and accuracy of the calculations. Ensuring that these calculations are done correctly is crucial. Performance almost always comes second. + +For the operations in your **SSE4.2** example, you have the following intrinsics: + +- **`_mm_cmpgt_ps`** +- **`_mm_add_ps`** +- **`_mm_mul_ps`** +- **`_mm_sqrt_ps`** + +To gain a deeper understanding of how these intrinsics work and to get detailed descriptions, you can use the search feature on **SIMD.info**. Simply enter the intrinsic's name into the search bar. You can either select from the suggested results or perform a direct search to find detailed information about each intrinsic. + +1. By searching [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps/) you get information about it's purpose, result-type, assembly instruction, prototype and an example about it. By clicking the **engine** option **"NEON"** you can find it's [equivalents](https://simd.info/eq/_mm_add_ps/NEON/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) will help you find the right one. Based on the prototype provided, you would choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32/) because it works with 128-bit vectors which is the same as **SSE4.2**. + +2. Moving to the next intrinsic, **`_mm_mul_ps`**, you will use the [Intrinsics Tree](https://simd.info/tag-tree) on **SIMD.info** to find the equivalent. Start by expanding the **Arithmetic** branch and then navigate to the branch **Vector Multiply**. Since you are working with 32-bit floats, open the **Vector Multiply 32-bit floats** branch, where you will find several options. The recommended choice is [**`vmulq_f32`**](https://simd.info/c_intrinsic/vmulq_f32/), following the same reasoning as before—it operates on 128-bit vectors. + +3. For the third intrinsic, **`_mm_sqrt_ps`**, the easiest way to find the corresponding **NEON** intrinsic is by typing **"Square Root"** into the search bar on SIMD.info. From the [search results](https://simd.info/search?search=Square+Root&simd_engines=1&simd_engines=2&simd_engines=3&simd_engines=4&simd_engines=5), look for the float-specific version and select [**`vsqrtq_f32`**](https://simd.info/c_intrinsic/vsqrtq_f32/), which, like the others, works with 128-bit vectors. In the equivalents section regarding **SSE4.2**, you can clearly see that **`_mm_sqrt_ps`** has its place as a direct match for this operation. + +4. For the last intrinsic, **`_mm_cmpgt_ps`**, follow a similar approach as before. Inside the intrinsics tree, start by expanding the **Comparison** folder. Navigate to the subfolder **Vector Compare Greater Than**, and since you are working with 32-bit floats, proceed to **Vector Compare Greater Than 32-bit floats**. The recommended choice is again the 128-bit variant [**`vcgtq_f32`**](https://simd.info/c_intrinsic/vcgtq_f32/). + +Now that you have found the **NEON** equivalents for each **SSE4.2** intrinsic, you're ready to begin porting the code. Understanding these equivalents is key to ensuring that the code produces the correct results in the calculations as you switch between SIMD engines. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md new file mode 100644 index 0000000000..13897277f9 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md @@ -0,0 +1,99 @@ +--- +title: Code Verification +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Step-by-Step Porting + +1. Change the loading process to follow **NEON**'s method for initializing vectors. The **SSE4.2** intrinsic **`_mm_set_ps`** is in reality a macro, in **NEON** you can do the same thing with curly braces **`{}`** inititialization. +2. Next, you will replace the **SSE4.2** intrinsics with the **NEON** equivalents we identified earlier. The key is to ensure that the operations perform the same tasks, such as comparison, addition, multiplication, and square root calculations. +3. Finally, modify the storing process to match **NEON**’s way of moving data from vectors to memory. In **NEON**, you use functions like [**`vst1q_f32`**](https://simd.info/c_intrinsic/vst1q_f32/) for storing 128-bit floating-point vectors and [**`vst1q_u32`**](https://simd.info/c_intrinsic/vst1q_u32/) for storing 128-bit integer vectors. + +After identifying the **NEON** intrinsics you will need in the ported program, it's time to actually write the code. + +Create a new file for the ported NEON code named `calculation_neon.c` with the contents shown below: + +```C +#include +#include + +float32_t a_array[4] = {1.0f, 4.0f, 9.0f, 16.0f}; +float32_t b_array[4] = {1.0f, 2.0f, 3.0f, 4.0f}; + +int main() { + float32x4_t a = vld1q_f32(a_array); + float32x4_t b = vld1q_f32(b_array); + + uint32x4_t cmp_result = vcgtq_f32(a, b); + + float a_arr[4], b_arr[4]; + uint32_t cmp_res[4]; + + vst1q_f32(a_arr, a); + vst1q_f32(b_arr, b); + vst1q_u32(cmp_res, cmp_result); + + for (int i = 0; i < 4; i++) { + if (cmp_res[i] != 0) { + printf("Element %d: %.2f is larger than %.2f\n", i, a_arr[i], b_arr[i]); + } else { + printf("Element %d: %.2f is not larger than %.2f\n", i, a_arr[i], b_arr[i]); + } + } + printf("\n"); + + float32x4_t add_result = vaddq_f32(a, b); + float32x4_t mul_result = vmulq_f32(add_result, b); + float32x4_t sqrt_result = vsqrtq_f32(mul_result); + + float res[4]; + + vst1q_f32(res, add_result); + printf("Addition Result: %.2f %.2f %.2f %.2f\n", res[0], res[1], res[2], res[3]); + + vst1q_f32(res, mul_result); + printf("Multiplication Result: %.2f %.2f %.2f %.2f\n", res[0], res[1], res[2], res[3]); + + vst1q_f32(res, sqrt_result); + printf("Square Root Result: %.2f %.2f %.2f %.2f\n", res[0], res[1], res[2], res[3]); + + return 0; +} +``` + +### Verifying the Ported Code + +It's time to verify that the functionality remains the same, which means you get the same results and similar performance. + +Compile the above code as follows on an Arm system: + +```bash +gcc -O3 calculation_neon.c -o calculation_neon +``` + +Now run the program: +```bash +./calculation_neon +``` + +The output should look like the following: + +```output +Element 0: 1.00 is not larger than 1.00 +Element 1: 4.00 is larger than 2.00 +Element 2: 9.00 is larger than 3.00 +Element 3: 16.00 is larger than 4.00 + +Addition Result: 2.00 6.00 12.00 20.00 +Multiplication Result: 2.00 12.00 36.00 80.00 +Square Root Result: 1.41 3.46 6.00 8.94 +``` + +You can see that the results are the same as in the **SSE4.2** example. + +{{% notice Note %}} +We initialized the vectors in reverse order compared to the **SSE4.2** version because the array initialization and vld1q_f32 function load vectors from LSB to MSB, whereas **`_mm_set_ps`** loads elements MSB to LSB. +{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md new file mode 100644 index 0000000000..2573f36c8e --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md @@ -0,0 +1,79 @@ +--- +title: Example Program +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Consider the following C example that uses Intel SSE4.2 intrinsics. + +Create a file named `calculation_sse.c` with the contents shown below. + +```C +#include +#include + +int main() { + __m128 a = _mm_set_ps(16.0f, 9.0f, 4.0f, 1.0f); + __m128 b = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f); + + __m128 cmp_result = _mm_cmpgt_ps(a, b); + + float a_arr[4], b_arr[4], cmp_arr[4]; + _mm_storeu_ps(a_arr, a); + _mm_storeu_ps(b_arr, b); + _mm_storeu_ps(cmp_arr, cmp_result); + + for (int i = 0; i < 4; i++) { + if (cmp_arr[i] != 0.0f) { + printf("Element %d: %.2f is larger than %.2f\n", i, a_arr[i], b_arr[i]); + } else { + printf("Element %d: %.2f is not larger than %.2f\n", i, a_arr[i], b_arr[i]); + } + } + + __m128 add_result = _mm_add_ps(a, b); + __m128 mul_result = _mm_mul_ps(add_result, b); + __m128 sqrt_result = _mm_sqrt_ps(mul_result); + + float res[4]; + + _mm_storeu_ps(res, add_result); + printf("Addition Result: %f %f %f %f\n", res[0], res[1], res[2], res[3]); + + _mm_storeu_ps(res, mul_result); + printf("Multiplication Result: %f %f %f %f\n", res[0], res[1], res[2], res[3]); + + _mm_storeu_ps(res, sqrt_result); + printf("Square Root Result: %f %f %f %f\n", res[0], res[1], res[2], res[3]); + + return 0; +} +``` + +The program first compares whether elements in one vector are greater than those in another vector, prints the result, and then proceeds to compute the addition of two vectors, multiplies the result with one of the vectors, and finally takes the square root of the multiplication result: + +Compile the code as follows on an Intel system that supports **SSE4.2**: +```bash +gcc -O3 calculation_sse.c -o calculation_sse -msse4.2 +``` + +Now run the program: +```bash +./calculation_sse +``` + +The output should look like the following: +```output +Element 0: 1.00 is not larger than 1.00 +Element 1: 4.00 is larger than 2.00 +Element 2: 9.00 is larger than 3.00 +Element 3: 16.00 is larger than 4.00 + +Addition Result: 2.00 6.00 12.00 20.00 +Multiplication Result: 2.00 12.00 36.00 80.00 +Square Root Result: 1.41 3.46 6.00 8.94 +``` + +It is imperative that you run the code first on the reference platform (here Intel), to make sure you understand how it works and what kind of results are being expected. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md new file mode 100644 index 0000000000..ed953c7c60 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md @@ -0,0 +1,131 @@ +--- +title: Intrinsics without Equivalents +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Handling intrinsics without direct equivalents + +During the porting process, you will observe that certain instructions translate seamlessly. However, there are cases where direct equivalents for some intrinsics may not be readily available across architectures. For example, the [**`_mm_madd_epi16`**](https://simd.info/c_intrinsic/_mm_madd_epi16/) intrinsic from **SSE2**, which performs multiplication of 16-bit signed integer elements in a vector and then does a pairwise addition of adjacent elements increasing the element width, does not have a direct counterpart in **NEON**. However it can be emulated using another intrinsic. Similarly its 256 and 512-bit counterparts, [**`_mm256_madd_epi16`**](https://simd.info/c_intrinsic/_mm256_madd_epi16/) and [**`_mm512_madd_epi16`**](https://simd.info/c_intrinsic/_mm512_madd_epi16/) can be emulated by a sequence of instructions, but here you will see the 128-bit variant. + +You may already know the equivalent operations for this particular intrinsic, but let's assume you don't. In this usecase, reading the **`_mm_madd_epi16`** on the **SIMD.info** might indicate that a key characteristic of the instruction involved is the *widening* of the result elements, from 16-bit to 32-bit signed integers. Unfortunately, that is not the case, as this particular instruction does not actually increase the size of the element holding the result values. You will see how that effects the result in the example. + +Consider the following code for **SSE2**. Create a new file for the code named `_mm_madd_epi16_test.c` with the contents shown below: + +```C +#include +#include +#include + +void print_s16x8(char *label, __m128i v) { + int16_t out[8]; + _mm_storeu_si128((__m128i*)out, v); + printf("%-*s: ", 30, label); + for (size_t i=0; i < 8; i++) printf("%4x ", (uint16_t)out[i]); + printf("\n"); +} + +int main() { + __m128i a = _mm_set_epi16(10, 30, 50, 70, 90, 110, 130, 150); + __m128i b = _mm_set_epi16(20, 40, 60, 80, 100, 120, 140, 160); + // 130 * 140 = 18200, 150 * 160 = 24000 + // adding them as 32-bit signed integers -> 42000 + // adding them as 16-bit signed integers -> -23336 (overflow!) + + __m128i res = _mm_madd_epi16(a, b); + + print_s16x8("a", a); + print_s16x8("b", b); + print_s16x8("_mm_madd_epi16(a, b)", res); + + return 0; +} +``` + +Compile the code as follows on an x86 system (no extra flags required as **SSE2** is assumed by default on all 64-bit x86 systems): +```bash +gcc -O3 _mm_madd_epi16_test.c -o _mm_madd_epi16_test +``` + +Now run the program: +```bash +./_mm_madd_epi16_test +``` + +The output should look like: +```output +a : 96 82 6e 5a 46 32 1e a +b : a0 8c 78 64 50 3c 28 14 +_mm_madd_epi16(a, b) : a4d8 0 56b8 0 2198 0 578 0 +``` + +You will note that the result of the first element is a negative number, even though we added 2 positive results (`130*140` and `150*160`). That is because the result of the addition has to occupy a 16-bit signed integer element and when the first is larger we have the effect of an negative overflow. The result is the same in binary arithmetic, but when interpreted into a signed integer, it turns the number into a negative. + +The rest of the values are as expected. Notice how each pair has a zero element next to it. The results are correct, but they are not in the correct order. In this example, we chose to use **`vmovl`** to zero-extend values, which achieves the correct order with zero elements in place. While both **`vmovl`** and **`zip`** could be used for this purpose, we opted for **`vmovl`** in this implementation. For more details, see the ARM Software Optimization Guides, such as the [Neoverse V2 guide](https://developer.arm.com/documentation/109898/latest/). + +```C +#include +#include +#include + +void print_s16x8(char *label, int16x8_t v) { + int16_t out[8]; + vst1q_s16(out, v); + printf("%-*s: ", 30, label); + for (size_t i = 0; i < 8; i++) printf("%4x ", (uint16_t)out[i]); + printf("\n"); +} + +int16_t a_array[8] = {150, 130, 110, 90, 70, 50, 30, 10}; +int16_t b_array[8] = {160, 140, 120, 100, 80, 60, 40, 20}; + +int main() { + int16x8_t a = vld1q_s16(a_array); + int16x8_t b = vld1q_s16(b_array); + int16x8_t zero = vdupq_n_s16(0); + // 130 * 140 = 18200, 150 * 160 = 24000 + // adding them as 32-bit signed integers -> 42000 + // adding them as 16-bit signed integers -> -23336 (overflow!) + + int16x8_t res = vmulq_s16(a, b); + + print_s16x8("a", a); + print_s16x8("b", b); + print_s16x8("vmulq_s16(a, b)", res); + res = vpaddq_s16(res, zero); + print_s16x8("vpaddq_s16(a, b)", res); + + // vmovl_s16 would sign-extend; we just want to zero-extend + // so we need to cast to uint16, vmovl_u16 and then cast back to int16 + uint16x4_t res_u16 = vget_low_u16(vreinterpretq_u16_s16(res)); + res = vreinterpretq_s16_u32(vmovl_u16(res_u16)); + print_s16x8("final", res); + + return 0; +} +``` + +Write the above program to a file called `_mm_madd_epi16_neon.c` and compile it: + +```bash +gcc -O3 _mm_madd_epi16_neon.c -o _mm_madd_epi16_neon +``` + +Now run the program: +```bash +./_mm_madd_epi16_neon.c +``` + +The output should look like: +```output +a : 96 82 6e 5a 46 32 1e a +b : a0 8c 78 64 50 3c 28 14 +vmulq_s16(a, b) : 5dc0 4718 3390 2328 15e0 bb8 4b0 c8 +vpaddq_s16(a, b) : a4d8 56b8 2198 578 0 0 0 0 +final : a4d8 0 56b8 0 2198 0 578 0 +``` + +As you can see the results of both match, **SIMD.info** was especially helpful in this process, providing detailed descriptions and examples that guided the translation of complex intrinsics between different SIMD architectures. +