diff --git a/.wordlist.txt b/.wordlist.txt index c5ade02d65..12f6888c86 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -577,7 +577,6 @@ Pmod PostgerSQL PostgrSQL Postgres -Powershell Prefetching Prereqs ProjectExplorer @@ -3336,4 +3335,49 @@ webgpufundamentals wgpuQueueSubmit wgpuQueueWriteBuffer wgpuQueueWriteTexture -wpa \ No newline at end of file +wpa +CAMs +CODENAME +COMs +Chaodong +CreateWorkload +Disqus +ExecuteNetwork +FPC +HX +Hejmadi +Himax +Kieran +LHwilBhvNU +LastWriteTime +LiteRT +OV +Seeed +WebGPU’s +WiseEye +Yolov +blp +dLayer +dawnwebgpu +dmesg +fm +himax +il +juXXgrDDaUdmi +mins +mobilenet +msvc +od +ons +scm +seeedstudio +sscript +tflm +tw +vc +videoio +wiseeye +wlcsp +xB +xmodem +yolov \ No newline at end of file diff --git a/assets/contributors.csv b/assets/contributors.csv index d50bc49455..0f5a4218d5 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -43,7 +43,7 @@ Rin Dobrescu,Arm,,,, Przemyslaw Wirkus,Arm,PrzemekWirkus,przemyslaw-wirkus-78b73352,, Nader Zouaoui,Day Devs,nader-zouaoui,nader-zouaoui,@zouaoui_nader,https://daydevs.com/ Alaaeddine Chakroun,Day Devs,Alaaeddine-Chakroun,alaaeddine-chakroun,,https://daydevs.com/ -Koki Mitsunami,Arm,,,, +Koki Mitsunami,Arm,,kmitsunami,, Chen Zhang,Zilliz,,,, Tianyu Li,Arm,,,, Georgios Mermigkis,VectorCamp,gMerm,georgios-mermigkis,,https://vectorcamp.gr/ diff --git a/content/install-guides/_images/git-woa.png b/content/install-guides/_images/git-woa.png new file mode 100644 index 0000000000..034c4e11a3 Binary files /dev/null and b/content/install-guides/_images/git-woa.png differ diff --git a/content/install-guides/_images/git2-woa.png b/content/install-guides/_images/git2-woa.png new file mode 100644 index 0000000000..5ddc6dcca4 Binary files /dev/null and b/content/install-guides/_images/git2-woa.png differ diff --git a/content/install-guides/_images/git3-woa.png b/content/install-guides/_images/git3-woa.png new file mode 100644 index 0000000000..092523c1a4 Binary files /dev/null and b/content/install-guides/_images/git3-woa.png differ diff --git a/content/install-guides/_images/git4-woa.png b/content/install-guides/_images/git4-woa.png new file mode 100644 index 0000000000..68076a1740 Binary files /dev/null and b/content/install-guides/_images/git4-woa.png differ diff --git a/content/install-guides/_images/git5-woa.png b/content/install-guides/_images/git5-woa.png new file mode 100644 index 0000000000..6d63e97aa7 Binary files /dev/null and b/content/install-guides/_images/git5-woa.png differ diff --git a/content/install-guides/_images/wpa-install-plugin.png b/content/install-guides/_images/wpa-install-plugin.png new file mode 100644 index 0000000000..91d072dc5e Binary files /dev/null and b/content/install-guides/_images/wpa-install-plugin.png differ diff --git a/content/install-guides/_images/wpa-installation.png b/content/install-guides/_images/wpa-installation.png index 207ce3bf8a..8a8bccdac5 100644 Binary files a/content/install-guides/_images/wpa-installation.png and b/content/install-guides/_images/wpa-installation.png differ diff --git a/content/install-guides/_images/wpa-store.png b/content/install-guides/_images/wpa-store.png new file mode 100644 index 0000000000..78d84c85a1 Binary files /dev/null and b/content/install-guides/_images/wpa-store.png differ diff --git a/content/install-guides/acfl.md b/content/install-guides/acfl.md index 383cde57a9..cf070251b5 100644 --- a/content/install-guides/acfl.md +++ b/content/install-guides/acfl.md @@ -142,6 +142,33 @@ install takes place **after** ACfL, you will no longer be able to fully uninstall ACfL. {{% /notice %}} +## Download and install using System Packages - Ubuntu Linux + +Arm Compiler for Linux is available to install with the Ubuntu system package manager `apt` command. + +### Setup the ACfL package repository: + +Add the ACfL `apt` package repository to your Ubuntu 20.04 or 22.04 system: + +```bash { target="ubuntu:latest" } +sudo apt update +sudo apt install -y curl +source /etc/os-release +curl "https://developer.arm.com/packages/ACfL%3A${NAME}-${VERSION_ID/%.*/}/${VERSION_CODENAME}/Release.key" | sudo tee /etc/apt/trusted.gpg.d/developer-arm-com.asc +echo "deb https://developer.arm.com/packages/ACfL%3A${NAME}-${VERSION_ID/%.*/}/${VERSION_CODENAME}/ ./" | sudo tee /etc/apt/sources.list.d/developer-arm-com.list +sudo apt update +``` + +The ACfL Ubuntu package repository is now ready to use. + +### Install ACfL + +Download and install Arm Compiler for Linux with: + +```bash { target="ubuntu:latest" } +sudo apt install acfl +``` + ### Set up environment Arm Compiler for Linux uses environment modules to dynamically modify your user environment. Refer to the [Environment Modules documentation](https://lmod.readthedocs.io/en/latest/#id) for more information. diff --git a/content/install-guides/git-woa.md b/content/install-guides/git-woa.md new file mode 100644 index 0000000000..432f39e75b --- /dev/null +++ b/content/install-guides/git-woa.md @@ -0,0 +1,105 @@ +--- +### Title the install tools article with the name of the tool to be installed +### Include vendor name where appropriate +title: Git for Windows on Arm + +### Optional additional search terms (one per line) to assist in finding the article +additional_search_terms: +- git +- windows +- woa +- windows on arm +- open source windows on arm + +### Estimated completion time in minutes (please use integer multiple of 5) +minutes_to_complete: 10 + +### Link to official documentation +official_docs: https://git-scm.com/doc + +author_primary: Jason Andrews + +### PAGE SETUP +weight: 1 # Defines page ordering. Must be 1 for first (or only) page. +tool_install: true # Set to true to be listed in main selection page, else false +multi_install: false # Set to true if first page of multi-page article, else false +multitool_install_part: false # Set to true if a sub-page of a multi-page article, else false +layout: installtoolsall # DO NOT MODIFY. Always true for tool install articles +--- + +Git has native support for [Windows on Arm](https://learn.microsoft.com/en-us/windows/arm/overview). Starting with version 2.47.1, an official installer is available. + +In addition to Windows laptops, Windows on Arm instances are available with Microsoft Azure. For further information, see [Deploy a Windows on Arm virtual machine on Microsoft Azure](/learning-paths/cross-platform/woa_azure/). + +## How do I download and install Git for Windows on Arm? + +Git releases are available in [GitHub releases](https://github.com/git-for-windows/git/releases/). + +Use a browser to download the desired release file. The Git releases for Windows on Arm have `arm64.exe` in the filename. + +You can also download from a Windows PowerShell with the following command: + +```command +curl https://github.com/git-for-windows/git/releases/download/v2.47.1.windows.1/Git-2.47.1-arm64.exe -o Git-2.47.1-arm64.exe +``` + +Once you have downloaded Git, run the installer `.exe` file on a Windows on Arm machine. + +The installer starts. + +Click **Next** to acknowledge the GNU General Public License. + +Set the destination location or accept the default location, and click **Next**. + +Continue to click **Next** for the configuration settings. You can accept all defaults if you are unsure of specific settings. + +At the end of the install process, you see the screen below indicating setup has finished installing Git: + +![Install](/install-guides/_images/git-woa.png) + +Click the **Finish** button to complete installation. + +## How do I use Git on Windows? + +You can use Git on Windows from a Command Prompt or using Git Bash. + +Git Bash is a Linux-like terminal experience which includes Git and many other Linux commands. + +{{% notice Note %}} +Git is not automatically added to your search path during installation. +{{% /notice %}} + +To use Git, click the Windows **Start** button and then click **All apps**. + +You see the Git folder in the G section. + +![Start](/install-guides/_images/git2-woa.png) + +There are menu items for multiple ways to start Git. + +## How can I use Git in a Windows Command Prompt? + +Start a Git Command Prompt by selecting **Git CMD** from the **Start** menu. + +![CMD](/install-guides/_images/git3-woa.png) + + +To see the help message, enter: + +```cmd +git help +``` + +You can use Git from this Command Prompt. + +## How can I use Git with Git Bash? + +To use Git in a Linux-like environment, select **Git Bash** from the start menu. + +![CMD](/install-guides/_images/git4-woa.png) + +Click the colored icon in the top-left corner of the Git Bash window, and then click **Options** to change the appearance of the window, including colors, fonts, and font sizes. + +![Options](/install-guides/_images/git5-woa.png) + +You are now ready to use Git on your Windows on Arm device. diff --git a/content/install-guides/windows-perf-wpa-plugin.md b/content/install-guides/windows-perf-wpa-plugin.md index 4e0fa1f432..9afda85d45 100644 --- a/content/install-guides/windows-perf-wpa-plugin.md +++ b/content/install-guides/windows-perf-wpa-plugin.md @@ -1,10 +1,6 @@ --- title: Windows Performance Analyzer (WPA) plugin -draft: true -cascade: - draft: true - minutes_to_complete: 15 official_docs: https://github.com/arm-developer-tools/windowsperf-wpa-plugin @@ -34,18 +30,18 @@ layout: installtoolsall # DO NOT MODIFY. Always true for tool install articles The Windows Performance Analyzer (WPA) plugin connects [WindowsPerf](/learning-paths/laptops-and-desktops/windowsperf/) to the Windows Performance Analyzer. Windows Perf is a lightweight performance profiling tool inspired by Linux Perf and designed for Windows on Arm. -Windows Performance Analyzer is a useful tool that supports developers with diagnostics and performance tuning. It generates data tables and graphs of Event Tracing for Windows (ETW) events, which are recorded in one of three ways: -- Windows Performance Recorder (WPR) -- Xperf -- or through an assessment that's run in the Assessment Platform. +Windows Performance Analyzer provides developers with diagnostics and performance tuning. It generates data tables and graphs of Event Tracing for Windows (ETW) events, which are recorded in one of three ways: +- By using Windows Performance Recorder (WPR). +- By using Xperf. +- Through an assessment that is run in the Assessment Platform. -WPA can open event trace log (ETL) files, which you can use for analysis. +WPA can open Event Trace Log (ETL) files, which you can use for analysis. -The WPA plugin is built using the [Microsoft Performance Toolkit SDK](https://github.com/microsoft/microsoft-performance-toolkit-sdk), a collection of tools to create and extend performance analysis applications. The plugin parses JSON output from Windows Perf so that it can be visualized in WPA. +The WPA plugin is built using the [Microsoft Performance Toolkit SDK](https://github.com/microsoft/microsoft-performance-toolkit-sdk), which is a collection of tools to create and extend performance analysis applications. The plugin parses JSON output from Windows Perf so that it can be visualized in WPA. -## What are some of the features of the WPA plugin? +## What are the features of the WPA plugin? -The WindowsPerf GUI extension includes features, which are designed to streamline the user experience: +The WindowsPerf GUI extension includes features that are designed to streamline the user experience, and these are detailed below. ### Timeline view @@ -61,83 +57,87 @@ The telemetry view displays telemetry events grouped by unit: ## How do I install the WPA plugin? -Before installing the plugin, you need to make sure you have installed WPA: +Before installing the plugin, you need to install WPA. ### Install WPA -WPA is included in the Windows Assessment and Deployment Kit (Windows ADK), which you can download from [Microsoft](https://go.microsoft.com/fwlink/?linkid=2243390). +For Windows on Arm devices, you can install WPA from the Microsoft Store. -{{% notice Note %}} -The WPA plugin requires WPA version `11.0.7.2` or higher. -{{% /notice %}} +Open the Microsoft Store and search for "windows performance analyzer". -Run the downloaded `adksetup.exe` program. +![WPA store](/install-guides/_images/wpa-store.png) -Specify the default installation location and accept the license agreement. +Hover over the card, and you will see that the **Free** button becomes a **Get** button. Click the **Get** button to start the installation. -Make sure that **Windows Performance Toolkit** is checked under **Select the features you want to install**. +Wait for WPA to be installed, and then launch it from the Windows menu. -![WPA Installation](/install-guides/_images/wpa-installation.png) +![WPA installation #center](/install-guides/_images/wpa-installation.png) -Finally, click **Install**. +{{% notice Note %}} +The WPA plugin requires WPA version `11.0.7.2` or higher. You can check the version by clicking **Help** > **About Windows Performance Analyzer**. +{{% /notice %}} + +Close Windows Performance Analyzer. ### Install the WPA plugin -Now you're ready to install the plugin, which is a single `.dll` file. +You are now ready to install the WPA plugin, which is a single `.dll` file. Download the `.zip` file from the [Windows Perf WPA plugin GitHub releases page](https://github.com/arm-developer-tools/windowsperf-wpa-plugin/releases) on GitHub. -Alternatively, you can download the latest version using command prompt: +Alternatively, you can download the `.zip` file from a command prompt: ```console mkdir wpa-plugin cd wpa-plugin -curl -L -O https://github.com/arm-developer-tools/windowsperf-wpa-plugin/releases/download/1.0.2/wpa-plugin-1.0.2.zip +curl -L -O https://github.com/arm-developer-tools/windowsperf-wpa-plugin/releases/download/1.0.3/wpa-plugin-1.0.3.zip ``` -Now extract the `.dll` file from the downloaded `.zip` file. +Extract the `.dll` file from the downloaded `.zip` file. ```console -tar -xmf wpa-plugin-1.0.2.zip +tar -xmf wpa-plugin-1.0.3.zip ``` The file `WPAPlugin.dll` is now in your `wpa-plugin` directory. -There are three ways you can install the `WPAPlugin.dll` file: +There are three ways that you can use the `WPAPlugin.dll` file: -#### 1. Copy the .dll file to the CustomDataSources directory next to the WPA executable. +##### Option 1: Start WPA from the command line and pass the plugin directory location using a flag. -The default location is: - `C:\\Program Files (x86)\Windows Kits\10\Windows Performance Toolkit\CustomDataSources` +Use the `-addsearchdir` flag to tell `wpa` where to find plugins. -#### 2. Set an environment variable. +For example, if you downloaded the `.dll` in your `Downloads` directory, you can run `wpa` as shown below: + +```bash +wpa -addsearchdir %USERPROFILE%\Downloads\wpa-plugin-1.0.3 +``` + +##### Option 2: Set an environment variable. Set the `WPA_ADDITIONAL_SEARCH_DIRECTORIES` environment variable to the location of the `.dll` file. -#### 3. Start WPA from the command line and pass the plugin directory location using a flag. +##### Option 3: Copy the `.dll` file to the `CustomDataSources` directory next to the WPA executable. -Use the `-addsearchdir` flag for `wpa`: +The default location is: + `C:\\Program Files (x86)\Windows Kits\10\Windows Performance Toolkit\CustomDataSources` -```bash -wpa -addsearchdir "%USERPROFILE%\plugins" -``` - -## How can I verify the WPA plugin is installed? +## How can I verify that the WPA plugin is installed? -To verify the plugin is loaded, launch WPA and the plugin should appear under **Help > About Windows Performance Analyzer**. +To verify the plugin is loaded, launch WPA and the plugin should appear on the Installed Plugins list. -![WPA installation confirmation](/install-guides/_images/about-wpa.png) +![WPA installation confirmation](/install-guides/_images/wpa-install-plugin.png) ## How can I run the WPA plugin from the command line? To open a JSON file directly from the command line, you can use the `-i` flag to specify the file path to open. -For example: to open `timeline_long.json` in your downloads directory, run the command: +For example, to open `timeline_long.json` in your `downloads` directory, run the command: ```console -wpa -i "%USERPROFILE%\\Downloads\\timeline_long.json" +wpa -addsearchdir %USERPROFILE%\Downloads\wpa-plugin-1.0.3 -i %USERPROFILE%\Downloads\timeline_long.json ``` ## How do I uninstall the WPA plugin? -To uninstall the plugin simply delete the `WPAPlugin.dll` file. +To uninstall the plugin, simply delete the `WPAPlugin.dll` file. diff --git a/content/learning-paths/cross-platform/simd-info-demo/_index.md b/content/learning-paths/cross-platform/simd-info-demo/_index.md index cedd43e124..43a273c486 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/_index.md +++ b/content/learning-paths/cross-platform/simd-info-demo/_index.md @@ -1,30 +1,30 @@ --- title: Introduction to SIMD.info -draft: true + cascade: - draft: true + minutes_to_complete: 30 -who_is_this_for: This is for advanced topic for software developers interested in porting SIMD code across Arm platforms. +who_is_this_for: This Learning Path is for software developers who are interested in porting SIMD code across Arm platforms. learning_objectives: - - Learn how to use SIMD.info’s tools and features, such as navigation, search, and comparison, to simplify the process of finding equivalent SIMD intrinsics between architectures and improving code portability. + - Describe how to use SIMD.info’s tools and features, such as navigation, search, and comparison, to simplify the process of finding equivalent SIMD intrinsics between architectures to improve code portability. prerequisites: - A basic understanding of SIMD. - - Access to an Arm platform with SIMD supported engine, with recent versions of a C compiler (Clang or GCC) installed. + - Access to an Arm platform with a SIMD-supported engine, installed with recent versions of a C compiler such as Clang or GCC. -author_primary: Georgios Mermigkis & Konstantinos Margaritis, VectorCamp +author_primary: Georgios Mermigkis and Konstantinos Margaritis, VectorCamp ### Tags skilllevels: Advanced subjects: Performance and Architecture armips: - - Aarch64 - - Armv8-a - - Armv9-a + - AArch64 + - Armv8-A + - Armv9-A tools_software_languages: - GCC - Clang diff --git a/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md b/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md index 320c29c6e1..2c6760d15e 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md +++ b/content/learning-paths/cross-platform/simd-info-demo/_next-steps.md @@ -1,7 +1,7 @@ --- next_step_guidance: You should explore **SIMD.info** more and find out porting opportunities between different SIMD engines. -recommended_path: /learning-paths/cross-platform/ +recommended_path: /learning-paths/cross-platform/vectorization-friendly-data-layout/ further_reading: - resource: diff --git a/content/learning-paths/cross-platform/simd-info-demo/_review.md b/content/learning-paths/cross-platform/simd-info-demo/_review.md index cc6a2a64d0..35df3294b6 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/_review.md +++ b/content/learning-paths/cross-platform/simd-info-demo/_review.md @@ -4,34 +4,34 @@ review: question: > What is SIMD.info? answers: - - An online resource for SIMD C intrinsics for all major architectures - - It's an online forum for SIMD developers - - A book about SIMD programming + - It's an online resource for SIMD C intrinsics for all major architectures. + - It's an online forum for SIMD developers. + - It's a book about SIMD programming. correct_answer: 1 explanation: > - While it allows comments in the SIMD intrinsics, SIMD.info is not really a forum. It is an online **free** resource to assist developers porting C code between popular architectures, for example, from SSE/AVX/AVX512 to Arm ASIMD. + While it allows comments in the SIMD intrinsics, SIMD.info is not a forum. It is an online free resource to assist developers porting C code between popular architectures, for example, from SSE/AVX/AVX512 to Arm ASIMD. - questions: question: > What architectures are listed in SIMD.info? answers: - - Intel SSE and Arm ASIMD - - Power VSX and Arm ASIMD/SVE - - Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX + - Intel SSE and Arm ASIMD. + - Power VSX and Arm ASIMD/SVE. + - Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX. correct_answer: 3 explanation: > - At the time of writing SIMD.info supports Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX as SIMD architectures. Work is in progress to include Arm SVE/SVE2, MIPS MSA, RISC-V RVV 1.0, s390 Z and others. + SIMD.info supports Intel SSE4.2/AVX/AVX2/AVX512, Arm ASIMD, Power VSX as SIMD architectures. Work is in progress to include Arm SVE/SVE2, MIPS MSA, RISC-V RVV 1.0, s390 Z and others. - questions: question: > What are SIMD.info's major features? answers: - - Hierarchical tree, Search, AI code translation - - Search, Hierarchical tree, Code examples - - Hierarchical tree, Search, Intrinsics Comparison, Code examples, Equivalents mapping, links to official documentation + - Hierarchical tree, search, and AI code translation. + - Search, hierarchical tree, and code examples. + - Hierarchical tree, search, intrinsics comparison, code examples, equivalents mapping, and links to official documentation. correct_answer: 3 explanation: > - SIMD.info provides multiple features, including a hierarchical tree, Search facility, Intrinsics Comparison, Code examples, Equivalents mapping, links to official documentation and others. AI code translation is not a feature of SIMD.info but will be the focus of another project, SIMD.ai. + SIMD.info provides multiple features, including a hierarchical tree, search facility, intrinsics comparison, code examples, equivalents mapping, links to official documentation, and others. AI code translation is not a feature of SIMD.info but is the focus of another project, SIMD.ai. diff --git a/content/learning-paths/cross-platform/simd-info-demo/conclusion.md b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md index bf30963645..14538a712f 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/conclusion.md +++ b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md @@ -6,12 +6,16 @@ weight: 8 layout: learningpathall --- -### Conclusion and Additional Resources +### Conclusion and further reading -Porting SIMD code between architecture can be a daunting process, in many cases requiring many hours of studying multiple ISAs in online resources or ISA manuals of thousands pages. Our primary focus in this work was to optimize the existing algorithm directly with SIMD intrinsics, without altering the algorithm or data layout. While reordering data to align with native Arm instructions could offer performance benefits, our scope remained within the constraints of the current data layout and algorithm. For those interested in data layout strategies to further enhance performance on Arm, the [vectorization-friendly data layout learning path](https://learn.arm.com/learning-paths/cross-platform/vectorization-friendly-data-layout/) offers valuable insights. +Porting SIMD code between architectures can be a daunting process, often requiring many hours of studying multiple ISAs in online resources or ISA manuals that run into thousands of pages. -Using **[SIMD.info](https://simd.info)** can be be instrumental in reducing the amount of time spent in this process, providing a centralized and user-friendly resource for finding **NEON** equivalents to intrinsics of other architectures. It saves considerable time and effort by offering detailed descriptions, prototypes, and comparisons directly, eliminating the need for extensive web searches and manual lookups. +The primary focus of this Learning Path is to optimize the existing algorithm directly with SIMD intrinsics, without altering the algorithm or data layout. While reordering data to align with native Arm instructions can offer performance benefits, this is outside the scope of this Learning Path. -While porting between vectors of different sizes is more complex, work is underway -at the time of writing this guide- to complete integration of **SVE**/**SVE2** Arm extensions and allow matching them with **AVX512** intrinsics, as they are both using predicate masks. +If you are interested in data layout strategies to further enhance performance on Arm, see the Learning Path *Optimize SIMD code with vectorization-friendly data layout* linked to in the **Next Steps** section at the of this Learning Path. -Please check **[SIMD.info](https://simd.info)** regularly for updates on this. +Using SIMD.info can be instrumental in reducing the amount of time spent in this process, providing a centralized and user-friendly resource for finding NEON equivalents to intrinsics of other architectures. It saves considerable time and effort by offering detailed descriptions, prototypes, and comparisons directly, eliminating the need for extensive web searches and manual lookups. + +While porting between vectors of different sizes is more complex, work is underway to complete the integration of SVE and SVE2 Arm extensions and allow matching them with AVX512 intrinsics, as they both use predicate masks. + +You can check **[SIMD.info](https://simd.info)** for updates. diff --git a/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md b/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md index 24df6cce42..65ca70f73f 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md +++ b/content/learning-paths/cross-platform/simd-info-demo/intro-to-simdinfo.md @@ -7,10 +7,12 @@ layout: learningpathall --- ### The Challenge of SIMD Code Portability -One of the biggest challenges developers face when working with SIMD code is making it portable across different platforms. SIMD instructions are designed to increase performance by executing the same operation on multiple data elements in parallel. However, each architecture has its own set of SIMD instructions, making it difficult to write code that works on all of them without major changes to the code and/or algorithm. +SIMD instructions are designed to improve performance by executing the same operation on multiple data elements in parallel. One of the biggest challenges developers face when working with SIMD code is making it portable across different platforms. -To port software written using Intel intrinsics, like SSE/AVX/AVX512, to Arm Neon, you have pay attention to data handling with the different instruction sets. +Each architecture has its own set of SIMD instructions, which makes it difficult to port code without major changes to either the code itself, or the algorithm, or both. -Having to port the code between architectures can increase development time and introduce the risk of errors during the porting process. Currently, developers rely on ISA documentation and manually search across various vendor platforms like [Arm Developer](https://developer.arm.com/architectures/instruction-sets/intrinsics/) and [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html) to find equivalent instructions. +For example, to port software written using Intel intrinsics, such as SSE/AVX/AVX512, to Arm Neon, you must address issues with data handling with the different instruction sets. -[SIMD.info](https://simd.info) aims to solve this by helping you find equivalent instructions and providing a more streamlined way to adapt your code for different architectures. +Porting the code between architectures can increase development time and introduce the risk of errors. Currently, developers rely on ISA documentation and must manually search across various vendor platforms such as [Arm Developer](https://developer.arm.com/architectures/instruction-sets/intrinsics/) and [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html) to find equivalent instructions. + +[SIMD.info](https://simd.info) aims to address this challenge by enabling developers to find equivalent instructions and providing a streamlined way to adapt code for different architectures. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md index 678d08327c..9997a8fbf5 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md @@ -6,48 +6,63 @@ weight: 3 layout: learningpathall --- -### Comprehensive SIMD.info Capabilities -**[SIMD.info](https://simd.info/)** offers a variety of powerful tools to help developers work more efficiently with SIMD code across different architectures. With a database of over 10,000 intrinsics, it provides detailed information to support effective SIMD development. - -For each intrinsic, SIMD.info provides comprehensive details, including: - -1. **Purpose**: A brief description of what the intrinsic does and its primary use case. -2. **Result**: Explanation of the output or result of the intrinsic. -3. **Example**: A code snippet demonstrating how to use the intrinsic. -4. **Prototypes**: Function prototypes for different programming languages (currently C/C++). -5. **Assembly Instruction**: The corresponding assembly instruction used by the intrinsic. -6. **Notes**: Any additional notes or caveats about the intrinsic. -7. **Architecture**: List of architectures that support the intrinsic -8. **Link(s) to Official Documentation** - -This detailed information ensures you have all the necessary resources to effectively use and port SIMD instructions across different platforms. Each feature is designed to simplify navigation, improve the search for equivalent instructions, and foster a collaborative environment for knowledge-sharing. - -- **Tree-based navigation:** **SIMD.info** uses a clear, hierarchical layout to organize instructions. It categorizes instructions into broad groups like **Arithmetic**, which are further divided into specific subcategories such as **Vector Add** and **Vector Subtract**. This organized structure makes it straightforward to browse through SIMD instruction sets across various platforms, allowing you to efficiently find and access the exact instructions you need. -An example of how the tree structure looks like: - - - - **Arithmetic** - - **Arithmetic (Complex Numbers)** - - **Boolean Logic & Bit Manipulation** - - **Boolean AND** - - **Boolean AND NOT** - - **Boolean AND NOT 128-bit vector** - - **Boolean AND NOT 16-bit signed integers** - - **Boolean AND NOT 16-bit unsigned integers** - - **Boolean AND NOT 256-bit vector** - - **Boolean AND NOT 32-bit floats** - - **Boolean AND NOT 32-bit signed integers** +#### SIMD.info categories of information +**[SIMD.info](https://simd.info/)** offers a variety of powerful tools to enable developers to work more efficiently with SIMD code across different architectures. + +With a database of over 10,000 intrinsics, it provides valuable detailed information to support effective SIMD development. + +For each intrinsic, SIMD.info provides information in the following categories: + +* **Purpose**: a brief description of what the intrinsic does and the primary use case. + +* **Result**: an explanation of the output or result of the intrinsic. + +* **Example**: a code snippet demonstrating how to use the intrinsic. + +* **Prototypes**: function prototypes for different programming languages (currently C/C++). + +* **Assembly Instruction**: the corresponding assembly instruction that the intrinsic uses. + +* **Notes**: any further information about the intrinsic, such as caveats. + +* **Architecture**: a list of architecture that supports the intrinsic. + +* **Links to official documentation**. + +This information ensures that you have all the necessary resources to effectively use and port SIMD instructions across different platforms. Each feature is designed to simplify navigation, improve the search for equivalent instructions, and foster a collaborative environment for knowledge-sharing. + +#### Tree-based navigation +SIMD.info uses a clear, hierarchical layout to present the instructions. It categorizes instructions into high-level groups such as **Arithmetic**, which are then further divided into specific subcategories such as **Vector Add**, and **Vector Subtract**. + +This organized structure enables you to browse through SIMD instruction sets across various platforms, allowing you to efficiently find and access the instructions that you need. Below is an example of the tree structure: + + - Arithmetic + - Arithmetic (Complex Numbers) + - Boolean Logic & Bit Manipulation + - Boolean AND + - Boolean AND NOT + - Boolean AND NOT 128-bit vector + - Boolean AND NOT 16-bit signed integers + - Boolean AND NOT 16-bit unsigned integers + - Boolean AND NOT 256-bit vector + - Boolean AND NOT 32-bit float + - Boolean AND NOT 32-bit signed integers - AVX512: mm512_andnot_epi32 - NEON: vbic_s32 - NEON: vbicq_s32 - VSX: vec_andc - - **Bit Clear** - - **XOR** + - Bit Clear + - XOR + +#### Advanced search functionality +With its robust search engine, SIMD.info allows you to either search for a specific intrinsic, for example `vaddq_f64`, or enter more general terms, for example "How to add 2 vectors," and it returns a list of the corresponding intrinsics. -- **Advanced search functionality:** With its robust search engine, **SIMD.info** allows you to either search for a specific intrinsic (e.g. `vaddq_f64`) or enter more general terms (e.g. *How to add 2 vectors*), and it will return a list of the corresponding intrinsics. You can also filter results based on the specific engine you're working with, such as **NEON**, **SSE4.2**, **AVX**, **AVX512**, **VSX**. This functionality streamlines the process of finding the right commands tailored to your needs. +You can also filter results based on the specific engine you're working with, such as NEON, SSE4.2, AVX, AVX512, or VSX. This functionality streamlines the process of finding the right commands tailored to your needs. -- **Comparison tools:** This feature lets you directly compare SIMD instructions from different (or the same) platforms side by side, offering a clear view of the similarities and differences. It’s a very helpful tool for porting code across architectures, as it ensures accuracy and efficiency. +#### Comparison tools +This feature lets you directly compare SIMD instructions from different, or the same, platforms side by side, offering a clear view of the similarities and differences. It’s a helpful tool for porting code across architectures, as it ensures accuracy and efficiency. -- **Discussion forum (like StackOverflow):** The integrated discussion forum, powered by **[discuss](https://disqus.com/)** allows users to ask questions, share insights, and troubleshoot problems together. This community-driven space ensures that you’re never stuck on a complex issue without support, fostering collaboration and knowledge-sharing among SIMD developers. Imagine something like **StackOverflow** but specific to SIMD intrinsics. +#### Discussion forum +The integrated discussion forum, powered by **[Disqus](https://disqus.com/)**, allows users to ask questions, share insights, and troubleshoot problems together. This community-driven space ensures that you’re never stuck on a complex issue without support. It fosters collaboration and knowledge-sharing among SIMD developers. Imagine something like **[StackOverflow](https://stackoverflow.com/)** but specific to SIMD intrinsics. -You can now learn how to use these features in the context of an actual example. +Now let's look at these features in the context of a real example. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md index 6a8e1c4463..495bc4cb5e 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md @@ -7,27 +7,29 @@ layout: learningpathall --- ### Using SIMD.info to find NEON Equivalents -Now that you have a clear view of the example, you can start the process of porting the code to Arm **Neon/ASIMD**. +Now that you have a clear view of the example, you can start the process of porting the code to Arm Neon/ASIMD. This is where [SIMD.info](https://simd.info/) comes in. -In SIMD programming, the primary concern is the integrity and accuracy of the calculations. Ensuring that these calculations are done correctly is crucial. Performance almost always comes second. +In SIMD programming, the primary focus is the integrity and accuracy of the calculations. Ensuring that these calculations are done correctly is crucial. Performance is almost always a secondary concern. -For the operations in your **SSE4.2** example, you have the following intrinsics: +For the operations in your SSE4.2 example, you have the following intrinsics: - **`_mm_cmpgt_ps`** - **`_mm_add_ps`** - **`_mm_mul_ps`** - **`_mm_sqrt_ps`** -To gain a deeper understanding of how these intrinsics work and to get detailed descriptions, you can use the search feature on **SIMD.info**. Simply enter the intrinsic's name into the search bar. You can either select from the suggested results or perform a direct search to find detailed information about each intrinsic. +To gain a deeper understanding of how these intrinsics work and to surface detailed descriptions, you can use the search feature on SIMD.info. Simply enter the name of the intrinsic in the search bar. You can either select from the suggested results or perform a direct search to retrieve information about each intrinsic. -1. By searching [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps/) you get information about it's purpose, result-type, assembly instruction, prototype and an example about it. By clicking the **engine** option **"NEON"** you can find it's [equivalents](https://simd.info/eq/_mm_add_ps/NEON/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) will help you find the right one. Based on the prototype provided, you would choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32/) because it works with 128-bit vectors which is the same as **SSE4.2**. +1. By searching for [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps/) you will retrieve information about its purpose, the result type, assembly instructions, prototypes, and an example demonstration. By clicking the **engine** option **"NEON"** you can find its [equivalents](https://simd.info/eq/_mm_add_ps/NEON/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) helps you find the right one. Based on the prototype provided, you can choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32/) as it works with 128-bit vectors which is the same as **SSE4.2**. -2. Moving to the next intrinsic, **`_mm_mul_ps`**, you will use the [Intrinsics Tree](https://simd.info/tag-tree) on **SIMD.info** to find the equivalent. Start by expanding the **Arithmetic** branch and then navigate to the branch **Vector Multiply**. Since you are working with 32-bit floats, open the **Vector Multiply 32-bit floats** branch, where you will find several options. The recommended choice is [**`vmulq_f32`**](https://simd.info/c_intrinsic/vmulq_f32/), following the same reasoning as before—it operates on 128-bit vectors. +2. Moving to the next intrinsic, **`_mm_mul_ps`**, you can use the [Intrinsics Tree](https://simd.info/tag-tree) on SIMD.info to find the equivalent. -3. For the third intrinsic, **`_mm_sqrt_ps`**, the easiest way to find the corresponding **NEON** intrinsic is by typing **"Square Root"** into the search bar on SIMD.info. From the [search results](https://simd.info/search?search=Square+Root&simd_engines=1&simd_engines=2&simd_engines=3&simd_engines=4&simd_engines=5), look for the float-specific version and select [**`vsqrtq_f32`**](https://simd.info/c_intrinsic/vsqrtq_f32/), which, like the others, works with 128-bit vectors. In the equivalents section regarding **SSE4.2**, you can clearly see that **`_mm_sqrt_ps`** has its place as a direct match for this operation. +Start by expanding the **Arithmetic** branch and then navigate to the branch **Vector Multiply**. As you are working with 32-bit floats, open the **Vector Multiply 32-bit floats** branch, where you will find several options. The recommended choice is [**`vmulq_f32`**](https://simd.info/c_intrinsic/vmulq_f32/), following the same reasoning as before; it operates on 128-bit vectors. -4. For the last intrinsic, **`_mm_cmpgt_ps`**, follow a similar approach as before. Inside the intrinsics tree, start by expanding the **Comparison** folder. Navigate to the subfolder **Vector Compare Greater Than**, and since you are working with 32-bit floats, proceed to **Vector Compare Greater Than 32-bit floats**. The recommended choice is again the 128-bit variant [**`vcgtq_f32`**](https://simd.info/c_intrinsic/vcgtq_f32/). +3. For the third intrinsic, **`_mm_sqrt_ps`**, the easiest way to find the corresponding NEON intrinsic is by typing **"Square Root"** in the search bar on SIMD.info. From the [search results](https://simd.info/search?search=Square+Root&simd_engines=1&simd_engines=2&simd_engines=3&simd_engines=4&simd_engines=5), look for the float-specific version and select [**`vsqrtq_f32`**](https://simd.info/c_intrinsic/vsqrtq_f32/), which, like the others, works with 128-bit vectors. In the equivalents section about **SSE4.2**, you can see that **`_mm_sqrt_ps`** has its place as a direct match for this operation. -Now that you have found the **NEON** equivalents for each **SSE4.2** intrinsic, you're ready to begin porting the code. Understanding these equivalents is key to ensuring that the code produces the correct results in the calculations as you switch between SIMD engines. +4. For the last intrinsic, **`_mm_cmpgt_ps`**, follow a similar approach as before. Inside the intrinsics tree, start by expanding the **Comparison** folder. Navigate to the subfolder **Vector Compare Greater Than**, and as you are working with 32-bit floats, proceed to **Vector Compare Greater Than 32-bit floats**. The recommended choice is again the 128-bit variant[**`vcgtq_f32`**](https://simd.info/c_intrinsic/vcgtq_f32/). + +Now that you have found the NEON equivalents for each SSE4.2 intrinsic, you're ready to begin porting the code. Understanding these equivalents is key to ensuring that the code produces the correct results in the calculations as you switch between SIMD engines. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md index f0a2d3f5bb..c38523ce30 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md @@ -8,13 +8,15 @@ layout: learningpathall ### Step-by-Step Porting -1. Change the loading process to follow **NEON**'s method for initializing vectors. The **SSE4.2** intrinsic **`_mm_set_ps`** is in reality a macro, in **NEON** you can do the same thing with curly braces **`{}`** inititialization. -2. Next, you will replace the **SSE4.2** intrinsics with the **NEON** equivalents we identified earlier. The key is to ensure that the operations perform the same tasks, such as comparison, addition, multiplication, and square root calculations. -3. Finally, modify the storing process to match **NEON**’s way of moving data from vectors to memory. In **NEON**, you use functions like [**`vst1q_f32`**](https://simd.info/c_intrinsic/vst1q_f32/) for storing 128-bit floating-point vectors and [**`vst1q_u32`**](https://simd.info/c_intrinsic/vst1q_u32/) for storing 128-bit integer vectors. +Follow this step-by-step process to porting: -After identifying the **NEON** intrinsics you will need in the ported program, it's time to actually write the code. +1. Change the loading process to follow NEON's method for initializing vectors. The SSE4.2 intrinsic **`_mm_set_ps`** is in reality a macro, in NEON you can do the same thing with curly braces **`{}`** initialization. +2. Next, replace the SSE4.2 intrinsics with the NEON equivalents that you identified earlier. The key is to ensure that the operations perform the same tasks, such as comparison, addition, multiplication, and square root calculations. +3. Finally, modify the storing process to match NEON’s way of moving data from vectors to memory. In NEON, you use functions like [**`vst1q_f32`**](https://simd.info/c_intrinsic/vst1q_f32/) for storing 128-bit floating-point vectors and [**`vst1q_u32`**](https://simd.info/c_intrinsic/vst1q_u32/) for storing 128-bit integer vectors. -This time on your Arm Linux machine, create a new file for the ported NEON code named `calculation_neon.c` with the contents shown below: +After identifying the NEON intrinsics that you require in the ported program, it's now time to write the code. + +This time on your Arm Linux machine, create a new file for the ported NEON code named `calculation_neon.c`, populating with the contents as shown below: ```C #include @@ -66,7 +68,7 @@ int main() { ### Verifying the Ported Code -It's time to verify that the functionality remains the same, which means you get the same results and similar performance. +It's time to verify that the functionality remains the same, which means that you achieve the same results and similar performance. Compile the above code as follows on your Arm Linux machine: @@ -92,8 +94,8 @@ Multiplication Result: 2.00 12.00 36.00 80.00 Square Root Result: 1.41 3.46 6.00 8.94 ``` -You can see that the results are the same as in the **SSE4.2** example. +You can see that the results are the same as in the SSE4.2 example. {{% notice Note %}} -You initialized the vectors in reverse order compared to the **SSE4.2** version because the array initialization and vld1q_f32 function load vectors from LSB to MSB, whereas **`_mm_set_ps`** loads elements MSB to LSB. +You initialized the vectors in reverse order compared to the SSE4.2 version because the array initialization and vld1q_f32 function load vectors from LSB to MSB, whereas **`_mm_set_ps`** loads elements MSB to LSB. {{% /notice %}} diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md index be115692d2..2e3eb72d9d 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1.md @@ -6,9 +6,9 @@ weight: 4 layout: learningpathall --- -Consider the following C example that uses Intel SSE4.2 intrinsics. +Have a look at the following C example that uses Intel SSE4.2 intrinsics. -On an x86_64 Linux development machine, create a file named `calculation_sse.c` with the contents shown below: +On an x86_64 Linux development machine, create a file named `calculation_sse.c`, populating it with the contents as shown below: ```C #include @@ -52,9 +52,15 @@ int main() { } ``` -The program first compares whether elements in one vector are greater than those in another vector, prints the result, and then proceeds to compute the addition of two vectors, multiplies the result with one of the vectors, and finally takes the square root of the multiplication result: +The program does the following: -Compile the code on your Linux x86_64 system that supports **SSE4.2**: +* Compares whether elements in one vector are greater than those in another vector. +* Prints the result. +* Computes the addition of two vectors. +* Multiplies the result with one of the vectors. +* Takes the square root of the multiplication result. + +Compile the code on your Linux x86_64 system that supports SSE4.2: ```bash gcc -O3 calculation_sse.c -o calculation_sse -msse4.2 @@ -78,4 +84,4 @@ Multiplication Result: 2.00 12.00 36.00 80.00 Square Root Result: 1.41 3.46 6.00 8.94 ``` -It is imperative that you run the code first on an Intel x86_64 reference platform, to make sure you understand how it works and what kind of results are being expected. +It is imperative that you run the code first on an Intel x86_64 reference platform, to make sure that you understand how it works and what kind of results you can expect. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md index 32793cf3c0..75349e6289 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md @@ -8,11 +8,13 @@ layout: learningpathall ### Handling intrinsics without direct equivalents -During the porting process, you will observe that certain instructions translate seamlessly. However, there are cases where direct equivalents for some intrinsics may not be readily available across architectures. For example, the [**`_mm_madd_epi16`**](https://simd.info/c_intrinsic/_mm_madd_epi16/) intrinsic from **SSE2**, which performs multiplication of 16-bit signed integer elements in a vector and then does a pairwise addition of adjacent elements increasing the element width, does not have a direct counterpart in **NEON**. However it can be emulated using another intrinsic. Similarly its 256 and 512-bit counterparts, [**`_mm256_madd_epi16`**](https://simd.info/c_intrinsic/_mm256_madd_epi16/) and [**`_mm512_madd_epi16`**](https://simd.info/c_intrinsic/_mm512_madd_epi16/) can be emulated by a sequence of instructions, but here you will see the 128-bit variant. +During the porting process, you can see that certain instructions translate seamlessly. However, there are cases where direct equivalents for some intrinsics might not be readily available across architectures. -You may already know the equivalent operations for this particular intrinsic, but let's assume you don't. In this usecase, reading the **`_mm_madd_epi16`** on the **SIMD.info** might indicate that a key characteristic of the instruction involved is the *widening* of the result elements, from 16-bit to 32-bit signed integers. Unfortunately, that is not the case, as this particular instruction does not actually increase the size of the element holding the result values. You will see how that effects the result in the example. +For example, the [**`_mm_madd_epi16`**](https://simd.info/c_intrinsic/_mm_madd_epi16/) intrinsic from SSE2, which performs multiplication of 16-bit signed integer elements in a vector and then does a pairwise addition of adjacent elements increasing the element width, does not have a direct counterpart in NEON. However, it can be emulated using another intrinsic. Similarly its 256 and 512-bit counterparts, [**`_mm256_madd_epi16`**](https://simd.info/c_intrinsic/_mm256_madd_epi16/) and [**`_mm512_madd_epi16`**](https://simd.info/c_intrinsic/_mm512_madd_epi16/), can be emulated by a sequence of instructions, but here you will see the 128-bit variant. -Consider the following code for **SSE2**. Create a new file on your x86_64 Linux machine named `_mm_madd_epi16_test.c` with the contents shown below: +You might already know the equivalent operations for this particular intrinsic, but let's assume that you don't. In this particular use case, reading **`_mm_madd_epi16`** on **SIMD.info** might indicate that a key characteristic of the instruction involved is the widening of the result elements, from 16-bit to 32-bit signed integers. Unfortunately, this is not the case. This particular instruction does not increase the size of the element holding the result values. You will see how this affects the result in the example. + +Consider the following code for SSE2. Create a new file on your x86_64 Linux machine named `_mm_madd_epi16_test.c`, and populate with the contents as shown below: ```C #include @@ -44,7 +46,7 @@ int main() { } ``` -Compile the code as follows on the x86_64 system (no extra flags required as **SSE2** is assumed by default on all 64-bit x86 systems): +Compile the code as follows on the x86_64 system. No extra flags are required as **SSE2** is assumed by default on all 64-bit x86 systems: ```bash gcc -O3 _mm_madd_epi16_test.c -o _mm_madd_epi16_test ``` @@ -61,11 +63,11 @@ b : a0 8c 78 64 50 3c 28 14 _mm_madd_epi16(a, b) : a4d8 0 56b8 0 2198 0 578 0 ``` -You will note that the result of the first element is a negative number, even though we added 2 positive results (`130*140` and `150*160`). That is because the result of the addition has to occupy a 16-bit signed integer element and when the first is larger we have the effect of an negative overflow. The result is the same in binary arithmetic, but when interpreted into a signed integer, it turns the number into a negative. +You will note that the result of the first element is a negative number, even though you added 2 positive results (`130*140` and `150*160`). This is because the result of the addition has to occupy a 16-bit signed integer element, and when the first is larger we have the effect of a negative overflow. The result is the same in binary arithmetic, but when interpreted into a signed integer, it turns the number into a negative. -The rest of the values are as expected. Notice how each pair has a zero element next to it. The results are correct, but they are not in the correct order. In this example, you used **`vmovl`** to zero-extend values, which achieves the correct order with zero elements in place. While both **`vmovl`** and **`zip`** could be used for this purpose, **`vmovl`** was chosen in this implementation. For more details, see the Arm Software Optimization Guides, such as the [Neoverse V2 guide](https://developer.arm.com/documentation/109898/latest/). +The rest of the values are as expected. Notice how each pair has a zero element next to it. The results are correct, but they are not in the correct order. In this example, you used **`vmovl`** to zero-extend values, which achieves the correct order with zero elements in place. While both **`vmovl`** and **`zip`** can be used for this purpose, **`vmovl`** was chosen in this implementation. For more details, see the Arm Software Optimization Guides, such as the [Neoverse V2 guide](https://developer.arm.com/documentation/109898/latest/). -Now switch your Linux Arm machine and create a file called `_mm_madd_epi16_neon.c` with the contents below: +Now switch to your Linux Arm machine and create a file called `_mm_madd_epi16_neon.c`, populating it with the contents below: ```C #include #include @@ -128,5 +130,5 @@ vpaddq_s16(a, b) : a4d8 56b8 2198 578 0 0 0 0 final : a4d8 0 56b8 0 2198 0 578 0 ``` -As you can see the results of both executions on different architectures match. You were able to use **SIMD.info** to help with the translation of complex intrinsics between different SIMD architectures. +As you can see, the results of both executions on different architectures match. You used SIMD.info to help with the translation of complex intrinsics between different SIMD architectures. diff --git a/content/learning-paths/embedded-systems/migration/6_run_evaluate.md b/content/learning-paths/embedded-systems/migration/6_run_evaluate.md index 8a2cfe7754..256a3ea14c 100644 --- a/content/learning-paths/embedded-systems/migration/6_run_evaluate.md +++ b/content/learning-paths/embedded-systems/migration/6_run_evaluate.md @@ -45,4 +45,4 @@ Emulation does not give a representative view of how efficiently the algorithms You have now ported an `x86_64` application to `aarch64`, built and run the ported application on `aarch64` using emulation, well done! -If you have access to Arm hardware, continue to the next section [Evaluating real hardware](/learning-paths/embedded-systems/migration/7_alternative). If you don't have access to Arm hardware you can jump straight to the [Review](/learning-paths/embedded-systems/migration/_review) and test your knowledge. \ No newline at end of file +Continue to the next section to evaluate application performance on Arm hardware. diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index 7f6fe66d59..857b0f965b 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -13,11 +13,11 @@ operatingsystems_filter: - ChromeOS: 1 - Linux: 29 - macOS: 7 -- Windows: 36 +- Windows: 37 subjects_filter: - CI-CD: 3 - Containers and Virtualization: 6 -- Migration to Arm: 25 +- Migration to Arm: 26 - Performance and Architecture: 20 subtitle: Create and migrate apps for power efficient performance title: Laptops and Desktops @@ -31,10 +31,10 @@ tools_software_languages_filter: - Automotive: 1 - C: 2 - C#: 5 -- C++: 2 +- C++: 3 - C/C++: 4 - CCA: 1 -- Clang: 9 +- Clang: 10 - CMake: 2 - Coding: 19 - CSS: 1 @@ -56,6 +56,7 @@ tools_software_languages_filter: - Neon: 1 - Neovim: 1 - Node.js: 3 +- OpenCV: 1 - perf: 2 - Python: 2 - Qt: 2 @@ -65,7 +66,7 @@ tools_software_languages_filter: - SVE: 1 - SVE2: 1 - Trusted Firmware: 1 -- Visual Studio: 9 +- Visual Studio: 10 - Visual Studio Code: 9 - VS Code: 2 - Windows Forms: 1 diff --git a/content/learning-paths/laptops-and-desktops/intro/find-hardware.md b/content/learning-paths/laptops-and-desktops/intro/find-hardware.md index b1824897e3..88b3d7eb93 100644 --- a/content/learning-paths/laptops-and-desktops/intro/find-hardware.md +++ b/content/learning-paths/laptops-and-desktops/intro/find-hardware.md @@ -21,10 +21,12 @@ Windows on Arm laptops can also be used for software development. ### ChromeOS -Chromebooks with Arm processors can also be used for software development. The Lenovo [Duet 3](https://www.lenovo.com/us/en/p/laptops/lenovo/lenovo-edu-chromebooks/ideapad-duet-3-chromebook-(11-inch,-qlc)/len101i0034) and [Duet 5](https://www.lenovo.com/us/en/p/laptops/lenovo/lenovo-edu-chromebooks/ideapad-duet-5-chromebook/len101i0023) are popular detachable Chromebooks. +Chromebooks with Arm processors can also be used for software development. The Lenovo [Duet Gen 9](https://www.lenovo.com/us/en/p/laptops/lenovo/lenovo-edu-chromebooks/lenovo-chromebook-duet-gen-9-11-inch-mediatek/83hh0000us) is a popular detachable Chromebook. ### Linux +The System76 [Thelio Astra](https://www.system76.com/arm/) is a powerful Arm-based desktop computer designed by System76 and ships with Ubuntu installed. + Both Windows and ChromeOS have a Linux subsystem which can be used for software development tasks. Read [WSL for Windows on Arm](/learning-paths/laptops-and-desktops/wsl2) to learn more. Some single board computers have enough performance to implement Linux based desktop computers. The [Pinebook Pro](https://pine64.org/devices/pinebook_pro/) and the [Raspberry Pi 400](https://www.raspberrypi.com/products/raspberry-pi-400/) are examples. diff --git a/content/learning-paths/laptops-and-desktops/llvm_putty/putty_llvm.md b/content/learning-paths/laptops-and-desktops/llvm_putty/putty_llvm.md index 62a8f46aae..57caf69c84 100644 --- a/content/learning-paths/laptops-and-desktops/llvm_putty/putty_llvm.md +++ b/content/learning-paths/laptops-and-desktops/llvm_putty/putty_llvm.md @@ -27,9 +27,9 @@ You can use Clang to build a CMake application for Windows on Arm. The example a To compile PuTTY on a Windows on Arm device, follow the steps outlined below: -1. Open `Windows Powershell` on your Windows on Arm computer. +1. Open `Windows PowerShell` on your Windows on Arm computer. -Run the next two steps at the Powershell command prompt. +Run the next two steps at the PowerShell command prompt. 2. Create a directory to use for the build and go to the directory: diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/1-opencv-compilers.md b/content/learning-paths/laptops-and-desktops/win-opencv/1-opencv-compilers.md new file mode 100644 index 0000000000..ae3c8e282c --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/1-opencv-compilers.md @@ -0,0 +1,80 @@ +--- +title: OpenCV and Compilers for Windows on Arm +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What is OpenCV? + +OpenCV (Open Source Computer Vision Library) is a popular, open-source library that developers use to build computer vision applications. It provides a set of tools and functions that help you handle tasks related to images and videos without needing to write everything from scratch. + +Here’s what developers should know: + +* __Ease of Use__: OpenCV comes with pre-built functions for common tasks like reading, displaying, and processing images and videos. This saves time compared to writing algorithms from the ground up. + +* __Image Processing__: You can perform operations like changing colors, applying filters, resizing, rotating, and other transformations to images with minimal code. + +* __Video Handling__: Developers can use OpenCV to capture, modify, and analyze video frames, making it ideal for creating applications like video surveillance or video editing tools. + +* __Computer Vision Algorithms__: OpenCV includes built-in algorithms for complex tasks like object detection (e.g., face and eye recognition), edge detection, and image segmentation. + +* __Machine Learning__: It includes modules for training models using basic machine learning algorithms, which can be applied for pattern recognition and data analysis in visual data. + +* __Community and Resources__: Being open-source and widely adopted, there is a large community of developers contributing to and supporting OpenCV. This makes it easier to find tutorials, documentation, and answers to questions. + + +## Which compilers are available for Windows on Arm Development? + +MSVC (Microsoft Visual C++) and Clang are options for developers building Windows on Arm applications. + +* __MSVC__: A compiler developed by Microsoft that’s part of the Visual Studio IDE. It’s designed specifically for Windows and integrates well with the Windows development ecosystem. + +* __Clang__: An open-source compiler that is part of the LLVM project. It’s known for its modern design and cross-platform capabilities. + +MSVC is ideal for Windows-focused projects needing seamless integration with Visual Studio. Clang is ideal for cross-platform projects or when using modern C++ features. + +## Before you begin + +Any Windows on Arm machine which has the required tools installed can be used for this Learning Path. You will learn how to build OpenCV using both MSVC and Clang. + +Please install the following tools required for both methods. + +* [CMake](/install-guides/cmake) + +{{% notice Note %}} +The instructions were tested with the version 3.28.1 +{{% /notice %}} + +* [Git](https://git-scm.com/downloads/win) + +{{% notice Note %}} +There is currently no Arm version of Git. Install the 64-bit x86 version. +{{% /notice %}} + +Follow the link to install the required tools for a method using MSVC. + +* [Visual Studio 2022 or higher](/install-guides/vs-woa). + +{{% notice Note %}} +The instructions were tested with Visual Studio 2022. +{{% /notice %}} + +To build using Clang, please install the following. + +* [LLVM](/install-guides/llvm-woa/) + +{{% notice Note %}} +The instructions were tested with the version 18.1.8. +{{% /notice %}} + +* [Ninja]( https://github.com/ninja-build/ninja/releases) + +{{% notice Note %}} +The instructions were tested with version 1.11.1 +{{% /notice %}} + +Make sure LLVM Clang and Ninja are in your search path. If they are not, you can use Windows Control Panel to set the PATH environment variable. + +You now have the required development tools installed. Please proceed to the page for the compiler you want to build with. diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/2-1-build-msvc.md b/content/learning-paths/laptops-and-desktops/win-opencv/2-1-build-msvc.md new file mode 100644 index 0000000000..a749ff9aa1 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/2-1-build-msvc.md @@ -0,0 +1,249 @@ +--- +title: Build OpenCV Applications with MSVC +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## How do I build OpenCV with MSVC? + +Follow the steps below to build OpenCV and a test application using the library with the MSVC compiler. + +### Clone the OpenCV repository + +Open up a Windows PowerShell and checkout the source tree: + +```bash +git clone https://github.com/opencv/opencv +cd opencv +git checkout tags/4.10.0 +``` + +{{% notice Note %}} +You might be able to use a later version, but these steps have been tested with the version 4.10.0. +{{% /notice %}} + +### Pre-build configuration + +You can use CMake from the command line. + +First, run the following command to run the pre-build configuration. + +```bash +mkdir build_msvc +cd build_msvc + +cmake ` +-S .. ` +-B . ` +-G "Visual Studio 17 2022" ` +-DCMAKE_BUILD_TYPE=Release ` +-DBUILD_opencv_world=ON ` +-DWITH_ITT=OFF ` +-DWITH_OPENCL=OFF ` +-DWITH_OPENCLAMDBLAS=OFF ` +-DWITH_OPENCLAMDFFT=OFF ` +-DWITH_OPENCL_D3D11_NV=OFF ` +-DWITH_DIRECTML=OFF ` +-DWITH_DIRECTX=OFF ` +-DWITH_ADE=OFF ` +-DWITH_CAROTENE=OFF +``` + +The given options specify the following: +- The source code is located one level above the current directory. +- The build will be performed in the current directory. +- The Visual Studio 2022 MSVC compiler will be used as the compiler. +- The compiled library is generated as a single file that includes all of OpenCV's functionality. +- Unnecessary options have been disabled, assuming processing on Arm CPUs. + +  + +If the configuration is successful, a message similar to the following should be displayed at the end of the execution: + +```output +-- General configuration for OpenCV 4.10.0 ===================================== +-- Version control: 4.10.0 +-- +-- Platform: +-- Timestamp: 2024-11-08T08:50:24Z +-- Host: Windows 10.0.22631 ARM64 +-- CMake: 3.28.1 +-- CMake generator: Visual Studio 17 2022 +-- CMake build tool: C:/Program Files/Microsoft Visual Studio/2022/Professional/MSBuild/Current/Bin/arm64/MSBuild.exe +-- MSVC: 1941 +-- Configuration: Debug Release +-- +-- CPU/HW features: +-- Baseline: NEON +-- requested: NEON FP16 +-- Dispatched code generation: NEON_DOTPROD NEON_FP16 +-- requested: NEON_FP16 NEON_BF16 NEON_DOTPROD +-- NEON_DOTPROD (1 files): + NEON_DOTPROD +-- NEON_FP16 (2 files): + NEON_FP16 +-- +-- C/C++: +-- Built as dynamic libs?: YES +-- C++ standard: 11 +-- C++ Compiler: C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/MSVC/14.41.34120/bin/Hostarm64/arm64/cl.exe (ver 19.41.34123.0) +[...] +-- C Compiler: C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/MSVC/14.41.34120/bin/Hostarm64/arm64/cl.exe +[...] +-- Install to: C:/Users/username/work/opencv/build_msvc/install +-- ----------------------------------------------------------------- +-- +-- Configuring done (97.5s) +-- Generating done (2.8s) +-- Build files have been written to: C:/Users/username/work/opencv/build_msvc +``` + +### Build and install + +Run the following commands to build and install OpenCV: + +```bash +cmake --build . --config Release +cmake --build . --target INSTALL --config Release +``` + +{{% notice Note %}} +The build takes approximately 25 mins on a Lenovo X13s +{{% /notice %}} + +  + +When the build and the install steps are complete, confirm the shared library has been created by inspecting the results in the `install/bin` directory: + +```bash { output_lines = "2-12" } +ls ./install/x64/vc17/bin + Directory: C:\Users\username\work\opencv\build_msvc\install\x64\vc17\bin +Mode LastWriteTime Length Name +---- ------------- ------ ---- +-a---- 08/11/2024 09:03 43008 opencv_annotation.exe +-a---- 08/11/2024 09:03 143872 opencv_interactive-calibration.exe +-a---- 08/11/2024 09:03 41984 opencv_model_diagnostics.exe +-a---- 08/11/2024 09:12 36864 opencv_version.exe +-a---- 08/11/2024 09:12 35328 opencv_version_win32.exe +-a---- 08/11/2024 08:50 26391552 opencv_videoio_ffmpeg4100_64.dll +-a---- 08/11/2024 09:12 56320 opencv_visualisation.exe +-a---- 08/11/2024 09:03 27179008 opencv_world4100.dll +``` + +Also inspect the `install/lib` directory: + +```bash { output_lines = "2-9" } +ls ./install/x64/vc17/lib + Directory: C:\Users\username\work\opencv\build_msvc\install\x64\vc17\lib +Mode LastWriteTime Length Name +---- ------------- ------ ---- +-a---- 08/11/2024 08:50 434 OpenCVConfig-version.cmake +-a---- 08/11/2024 08:50 15260 OpenCVConfig.cmake +-a---- 08/11/2024 08:50 972 OpenCVModules-release.cmake +-a---- 08/11/2024 08:50 3879 OpenCVModules.cmake +-a---- 08/11/2024 09:02 2849862 opencv_world4100.lib +``` + +  + +The library used in your application is `opencv_world.lib/dll`. + +Once the library files are correctly generated, run the following command to ensure there are no errors. + +```bash { output_lines = "2" } +./install/x64/vc17/bin/opencv_version.exe +4.10.0 +``` + +{{% notice Note %}} +The genereated directory name contains "x64," but there is no need to worry as the libraries and executable files will definitely run as ARM64. +{{% /notice %}} + +  + +## Build OpenCV Applications + +Once the OpenCV library has been successfully created, the next step is to link it to a simple application and try using it. + +### Create a new project in Visual Studio + +First, create a new project in Visual Studio. + +Launch Visual Studio, click `Create a new project` on the initial screen, then select `Empty Project` and click `Next`. + +On the next screen, set the `Project name` and `Location`. You can choose any name and location, but for this example, name the project `TestOpenCV`, as shown below. + +Click `Create` to generate the new project. + +![MSVC project #center](msvc_project.png "Create a new project") + +### Add source code + + In `Solution Explorer`, right-click the `Source Files` folder, select `Add`, and then `New Item...`. Create a file named `test_opencv.cpp`. + +![MSVC add file #center](msvc_add_file.png "Add a source file") + +  + +Once the file is created, it will open in the editor. + +Copy and paste the following program into it and save the file. + +```cpp +#include +#include +int main() { + cv::Mat image = cv::Mat::zeros(100, 100, CV_8UC3); + if (image.empty()) { + std::cout << "Failed to create an image!" << std::endl; + return -1; + } + cv::circle(image, cv::Point(50, 50), 30, cv::Scalar(255, 0, 0), -1); + cv::imwrite("test_image.png", image); + cv::waitKey(0); + return 0; +} +``` + +This program is a simple example that uses OpenCV to create a 100x100 black image, draw a blue circle on it, and save it as a file. + +### Configure build settings + +Next, select the `Configuration` dropdown menu in the center of the screen and change it from `Debug` to `Release`. At this stage, your screen should look like the example shown below. + +![MSVC screenshot #center](msvc_screen.png "MSVC screenshot") + +  + +Now, set up the compile and link settings. Select `Project` from the top menu and click on `TestOpenCV properties`. Edit `Include directories`, `Library directories`, and `Additional dependencies` as shown in the images below, and then click OK. + +![MSVC include dir #center](msvc_include_dir.png "Include directories: Specify the directory containing the OpenCV header files.") + +  + +![MSVC link dir #center](msvc_link_dir.png "Library directories: Specify the directory where the libraries for linking are located.") + +  + +![MSVC link lib #center](msvc_link_lib.png "Additional dependencies: Specify the names of the libraries to link") + +  + +Finally, ensure that the directory containing the dynamic libraries (DLLs) is added to the `PATH` environment variable. Set this in the Windows system settings. After setting the environment variable, restart Visual Studio to apply the changes. + +![path setting #center](set_path.png "Set the DLL dir to the PATH environment variable") + +### Build the application + +You are now ready to build the application. + +From the top menu, select `Debug` and click `Start Without Debugging` or press `Ctrl` + `F5`. + +If a console window appears showing that the program exited with code 0 and `test_image.png` is generated in the top-level directory of your Visual Studio project, you have succeeded. + +Open the image file, it should look like the example shown below. + +![test_image pic](test_image.png "test_image.png") + +Congratulations! You are now ready to create your own OpenCV applications using MSVC. + diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/2-2-build-clang.md b/content/learning-paths/laptops-and-desktops/win-opencv/2-2-build-clang.md new file mode 100644 index 0000000000..f3311710da --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/2-2-build-clang.md @@ -0,0 +1,225 @@ +--- +title: Build OpenCV Applications with Clang +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## How do I build OpenCV with Clang? + +Follow the steps below to build OpenCV and a test application using the library with the Clang compiler. + +### Clone the OpenCV repository + +Open up a Windows PowerShell and checkout the source tree: + +```bash +git clone https://github.com/opencv/opencv +cd opencv +git checkout tags/4.10.0 +``` + +{{% notice Note %}} +You might be able to use a later version, but these steps have been tested with the version 4.10.0. +{{% /notice %}} + +### Pre-build configuration + +You can use CMake from the command line. + +First, run the following command to run the pre-build configuration. + +```bash +mkdir build_clang +cd build_clang + +cmake ` +-S .. ` +-B . ` +-G "Ninja" ` +-DCMAKE_C_COMPILER=clang ` +-DCMAKE_CXX_COMPILER=clang++ ` +-DCMAKE_BUILD_TYPE=Release ` +-DBUILD_opencv_world=ON ` +-DWITH_ITT=OFF ` +-DWITH_OPENCL=OFF ` +-DWITH_OPENCLAMDBLAS=OFF ` +-DWITH_OPENCLAMDFFT=OFF ` +-DWITH_OPENCL_D3D11_NV=OFF ` +-DWITH_DIRECTML=OFF ` +-DWITH_DIRECTX=OFF ` +-DWITH_ADE=OFF ` +-DWITH_CAROTENE=OFF +``` + +The given options specify the following: +- The source code is located one level above the current directory. +- The build will be performed in the current directory. +- The Clang compiler will be used as the compiler. +- The compiled library is generated as a single file that includes all of OpenCV's functionality. +- Unnecessary options have been disabled, assuming processing on Arm CPUs. + +  + +If the configuration is successful, a message similar to the following should be displayed at the end of the execution: + +``` +-- General configuration for OpenCV 4.10.0 ===================================== +-- Version control: 4.10.0 +-- +-- Platform: +-- Timestamp: 2024-11-08T09:23:57Z +-- Host: Windows 10.0.22631 ARM64 +-- CMake: 3.28.1 +-- CMake generator: Ninja +-- CMake build tool: C:/Users/username/work/venv/Scripts/ninja.exe +-- Configuration: Release +-- +-- CPU/HW features: +-- Baseline: NEON +-- requested: NEON FP16 +-- Dispatched code generation: NEON_DOTPROD NEON_FP16 NEON_BF16 +-- requested: NEON_FP16 NEON_BF16 NEON_DOTPROD +-- NEON_DOTPROD (1 files): + NEON_DOTPROD +-- NEON_FP16 (2 files): + NEON_FP16 +-- NEON_BF16 (0 files): + NEON_BF16 +-- +-- C/C++: +-- Built as dynamic libs?: YES +-- C++ standard: 11 +-- C++ Compiler: C:/Program Files/LLVM/bin/clang++.exe (ver 18.1.8) +[...] +-- C Compiler: C:/Program Files/LLVM/bin/clang.exe +[...] +-- Install to: C:/Users/username/work/opencv/build_clang/install +-- ----------------------------------------------------------------- +-- +-- Configuring done (244.5s) +-- Generating done (1.4s) +-- Build files have been written to: C:/Users/username/work/opencv/build_clang +``` + +### Build and install + +Run the following commands to build and install OpenCV: + +```bash +ninja +ninja install +``` + +{{% notice Note %}} +The build takes approximately 25 mins on a Lenovo X13s +{{% /notice %}} + +  + +When the build and the install steps are complete, confirm the shared library has been created by inspecting the results in the `install/bin` directory: + +```bash { output_lines = "2-11" } +ls ./install/bin +Mode LastWriteTime Length Name +---- ------------- ------ ---- +-a---- 08/11/2024 09:51 40448 opencv_annotation.exe +-a---- 08/11/2024 09:51 126464 opencv_interactive-calibration.exe +-a---- 08/11/2024 09:51 40448 opencv_model_diagnostics.exe +-a---- 08/11/2024 09:51 38400 opencv_version.exe +-a---- 08/11/2024 09:51 35840 opencv_version_win32.exe +-a---- 08/11/2024 09:23 26391552 opencv_videoio_ffmpeg4100_64.dll +-a---- 08/11/2024 09:51 51712 opencv_visualisation.exe +-a---- 08/11/2024 09:50 20207104 opencv_world4100.dll +``` + +Also inspect the `install/lib` directory: + +```bash { output_lines = "2-9" } +ls ./install/lib + Directory: C:\Users\username\work\opencv\build_clang\install\lib +Mode LastWriteTime Length Name +---- ------------- ------ ---- +-a---- 08/11/2024 09:23 434 OpenCVConfig-version.cmake +-a---- 08/11/2024 09:23 15254 OpenCVConfig.cmake +-a---- 08/11/2024 09:23 936 OpenCVModules-release.cmake +-a---- 08/11/2024 09:23 3749 OpenCVModules.cmake +-a---- 08/11/2024 09:50 2862548 opencv_world4100.lib +``` + +  + +The library used in your application is `opencv_world.lib/dll`. + +Once the library files are correctly generated, run the following command to ensure there are no errors. + +```bash { output_lines = "2" } +./install/bin/opencv_version.exe +4.10.0 +``` + +  + +## Build OpenCV Applications + +Once the OpenCV library has been successfully created, you can create a simple application and try using it. + +### Prepare a application program + +First, use a text editor to save the following C++ program as `test_opencv.cpp` in the `build_clang` directory. + +```cpp +#include +#include +int main() { + cv::Mat image = cv::Mat::zeros(100, 100, CV_8UC3); + if (image.empty()) { + std::cout << "Failed to create an image!" << std::endl; + return -1; + } + cv::circle(image, cv::Point(50, 50), 30, cv::Scalar(255, 0, 0), -1); + cv::imwrite("test_image.png", image); + cv::waitKey(0); + return 0; +} +``` + +This program is a simple example that uses OpenCV to create a 100x100 black image, draw a blue circle on it, and save it as a file. + +### Compile the program + +Compile the code using the command below: + +```bash +clang++ .\test_opencv.cpp -o test_opencv.exe -I.\install\include -L.\install\lib -lopencv_world4100 +``` + +The given options specify the following: +* __`-o`__: Specifies the name of the generated executable file. +* __`-I`__: Indicates the directory where the OpenCV header files to be included are located. +* __`-L`__: Specifies the directory where the libraries for linking are located. +* __`-l`__: Specifies the name of the library to link. When linking `opencv_world4100.lib`, omit the `.lib` extension and specify it as `-lopencv_world4100`. + +### Run the program + +To run the executable, you need to ensure that the directory containing the dynamic libraries (DLLs) is added to the `PATH` environment variable, or place the DLLs in the same location as the executable. + +{{% notice Note %}} +The command below adds the DLL directory to the beginning of the `PATH` environment variable. Since this is a temporary setting, the `PATH` will revert to its original state when the PowerShell session is closed. To set it permanently, you need to use the Windows system settings or the `[Environment]::SetEnvironmentVariable()` method. + +```bash +$env:PATH = "./install/bin;" + $env:PATH +``` +{{% /notice %}} + +Run the test program: + +```bash +.\test_opencv.exe +``` + +When you execute the command, it will finish quickly, and `test_image.png` is generated. If you don't have the DLL directory in your search path, the program appears to run, but no `test_image.png` is generated. + +Open the image file, it should look like the example shown below. + +![test_image pic](test_image.png "test_image.png") + +Congratulations! You are now ready to create your own OpenCV applications using Clang. diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/_index.md b/content/learning-paths/laptops-and-desktops/win-opencv/_index.md new file mode 100644 index 0000000000..939f756cfa --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/_index.md @@ -0,0 +1,38 @@ +--- +title: Build OpenCV applications on Windows on Arm +draft: true +cascade: + draft: true +minutes_to_complete: 90 + +who_is_this_for: This is an advanced topic for software developers who want to use OpenCV with Windows on Arm devices. + +learning_objectives: + - Build the OpenCV library for Windows on Arm devices. + - Develop applications using OpenCV. + +prerequisites: + - A Windows on Arm computer such as Lenovo Thinkpad X13s or an [Azure virtual machine](/learning-paths/cross-platform/woa_azure/). + +author_primary: Koki Mitsunami + +### Tags +skilllevels: Introductory +subjects: Migration to Arm +armips: + - Cortex-A +tools_software_languages: + - Visual Studio + - Clang + - OpenCV + - C++ +operatingsystems: + - Windows + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/_next-steps.md b/content/learning-paths/laptops-and-desktops/win-opencv/_next-steps.md new file mode 100644 index 0000000000..e12b500d53 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/_next-steps.md @@ -0,0 +1,27 @@ +--- +next_step_guidance: You have successfully built OpenCV on Windows on Arm using MSVC/Clang. You might be interested in how to create Windows Forms applications on Windows on Arm. + +recommended_path: /learning-paths/laptops-and-desktops/win_forms/ + +further_reading: + - resource: + title: OpenCV website + link: https://opencv.org/ + type: website + - resource: + title: Arm Kleidi Libraries + link: https://www.arm.com/products/development-tools/embedded-and-software/kleidi-libraries + type: website + - resource: + title: Evolution of SIMD architecture with SVE2 + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/sve2 + type: blog + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/_review.md b/content/learning-paths/laptops-and-desktops/win-opencv/_review.md new file mode 100644 index 0000000000..6db88cd2cc --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win-opencv/_review.md @@ -0,0 +1,42 @@ +--- +review: + - questions: + question: > + What is OpenCV? + answers: + - OpenCV is a game development software that helps create 3D video games quickly. + - OpenCV is a library that helps computers see and work with images and videos. + - OpenCV is a tool used for creating music and sound effects for apps. + correct_answer: 2 + explanation: > + OpenCV is a powerful open-source library that helps computers process and understand images and videos. It is used in tasks like detecting objects and editing images. + + - questions: + question: > + MSVC is a Microsoft compiler used for building C/C++ applications, mainly on Windows. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + MSVC (Microsoft Visual C++) is a compiler provided by Microsoft that is part of Visual Studio. It is widely used for building and compiling C/C++ programs on Windows, offering good integration with Windows libraries and debugging tools. + + - questions: + question: > + Clang is a compiler that is part of the LLVM project, known for its cross-platform support. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Clang is an open-source compiler that is part of the LLVM project. It is known for its support for C/C++ and other languages, cross-platform capabilities, and clear error diagnostics, making it popular for modern development needs. + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_add_file.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_add_file.png new file mode 100644 index 0000000000..0301fff9a0 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_add_file.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_include_dir.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_include_dir.png new file mode 100644 index 0000000000..8211134074 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_include_dir.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_dir.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_dir.png new file mode 100644 index 0000000000..0bfe5953b2 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_dir.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_lib.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_lib.png new file mode 100644 index 0000000000..2487da5247 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_link_lib.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_project.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_project.png new file mode 100644 index 0000000000..91bfedcc3d Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_project.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/msvc_screen.png b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_screen.png new file mode 100644 index 0000000000..9ce520802b Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/msvc_screen.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/set_path.png b/content/learning-paths/laptops-and-desktops/win-opencv/set_path.png new file mode 100644 index 0000000000..b6e2d62ab4 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/set_path.png differ diff --git a/content/learning-paths/laptops-and-desktops/win-opencv/test_image.png b/content/learning-paths/laptops-and-desktops/win-opencv/test_image.png new file mode 100644 index 0000000000..abfca30f56 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win-opencv/test_image.png differ diff --git a/content/learning-paths/laptops-and-desktops/win_sandbox_dot_net_cicd/win_sandbox_net.md b/content/learning-paths/laptops-and-desktops/win_sandbox_dot_net_cicd/win_sandbox_net.md index 0ab8e7f935..4b66c37413 100644 --- a/content/learning-paths/laptops-and-desktops/win_sandbox_dot_net_cicd/win_sandbox_net.md +++ b/content/learning-paths/laptops-and-desktops/win_sandbox_dot_net_cicd/win_sandbox_net.md @@ -50,7 +50,7 @@ You will see instructions displayed on this page. These are commands you now nee On your host machine, click on the Search bar and enter **Windows Sandbox**. -In your running Windows Sandbox, open up Powershell. You will now copy and paste all the commands into Powershell. +In your running Windows Sandbox, open up PowerShell. You will now copy and paste all the commands into PowerShell. The commands are also shown here for your convenience. To download the runner package: diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/_index.md b/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/_index.md index 0f33fff372..1518a9cfce 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/_index.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/_index.md @@ -1,10 +1,6 @@ --- title: Get started with the Windows Performance Analyzer (WPA) plugin for WindowsPerf -draft: true -cascade: - draft: true - minutes_to_complete: 15 who_is_this_for: This is an introductory topic for software developers interested in using the Windows Performance Analyzer (WPA) plugin for performance analysis. diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/windowsperf_wpa_plugin.md b/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/windowsperf_wpa_plugin.md index 2f1f1a04e7..abd47aaa51 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/windowsperf_wpa_plugin.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/windowsperf_wpa_plugin.md @@ -26,8 +26,9 @@ You can save a `.json` output from WindowsPerf by using the `--output` command f To create a file named `example.json`, run the following command: ```console - wperf stat -e ld_spec --output example.json + wperf stat -e ld_spec --timeout 5 --json --output example.json ``` + Note: This command won't finish automatically. You can use Ctrl+C to terminate it after it has run for 5-10 seconds. 2. Open Windows Performance Analyzer, and see the following window: @@ -54,9 +55,11 @@ You can use WPA to visualize PMU events in the recorded data. To try the timeline feature, run the command: ```command -wperf stat -m dcache -c 0,1,2,3,4,5,6,7 -t -i 0 -n 50 --json +wperf stat -m dcache -c 0,1,2,3,4,5,6,7 -t -i 0 -n 10 --timeout 2 --json --output example2.json ``` +Note: above command will run for ~20 seconds. + Open the generated output (`.json` file) in WPA to see the graph: ![timeline-by-core #center](figures/timeline-by-core.png) @@ -69,12 +72,14 @@ The WPA plugin also generates a graph per event note in order to provide a more To see all the generated graphs you can expand the `Counting timeline` section in the graph explorer section of WPA. -Run another `wperf` command with different options: +Before you run another `wperf` command with different options, you use use `wperf -list` to find out supported metrics in your machine. ```console -wperf stat -t -i 0 -m imix,l1d_cache_miss_ratio,l1d_cache_mpki,l1d_tlb_miss_ratio,l1d_tlb_mpki -e inst_spec,vfp_spec,ld_spec,st_spec -c 1 --json +wperf stat -t -i 0 -n 10 -m imix,l1d_cache_miss_ratio,l1d_cache_mpki,l1d_tlb_miss_ratio,l1d_tlb_mpki -e inst_spec,vfp_spec,ld_spec,st_spec -c 1 --timeout 2 --json --output example3.json ``` +Note: above command will run for ~20 seconds. + The graph after opening the `.json` file is shown below: ![timeline-events-by-key #center](figures/timeline-events-by-key.png) @@ -88,9 +93,11 @@ The WPA Plugin also provides visualization of [Arm telemetry metrics](https://de To visualize telemetry, run the following command: ```console -wperf stat -t -i 0 -m imix,l1d_cache_miss_ratio,l1d_cache_mpki,l1d_tlb_miss_ratio,l1d_tlb_mpki -e inst_spec,vfp_spec,ld_spec,st_spec -c 1 --json +wperf stat -t -i 0 -n 10 -m imix,l1d_cache_miss_ratio,l1d_cache_mpki,l1d_tlb_miss_ratio,l1d_tlb_mpki -e inst_spec,vfp_spec,ld_spec,st_spec -c 1 --timeout 2 --json --output example4.json ``` +Note: above command will run for ~20 seconds. + You can also see the telemetry timeline graphs under the graph explorer level in WPA. These graphs are generated dynamically so only the relevant metrics for the given `.json` output file are visible. @@ -101,4 +108,4 @@ Once expanded, a more in-depth view is visible under the Analysis tab of WPA. ![telemetry-table #center](figures/telemetry-table.png) -You now have a basic understanding of how to use `wperf` generated data in the Windows Performance Analyzer. \ No newline at end of file +You now have a basic understanding of how to use `wperf` generated data in the Windows Performance Analyzer. diff --git a/content/learning-paths/laptops-and-desktops/wsl2/setup.md b/content/learning-paths/laptops-and-desktops/wsl2/setup.md index 7a3253d514..f1a96fb493 100644 --- a/content/learning-paths/laptops-and-desktops/wsl2/setup.md +++ b/content/learning-paths/laptops-and-desktops/wsl2/setup.md @@ -25,6 +25,7 @@ WSL 2 replaces the system call translation layer provided in WSL 1 with the late WSL 2 can run containers for application development. WSL 2 provides much faster file I/O compared to WSL 1. ## Install WSL 2 + Installing WSL 2 requires Windows 11. It is also possible to install WSL 2 on certain Windows 10 versions, but these instructions were tested on Windows 11. Windows 11 is recommended to complete all of the examples in this Learning Path. All of the examples have been tested using WSL 2. If only WSL is specified it means WSL 2. Here is the short version on how to install WSL 2. Microsoft documentation provides a [quickstart](https://docs.microsoft.com/en-us/windows/wsl/install-win10) with full details on how to install WSL 2. There are also numerous tutorials available (for non-Arm architectures). @@ -64,7 +65,7 @@ There are other Linux distributions available in the Microsoft Store. Make sure Another way to install Linux distributions is using the WSL command. -Open a Windows Powershell or Command Prompt and list the distributions available: +Open a Windows PowerShell or Command Prompt and list the distributions available: ```cmd wsl --list --online @@ -73,12 +74,15 @@ wsl --list --online The output will list the available distributions: ```output -NAME FRIENDLY NAME -Ubuntu Ubuntu -Debian Debian GNU/Linux -Ubuntu-18.04 Ubuntu 18.04 LTS -Ubuntu-20.04 Ubuntu 20.04 LTS -Ubuntu-22.04 Ubuntu 22.04 LTS +NAME FRIENDLY NAME +Ubuntu Ubuntu +Debian Debian GNU/Linux +kali-linux Kali Linux Rolling +Ubuntu-18.04 Ubuntu 18.04 LTS +Ubuntu-20.04 Ubuntu 20.04 LTS +Ubuntu-22.04 Ubuntu 22.04 LTS +Ubuntu-24.04 Ubuntu 24.04 LTS +openSUSE-Tumbleweed openSUSE Tumbleweed ``` Install a distribution from this list: diff --git a/content/learning-paths/laptops-and-desktops/wsl2/ssh.md b/content/learning-paths/laptops-and-desktops/wsl2/ssh.md index 4621c441a4..e12a3e7983 100644 --- a/content/learning-paths/laptops-and-desktops/wsl2/ssh.md +++ b/content/learning-paths/laptops-and-desktops/wsl2/ssh.md @@ -22,7 +22,7 @@ Substitute your username and the filename to be copied. cp /mnt/c/Users//Downloads/ . ``` -If SSH is needed to access WSL from a different machine continue with the instructions below: +If SSH is needed to access WSL from a different machine continue with the instructions below. ## Install SSH server diff --git a/content/learning-paths/laptops-and-desktops/wsl2/systemd.md b/content/learning-paths/laptops-and-desktops/wsl2/systemd.md index 3f655ee02b..36251b091e 100644 --- a/content/learning-paths/laptops-and-desktops/wsl2/systemd.md +++ b/content/learning-paths/laptops-and-desktops/wsl2/systemd.md @@ -20,7 +20,7 @@ Add the following lines to `/etc/wsl.conf`: systemd=true ``` -Open a Windows Command Prompt or Powershell. Run the following commands to terminate and restart the distribution: +Open a Windows Command Prompt or PowerShell. Run the following commands to terminate and restart the distribution: ```cmd wsl --terminate Ubuntu-22.04 diff --git a/content/learning-paths/microcontrollers/_index.md b/content/learning-paths/microcontrollers/_index.md index f139834ead..caeda206bc 100644 --- a/content/learning-paths/microcontrollers/_index.md +++ b/content/learning-paths/microcontrollers/_index.md @@ -8,14 +8,14 @@ key_ip: maintopic: true operatingsystems_filter: - Baremetal: 25 -- Linux: 4 -- macOS: 2 +- Linux: 5 +- macOS: 3 - RTOS: 10 - Windows: 2 subjects_filter: - CI-CD: 3 - Libraries: 3 -- ML: 7 +- ML: 8 - Performance and Architecture: 11 - RTOS Fundamentals: 4 - Security: 2 @@ -37,6 +37,7 @@ tools_software_languages_filter: - FVP: 1 - GCC: 5 - GitHub: 2 +- Himax SDK: 1 - IP Explorer: 2 - Keil: 7 - Keil MDK: 3 @@ -44,7 +45,7 @@ tools_software_languages_filter: - MPS3: 1 - Node.js: 1 - Paddle: 1 -- Python: 1 +- Python: 2 - PyTorch: 1 - Raspberry Pi: 1 - RTX: 2 diff --git a/content/learning-paths/microcontrollers/mlek/_index.md b/content/learning-paths/microcontrollers/mlek/_index.md index 4a0460243f..14729b1732 100644 --- a/content/learning-paths/microcontrollers/mlek/_index.md +++ b/content/learning-paths/microcontrollers/mlek/_index.md @@ -1,13 +1,13 @@ --- title: Build and run the Arm Machine Learning Evaluation Kit examples -minutes_to_complete: 30 +minutes_to_complete: 30 who_is_this_for: This is an introductory topic for embedded software developers interested in learning about machine learning. -learning_objectives: +learning_objectives: - Build examples from Machine Learning Evaluation Kit (MLEK) - - Run the examples on Corstone-300 FVP or Virtual Hardware + - Run the examples on Corstone-320 FVP or Virtual Hardware prerequisites: - Some familiarity with embedded programming diff --git a/content/learning-paths/microcontrollers/mlek/build.md b/content/learning-paths/microcontrollers/mlek/build.md index 0872560bfd..98b0e289b8 100644 --- a/content/learning-paths/microcontrollers/mlek/build.md +++ b/content/learning-paths/microcontrollers/mlek/build.md @@ -7,69 +7,138 @@ weight: 2 # 1 is first, 2 is second, etc. # Do not modify these elements layout: "learningpathall" --- -The [Arm ML Evaluation Kit (MLEK)](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ml-embedded-evaluation-kit) provides a number of ready-to-use ML applications. These allow you to investigate the embedded software stack and evaluate performance on the Cortex-M55 and Ethos-U55 processors. +The [Arm ML Evaluation Kit (MLEK)](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ml-embedded-evaluation-kit) provides a number of ready-to-use ML applications. These allow you to investigate the embedded software stack and evaluate performance on the Cortex-M55 and Ethos-U85 processors. -You can use the MLEK source code to build sample applications and run them on the [Corstone reference systems](https://www.arm.com/products/silicon-ip-subsystems/), for example the [Corstone-300](https://developer.arm.com/Processors/Corstone-300) Fixed Virtual Platform (FVP). +You can use the MLEK source code to build sample applications and run them on the [Corstone reference systems](https://www.arm.com/products/silicon-ip-subsystems/), for example the [Corstone-320](https://developer.arm.com/Processors/Corstone-320) Fixed Virtual Platform (FVP). ## Before you begin You can use your own Ubuntu Linux host machine or use [Arm Virtual Hardware (AVH)](https://www.arm.com/products/development-tools/simulation/virtual-hardware) for this Learning Path. -The Ubuntu version should be 20.04 or 22.04. The `x86_64` architecture must be used because the Corstone-300 FVP is not currently available for the Arm architecture. You will need a Linux desktop to run the FVP because it opens graphical windows for input and output from the software applications. +The Ubuntu version should be 20.04 or 22.04. These instructions have been tested on the `x86_64` architecture. You will need a way to interact visually with your machine to run the FVP, because it opens graphical windows for input and output from the software applications. If you want to use Arm Virtual Hardware the [Arm Virtual Hardware install guide](/install-guides/avh#corstone) provides setup instructions. -### Compilers +## Build the example application -The examples can be built with [Arm Compiler for Embedded](https://developer.arm.com/Tools%20and%20Software/Arm%20Compiler%20for%20Embedded) or [Arm GNU Toolchain](https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain). +### Install the dependencies -Use the install guides to install the compilers on your computer: -- [Arm Compiler for Embedded](/install-guides/armclang/) -- [Arm GNU Toolchain](/install-guides/gcc/arm-gnu) +Run the following commands to install some necessary tools. -Both compilers are pre-installed in Arm Virtual Hardware. +```bash +sudo apt update +sudo apt install unzip python3-venv python3-pip -y +``` +### Install the compiler -### Corstone-300 FVP {#fvp} +The examples can be built with [Arm Compiler for Embedded](https://developer.arm.com/Tools%20and%20Software/Arm%20Compiler%20for%20Embedded) or [Arm GNU Toolchain](https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain). -To install the Corstone-300 FVP on your computer refer to the [install guide for Arm Ecosystem FVPs](/install-guides/fm_fvp). +Install the GNU toolchain (`gcc`). -The Corstone-300 FVP is pre-installed in Arm Virtual Hardware. +```bash +wget https://developer.arm.com/-/media/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi.tar.xz -## Clone the repository +tar -xf arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi.tar.xz -1. Install `virtualenv` to create Python virtual environments: +export PATH=~/arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi/bin/:$PATH -```console -sudo apt update -sudo apt install python3-venv -y ``` -2. Clone the ML Evaluation Kit repository, and navigate into the new directory: +{{% notice Tip %}} +You can review the installation guides for further details. -```console +- [Arm Compiler for Embedded](/install-guides/armclang/) +- [Arm GNU Toolchain](/install-guides/gcc/arm-gnu) + +{{% /notice %}} + + +Both compilers are pre-installed in Arm Virtual Hardware. + +### Clone the repository + +Clone the ML Evaluation Kit repository, and navigate into the new directory: + +```bash git clone "https://review.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit" cd ml-embedded-evaluation-kit git submodule update --init ``` -## Build the example applications +### Run the build sscript + +The default build is Ethos-U55 and Corstone-300. The default build for Ethos-U85 is Corstone-320. Use the `npu-config-name` flag to set Ethos-U85. -The default compiler is `gcc`, but `armclang` can also be used. +The default compiler is `gcc`, but `armclang` can also be used. Number after `ethos-u85-*` is number of MACs, 128-2048 (2^n). You can select either compiler to build applications. You can also try them both and compare the results. - Build with Arm GNU Toolchain (`gcc`) ``` -./build_default.py +./build_default.py --npu-config-name ethos-u85-256 --toolchain gnu ``` - Build with Arm Compiler for Embedded (`armclang`) ```console -./build_default.py --toolchain arm +./build_default.py --npu-config-name ethos-u85-256 --toolchain arm ``` The build will take a few minutes. -When the build is complete, you will find the example images (`.axf` files) in the `cmake-build-*/bin` directory. The `cmake-build` directory names are specific to the compiler used and Ethos-U55 configuration. +When the build is complete, you will find the examples (`.axf` files) in the `cmake-build-*/bin` directory. The `cmake-build` directory names are specific to the compiler used and Ethos-U85 configuration. Verify that the files have been created by observing the output of the `ls` command + +```bash +ls cmake-build-mps4-sse-320-ethos-u85-256-gnu/bin/ +``` + +The next step is to install the FVP and run it with these example audio clips. + + +## Corstone-320 FVP {#fvp} + +This section describes installation of the Corstone-320 to run on your local machine. If you are using Arm Virtual Hardware, that comes with the Corstone-300 FVP pre-installed, and you can move on to the next section. You can review Arm's full FVP offer and general installation steps in the [Fast Model and Fixed Virtual Platform](/install-guides/fm_fvp) install guides. + +{{% notice Note %}} +The rest of the steps for the Corstone-320 need to be run in a new terminal window. +{{% /notice %}} + +Open a **new terminal window** and download the Corstone-320 archive. + +```bash +cd $HOME +wget https://developer.arm.com/-/cdn-downloads/permalink/FVPs-Corstone-IoT/Corstone-320/FVP_Corstone_SSE-320_11.27_25_Linux64.tgz +``` + +Unpack it with `tar`, run the setup script and export the binary paths to the `PATH` environment variable. + +```bash +tar -xf FVP_Corstone_SSE-320_11.27_25_Linux64.tgz +./FVP_Corstone_SSE-320.sh --i-agree-to-the-contained-eula --no-interactive -q +export PATH=$HOME/FVP_Corstone_SSE-320/models/Linux64_GCC-9.3:$PATH +``` + +The FVP requires an additional dependency, `libpython3.9.so.1.0`, which can be installed using a script. Note that this will tinkle with the python installation for the current terminal window, so make sure to open a new one for the next step. + +```bash +source $HOME/FVP_Corstone_SSE-320/scripts/runtime.sh +``` + +Verify that the FVP was successfully installed by comparing your output from below command. + +```bash +FVP_Corstone_SSE-320 +``` + +```output +telnetterminal0: Listening for serial connection on port 5000 +telnetterminal1: Listening for serial connection on port 5001 +telnetterminal2: Listening for serial connection on port 5002 +telnetterminal5: Listening for serial connection on port 5003 + +``` + + +Now you are ready to test the application with the FVP. + diff --git a/content/learning-paths/microcontrollers/mlek/run.md b/content/learning-paths/microcontrollers/mlek/run.md index 69c9a83399..714002ad57 100644 --- a/content/learning-paths/microcontrollers/mlek/run.md +++ b/content/learning-paths/microcontrollers/mlek/run.md @@ -1,6 +1,6 @@ --- # User change -title: "Run the examples on Corstone-300 FVP" +title: "Run the examples on the FVP" weight: 3 # 1 is first, 2 is second, etc. @@ -9,17 +9,37 @@ layout: "learningpathall" --- ## Run an example -To run an example on the Corstone-300 FVP target, launch the FVP executable with `-a` to specify the software application. +Now you are ready to combine the FVP installation and the example application. Navigate to the evaluation kit repository. -To run the key word spotting example `ethos-u-kws.axf` compiled with `gcc` use: +```bash +cd ml-embedded-evaluation-kit/ +``` + +To run an example on the Corstone-320 FVP target, launch the FVP executable with `-a` to specify the software application. + +To run the key word spotting example `ethos-u-kws.axf` compiled with `gcc` use one of the two options below. -- on your computer with the FVP installed +## Option 1: On your computer with the FVP installed + +Run the FVP. ```console -FVP_Corstone_SSE-300_Ethos-U55 -a cmake-build-mps3-sse-300-ethos-u55-128-gnu/bin/ethos-u-kws.axf +FVP_Corstone_SSE-320 \ + -C mps4_board.subsystem.ethosu.num_macs=256 \ + -C mps4_board.visualisation.disable-visualisation=1 \ + -C vis_hdlcd.disable_visualisation=1 \ + -a cmake-build-mps4-sse-320-ethos-u85-256-gnu/bin/ethos-u-kws.axf ``` -- on Arm Virtual Hardware +{{% notice Note %}} +The number of NPU MACs specified in the build MUST match the number specified in the FVP. Else an error similar to the below will be emitted. + +``` +E: NPU config mismatch. npu.macs_per_cc=E: NPU config mismatch.. +``` +{{% /notice %}} + +## Option 2: On Arm Virtual Hardware ```console VHT_Corstone_SSE-300_Ethos-U55 -a cmake-build-mps3-sse-300-ethos-u55-128-gnu/bin/ethos-u-kws.axf @@ -27,17 +47,18 @@ VHT_Corstone_SSE-300_Ethos-U55 -a cmake-build-mps3-sse-300-ethos-u55-128-gnu/bin When the example is running, a telnet instance will open allowing you to interact with the example. {{% notice Note %}} -It may take some time to initialize the terminal, please be patient. +It may take some time to initialize the terminal, please be patient. If you see warnings regarding loading the image, these can likely be ignored. {{% /notice %}} +## Interact with the application Use the menu to control the application. For the key word spotting application enter 1 to classify the next audio clip. ![terminal #center](term.png) -The results of the classification will appear in the visualization window of the FVP. +The results of the classification will appear in the visualization window of the FVP. The display shows a 98% chance of the audio clips sound was down. @@ -56,23 +77,23 @@ You can specify additional parameters to configure certain aspects of the simula List the available parameters by running the FVP executable with the `--list-params` option, for example: ```console -FVP_Corstone_SSE-300_Ethos-U55 --list-params > parameters.txt +FVP_Corstone_SSE-320 --list-params > parameters.txt ``` {{% notice Note %}} If you are running with Arm Virtual Hardware substitute `VHT_Corstone_SSE-300_Ethos-U55` as the executable name. {{% /notice %}} -Open the file `parameters.txt` to see all of the possible parameters and the default values. +Open the file `parameters.txt` to see all of the possible parameters and the default values. ### Set parameters -Individual parameters can be set with the `-C` command option. +Individual parameters can be set with the `-C` command option. For example, to put the Ethos-U component into fast execution mode: ```console -FVP_Corstone_SSE-300_Ethos-U55 -a cmake-build-mps3-sse-300-ethos-u55-128-gnu/bin/ethos-u-kws.axf -C ethosu.extra_args="--fast" +FVP_Corstone_SSE-320 -a cmake-build-mps4-sse-320-ethos-u85-256-gnu/bin/ethos-u-kws.axf -C mps4_board.subsystem.ethosu.extra_args="--fast" ``` {{% notice Note %}} Do not use fast execution mode whilst benchmarking performance. @@ -83,14 +104,16 @@ To set multiple parameters it may be easier to list them in a text file (without For example, use a text editor to create a file named `options.txt` with the contents: ```console -mps3_board.visualisation.disable-visualisation=1 -ethosu.extra_args="--fast" +mps4_board.visualisation.disable-visualisation=1 +mps4_board.subsystem.ethosu.extra_args="--fast" ``` Run the FVP with the `-f` option and the `options.txt` file: ```console -FVP_Corstone_SSE-300_Ethos-U55 -a cmake-build-mps3-sse-300-ethos-u55-128-gnu/bin/ethos-u-kws.axf -f options.txt +FVP_Corstone_SSE-320 -a cmake-build-mps4-sse-320-ethos-u85-256-gnu/bin/ethos-u-kws.axf -f options.txt ``` Full instructions are provided in the evaluation kit [documentation](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ml-embedded-evaluation-kit/+/HEAD/docs/quick_start.md). + +You have now run an example application on an Arm Fixed Virtual Platform. \ No newline at end of file diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/_index.md b/content/learning-paths/microcontrollers/yolo-on-himax/_index.md new file mode 100644 index 0000000000..8882ef4e61 --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/_index.md @@ -0,0 +1,46 @@ +--- +title: Run a Computer Vision Model on a Himax Microcontroller +draft: true +cascade: + draft: true +minutes_to_complete: 90 + +who_is_this_for: This is an introduction topic explaining how to run a computer vision application on an embedded device from Himax. The example uses an off-the-shelf Himax WiseEye2 module which is based on Arm Cortex-M55 and Ethos-U55. + +learning_objectives: + - Run a you-only-look-once (YOLO) object detection model on the Himax device. + - Build the Himax Software Development Kit (SDK) and generate the firmware image file. + - Update the firmware on the Himax WiseEye2. + +prerequisites: + - A [Seeed Grove Vision AI Module V2](https://www.seeedstudio.com/Grove-Vision-AI-Module-V2-p-5851.html) development board. + - An [OV5647-62 Camera Module](https://www.seeedstudio.com/OV5647-69-1-FOV-Camera-module-for-Raspberry-Pi-3B-4B-p-5484.html) and included FPC cable. + - A USB-C cable. + - An x86 Linux machine or a Mac running macOS with Apple Silicon. + +author_primary: Chaodong Gong, Alex Su, Kieran Hejmadi + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Cortex-M55 + - Ethos-U55 +tools_software_languages: + - Himax SDK + - Python +operatingsystems: + - Linux + - macOS + +draft: true +cascade: + draft: true + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/_next-steps.md b/content/learning-paths/microcontrollers/yolo-on-himax/_next-steps.md new file mode 100644 index 0000000000..b3b4ba6a6e --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/_next-steps.md @@ -0,0 +1,22 @@ +--- +next_step_guidance: Navigate Machine Learning Development with Ethos-U processors + +recommended_path: /learning-paths/microcontrollers/nav-mlek/ + +further_reading: + - resource: + title: Grove Vision AI Module V2 User Documentation + link: https://wiki.seeedstudio.com/grove_vision_ai_v2/ + type: documentation + - resource: + title: WiseEye2 HX6538 processor blog (SoC powering Grove Vision AI Module V2) + link: https://www.himax.com.tw/products/wiseeye-ai-sensing/wiseeye2-ai-processor/ + type: blog + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/_review.md b/content/learning-paths/microcontrollers/yolo-on-himax/_review.md new file mode 100644 index 0000000000..8049916a7e --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/_review.md @@ -0,0 +1,31 @@ +--- +review: + - questions: + question: > + The Grove Vision AI V2 Module can run Yolov8 model in real time? + answers: + - True + - False + correct_answer: 1 + explanation: > + The Grove Vision AI V2 Module can run object detection in real time using the Cortex-M55 and Ethos-U55. + + - questions: + question: > + What the IP the Grove Vision AI V2 Module integrated? + answers: + - Cortex-M55 + - Ethous-U55 + - Both Cortex-M55 and Ethous-U55 + correct_answer: 3 + explanation: > + The Himax WiseEye2 module which is based on the Arm Cortex-M55 and Ethos-U55. + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md b/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md new file mode 100644 index 0000000000..b1db98e5f5 --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md @@ -0,0 +1,69 @@ +--- +title: Build the firmware +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +This section explains the process of generating a firmware image file. + +## Clone the Himax GitHub project + +Himax maintains a repository containing a few examples for the Seeed Grove Vision AI V2 board. + +It contains third-party software and scripts to build and flash the image with the object detection application. By recursively cloning the Himax examples repo, git will include the necessary sub-repositories that have been configured for the project. + +Clone the repository: + +```bash +git clone --recursive https://github.com/HimaxWiseEyePlus/Seeed_Grove_Vision_AI_Module_V2.git +cd Seeed_Grove_Vision_AI_Module_V2 +``` + +## Compile the firmware + +Use Make to compile the source code for object detection. + +This takes up to 10 minutes depending on the number of CPU cores available on your host machine. + +```bash +cd EPII_CM55M_APP_S +make +``` + +When the build is complete, you have an `.elf` file at `obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf` + +## Generate the firmware image + +The examples repository contains scripts to generate the image file. + +Copy the `.elf` file to the `input_case1_secboot` directory. + +```bash +cd ../we2_image_gen_local/ +cp ../EPII_CM55M_APP_S/obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf input_case1_secboot/ +``` + +Run the script your OS as shown below. This will create a file named `output.img` in the `output_case1_sec_wlcsp` directory. + + +{{< tabpane code=true >}} + {{< tab header="Linux" language="shell">}} +./we2_local_image_gen project_case1_blp_wlcsp.json + {{< /tab >}} + {{< tab header="macOS" language="shell">}} +./we2_local_image_gen_macOS_arm64 project_case1_blp_wlcsp.json + {{< /tab >}} +{{< /tabpane >}} + +The script output ends with the following output: + +```output +Output image: output_case1_sec_wlcsp/output.img +Output image: output_case1_sec_wlcsp/output.img + +IMAGE GEN DONE +``` + +You are ready to flash the image onto the Himax development board. \ No newline at end of file diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md b/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md new file mode 100644 index 0000000000..73cb55e14f --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md @@ -0,0 +1,138 @@ +--- +title: Set up the environment +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +This Learning Path has been validated on Ubuntu 22.04 LTS and macOS. + +{{% notice %}} +If you are running Windows, you can use Ubuntu through Windows subsystem for Linux 2 (WSL2). Check out [Get started with Windows Subsystem for Linux (WSL) on Arm](https://learn.arm.com/learning-paths/laptops-and-desktops/wsl2/setup/) to learn more. +{{% /notice %}} + +## Install software tools + +Follow the instructions below to install the required development tools. + +### Install Python and Pip + +You will use Python to build the firmware image and pip to install additional dependencies. + +Verify Python is installed by running: + +```bash +python3 --version +``` + +You should see an output like the following: + +```output +Python 3.12.7 +``` + +On Ubuntu, you may need to install `pip` and `venv` with the following commands: + +```bash +sudo apt update +sudo apt install python3-pip python3-venv -y +``` + +Verify Pip is installed correctly: + +```bash +pip3 --version +``` + +The output is similar to: + +```output +pip 24.0 from /usr/lib/python3/dist-packages/pip (python 3.12) +``` + +It is good practice to manage Python packages through a virtual environment. + +Create one with the steps below. + +```bash +python3 -m venv $HOME/yolo-venv +source $HOME/yolo-venv/bin/activate +``` + +Your terminal displays `(yolo-venv)` in the prompt indicating the virtual environment is active. + +You also need the Git distributed version control system installed. + +Run the command below to verify that Git is installed on your system: + +```bash +git --version +``` + +If it is installed, you will see output similar to: + +```output +git version 2.39.3 +``` + +### Install Make + +Install the Make build tool, which is used to build the firmware in the next section. + +{{< tabpane code=true >}} + {{< tab header="Linux" language="shell">}} +sudo apt update +sudo apt install make -y + {{< /tab >}} + {{< tab header="macOS" language="shell">}} +brew install make + {{< /tab >}} +{{< /tabpane >}} + +After Make is installed, run it to print the version. + +```bash +make --version +``` + +The output is similar to: + +```output +GNU Make 4.3 +Built for x86_64-pc-linux-gnu +Copyright (C) 1988-2020 Free Software Foundation, Inc. +License GPLv3+: GNU GPL version 3 or later +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. +``` + +{{% notice Note %}} +If you are using macOS, you need to verify that your installation is for GNU Make - not the BSD version. You should see GNU in the version output. +{{% /notice %}} + +### Install the Arm GNU toolchain + +The toolchain is used to compile code on the host for the embedded device architecture. + +{{< tabpane code=true >}} + {{< tab header="x86 Linux" language="shell">}} +cd $HOME +wget https://developer.arm.com/-/media/Files/downloads/gnu/13.2.rel1/binrel/arm-gnu-toolchain-13.2.rel1-x86_64-arm-none-eabi.tar.xz +tar -xvf arm-gnu-toolchain-13.2.rel1-x86_64-arm-none-eabi.tar.xz +export PATH="$HOME/arm-gnu-toolchain-13.2.Rel1-x86_64-arm-none-eabi/bin/:$PATH" + {{< /tab >}} + {{< tab header="macOS" language="shell">}} +cd $HOME +wget https://developer.arm.com/-/media/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi.tar.xz +tar -xvf arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi.tar.xz +export PATH="$HOME/arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi/bin/:$PATH" + {{< /tab >}} +{{< /tabpane >}} + +{{% notice %}} +You can add the `export` command to your `.bashrc` or `.zshrc` file to set the search path for each new shell. +{{% /notice %}} + + +Now that your development environment is ready, move to the next section where you will generate the firmware image. \ No newline at end of file diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/face_detection.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/face_detection.jpg new file mode 100644 index 0000000000..ed70940f33 Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/face_detection.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md b/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md new file mode 100644 index 0000000000..43892699b7 --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md @@ -0,0 +1,100 @@ +--- +title: Flash firmware onto the microcontroller +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Now that you have generated an image file on the local host machine, you are ready to flash the microcontroller with the firmware. + +## Install xmodem + +`Xmodem` is a basic file transfer protocol which is easily installed using the Himax examples repository. + +Run the following command to install the dependency: + +```bash +cd $HOME/Seeed_Grove_Vision_AI_Module_V2 +pip install -r xmodem/requirements.txt +``` + +## Connect the module + +It's time to get the board set up. + +Insert the Flexible printed circuit (FPC) into the Grove Vision AI V2 module. Lift the dark grey latch on the connector as per the image below. + +![unlatched](./unlatched.jpg) + +Slide the FPC connector in with the metal pins facing down and close the dark grey latch to fasten the connector. + +![latched](./latched.jpg) + +Now you can connect the Groove Vision AI V2 Module to your computer via the USB-C cable. + +{{% notice Note %}} +The development board may have two USB-C connectors. If you are running into issues connecting the board in the next step, make sure you are using the right one. +{{% /notice %}} + +## Find the COM port + +You'll need to provide the communication port (COM) which the board is connected to in order to flash the image. There are commands to list all COMs available on your machine. Once your board is connected through USB, it'll show up in this list. The COM identifier will start with **tty**, which may help you determine which one it is. You can run the command before and after plugging in the board if you are unsure. + + +{{< tabpane code=true >}} + {{< tab header="Linux" language="shell">}} +sudo grep -i 'tty' /var/log/dmesg + {{< /tab >}} + {{< tab header="MacOS" language="shell">}} +ls /dev/tty.* + {{< /tab >}} +{{< /tabpane >}} + + +{{% notice Note %}} +If the port seems unavailable, try changing the permissions temporarily using the `chmod` command. Be sure to reset them afterwards, as this may pose a computer security vulnerability. + +```bash +chmod 0777 +``` +{{% /notice %}} + +The full path to the port is needed in the next step, so be sure to save it. + +## Flash the firmware onto the module + +Run the python script below to flash the firmware: + +```bash +python xmodem\xmodem_send.py --port= \ +--baudrate=921600 --protocol=xmodem \ +--file=we2_image_gen_local\output_case1_sec_wlcsp\output.img +``` + +{{% notice Note %}} +When you run other example models demonstrated in the later section [Run additional models in the web toolkit](/learning-paths/microcontrollers/yolo-on-himax/web-toolkit/), you need to adapt this command with `--model` argument. +{{% /notice %}} + +After the firmware image flashing is completed, the message `Do you want to end file transmission and reboot system? (y)` is displayed. Press the reset button shown in the image below. + +![reset button](./reset_button.jpg) + +## Run the model + +After the reset button is pressed, the board will start inference with the object detection automatically. Observe the output in the terminal to verify that the image is built correctly. If a person is in front of the camera, you should see the `person_score` value go over `100`. + +```output +b'SENSORDPLIB_STATUS_XDMA_FRAME_READY 240' +b'write frame result 0, data size=15284,addr=0x340e04e0' +b'invoke pass' +b'person_score:113' +b'EVT event = 10' +b'SENSORDPLIB_STATUS_XDMA_FRAME_READY 241' +b'write frame result 0, data size=15296,addr=0x340e04e0' +b'invoke pass' +b'person_score:112' +b'EVT event = 10' +``` + +This means the image works correctly on the device, and the end-to-end flow is complete. \ No newline at end of file diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/himax_web_ui.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/himax_web_ui.jpg new file mode 100644 index 0000000000..080dd8ee96 Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/himax_web_ui.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/latched.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/latched.jpg new file mode 100644 index 0000000000..25d9b856d0 Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/latched.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/object_detection.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/object_detection.jpg new file mode 100644 index 0000000000..783ee9f521 Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/object_detection.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/pose_estimation.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/pose_estimation.jpg new file mode 100644 index 0000000000..09e24fcbcf Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/pose_estimation.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/reset_button.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/reset_button.jpg new file mode 100644 index 0000000000..d6f7d8b82d Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/reset_button.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/unlatched.jpg b/content/learning-paths/microcontrollers/yolo-on-himax/unlatched.jpg new file mode 100644 index 0000000000..0a3b5b0142 Binary files /dev/null and b/content/learning-paths/microcontrollers/yolo-on-himax/unlatched.jpg differ diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md b/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md new file mode 100644 index 0000000000..e03500d02a --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md @@ -0,0 +1,106 @@ +--- +title: Run additional models in the web toolkit +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this section, you will view a live camera feed with a computer vision application running. + +## Modify the makefile + +Change to the directory where the `makefile` is located. + +```bash +cd $HOME/Seeed_Grove_Vision_AI_Module_V2/EPII_CM55M_APP_S/ +``` + +The table shows the different options available to use with the web toolkit. + +Use a text editor to open `makefile` and modify the `APP_TYPE` field to one of the values in the table. + +You will use the model argument value with the `--model` option to the `xmodem` command. + +|APP_TYPE |Description | Model argument | +|--- |--- |--- +|tflm_yolov8_od |Object detection | model_zoo\tflm_yolov8_od\yolov8n_od_192_delete_transpose_0xB7B000.tflite 0xB7B000 0x00000 | +|tflm_fd_fm |Face detection | model_zoo\tflm_fd_fm\0_fd_0x200000.tflite 0x200000 0x00000 model_zoo\tflm_fd_fm\1_fm_0x280000.tflite 0x280000 0x00000 model_zoo\tflm_fd_fm\2_il_0x32A000.tflite 0x32A000 0x00000 | + +{{% notice Note %}} +For `tflm_fd_fm`, you need to pass all three models as separate `--model` arguments. +{{% /notice %}} + + +## Regenerate the firmware image + +Use Make to re-generate the `.elf` file. + +```bash +make clean +make +``` + +Use the same commands from previous section to run re-generate the firmware image: + +```bash +cd ../we2_image_gen_local/ +cp ../EPII_CM55M_APP_S/obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf input_case1_secboot/ +``` + +Run the script corresponding to the OS of your host machine. + +{{< tabpane code=true >}} + {{< tab header="Linux" language="shell">}} +./we2_local_image_gen project_case1_blp_wlcsp.json + {{< /tab >}} + {{< tab header="MacOS" language="shell">}} +./we2_local_image_gen_macOS_arm64 project_case1_blp_wlcsp.json + {{< /tab >}} +{{< /tabpane >}} + + +Finally, use `xmodem` to flash the image. + +```bash +python xmodem\xmodem_send.py --port= \ +--baudrate=921600 --protocol=xmodem \ +--file=we2_image_gen_local\output_case1_sec_wlcsp\output.img \ +--model= +``` + +Press the reset button when prompted before moving on. + +## Download the Himax AI web toolkit + +The Himax AI web toolkit enables a browser-based graphical user interface (GUI) for the live camera feed. + +```bash +wget https://github.com/HimaxWiseEyePlus/Seeed_Grove_Vision_AI_Module_V2/releases/download/v1.1/Himax_AI_web_toolkit.zip +unzip Himax_AI_web_toolkit.zip +``` + +{{% notice Note %}} +If needed, install the unzip command: + +```bash +sudo apt install unzip -y +``` +{{% /notice %}} + +Open the file `index.html` in a browser. You can double click the file in a file browser or use the `File -> Open File...` command from the browser menu. + +## Connect to the Grove Vision AI + +Select `Grove Vision AI(V2)` in the top-right hand corner and press `Connect` button. Follow the instructions to set up the connection. You should see a video feed with a bounding box showing identified objects, poses or face detection. + +![Himax web UI](./himax_web_ui.jpg) + +The images below are captured images from the models run in the toolkit. + +### Objection detection +![object_detection](./object_detection.jpg) + +### Face detection +![object_detection](./face_detection.jpg) + diff --git a/content/learning-paths/servers-and-cloud-computing/intro/find-hardware.md b/content/learning-paths/servers-and-cloud-computing/intro/find-hardware.md index 2c7da50571..1a2e5e80ff 100644 --- a/content/learning-paths/servers-and-cloud-computing/intro/find-hardware.md +++ b/content/learning-paths/servers-and-cloud-computing/intro/find-hardware.md @@ -24,7 +24,6 @@ Cloud providers offer Arm instances based on Neoverse processors. For example: Free tier offers are currently available. - [Amazon EC2 t4g.small instances powered by AWS Graviton2 processors are free until Dec 31st 2024](https://aws.amazon.com/ec2/instance-types/t4/) - [Oracle free tier includes up to 4 instances of ARM Ampere A1 Compute which are always free](https://www.oracle.com/cloud/free/) -- [Until March 31, 2024 Tau T2A VMs in Google Cloud are available for a free trial](https://cloud.google.com/compute/docs/instances/create-arm-vm-instance#t2afreetrial) [Get started with Arm-based cloud service platforms](/learning-paths/servers-and-cloud-computing/csp/) explains how to create an account and start an Arm virtual machine using the cloud service providers listed above. diff --git a/content/learning-paths/servers-and-cloud-computing/lambda_functions/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/lambda_functions/_next-steps.md index 7460a65db6..00349aa36e 100644 --- a/content/learning-paths/servers-and-cloud-computing/lambda_functions/_next-steps.md +++ b/content/learning-paths/servers-and-cloud-computing/lambda_functions/_next-steps.md @@ -26,8 +26,8 @@ further_reading: link: https://aws.amazon.com/lambda/getting-started/ type: documentation - resource: - title: A Performance Perspective for Graviton Based Lambda Functions - link: https://blog.thundra.io/a-performance-perspective-for-graviton-based-lambda-functions + title: AWS Lambda performance with Java 21 + link: https://community.aws/content/2juXXgrDDaUdmi902LHwilBhvNU/aws-lambda-performance-with-java-21-x86-vs-arm64-part-1-initial-measurements-and-comparisons?lang=en type: blog diff --git a/content/learning-paths/smartphones-and-mobile/_index.md b/content/learning-paths/smartphones-and-mobile/_index.md index 0461503058..3184b831e0 100644 --- a/content/learning-paths/smartphones-and-mobile/_index.md +++ b/content/learning-paths/smartphones-and-mobile/_index.md @@ -10,14 +10,14 @@ key_ip: - Mali maintopic: true operatingsystems_filter: -- Android: 23 -- Linux: 21 +- Android: 24 +- Linux: 22 - macOS: 10 - Windows: 10 subjects_filter: - Gaming: 6 -- Graphics: 3 -- ML: 8 +- Graphics: 4 +- ML: 9 - Performance and Architecture: 24 subtitle: Optimize Android apps and build faster games using cutting-edge Arm tech title: Smartphones and Mobile @@ -27,7 +27,7 @@ tools_software_languages_filter: - Android: 3 - Android NDK: 1 - Android SDK: 1 -- Android Studio: 7 +- Android Studio: 8 - Arm Development Studio: 1 - Arm Mobile Studio: 1 - Arm Performance Studio: 2 @@ -61,6 +61,7 @@ tools_software_languages_filter: - Rust: 2 - SDDiskTool: 1 - SVE2: 1 +- tflite: 1 - Total Compute: 1 - Trusted Firmware: 1 - Unity: 6 diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/1-webgpu-fundamentals.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/1-webgpu-fundamentals.md index 0ccb2f79c4..4ffcff0a43 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/1-webgpu-fundamentals.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/1-webgpu-fundamentals.md @@ -1,5 +1,5 @@ --- -title: Introduction to WebGPU +title: Overview of WebGPU weight: 2 ### FIXED, DO NOT MODIFY @@ -8,61 +8,69 @@ layout: learningpathall ## What is WebGPU? -WebGPU is the successor to WebGL, a well adopted modern API standard for interfacing with GPUs. WebGPU provides better compatibility with modern GPUs, support for general-purpose GPU computations, faster operations, and access to more advanced GPU features. It is designed to provide a _unified access_ to GPUs, agnostic to GPU vendors and operating systems. +WebGPU is the successor to WebGL, a well-adopted modern API standard for interfacing with GPUs. -WebGPU is a Render Hardware Interface built on top of various backend APIs like Vulkan, DirectX, and Metal (depending on the operating system). +WebGPU offers the following benefits: + +* Improved compatibility with modern GPUs. +* Support for general-purpose GPU computations. +* Fast operations. +* Access to advanced GPU features. +* Unified access to GPUs, which is both GPU vendor-agnostic and OS-agnostic. + +WebGPU is a render hardware interface built on top of various backend APIs such as Vulkan, DirectX, and Metal, though this is OS-dependent. WebGPU is available through web browsers using the webgpu.h header file. -The high level view of WebGPU is shown below: +The high-level view of WebGPU is shown below in Figure 1: -![WebGPU high level view #center](images/webgpu_highlevel.png "WebGPU High Level View") +![Figure 1: WebGPU high level view #center](images/webgpu_highlevel.png "Figure 1: WebGPU High-Level View") ## What are the benefits of WebGPU? -WebGPU takes into account learnings from older standards like WebGL and OpenGL and provides the following benefits: +WebGPU uses lessons learned from earlier standards such as WebGL and OpenGL to develop an improved offering, and provides the following benefits: -* A reasonable level of abstraction -* Good performance -* Cross-platform -* Backed by W3C standards group -* Future-proof design +* A reasonable level of abstraction. +* Improved performance. +* Cross-platform functionality. +* Backing from the W3C standards group. +* A future-proof design. -WebGPU is a standard and not a true API, so the implementation can be adopted and developed as an interface between native applications developed in any programming language. +It is important to note that WebGPU is a standard, and not a true API, so the implementation can be adopted and developed as an interface between native applications developed in any programming language. -The performance requirements for web pages is actually the same as for native application. +The performance requirements for web pages are the same as for native applications. {{% notice Note %}} When designing an API for the Web, the two key constraints are portability and privacy. -The limitations of the API due to privacy considerations can be disabled when using WebGPU as a native API. +Any limitations of the API caused by privacy constraints can be disabled when using WebGPU as a native API. {{% /notice %}} ## What are the benefits of using C++ for WebGPU? -The initial target for WebGPU was JavaScript. The initial `webgpu.h` header file is written in C. +The initial focus for WebGPU was JavaScript. The initial `webgpu.h` header file is written in C. -This Learning Path uses C++ rather than JavaScript or C because for the following reasons: +This Learning Path uses C++ rather than JavaScript or C, for the following reasons: -* C++ is still the primary language used for high performance graphics applications, such as video games, render engines, and modeling tools. -* The level of abstraction and control of C++ is well suited for interacting with graphics APIs in general. -* Graphics programming is a good way to learn more C++. +* C++ remains the primary language used for high performance graphics applications, such as video games, render engines, and modeling tools. +* The level of abstraction and control of C++ is well-suited for interacting with graphics APIs in general. +* Graphics programming is a good way to improve skills in C++. ## Dawn: the Google WebGPU implementation -Since WebGPU is a standard and not an implementation, there are different implementations. +Since WebGPU is a standard and not an implementation itself, there are various possible options for implementation. -[Dawn](https://github.com/google/dawn) is an open-source, cross-platform implementation of the WebGPU standard. +[Dawn](https://github.com/google/dawn) is an open-source, cross-platform implementation of the WebGPU standard. It implements the WebGPU functionality specified in `webgpu.h`. -It implements the WebGPU functionality specified in `webgpu.h`. Dawn is meant to be integrated as part of a larger system like Chromium or a native Android Application. +Dawn is designed to be integrated as part of a larger system such as Chromium or a native Android Application. Dawn provides several WebGPU building blocks: -* WebGPU C/C++ headers that applications and other building blocks use, including a header file and C++ wrapper. -* A "native" implementation of WebGPU using appropriate APIs: D3D12, Metal, Vulkan and OpenGL. -* A client-server implementation of WebGPU for applications that are in a sandbox without access to native drivers. +* WebGPU C/C++ headers that applications and other building blocks use, including a header file and a C++ wrapper. +* A "native" implementation of WebGPU using appropriate APIs, such as D3D12, Metal, Vulkan, and OpenGL. +* A client-server implementation of WebGPU, for applications that are in a sandbox without access to native drivers. * Tint, a compiler for the WebGPU Shader Language (WGSL), that converts shaders to and from WGSL. -Because it is written in C++, Dawn provides better error messages and logging. Because it is open-source, it is easier to inspect stack traces when applications crash. +As it is written in C++, Dawn provides enhanced error message reporting and logging. The fact that it is open source, also means that it is easier to inspect stack traces when applications crash. -Dawn is usually ahead of `wgpu-native`, another WebGPU implementation, when it comes to new functionalities and standards changes. +Dawn is usually ahead of `wgpu-native`, another WebGPU implementation, in terms of new functionality developments and standards changes. diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/2-env-setup.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/2-env-setup.md index 070d2456c2..43211b4c33 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/2-env-setup.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/2-env-setup.md @@ -6,19 +6,13 @@ weight: 3 layout: learningpathall --- -In this Learning Path, you will learn how to: - -* Integrate Dawn (WebGPU) in an application. -* Use the APIs to render a simple 3D object. -* Profile and analyze the application. - The first step is to prepare a development environment with the required software: -* [Android Studio](https://developer.android.com/studio) -* [Arm Performance Studio](https://www.arm.com/products/development-tools/graphics/arm-performance-studio) -* Python 3.10 or later +* Android Studio. +* Arm Performance Studio. +* Python 3.10 or later. -You can use any computer and operating system which supports the above software. +You can use any computer and operating system that supports this software. ## Install Android Studio and the Android NDK @@ -26,28 +20,29 @@ You can use any computer and operating system which supports the above software. 2. Start Android Studio. -3. Open the `Settings` dialog. +3. Open the **Settings** dialog. -4. Navigate to `Languages & Frameworks`, then select `Android SDK`. +4. Navigate to **Languages & Frameworks**, then select **Android SDK**. -5. In the `SDK Platforms` tab, check `Android 14.0 ("UpsideDownCake")` +5. In the **SDK Platforms** tab, select the checkbox **Android 14.0** ("UpsideDownCake"). -![SDK Platforms #center](images/sdk-platforms.png "SDK Platforms") +![SDK Platforms #center](images/sdk-platforms.png "Figure 2: SDK Platforms") -6. In the `SDK Tools` tab check the following: - * Check `Android SDK Build-Tools 35` - * Check `NDK (Side by side)` - * Check `CMake` +6. In the **SDK Tools** tab, select the following: + + * **Android SDK Build-Tools 35**. + * **NDK (Side by side)**. + * **CMake**. -![SDK Tools #center](images/sdk-tools.png "SDK Tools") +![SDK Tools #center](images/sdk-tools.png "Figure 3: SDK Tools") -Click OK to install and update the selected components. +Click **OK** to install and update the selected components. ## Install Arm Performance Studio Profiling is an important step in the Android application development cycle. -The default profiler in the Android Studio is great to profile CPU related metrics, but does not provide GPU details. +The default profiler in Android Studio is great to profile CPU-related metrics, but it does not provide GPU details. Arm Performance Studio is a comprehensive profiling tool to profile both CPUs and GPUs. @@ -55,13 +50,13 @@ One of the components of Performance Studio is Streamline. Streamline captures d * Program Counter (PC) samples from running application threads. * Samples from the hardware Performance Monitoring Unit (PMU) counters in Arm CPUs, Arm Mali GPUs, and Arm Immortalis GPUs. -* Thread scheduling information from the Linux kernel. +* Thread-scheduling information from the Linux kernel. * Software-generated annotations and counters from running applications. -Install Arm Performance Studio using the [install guide](/install-guides/ams/). +Install Arm Performance Studio using the [Arm Performance Studio Install Guide](/install-guides/ams/). {{% notice Tip %}} -If you want to learn more about Arm Performance Studio and Streamline before continuing, refer to [Get started with Arm Performance Studio for mobile](https://learn.arm.com/learning-paths/smartphones-and-mobile/ams/ams/) +To learn more about Arm Performance Studio and Streamline, see [Get started with Arm Performance Studio for mobile](https://learn.arm.com/learning-paths/smartphones-and-mobile/ams/ams/). {{% /notice %}} -Android Studio and Arm Performance Studio are now installed and you are ready to create a WebGPU Android application. +Android Studio and Arm Performance Studio are now installed, and you are ready to create a WebGPU Android application. diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/3-integrate-dawn.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/3-integrate-dawn.md index 8b487e4bb7..544058f2ac 100755 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/3-integrate-dawn.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/3-integrate-dawn.md @@ -1,24 +1,22 @@ --- -title: Create an application which includes Dawn +title: Create an application with Dawn weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Set up Android Project +## Set up your Android Studio Project -You can start by creating a new Android Studio project. +Start by creating a new Android Studio project. -Open Android studio, click `New Project` and select `Game Activity (C++)` as shown below: +Open Android Studio, click **New Project**, and select **Game Activity (C++)** as shown below: -![New Game Activity #center](./images/android_studio_new_game_activity.png "New C++ Game Activity") +![New Game Activity #center](./images/android_studio_new_game_activity.png "Figure 4: New C++ Game Activity") -Set the `Name` to be `dawnwebgpu`. - -Select `Next` to continue. - -Finish the new project creation by accepting all defaults until the project is created. +* Set the **Name** field to **dawnwebgpu**. +* Click **Next** to continue. +* Finish setting up the new project by accepting all the default selections until the project is created. The project is created in `~/AndroidStudioProjects`. @@ -28,23 +26,25 @@ GameActivity is a Jetpack library designed to assist Android games in processing GameActivity is a direct descendant of NativeActivity and shares a similar architecture: -![Game Activity Architecture #center](./images/GameActivityArchitecture.png "Game Activity Architecture") +![Game Activity Architecture #center](./images/GameActivityArchitecture.png "Figure 5: Game Activity Architecture") -With GameActivity, you can focus on your game development and avoid spending excessive time dealing with the Java Native Interface (JNI) code. +With GameActivity, you can focus on game development and avoid spending excessive amounts of time dealing with the Java Native Interface (JNI) code. GameActivity performs the following functions: -* Interacts with the Android framework through the Java-side component. -* Passes app cycle commands, input events, and input text to the native side. -* Renders into a SurfaceView, making it easier for games to interact with other UI components. +* It interacts with the Android framework through the Java-side component. +* It passes app cycle commands, input events, and input text to the native side. +* It renders into a SurfaceView, making it easier for games to interact with other UI components. {{% notice Tip %}} -You can find more information about Android Game Activity and its capabilities in the [Game Activity documentation](https://developer.android.com/games/agdk/game-activity). +You can find more information about Android Game Activity and its capabilities in the [Game Activity Documentation](https://developer.android.com/games/agdk/game-activity). {{% /notice %}} ## Download project source files -To create a WebGPU application, a number of files from GitHub are doing to be added to your Game Activity project. The objective is to show you how to take the Game Activity template and modify it to become a WebGPU application. +The process of creating a WebGPU application involves adding a number of files from GitHub to your Game Activity project. + +The objective of this part of the Learning Path is to show you how to take the Game Activity template and modify it to become a WebGPU application. To get started, open a terminal, create a new directory, and download the project files: @@ -59,27 +59,22 @@ Unzip the project files: unzip main.zip ``` -Yow now have a directory named `Android_DawnWebGPU-main` in your `webgpu-files` directory. +You now have a directory named **Android_DawnWebGPU-main** in the **webgpu-files** directory. -During the next sections you will copy some of the required files from the `Android_DawnWebGPU-main` directory to your Game Activity project to learn how to create WebGPU applications. +During the following sections, you will copy some of the required files from the **Android_DawnWebGPU-main** directory into your Game Activity project to learn how to create WebGPU applications. ## Upgrade the application to include Dawn Return to Android Studio and start work on the WebGPU application. -The Android Game Activity framework uses OpenGLES3 for graphics. - -You can remove this dependency and replace it with WebGPU. +The Android Game Activity framework uses OpenGLES3 for graphics. You can remove this dependency and replace it with WebGPU. Add WebGPU to the project using the following steps: -1. In Android Studio, navigate to the project view and find the `app` --> `cpp` folder. - -Open terminal in Android Studio. You should be in the `dawnwebgpu` directory. - -2. Create a new directory and download the WebGPU header file from GitHub - -Run the commands below to download the `webgpu.hpp` header file: +* In Android Studio, navigate to the project view, and find the **app** --> **cpp** folder. +* Open the terminal in Android Studio. You are now in the **dawnwebgpu** directory. +* Create a new directory and download the WebGPU header file from GitHub. +* Run the commands below to download the `webgpu.hpp` header file: ```console mkdir -p app/src/main/cpp/webgpu/include/webgpu @@ -88,7 +83,7 @@ cp ~/webgpu-files/Android_DawnWebGPU-main/app/src/main/cpp/webgpu/include/webgpu cd ../.. ``` -3. Next copy the remaining WebGPU files to your project. +Next, copy the remaining WebGPU files to your project. ```console cp ~/webgpu-files/Android_DawnWebGPU-main/app/src/main/cpp/webgpu/CMakeLists.txt . @@ -101,19 +96,19 @@ cd .. Notice that `FetchDawn.cmake` uses a stable `chromium/6536` branch of Dawn repository. {{% notice Note %}} -WebGPU is constantly evolving standard and hence its implementation, Dawn is also under active development. For sake of stability, we have chosen a stable branch for our development. Updating to latest or different branch may cause breakage. +WebGPU is a constantly evolving standard and hence its implementation, Dawn is also under active development. For sake of stability, we have chosen a stable branch for our development. Updating to latest or different branch may cause breakage. {{% /notice %}} -To add Dawn to our application, there are 2 options: +To add Dawn to our application, there are two options: * Create a shared/static library from the Dawn source and use it in application. * Download the source as a dependency and build it as part of the project build. -You will use the second option, since it provides more debug flexibility. +You will use the second option here, since it provides more flexibility for debug. The files `webgpu/webgpu.cmake` and `CMakeLists.txt` facilitate downloading and building WebGPU with Dawn implementation and integrating Dawn into the project. -4. Add WebGPU to the project. +## Add WebGPU to the project WebGPU is added to the project in the file `CMakeLists.txt`. @@ -153,6 +148,6 @@ target_link_libraries(dawnwebgpu ``` -The `webgpu.hpp` header file acts like an interface, exposing all WebGPU functions and variables to the main Application. +The `webgpu.hpp` header file acts like an interface, exposing all the WebGPU functions and variables to the main Application. Navigate to the next section to continue building the WebGPU application. diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/4-using-webgpu-apis.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/4-using-webgpu-apis.md index a3acd342c2..eec6f8f291 100755 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/4-using-webgpu-apis.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/4-using-webgpu-apis.md @@ -6,15 +6,17 @@ weight: 5 layout: learningpathall --- -## Setup project +## Project Setup -With the `webgpudawn` library integrated, you can start by removing the extra files included as part of the stock Game Activity project. +With the `webgpudawn` library integrated, you can begin removing the extra files included as part of the stock Game Activity project. -1. Delete all the files from the top `cpp` directory except `CMakeLists.txt`. +* Delete all the files from the top `cpp` directory, except `CMakeLists.txt`. - You have already reviewed `CMakeLists.txt` in the previous section. + {{% notice Note %}} +You have already reviewed `CMakeLists.txt` in the previous section. +{{% /notice %}} -2. Add the files `webgpuRenderer.cpp` and `webgpuRenderer.h` files for the WebGPU application. +* Add the files `webgpuRenderer.cpp` and `webgpuRenderer.h` files for the WebGPU application. Run the commands below to add a new `main.cpp` and WebGPU renderer files: @@ -34,7 +36,7 @@ With the `webgpudawn` library integrated, you can start by removing the extra fi There are several layers of abstraction between a device GPU and an application running the WebGPU API. -![WebGPU Application Interface #center](images/webgpu_app_interface.png "WebGPU Application Interface") +![WebGPU Application Interface #center](images/webgpu_app_interface.png "Figure 6: WebGPU Application Interface") It is useful to understand these layers as you begin to use WebGPU APIs in an application. @@ -48,27 +50,27 @@ It is useful to understand these layers as you begin to use WebGPU APIs in an ap ### The adapter -Before requesting access to a **device**, you need to select an **adapter**. +Before requesting access to a device, you need to select an adapter. -The same host system may expose multiple adapters if it has access to multiple physical GPUs. It may also have an adapter that represents an emulated/virtual device. Each adapter advertises a list of optional **features** and **supported limits** that it can handle. +The same host system might expose multiple adapters if it has access to multiple physical GPUs. It might also have an adapter that represents an emulated/virtual device. Each adapter offers optional features and supported limits that it can handle. -These are used to determine the overall capabilities of the system before **requesting the device**. The **adapter** is used to **access the capabilities** of the user’s hardware, which are used to select the behavior of your application among very different code paths. +You can use these features and limits to determine the overall capabilities of the system before requesting the device. The adapter is used to access the capabilities of the user’s hardware, which are used to select the behavior of your application among different code paths. -Once a code path is chosen, a device is created with the chosen capabilities. Only the capabilities selected for this device are allowed in the rest of the application. This way, it is **not** possible to inadvertently rely on capabilities specific to a device. +Once a code path is chosen, a device is created with the chosen capabilities. Only the capabilities selected for this device are allowed in the rest of the application. This way, it is not possible to inadvertently rely on capabilities specific to a device. -![Supported Limits #center](images/adapter_supported_limits.png "Adapter Supported Limits") +![Supported Limits #center](images/adapter_supported_limits.png "Figure 7: Adapter Supported Limits") {{% notice Tip %}} -In an advanced use of the adapter/device duality, you can set up multiple limit presets and select one depending on the adapter. +To use the adapter/device duality in an advanced way, you can set up multiple limit presets and select one depending on the adapter. -In this case, there is a single preset and abort early if it is not supported. +In this scenario, there is a single preset and an abort early option if it is not supported. {{% /notice %}} ### Requesting the adapter An adapter is not something you create, but rather something that you *request* using the function `requestAdapter()`. -Before doing that you need to create an instance using the `createInstance()` function. +Before doing this you need to create an instance using the `createInstance()` function. ```C++ wgpu::Instance instance = createInstance(InstanceDescriptor{}); @@ -78,7 +80,7 @@ In order to display something on the screen, the operating system needs to provi The Game Activity provides a *pApp* member which exposes an Android Window. WebGPU can use an Android Window for rendering. -WebGPU cannot use the *window* directly, but uses something called **a surface**, which can be easily created using the window. +WebGPU cannot use the *window* directly, but uses something called a "surface", which can be easily created using the window. ```C++ wgpu::SurfaceDescriptorFromAndroidNativeWindow platformSurfaceDescriptor = {}; @@ -129,7 +131,7 @@ There are few options to set the limits: wgpu::RequiredLimits requiredLimits = Default; ``` -* Query the Adapter's *supported limits* and use them as *required limits*: +* Query the Adapter's *supported limits*, and use them as *required limits*: ```C++ wgpu::SupportedLimits supportedLimits; @@ -138,7 +140,7 @@ wgpu::RequiredLimits requiredLimits = Default; requireLimits.limits = supportedLimits.limits; ``` -* Query the Adapter's *supported limits* and define specific *better* limits in the *required limits*: +* Query the Adapter's *supported limits*, and define specific *better* limits in the *required limits*: ```C++ wgpu::SupportedLimits supportedLimits; @@ -153,9 +155,9 @@ requiredLimits.limits.minUniformBufferOffsetAlignment = supportedLimits.limits.m ``` {{% notice Tip %}} -Setting *better* limits may not be desirable, as doing so may have a performance impact. To improve portability across devices and implementations, applications should generally only request better limits if they are required. +Setting *better* limits might not be preferable, as doing so might have a consequential performance impact. To improve portability across devices and implementations, applications should generally only request better limits if they are required. -It is recommended to read more about ["Supported Limits"](https://developer.mozilla.org/en-US/docs/Web/API/GPUSupportedLimits) and ["limits"](https://gpuweb.github.io/gpuweb/#limits). +It is recommended that you to read more about ["Supported Limits"](https://developer.mozilla.org/en-US/docs/Web/API/GPUSupportedLimits) and ["Limits"](https://gpuweb.github.io/gpuweb/#limits). {{% /notice %}} Use the `requestDevice()` API to request device: @@ -179,7 +181,7 @@ static auto errorCallback = device.setUncapturedErrorCallback([](ErrorType type, ``` {{% notice Tip %}} -While creating a device, use a callback function `setUncapturedErrorCallback`, this helps in capturing validation and other errors with the WebGPU device. +While creating a device, use the callback function `setUncapturedErrorCallback`. This helps in capturing validation and other errors with the WebGPU device. {{% /notice %}} -Proceed to learn how to render 3D objects. \ No newline at end of file +You can now move on to learn how to render 3D objects. \ No newline at end of file diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/5-render-a-simple-3D-object-part-1.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/5-render-a-simple-3D-object-part-1.md index f807a40b92..cd8d77ac10 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/5-render-a-simple-3D-object-part-1.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/5-render-a-simple-3D-object-part-1.md @@ -8,40 +8,40 @@ layout: learningpathall ## Command queue -Graphic applications have to deal with 2 processors, the CPU and the GPU. +Graphic applications must operate with two processors, the CPU and the GPU. -These 2 processors run on different timelines. For optimal performance, commands intended for the GPU are batched and sent through a command queue. The GPU consumes this queue whenever it is ready, and this way processors minimize the time spent idling for their sibling to respond. +These two processors run on different timelines. For optimal performance, commands intended for the GPU are batched and sent through a command queue. The GPU consumes this queue whenever it is ready, and in this way, processors minimize the time spent idling for their sibling to respond. A WebGPU device has a single queue, which is used to send both commands and data. You can get it with `wgpuDeviceGetQueue()`. -WebGPU offers 3 different ways to submit work to this queue: +WebGPU offers three different ways to submit work to this queue: -* wgpuQueueSubmit -* wgpuQueueWriteBuffer -* wgpuQueueWriteTexture +* wgpuQueueSubmit. +* wgpuQueueWriteBuffer. +* wgpuQueueWriteTexture. {{% notice Note %}} -Other graphics API allow you to build multiple queues per device, and future version of WebGPU might as well. +Other graphics API allow you to build multiple queues per device. -If you want to learn more, refer to the [Command Queue](https://eliemichel.github.io/LearnWebGPU/getting-started/the-command-queue.html) +If you would like to learn more, refer to the [Command Queue](https://eliemichel.github.io/LearnWebGPU/getting-started/the-command-queue.html). {{% /notice %}} ## Getting started to render a 3D object -WebGPU is a very simple system. All it does is run 3 types of functions on the GPU: Vertex Shaders, Fragment Shaders, and Compute Shaders. +WebGPU is a simple system. It runs three types of functions on the GPU: -* A Vertex Shader computes vertices. The shader returns vertex positions. -* A Fragment Shader computes colors, they indirectly write data to textures. That data does not have to be colors. -* A Compute Shader is more generic. It’s effectively a function you call and say “execute this function N times” +* A Vertex Shader that computes vertices. The shader returns vertex positions. +* A Fragment Shader that computes colors. When an object is drawn, for each pixel to be drawn the GPU calls the fragment shader. The fragment shader then returns a color. +* A Compute Shader that is more generic. It is effectively a function that you can call and request to execute as many times as you require. -Here is a simplified diagram of WebGPU setup to draw triangles by using a vertex shader and a fragment shader: +Here is a simplified diagram of a WebGPU setup to draw triangles by using a vertex shader and a fragment shader: -!["Triangle using WebGPU" #center](images/webgpu-draw-high-level.svg "Triangle using WebGPU") +!["Triangle using WebGPU" #center](images/webgpu-draw-high-level.svg "Figure 8: Triangle using WebGPU") The main things to notice in the above image are: -* There is a **Pipeline**. It contains the vertex shader and fragment shader the GPU will run. You could also have a pipeline with a compute shader. -* The shaders reference resources (buffers, textures, samplers) indirectly through **Bind Groups**. +* There is a **Pipeline** that contains the vertex shader and fragment shader the GPU runs. Alternatively, you might also have a pipeline with a compute shader. +* The shaders reference resources such as buffers, textures, and samplers, indirectly through **Bind Groups**. * The pipeline defines attributes that reference buffers indirectly through the internal state. * Attributes pull data out of buffers and feed the data into the vertex shader. * The vertex shader may feed data into the fragment shader. @@ -50,7 +50,7 @@ The main things to notice in the above image are: To execute shaders on the GPU, you need to create all of these resources and set up this state. Creation of resources is relatively straightforward. {{% notice Note %}} -Most WebGPU resources can not be changed after creation. You can change their contents but not their size, usage, and format. +Most WebGPU resources cannot be changed after creation. You can change their contents but not their size, usage, and format. If you want to change something create a new resource and destroy the old one. {{% /notice %}} @@ -61,51 +61,50 @@ In order to achieve high performance real-time 3D rendering, the GPU processes s To do so, WebGPU provides a Render Pipeline object. The figure below illustrates the sequence of data processing stages executed by the render pipeline. -!["Render Pipeline" #center](images/render-pipeline.svg "Render Pipeline") +!["Render Pipeline" #center](images/render-pipeline.svg "Figure 9: Render Pipeline") -The Render Pipeline has 2 main types of stages, **fixed-function** and **programmable**. +The Render Pipeline has two main types of stages, **fixed-function** and **programmable**. ### Fixed Functions stages The pipeline description consists of the following steps: -* Describe vertex pipeline state -* Describe vertex pipeline state -* Describe primitive pipeline state -* Describe fragment pipeline state -* Describe stencil/depth pipeline state -* Describe multi-sampling state -* Describe pipeline layout +* Describe the vertex pipeline state. +* Describe the primitive pipeline state. +* Describe the fragment pipeline state. +* Describe the stencil/depth pipeline state. +* Describe the multi-sampling state. +* Describe the pipeline layout. -The fixed function stages are well documented and you can refer to [code](https://github.com/varunchariArm/Android_DawnWebGPU/blob/main/app/src/main/cpp/webgpuRenderer.cpp#L256) and [further reading](https://eliemichel.github.io/LearnWebGPU/basic-3d-rendering/hello-triangle.html#lit-24) for configuring them. +The fixed function stages are well-documented, and you can refer to [code](https://github.com/varunchariArm/Android_DawnWebGPU/blob/main/app/src/main/cpp/webgpuRenderer.cpp#L256) and [further reading](https://eliemichel.github.io/LearnWebGPU/basic-3d-rendering/hello-triangle.html#lit-24) for information about configuring them. -Configuring these stages is straight forward and is similar to other graphics APIs. +Configuring these stages is straightforward and is similar to other graphics APIs. ### Programmable stage -There are two programmable stages, vertex and fragment programmable stages. Both of them uses **Shader Module**. +There are two programmable stages, vertex, and fragment programmable stages. Both of them use the **Shader Module**. ### Shaders Both the vertex and fragment programmable stages can use the same shader module or have individual shader modules. -The Shader module is kind of a dynamic library (like a .dll, .so or .dylib file), except that it talks the binary language of your GPU rather than your CPU. +The Shader module is like a dynamic library (such as a .dll, .so, or a .dylib file), except that it uses the binary language of your GPU rather than that of your CPU. ### Shader Code The shader language officially used by WebGPU is called WebGPU Shading Language, [WGSL](https://gpuweb.github.io/gpuweb/wgsl/). -All implementations of WebGPU support it, and Dawn also offers the possibility to provide shaders written in [SPIR-V](https://www.khronos.org/spir). +All implementations of WebGPU support it, and Dawn also offers the opportunity to provide shaders written in [SPIR-V](https://www.khronos.org/spir). {{% notice Note %}} -WGSL was originally designed to be a human-editable version of SPIR-V programming model, so transpilation from SPIR-V to WGSL is in theory efficient and lossless. You can use [Naga](https://github.com/gfx-rs/naga) or [Tint](https://dawn.googlesource.com/tint) to translate. +WGSL was originally designed to be a human-editable version of the SPIR-V programming model, so transpilation from SPIR-V to WGSL is in theory efficient and lossless. You can use [Naga](https://github.com/gfx-rs/naga) or [Tint](https://dawn.googlesource.com/tint) to translate. {{% /notice %}} -It is highly recommended to understand WGSL syntax and capabilities to better program in WebGPU. +It is recommended that you learn about the WGSL syntax and capabilities to better program in WebGPU. ### Shader Module Creation -It is simple to create a Shader module in WebGPU: +It is simple to create a Shader module in WebGPU. Use the following code: ```C++ ShaderModuleDescriptor shaderDesc; @@ -116,15 +115,15 @@ By default the `nextInChain` member of `ShaderModuleDescriptor` is a `nullptr`. The `nextInChain` pointer is the entry point of WebGPU’s extension mechanism. It is either null or pointing to a structure of type `WGPUChainedStruct`. -It may recursively have a next element (again, either null or pointing to some `WGPUChainedStruct`). +It may recursively have a next element (again, either null or pointing to a `WGPUChainedStruct`). -Second, it has a struct type `sType`, which is an enum telling in which struct the chain element can be cast. +Secondly, it has the struct type `sType`, which is an enum reporting in which struct the chain element can be cast. To create a shader module from WGSL code, use the `ShaderModuleWGSLDescriptor` SType. In Dawn, a SPIR-V shader can similarly be created using the `WGPUShaderModuleSPIRVDescriptor`. -The field shaderCodeDesc.chain corresponds to the chained struct when cast as a simple `WGPUChainedStruct`, which must be set to the corresponding SType enum value: +The field shaderCodeDesc.chain corresponds to the chained struct when cast as a simple `WGPUChainedStruct`, that must be set to the corresponding SType enum value: ```C++ ShaderModuleWGSLDescriptor shaderCodeDesc; diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/6-render-a-simple-3D-object-part-2.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/6-render-a-simple-3D-object-part-2.md index 6378bd1a67..8cd399a404 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/6-render-a-simple-3D-object-part-2.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/6-render-a-simple-3D-object-part-2.md @@ -8,34 +8,34 @@ layout: learningpathall ## 3D meshes -Once a Render Pipeline is created, you can use WebGPU APIs to create and render a 3D mesh. This is very similar to other graphics APIs. +Once a Render Pipeline is created, you can use WebGPU APIs to create and render a 3D mesh. This is similar to other graphics APIs. The steps are listed below: -* Create the Vertex Buffer(s) -* Create the Index Buffer(s) -* Create the Uniform Buffer(s) -* Create a Depth Buffer (Z-Buffer algorithm) -* Create the Depth Texture and TextureView -* Create a Depth Stencil -* Create the Transformation and Projection matrices +* Create the Vertex Buffer(s). +* Create the Index Buffer(s). +* Create the Uniform Buffer(s). +* Create a Depth Buffer (Z-Buffer algorithm). +* Create the Depth Texture and TextureView. +* Create a Depth Stencil. +* Create the Transformation and Projection matrices. -All these steps are common in graphics programming and WebGPU offers capability to perform all the operations. +All these steps are common in graphics programming, and WebGPU allows you to perform these same operations. -It is recommended to go through individual chapters in the [3D rendering](https://eliemichel.github.io/LearnWebGPU/basic-3d-rendering/index.html) section to learn more. +It is a good idea to read the individual chapters in the [3D Rendering](https://eliemichel.github.io/LearnWebGPU/basic-3d-rendering/index.html) section to learn more. ### Loading 3D objects -In this project you can use OBJ files to define 3D meshes. +In this Learning Path you can use OBJ files to define 3D meshes. -Instead of manually parsing OBJ files, use the [TinyOBJLoader](https://github.com/tinyobjloader/tinyobjloader) library. +Instead of manually parsing OBJ files, you can use the [TinyOBJLoader](https://github.com/tinyobjloader/tinyobjloader) library. -The file format is not complex, but parsing files is not the goal of this Learning Path. +The file format is not complex, but parsing files is out of the scope of this Learning Path. You can use open-source software such as Blender to create your own 3D objects. {{% notice Note %}} -Exactly one of your source files must define `TINYOBJLOADER_IMPLEMENTATION` before including it: +Only one of your source files must define `TINYOBJLOADER_IMPLEMENTATION` before including it: ```C++ #define TINYOBJLOADER_IMPLEMENTATION // add this to exactly 1 of your C++ files @@ -51,7 +51,7 @@ You are now ready to render a 3D object. You can run a rendering pass and *draw* something onto our *surface*. -To encode any commands to be issued to GPU, you need to create a `CommandEncoder`. Modern APIs record commands into command buffers,rather than issuing commands one by one, and submit all of them at once. +To encode any commands to be issued to GPU, you need to create a `CommandEncoder`. Modern APIs record commands into command buffers, rather than issuing commands one by one, and submit all of them at once. In WebGPU, this is done through a `CommandEncoder` as shown below: @@ -123,7 +123,7 @@ Make sure you release the created encoders and buffers by calling the respective {{% /notice %}} {{% notice Note %}} -By default Dawn runs callbacks only when the device “ticks”, so the error callbacks are invoked in a different call stack than where the error occurred, making the breakpoint less informative. +By default, Dawn runs callbacks only when the device “ticks”, so the error callbacks are invoked in a different call stack than where the error occurred, making the breakpoint less informative. To force Dawn to invoke error callbacks as soon as there is an error, you can enable an instance toggle: @@ -174,6 +174,6 @@ For example: Now click the **Run** icon in Android Studio, which builds the application and launches it on the connected device, producing the following output: -![Output #center](images/output.gif "Output") +![Output #center](images/output.gif "Figure 10: Output") -Congratulations! You are run a WebGPU application on an Android device. \ No newline at end of file +Congratulations! You have now run a WebGPU application on an Android device. \ No newline at end of file diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/7-Profiling-App-using-Streamline.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/7-Profiling-App-using-Streamline.md index 583395c6b4..1f5326e38d 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/7-Profiling-App-using-Streamline.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/7-Profiling-App-using-Streamline.md @@ -6,54 +6,54 @@ weight: 8 layout: learningpathall --- -## Setup Arm Streamline +## Set up Arm Streamline -Follow these steps to configure Arm Streamline Performance Analyzer to capture Mali GPU related data: +Follow these steps to configure Arm Streamline Performance Analyzer to capture Mali GPU-related data: -* Confirm you Android device is connected to the development machine. -* Navigate to **Start tab**, select **Android (adb)**, select the device and then select the application to debug. -* Click on **Select Counters** +* Confirm that your Android device is connected to the development machine. +* Navigate to the **Start** tab, select **Android (adb)**, select the device, and then select the application to debug. +* Select **Select Counters**. -![Select device #center](images/streamline_select.png "Select device") +![Select device #center](images/streamline_select.png "Figure 11: Select device") -which opens a new window: +This opens a new window: -![Select counters #center](images/streamline_select_counters.png "Select counters") +![Select counters #center](images/streamline_select_counters.png "Figure 12: Select counters") -* Search for **Mali Timeline Events: Perfetto** -* Make sure it is listed in the **Events to collect** -* Click Save +* Search for **Mali Timeline Events: Perfetto**. +* Check that it is listed in **Events to collect**. +* Select **Save**. ## Profiling the application using Streamline -Once you have selected the device, the application and metrics to be collected, click on the **start capture** button. +Once you have selected the device, the application, and the metrics to be collected, click on the **start capture** button. This automatically starts the application and begins collecting the profiling data. -Make sure the application is running as desired on your Android device. After a few seconds, you can stop the capture. +Make sure the application is running correctly on your Android device. After a few seconds, you can stop the capture process. Wait until Streamline completes processing the data. -Switch to *Mali Timeline* view as shown below: +Switch to **Mali Timeline** view as shown below: -!["Mali Timeline Streamline" #center](images/Streamline-mali-timeline.png "Mali Timeline Streamline") +!["Mali Timeline Streamline" #center](images/Streamline-mali-timeline.png "Figure 13: Mali Timeline Streamline") -You may have to zoom into the data to the maximum (`500 us`), since you are rendering a very simple 3D object. +You might have to zoom into the data up to the maximum (**500 us**), as you are rendering a simple 3D object. -You can analyze 2 consecutive frames as shown below: +You can analyze two consecutive frames as shown below: -!["Two consecutive frames" #center](./images/Streamline-mali-analysis.png "Two consecutive frames") +!["Two Consecutive Frames" #center](./images/Streamline-mali-analysis.png "Figure 14: Two Consecutive Frames") Arm has worked with the Dawn team to optimize data uploading to GPU buffers for Mali GPUs. -Arm has implemented a **Fast Path** mechanism wherein the Vertex Queue starts processing in parallel while an earlier Fragment Queue is being processed. +Arm has implemented a **Fast Path** mechanism where the Vertex Queue starts processing in parallel while an earlier Fragment Queue is simultaneously being processed. -As you can see from the above picture, there is some overlap between Fragment Queue of first frame and Vertex Queue of the consecutive frame. +As you can see from the above picture, there is some overlap between the Fragment Queue of first frame, and the Vertex Queue of the consecutive frame. -This shows that the application is hitting the **Fast Path** that Arm has implemented to optimize performance of Dawn for Mali GPUs. +This demonstrates that the application is hitting the **Fast Path** that Arm has implemented to optimize performance of Dawn for Mali GPUs. -The overlap is small since the application is rendering the same simple 3D object under different orientation. You can extend the application to render complex objects with multiple *Uniform Buffers*. This would show the overlap in more detail. +The overlap is small as the application is rendering the same simple 3D object under a different orientation. You can extend the application to render complex objects with multiple *Uniform Buffers*. This demonstrates the overlap in more detail. {{% notice Tip %}} -Feel free to experiment with different counters in Streamline and explore the other CPU profiling data as well. +You can experiment with different counters in Streamline and also explore other CPU profiling data. {{% /notice %}} diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_index.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_index.md index f1a307a26b..5fa6f5e6c9 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_index.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_index.md @@ -1,34 +1,34 @@ --- title: Build and profile a simple WebGPU Android Application - -draft: true cascade: - draft: true - minutes_to_complete: 90 -who_is_this_for: This is an introductory topic for developers building GPU based Android applications and interested in trying WebGPU. +who_is_this_for: This is an introductory topic for developers who are building GPU-based Android applications and are interested in experimenting with WebGPU. learning_objectives: - - Understand the benefits of WebGPU and Dawn, a WebGPU implementation. + - Describe the benefits of WebGPU. + - Describe the benefits of using Dawn. - Set up a WebGPU development environment. - Integrate Dawn in an Android Application. - Use Dawn WebGPU APIs in the application. - - Understand the changes required to upgrade to WebGPU to render a simple 3D object. + - Describe the changes required to upgrade to WebGPU to render a simple 3D object. - Build and run a WebGPU Android Application. - - Profile the Application using Streamline. + - Profile the application using Streamline. - Analyze the profiling data. - + prerequisites: - - Basic knowledge of graphics APIs and experience with developing Android graphics applications. + - Basic knowledge of graphics APIs and experience in developing Android graphics applications. - A development machine with Android Studio, Blender, and Arm Streamline installed. - An Android phone in developer mode. + - Android Studio. + - Arm Performance Studio. + - Python 3.10 or later. author_primary: Varun Chari, Albin Bernhardsson ### Tags skilllevels: Advanced -subjects: GPU +subjects: Graphics armips: - Cortex-A tools_software_languages: diff --git a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_review.md b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_review.md index f12277b62d..25383d3999 100644 --- a/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_review.md +++ b/content/learning-paths/smartphones-and-mobile/android_webgpu_dawn/_review.md @@ -4,12 +4,12 @@ review: question: > What is WebGPU? answers: - - A highly customized API for specific GPUs. + - A highly-customized API for specific GPUs. - APIs designed to provide unified access to GPUs whichever the GPU vendor and operating system the application runs with. - APIs designed for cloud-based applications. correct_answer: 2 explanation: > - WebGPU is a Render Hardware Interface built on top of the various APIs provided by the driver/OS depending on your platform. This duplicated development effort is made once by the web browsers and made available to us through the webgpu.h header they provide + WebGPU is a Render Hardware Interface built on top of the various APIs provided by the driver/OS depending on your platform. This duplicated development effort is made once by the web browsers and made available through the webgpu.h header that they provide. - questions: question: > @@ -20,7 +20,7 @@ review: - A new programming language to program GPUs. correct_answer: 1 explanation: > - Dawn is an open-source and cross-platform implementation of the WebGPU standard, lead by Google. More precisely it implements webgpu.h that is a one-to-one mapping with the WebGPU IDL. + Dawn is an open-source and cross-platform implementation of the WebGPU standard, lead by Google. More precisely, it implements webgpu.h that is a one-to-one mapping with the WebGPU IDL. - questions: question: > @@ -28,10 +28,10 @@ review: answers: - A profiling tool to profile CPUs. - A profiling tool to profile GPUs. - - A a comprehensive profiling software to profile both CPUs and GPUs. + - A comprehensive profiling software to profile both CPUs and GPUs. correct_answer: 3 explanation: > - Streamline is an application profiler that can capture data from multiple sources, including Program Counters (PC), Samples from the hardware Performance Monitoring Unit (PMU) counters in the Arm CPU, Arm® Mali™ GPUs, and Arm Immortalis™ GPUs. + Streamline is an application profiler that can capture data from multiple sources, including Program Counters (PC), samples from the hardware Performance Monitoring Unit (PMU) counters in the Arm CPU, Arm® Mali™ GPUs, and Arm Immortalis™ GPUs. diff --git a/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/2-build-onnxruntime.md b/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/2-build-onnxruntime.md index d6541e2bd6..93ff353ee3 100644 --- a/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/2-build-onnxruntime.md +++ b/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/2-build-onnxruntime.md @@ -15,7 +15,7 @@ ONNX Runtime is an open-source inference engine designed to accelerate the deplo ### Clone onnxruntime repo -Open up a Windows Powershell and checkout the source tree: +Open up a Windows PowerShell and checkout the source tree: ```bash cd C:\Users\$env:USERNAME diff --git a/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/3-build-onnxruntime-generate-api.md b/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/3-build-onnxruntime-generate-api.md index 4ca2983bec..0d7e417440 100644 --- a/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/3-build-onnxruntime-generate-api.md +++ b/content/learning-paths/smartphones-and-mobile/build-android-chat-app-using-onnxruntime/3-build-onnxruntime-generate-api.md @@ -18,7 +18,7 @@ You can learn more by reading the [ONNX Runtime generate() API page](https://onn ### Clone onnxruntime-genai repo -Within your Windows Powershell prompt, checkout the source repo: +Within your Windows PowerShell prompt, checkout the source repo: ```bash C:\Users\$env:USERNAME diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/Streamline.png b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/Streamline.png new file mode 100644 index 0000000000..e02ea645ce Binary files /dev/null and b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/Streamline.png differ diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md new file mode 100644 index 0000000000..09b49edbdb --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md @@ -0,0 +1,42 @@ +--- +title: Profile the performance of ML models on Arm + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: This is an introductory topic for software developers who want to learn how to profile the performance of their ML models running on Arm devices. + +learning_objectives: + - Profile the execution times of ML models on Arm devices. + - Profile ML application performance on Arm devices. + +prerequisites: + - An Arm-powered Android smartphone, and USB cable to connect with it. + +author_primary: Ben Clark + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Cortex-X + - Cortex-A + - Mali + - Immortalis +tools_software_languages: + - Android Studio + - tflite +operatingsystems: + - Android + - Linux + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_next-steps.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_next-steps.md new file mode 100644 index 0000000000..f468cb1b80 --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_next-steps.md @@ -0,0 +1,20 @@ +--- +next_step_guidance: You might be interested in learning how to profile your Unity apps on Android. + +recommended_path: /learning-paths/smartphones-and-mobile/profiling-unity-apps-on-android/ + +further_reading: + - resource: + title: Arm Streamline User Guide + link: https://developer.arm.com/documentation/101816/latest/ + type: documentation + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md new file mode 100644 index 0000000000..7eae5a8b1b --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md @@ -0,0 +1,45 @@ +--- +review: + - questions: + question: > + Streamline Profiling lets you profile: + answers: + - Arm CPU activity + - Arm GPU activity + - when your Neural Network is running + - All of the above + correct_answer: 4 + explanation: > + Streamline will show you CPU and GPU activity (and a lot more counters!), and if Custom Activity Maps are used, you can see when your Neural Network and other parts of your application are running. + + - questions: + question: > + Does Android Studio have a profiler? + answers: + - "Yes" + - "No" + correct_answer: 1 + explanation: > + Yes, Android Studio has a built-in profiler that can be used to monitor the memory usage of your app among other things + + - questions: + question: > + Is there a way to profile what is happening inside your Neural Network? + answers: + - Yes, Streamline just shows you out of the box + - No. + - Yes, ArmNN's ExecuteNetwork can do this + - Yes, Android Studio Profiler can do this + correct_answer: 3 + explanation: > + Standard profilers don't have an easy way to see what is happening inside an ML framework to see a model running inside it. ArmNN's ExecuteNetwork can do this for TensorFlow Lite models, and ExecuTorch has tools that can do this for PyTorch models. + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/android-profiling-version.png b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/android-profiling-version.png new file mode 100644 index 0000000000..7e058f009f Binary files /dev/null and b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/android-profiling-version.png differ diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md new file mode 100644 index 0000000000..9f8508f3a8 --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md @@ -0,0 +1,45 @@ +--- +title: Memory Profiling with Android Studio +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Android Memory Profiling +Memory is often a problem in ML, with ever bigger models and data. For profiling an Android app's memory, Android Studio has a built-in profiler. This can be used to monitor the memory usage of your app, and to find memory leaks. + +To find the Profiler, open your project in Android Studio and click on the *View* menu, then *Tool Windows*, and then *Profiler*. This opens the Profiler window. Attach your device in Developer Mode with a USB cable, and then you should be able to select your app's process. Here there are a number of different profiling tasks available. + +Most likely with an Android ML app you'll need to look at memory both from the Java/Kotlin side and the native side. The Java/Kotlin side is where the app runs, and may be where buffers are allocated for input and output if, for example, you're using LiteRT (formerly known as TensorFlow Lite). The native side is where the ML framework will run. Looking at the memory consumption for Java/Kotlin and native is 2 separate tasks in the Profiler: *Track Memory Consumption (Java/Kotlin Allocations)* and *Track Memory Consumption (Native Allocations)*. + +Before you start either task, you have to build your app for profiling. The instructions for this and for general profiling setup can be found [here](https://developer.android.com/studio/profile). You will want to start the correct profiling version of the app depending on the task. + +![Android Studio profiling run types alt-text#center](android-profiling-version.png "Figure 1. Profiling run versions") + +For the Java/Kotlin side, you want the **debuggable** "Profile 'app' with complete data", which is based off the debug variant. For the native side, you want the **profileable** "Profile 'app' with low overhead", which is based off the release variant. + +### Java/Kotlin + +If you start looking at the [Java/Kotlin side](https://developer.android.com/studio/profile/record-java-kotlin-allocations), choose *Profiler: Run 'app' as debuggable*, and then select the *Track Memory Consumption (Java/Kotlin Allocations)* task. Navigate to the part of the app you wish to profile and then you can start profiling. At the bottom of the Profiling window it should look like Figure 2 below. Click *Start Profiler Task*. + +![Android Studio Start Profile alt-text#center](start-profile-dropdown.png "Figure 2. Start Profile") + +When you're ready, *Stop* the profiling again. Now there will be a nice timeline graph of memory usage. While Android Studio has a nicer interface for the Java/Kotlin side than the native side, the key to the timeline graph may be missing. This key is shown below in Figure 3, so you can refer to the colors from this. +![Android Studio memory key alt-text#center](profiler-jk-allocations-legend.png "Figure 3. Memory key for the Java/Kotlin Memory Timeline") + +The default height of the Profiling view, as well as the timeline graph within it is usually too small, so adjust these heights to get a sensible graph. You can click at different points of the graph to see the memory allocations at that time. If you look according to the key you can see how much memory is allocated by Java, Native, Graphics, Code etc. + +Looking further down you can see the *Table* of Java/Kotlin allocations for your selected time on the timeline. With ML a lot of your allocations are likely to be byte[] for byte buffers, or possibly int[] for image data, etc. Clicking on the data type will open up the particular allocations, showing their size and when they were allocated. This will help to quickly narrow down their use, and whether they are all needed etc. + +### Native + +For the [native side](https://developer.android.com/studio/profile/record-native-allocations), the process is similar but with different options. Choose *Profiler: Run 'app' as profileable*, and then select the *Track Memory Consumption (Native Allocations)* task. Here you have to *Start profiler task from: Process Start*. Choose *Stop* once you've captured enough data. + +The Native view doesn't have the same nice timeline graph as the Java/Kotlin side, but it does have the *Table* and *Visualization* tabs. The *Table* tab no longer has a list of allocations, but options to *Arrange by allocation method* or *callstack*. Choose *Arrange by callstack* and then you can trace down which functions were allocating significant memory. Potentially more useful, you can also see Remaining Size. + +In the Visualization tab you can see the callstack as a graph, and once again you can look at total Allocations Size or Remaining Size. If you look at Remaining Size, you can see what is still allocated at the end of the profiling, and by looking a few steps up the stack, probably see which allocations are related to the ML model, by seeing functions that relate to the framework you are using. A lot of the memory may be allocated by that framework rather than in your code, and you may not have much control over it, but it is useful to know where the memory is going. + +## Other platforms + +On other platforms, you will need a different memory profiler. The objective of working out where the memory is being used is the same, and whether there are issues with leaks or just too much memory being used. There are often trade-offs between memory and speed, and they can be considered more sensibly if the numbers involved are known. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md new file mode 100644 index 0000000000..e55e4e172d --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md @@ -0,0 +1,249 @@ +--- +title: Profile your application with Streamline +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Application Profiling +Application profiling can be split into 2 main types - *Instrumentation* and *Sampling*. [Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer), for example, is a sampling profiler, that takes regular samples of various counters and registers in the system to provide a detailed view of the system's performance. Sampling will only provide a statistical view, but it is less intrusive and has less processing overhead than instrumentation. + +The profiler can look at memory, CPU activity and cycles, cache misses, and many parts of the GPU as well as other performance metrics. It can also provide a timeline view of these counters to show the application's performance over time. This will show bottlenecks, and help you understand where to focus your optimization efforts. + +![Streamline image alt-text#center](Streamline.png "Figure 1. Streamline timeline view") + +## Example Android Application + +In this Learning Path, you will use profile [an example Android application](https://github.com/dawidborycki/Arm.PyTorch.MNIST.Inference) using Streamline. +Start by cloning the repository containing this example on your machine and open it in a recent Android Studio. It is generally safest to not update the Gradle version when prompted. + +## Streamline +You will install Streamline and Performance Studio on your host machine and connect to your target Arm device to capture the data. In this example, the target device is an Arm-powered Android phone. The data is captured over a USB connection, and then analyzed on your host machine. + +For more details on Streamline usage you can refer to these [tutorials and training videos](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio). While the example you are running is based on Android, you can use [the setup and capture instructions for Linux](https://developer.arm.com/documentation/101816/0903/Getting-started-with-Streamline/Profile-your-Linux-application). + +First, follow these [setup instructions](https://developer.arm.com/documentation/102477/0900/Setup-tasks?lang=en), to make sure you have `adb` (Android Debug Bridge) installed. If you have installed [Android Studio](https://developer.android.com/studio), you will have installed adb already. Otherwise, you can get it as part of the Android SDK platform tools [here](https://developer.android.com/studio/releases/platform-tools.html). + +Make sure `adb` is in your path. You can check this by running `adb` in a terminal. If it is not in your path, you can add it by installing the [Android SDK `platform-tools`](https://developer.android.com/tools/releases/platform-tools#downloads) directory to your path. + +Next, install [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio#Downloads), which includes Streamline. + +Connect your Android phone to your host machine through USB. Ensure that your Android phone is set to [Developer mode](https://developer.android.com/studio/debug/dev-options). + +On your phone, go to `Settings > Developer Options` and enable USB Debugging. If your phone asks you to authorize connection to your host machine, confirm this. Test the connection by running `adb devices` in a terminal. You should see your device ID listed. + +Next, you need a debuggable build of the application you want to profile. +- In Android Studio, ensure your *Build Variant* is set to `debug`. You can then build the application and install it on your device. +- For a Unity app, select Development Build under File > Build Settings when building your application. +- In Unreal Engine, open Project Settings > Project > Packaging > Project, and ensure that the For Distribution checkbox is not set. +- In the general case, you can set `android:debuggable=true` in the application manifest file. + +For the example application that you cloned earlier, the Build Variant is `debug` by default, but you can verify this by going to `Build > Select Build Variant` in Android Studio. Build and install this application on your device. + +You can now run Streamline and [capture a profile](https://developer.arm.com/documentation/102477/0900/Capture-a-profile?lang=en) of your application. But before you do, lets add some useful annotations to your code that can help with more specific performance analysis of your application. + +## Custom Annotations + +In Streamline, it is possible to add custom annotations to the timeline view. This can be useful to mark the start and end of specific parts of your application, or to mark when a specific event occurs. This can help you understand the performance of your application in relation to these events. At the bottom of *Figure 1* above there are custom annotations to show when inference, pre-processing, and post-processing are happening. + +To add annotations, you will need to add some files into your project from the **gator** daemon that Streamline uses. These files are named `streamline_annotate.c`, `streamline_annotate.h` and `streamline_annotate_logging.h` and made available [here](https://github.com/ARM-software/gator/tree/main/annotate). Using these annotations, you will be able to show log strings, markers, counters and Custom Activity Maps. WIthin your example project, create a `cpp` folder under the `app/src/main` folder, and add these three files there. + +These files are written in C, so if your Android Studio project is in Java or Kotlin, you will need to add a C library to your project. This is slightly trickier than just adding a Java or Kotlin file, but it is not difficult. You can find instructions on how to do this [here](https://developer.android.com/studio/projects/add-native-code). + +Create a file in the `app/src/main/cpp/` folder under your project and name it `annotate_jni_wrapper.c`. This will be a wrapper around the gator daemon's functions, and will be called from your Kotlin code. Copy the code below into this file. You can also create very similar wrapper functions for other gator daemon functions. + +```c +#include +#include "streamline_annotate.h" + +JNIEXPORT void JNICALL Java_AnnotateStreamline_AnnotateSetup(JNIEnv* env, jobject obj) { + gator_annotate_setup(); +} + +JNIEXPORT jlong JNICALL Java_AnnotateStreamline_GetTime(JNIEnv* env, jobject obj) { + return gator_get_time(); +} +``` + +Some functions have `unsigned int`, but that needs to be a `jint` in the wrapper, with some casting required in your Kotlin code to enforce type correctness at that end. Some functions have strings as arguments, and you will need to do a small conversion as shown below: + +```c +JNIEXPORT void JNICALL Java_AnnotateStreamline_AnnotateMarkerColorStr(JNIEnv* env, jobject obj, jint color, jstring str) { + const char* nativeStr = (*env)->GetStringUTFChars(env, str, 0); + gator_annotate_marker_color(color, nativeStr); + (*env)->ReleaseStringUTFChars(env, str, nativeStr); +} +``` + +In Android Studio `cmake` is used to create your C library, so you will need a `CMakelists.txt` file in the same directory as the C files (`app/src/main/cpp/` in the example). Copy the contents shown below into `CMakelists.txt`: + +```cmake +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declare the project name. +project("StreamlineAnnotationJNI") + +# Create and name the library +add_library(${CMAKE_PROJECT_NAME} SHARED + annotate_jni_wrapper.c + streamline_annotate.c) + +# Specifies libraries CMake should link to your target library. +# Adding in the Android system log library pulls in the NDK path. +find_library( # Sets the path to the NDK library. + log-lib + log ) + +target_link_libraries( # Specifies the target library. + ${CMAKE_PROJECT_NAME} + ${log-lib} ) +``` + +Now add the code below to the `build.gradle` file of the Module you wish to profile (`:app` in the example). You will be able to call the functions from your Kotlin code: + +```gradle + externalNativeBuild { + cmake { + path file('src/main/cpp/CMakeLists.txt') + version '3.22.1' + } + } +``` + +This will create a `libStreamlineAnnotationJNI.so` library that you can load in your Kotlin code, and then you can call the functions. Here you will create a singleton `AnnotateStreamline.kt`. Place the file alongside `MainActivity.kt` in `app\src\main\java\com\arm\armpytorchmnistinference` for the example. Add the following code to `AnnotateStreamline.kt` to enable Kotlin calls to the gator daemon from the rest of your code: + +```kotlin +// Kotlin wrapper class for integration into Android project +class AnnotateStreamline { + init { + // Load the native library + System.loadLibrary("StreamlineAnnotationJNI") + } + + companion object { + // #defines for colors from the Streamline Annotation c code + const val ANNOTATE_RED: UInt = 0x0000ff1bu + const val ANNOTATE_BLUE: UInt = 0xff00001bu + const val ANNOTATE_GREEN: UInt = 0x00ff001bu + const val ANNOTATE_PURPLE: UInt = 0xff00ff1bu + const val ANNOTATE_YELLOW: UInt = 0x00ffff1bu + // any other constants you want from the included gator files + + // Create an instance of the AnnotateStreamline class + private val annotations = AnnotateStreamline() + + // Function to setup the Streamline Annotation - call this first + @JvmStatic + fun setup() { + annotations.AnnotateSetup() + } + + // Function to get the current time from gator + @JvmStatic + fun getTime(): Long { + return annotations.GetTime() + } + + // more functions that you want, e.g. (note UInt conversion) + @JvmStatic + fun annotateMarkerColorStr(color: UInt, str: String) { + annotations.AnnotateMarkerColorStr(color.toInt(), str) + } + // ... + } + + // externals match the last part of function names in annotate_jni_wrapper.c + external fun AnnotateSetup() + external fun GetTime(): Long + external fun AnnotateMarkerColorStr(color: Int, str: String) + // ... +} +``` + +Fill in all the function calls to match the functions you added into `annotate_jni_wrapper.c`. + +The `AnnotateStreamline` class can now be used in your Kotlin code to add annotations to the Streamline timeline view. The first thing is to make sure `AnnotateStreamline.setup()` is called before any other gator functions. For the example project, add it into the `onCreate()` function of `MainActivity.kt`. Then you can add annotations like this: + +```kotlin + AnnotateStreamline.annotateMarkerColorStr(AnnotateStreamline.ANNOTATE_BLUE, "Model Load") +``` + +In the example app you could add this in the `onCreate()` function of `MainActivity.kt` after the `Module.load()` call to load the `model.pth`. + +This 'colored marker with a string' annotation will add the string and time to Streamline's log view, and look like the image shown below in Streamline's timeline (in the example app ArmNN isn't used, so there are no white ArmNN markers): + +![Streamline image alt-text#center](streamline_marker.png "Figure 2. Streamline timeline markers") + +## Custom Activity Maps (CAMs) + +In addition to adding strings to the log and colored markers to the timeline, a particularly useful set of annotations is the Custom Activity Maps. These are the named colored bands you can see at the bottom of the Streamline timeline view shown in *Figure 1*. They can be used to show when specific parts of your application are running, such as the pre-processing or inference, and layered for functions within functions etc. + +To add these you will need to import the functions that start `gator_cam_` from `streamline_annotate.h` through your wrapper files in the same way as the functions above. Then you can use CAMs, but first you will need to set up the tracks the annotations will appear on and an id system for each annotation. The `baseId` code below is to ensure that if you add annotations in multiple places in your code, the ids are unique. + +Here is an example setup in a class's companion object: + +```kotlin + companion object { + const val camViewId = 1u + const val trackRoot = 1u + const val trackChild = 2u + baseId = (0u..UInt.MAX_VALUE/2u-5000u).random() + currentId = baseId + + init { + AnnotateStreamline.camViewName(camViewId, "Inference") + AnnotateStreamline.camTrack(camViewId, trackRoot,0xffffffffu, "Root") // root wants -1 for parent id + AnnotateStreamline.camTrack(camViewId, trackChild, trackRoot, "Children") + } +``` + +For the example app, add this to the `MainActivity` class. + +Then it can be used like this: + +```kotlin + val preprocess = currentId++ + AnnotateStreamline.camJobStart(camViewId, preprocess, "Preprocess", trackRoot, AnnotateStreamline.getTime(), AnnotateStreamline.ANNOTATE_YELLOW) + val childjob = currentId++ + AnnotateStreamline.camJobStart(camViewId, childjob, "child job", trackChild, AnnotateStreamline.getTime(), AnnotateStreamline.ANNOTATE_CYAN) + //child job code... + AnnotateStreamline.camJobEnd(camViewId, childjob, AnnotateStreamline.getTime()) + //rest of preprocessing code... + AnnotateStreamline.camJobEnd(camViewId, preprocess, AnnotateStreamline.getTime()) +``` + +In the example app, the CAM annotations are added to the `runInference()` function, which should look like this: + +```kotlin + private fun runInference(bitmap: Bitmap) { + val preprocess = currentId++ + AnnotateStreamline.camJobStart(camViewId, preprocess, "Preprocess", trackRoot, AnnotateStreamline.getTime(), AnnotateStreamline.ANNOTATE_YELLOW) + // Convert bitmap to a float array and create a tensor with shape [1, 1, 28, 28] + val inputTensor = createTensorFromBitmap(bitmap) // could add a child CAM job inside function call, but probably too simple + AnnotateStreamline.camJobEnd(camViewId, preprocess, AnnotateStreamline.getTime()) + + // Run inference and measure time + val inferenceTimeMicros = measureTimeMicros { + // Forward pass through the model + val inference = currentId++ + AnnotateStreamline.camJobStart(camViewId, inference, "Inference", trackRoot, AnnotateStreamline.getTime(), AnnotateStreamline.ANNOTATE_RED) + val outputTensor = model.forward(IValue.from(inputTensor)).toTensor() + AnnotateStreamline.camJobEnd(camViewId, inference, AnnotateStreamline.getTime()) + // and then post-processing is simplistic in this case, so not worth a CAM job + val scores = outputTensor.dataAsFloatArray + + // Get the index of the class with the highest score + val maxIndex = scores.indices.maxByOrNull { scores[it] } ?: -1 + predictedLabel.text = "Predicted Label: $maxIndex" + } + + // Update inference time TextView in microseconds + inferenceTime.text = "Inference Time: $inferenceTimeMicros µs" + } +``` + +The example application is very fast and simple, so the CAMs will not show much information. In a more complex application you could add more CAMs, including child-level ones, to give more detailed annotations to show where time is spent in your application. For this example app with its very fast inference, it's best to change the Streamline timeline view scale to 10µs in order to see the CAM annotations better. + +Once you've added in useful CAM annotations, you can build and deploy a debug version of your application. You can run Streamline and see the annotations and CAMs in the timeline view. See the [Streamline documentation](https://developer.arm.com/documentation/101816/latest/) for how to make a capture for profiling. After the capture is made and analyzed, you will be able to see when your application is running the inference, ML pre-processing, ML post-processing, or other parts of your application. From there you can see where the most time is spent, and how hard the CPU or GPU is working during different parts of the application. From this you can then decide if work is needed to improve performance and where that work needs doing. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md new file mode 100644 index 0000000000..f4ca26994d --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md @@ -0,0 +1,85 @@ +--- +title: ML profiling of a tflite model with ExecuteNetwork +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## ArmNN's Network Profiler +One way of running tflite models is with ArmNN. This is available as a delegate to the standard tflite interpreter. But to profile the model, ArmNN comes with a command-line utility called `ExecuteNetwork`. This program just runs the model without the rest of the app. It is able to output layer timings and other useful information to let you know where there might be bottlenecks within your model. + +If you are using tflite without ArmNN, then the output from `ExecuteNetwork` will be more of an indication than a definitive answer. But it can still be useful to spot any obvious problems. + +To try this out, you can download a tflite model from the [Arm Model Zoo](https://github.com/ARM-software/ML-zoo). In this Learning Path, you will download [mobilenet tflite](https://github.com/ARM-software/ML-zoo/blob/master/models/image_classification/mobilenet_v2_1.0_224/tflite_int8/mobilenet_v2_1.0_224_INT8.tflite). + +To get `ExecuteNetwork` you can download it from the [ArmNN GitHub](https://github.com/ARM-software/armnn/releases). Download the version appropriate for the Android phone you wish to test on - the Android version and the architecture of the phone. If you are unsure of the architecture, you can use a lower one, but you may miss out on some optimizations. Inside the `tar.gz` archive that you download, `ExecuteNetwork` is included. Note among the other release downloads on the ArmNN Github is the separate file for the `aar` delegate which is the easy way to include the ArmNN delegate into your app. + +To run `ExecuteNetwork` you'll need to use `adb` to push the model and the executable to your phone, and then run it from the adb shell. `adb` is included with Android Studio, but you may need to add it to your path. Android Studio normally installs it to a location like `\\AppData\Local\Android\Sdk\platform-tools`. `adb` can also be downloaded separately from the [Android Developer site](https://developer.android.com/studio/releases/platform-tools). + +Unzip the `tar.gz` folder you downloaded. From a command prompt, you can then adapt and run the following commands to push the files to your phone. The `/data/local/tmp` folder of your Android device is a place with relaxed permissions that you can use to run this profiling. + +```bash +adb push mobilenet_v2_1.0_224_INT8.tflite /data/local/tmp/ +adb push ExecuteNetwork /data/local/tmp/ +adb push libarm_compute.so /data/local/tmp/ +adb push libarmnn.so /data/local/tmp/ +adb push libarmnn_support_library.so /data/local/tmp/ +# more ArmNN .so library files +``` +Push all the `.so` library files that are in the base folder of the `tar.gz` archive you downloaded, alongside `ExecuteNetwork`, and all the `.so` files in the `delegate` sub-folder. If you are using a recent version of Android Studio this copying can be done much more easily with drag and drop in the *Device Explorer > Files*. + +Then you need to set the permissions on the files: + +```bash +adb shell +cd /data/local/tmp +chmod 777 ExecuteNetwork +chmod 777 *.so +``` + +Now you can run ExecuteNetwork to profile the model. With the example tflite, you can use the following command: + +```bash +LD_LIBRARY_PATH=. ./ExecuteNetwork -m mobilenet_v2_1.0_224_INT8.tflite -c CpuAcc -T delegate --iterations 2 --do-not-print-output --enable-fast-math --fp16-turbo-mode -e --output-network-details > modelout.txt +``` + +If you are using your own tflite, replace `mobilenet_v2_1.0_224_INT8.tflite` with the name of your tflite file. + +This will run the model twice, outputting the layer timings to `modelout.txt`. The `--iterations 2` flag is the command that means it runs twice: the first run includes a lot of startup costs and one-off optimizations, so the second run is more indicative of the real performance. + +The other flags to note are the `-e` and `--output-network-details` flags which will output a lot of timeline information about the model, including the layer timings. The `--do-not-print-output` flag will stop the output of the model, which can be very large, and without sensible input it is meaningless. The `--enable-fast-math` and `--fp16-turbo-mode` flags enable some math optimizations. `CpuAcc` is the acclerated CPU backend, it can be replaced with `GpuAcc` for the accelerated GPU backend. + +After running the model, you can pull the output file back to your host machine with the following commands: + +```bash +exit +adb pull /data/local/tmp/modelout.txt +``` +Once again, this can be done with drag and drop in Android Studio's *Device Explorer > Files*. + +Depending on the size of your model, the output will probably be quite large. You can use a text editor to view the file. The output is in JSON format, so you can use a JSON viewer to make it more readable. Usually some scripting can be used to extract the information you need more easily out of the very raw data in the file. + +At the top is the summary, with the setup time and inference time of your 2 runs, which will look something like this: + +```text +Info: ArmNN v33.2.0 +Info: Initialization time: 7.20 ms. +Info: ArmnnSubgraph creation +Info: Parse nodes to ArmNN time: 50.99 ms +Info: Optimize ArmnnSubgraph time: 85.94 ms +Info: Load ArmnnSubgraph time: 91.11 ms +Info: Overall ArmnnSubgraph creation time: 228.47 ms + +Info: Execution time: 721.91 ms. +Info: Inference time: 722.02 ms + +Info: Execution time: 468.42 ms. +Info: Inference time: 468.58 ms +``` + +After the summary comes the graph of the model, then the layers and their timings from the second run. At the start of the layers there are a few optimizations and their timings recorded before the network itself. You can skip past the graph and the optimization timings to get to the part that needs analyzing. + +In the mobilenet example output, the graph is from lines 18 to 1629. After this is the optimization timings, which are part of the runtime, but not the network - these go until line 1989. Next there are a few wall clock recordings for the loading of the network, before the first layer "Convolution2dLayer_CreateWorkload_#18" at line 2036. Here is where the layer info that needs analyzing starts. + +The layers' "Wall clock time" in microseconds shows how long they took to run. These layers and their timings can then be analyzed to see which layers, and which operators, took the most time. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md new file mode 100644 index 0000000000..91a35381f1 --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md @@ -0,0 +1,16 @@ +--- +title: Profiling the Neural Network +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Profiling your model +App profilers will give you a good overall view of your performance, but often you might want to look inside the model and work out bottlenecks within the network. The network is often the bulk of the time, in which case it will warrant closer analysis. + +With general profilers this is hard to do, as there needs to be annotations inside the ML framework code to get the information. It is a large task to write the profiling annotations throughout the framework, so it is easier to use tools from a framework or inference engine that already has the required instrumentation. + +Depending on your model, your choice of tools will differ. For example, if you are using LiteRT (formerly TensorFlow Lite), Arm provides the ArmNN delegate that you can run with the model running on Linux or Android, CPU or GPU. ArmNN in turn provides a tool called `ExecuteNetwork` that can run the model and give you layer timings among other useful information. + +If you are using PyTorch, you will probably use ExecuTorch the ons-device inference runtime for your Android phone. ExecuTorch has a profiler available alongside it. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt new file mode 100644 index 0000000000..70e7667178 --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt @@ -0,0 +1,20 @@ + +want the performance of your ML app +memory and compute + +how can you find that out + +different steps: +- ML network +- app around the ML network, especially pre and post processing, and the network as a whole + +for around the ML network - streamline profiler +here's how to do that... +Also Android Profiler, memory example + +Ml network, it will depend on the inference engine you are using +- here's an example for if you are using ArmNN with TFLite +- if you're not using it, it may still have some useful information, but different operators will be used and their performance will be different +can see structure with netron or google model explorer to compare operators or different versions of networks +may need to use a conversion tool to convert to TFLite (or whatever your inference engine wants) + diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/profiler-jk-allocations-legend.png b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/profiler-jk-allocations-legend.png new file mode 100644 index 0000000000..a9dfadfe0d Binary files /dev/null and b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/profiler-jk-allocations-legend.png differ diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/start-profile-dropdown.png b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/start-profile-dropdown.png new file mode 100644 index 0000000000..e7d16270f8 Binary files /dev/null and b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/start-profile-dropdown.png differ diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/streamline_marker.png b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/streamline_marker.png new file mode 100644 index 0000000000..e7ec90f36e Binary files /dev/null and b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/streamline_marker.png differ diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md new file mode 100644 index 0000000000..7d688a4ad6 --- /dev/null +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md @@ -0,0 +1,23 @@ +--- +title: Why do you need to profile your ML application? +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Performance +Working out what is taking the time and memory in your application is the first step to getting the performance you want. Profiling can help you identify the bottlenecks in your application and understand how to optimize it. + +With Machine Learning (ML) applications, the inference of the Neural Network (NN) itself is often the heaviest part of the application in terms of computation and memory usage. This is not guaranteed however, so it is important to profile the application as a whole to see if pre- or post-processing or other code is an issue. + +In this Learning Path, you will profile an Android example using TFLite, but most of the steps shown will also work with Linux and cover a wide range of Arm devices. The principles for profiling your application are the same for use with other inference engines and platforms, but the tools are different. + +## Tools + +You will need to use different tools to profile the ML inference or the application's performance running on your Arm device. + +For profiling the ML inference, you will use [ArmNN](https://github.com/ARM-software/armnn/releases)'s ExecuteNetwork. + +For profiling the application as a whole, you will use [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio)'s Streamline, and the Android Studio Profiler. + diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index ecd46e69b0..3947475798 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 301 + content_total: 305 content_with_all_tests_passing: 32 content_with_tests_enabled: 34 sw_categories: diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index 4844d39087..e9e8de919a 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -3834,3 +3834,255 @@ avg_close_time_hrs: 0 num_issues: 17 percent_closed_vs_total: 0.0 +- a_date: '2024-11-18' + content: + cross-platform: 25 + embedded-systems: 19 + install-guides: 89 + laptops-and-desktops: 32 + microcontrollers: 25 + servers-and-cloud-computing: 88 + smartphones-and-mobile: 25 + total: 303 + contributions: + external: 43 + internal: 353 + github_engagement: + num_forks: 30 + num_prs: 8 + individual_authors: + alaaeddine-chakroun: 1 + alexandros-lamprineas: 1 + annie-tallund: 1 + arm: 3 + arnaud-de-grandmaison: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong,-alex-su,-kieran-hejmadi: 1 + chen-zhang: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 1 + david-spickett: 2 + dawid-borycki: 30 + diego-russo: 1 + diego-russo-and-leandro-nunes: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + graham-woodward: 1 + iago-calvo-lista,-arm: 1 + james-whitaker,-arm: 1 + jason-andrews: 89 + joe-stech: 1 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez,-arm: 1 + julie-gaskin: 4 + julio-suarez: 5 + kasper-mecklenburg: 1 + koki-mitsunami: 1 + konstantinos-margaritis: 7 + kristof-beyls: 1 + liliya-wu: 1 + mathias-brossard: 1 + michael-hall: 5 + nikhil-gupta,-pareena-verma,-nobel-chowdary-mandepudi,-ravi-malhotra: 1 + odin-shen: 1 + owen-wu,-arm: 2 + pareena-verma: 35 + pareena-verma,-annie-tallund: 1 + pareena-verma,-jason-andrews,-and-zach-lasiuk: 1 + pareena-verma,-joe-stech,-adnan-alsinan: 1 + pranay-bakre: 4 + przemyslaw-wirkus: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + thirdai: 1 + tianyu-li: 1 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari,-pareena-verma: 1 + visualsilicon: 1 + ying-yu: 1 + ying-yu,-arm: 1 + zach-lasiuk: 1 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 12 + percent_closed_vs_total: 0.0 +- a_date: '2024-11-25' + content: + cross-platform: 25 + embedded-systems: 19 + install-guides: 89 + laptops-and-desktops: 32 + microcontrollers: 25 + servers-and-cloud-computing: 88 + smartphones-and-mobile: 25 + total: 303 + contributions: + external: 43 + internal: 353 + github_engagement: + num_forks: 30 + num_prs: 9 + individual_authors: + alaaeddine-chakroun: 1 + alexandros-lamprineas: 1 + annie-tallund: 1 + arm: 3 + arnaud-de-grandmaison: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong,-alex-su,-kieran-hejmadi: 1 + chen-zhang: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 1 + david-spickett: 2 + dawid-borycki: 30 + diego-russo: 1 + diego-russo-and-leandro-nunes: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + graham-woodward: 1 + iago-calvo-lista,-arm: 1 + james-whitaker,-arm: 1 + jason-andrews: 89 + joe-stech: 1 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez,-arm: 1 + julie-gaskin: 4 + julio-suarez: 5 + kasper-mecklenburg: 1 + koki-mitsunami: 1 + konstantinos-margaritis: 7 + kristof-beyls: 1 + liliya-wu: 1 + mathias-brossard: 1 + michael-hall: 5 + nikhil-gupta,-pareena-verma,-nobel-chowdary-mandepudi,-ravi-malhotra: 1 + odin-shen: 1 + owen-wu,-arm: 2 + pareena-verma: 35 + pareena-verma,-annie-tallund: 1 + pareena-verma,-jason-andrews,-and-zach-lasiuk: 1 + pareena-verma,-joe-stech,-adnan-alsinan: 1 + pranay-bakre: 4 + przemyslaw-wirkus: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + thirdai: 1 + tianyu-li: 1 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari,-pareena-verma: 1 + visualsilicon: 1 + ying-yu: 1 + ying-yu,-arm: 1 + zach-lasiuk: 1 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 13 + percent_closed_vs_total: 0.0 +- a_date: '2024-12-02' + content: + cross-platform: 25 + embedded-systems: 19 + install-guides: 90 + laptops-and-desktops: 33 + microcontrollers: 25 + servers-and-cloud-computing: 88 + smartphones-and-mobile: 25 + total: 305 + contributions: + external: 43 + internal: 355 + github_engagement: + num_forks: 30 + num_prs: 14 + individual_authors: + alaaeddine-chakroun: 1 + alexandros-lamprineas: 1 + annie-tallund: 1 + arm: 3 + arnaud-de-grandmaison: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong,-alex-su,-kieran-hejmadi: 1 + chen-zhang: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 1 + david-spickett: 2 + dawid-borycki: 30 + diego-russo: 1 + diego-russo-and-leandro-nunes: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + graham-woodward: 1 + iago-calvo-lista,-arm: 1 + james-whitaker,-arm: 1 + jason-andrews: 90 + joe-stech: 1 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez,-arm: 1 + julie-gaskin: 4 + julio-suarez: 5 + kasper-mecklenburg: 1 + koki-mitsunami: 2 + konstantinos-margaritis: 7 + kristof-beyls: 1 + liliya-wu: 1 + mathias-brossard: 1 + michael-hall: 5 + nikhil-gupta,-pareena-verma,-nobel-chowdary-mandepudi,-ravi-malhotra: 1 + odin-shen: 1 + owen-wu,-arm: 2 + pareena-verma: 35 + pareena-verma,-annie-tallund: 1 + pareena-verma,-jason-andrews,-and-zach-lasiuk: 1 + pareena-verma,-joe-stech,-adnan-alsinan: 1 + pranay-bakre: 4 + przemyslaw-wirkus: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + thirdai: 1 + tianyu-li: 1 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari,-pareena-verma: 1 + visualsilicon: 1 + ying-yu: 1 + ying-yu,-arm: 1 + zach-lasiuk: 1 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 18 + percent_closed_vs_total: 0.0