diff --git a/.wordlist.txt b/.wordlist.txt index 96e6a2ba7a..ea9de98011 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -5156,3 +5156,108 @@ transpile tsc typescriptlang vmlinux +ATfE +ATfL +AlmaLinux +Asher +AsyncOpenAI +Bálint +CVE +CircleCI's +Couchbase +Couchbase's +DANDROID +DKLEIDICV +DataType +EdgeXpert +EleutherAI +Facter +GDDR +GEMMs +GSM +Gnuplot +HD +HellaSwag +Hiera +HwCaps +InceptionV +Infineon +Jett +KRaft's +Kiro +KleidiCV's +LangChain +LlamaIndex +MMLU +MULTIVERSION +MVT +Menuconfig +MobileNetV +NHWC +ORM +Phoronix +PointwiseConv +PyBind +QL +RecursiveCharacterTextSplitter +ResNet +SDOT +SMES +SentenceTransformer +Silabs +TPUs +UDOT +XDCR +XNNBatchMatrixMultiply +XNNConv +XNNFullyConnected +XnnpackBackend +Zhou +acc +agentless +aten +blockwise +cbc +couchbase +ctrl +datasheets +decltype +docx +etdump +etrecord +facter +faiss +fg +fibonacci +gemm +hiera +ipc +ivh +js's +kiro +kirocli +libclang +libopencv +llamacpp +llmcompressor +minmax +mse +multiversion +phoronix +pillowfight +pkl +pointwise +pqs +precisions +proto +pypdf +qb +qc +qp +rebalance +rustup +sSf +tcmalloc +tlsv +vLLM's +webp \ No newline at end of file diff --git a/archetypes/learning-path/_next-steps.md b/archetypes/learning-path/_next-steps.md index c3db0de5a2..727b395ddd 100644 --- a/archetypes/learning-path/_next-steps.md +++ b/archetypes/learning-path/_next-steps.md @@ -2,7 +2,7 @@ # ================================================================================ # FIXED, DO NOT MODIFY THIS FILE # ================================================================================ -weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +weight: 21 # The weight controls the order of the pages. _index.md always has weight 1. title: "Next Steps" # Always the same, html page title. layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. --- diff --git a/assets/contributors.csv b/assets/contributors.csv index fca2f42a34..bc7e464355 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -40,7 +40,7 @@ Gayathri Narayana Yegna Narayanan,Arm,,,, Alexandros Lamprineas,Arm,,,, Annie Tallund,Arm,annietllnd,annietallund,, Cyril Rohr,RunsOn,crohr,cyrilrohr,, -Rin Dobrescu,Arm,,,, +Asher Dobrescu,Arm,,,, Przemyslaw Wirkus,Arm,PrzemekWirkus,przemyslaw-wirkus-78b73352,, Nader Zouaoui,Day Devs,nader-zouaoui,nader-zouaoui,@zouaoui_nader,https://daydevs.com/ Alaaeddine Chakroun,Day Devs,Alaaeddine-Chakroun,alaaeddine-chakroun,,https://daydevs.com/ @@ -105,6 +105,9 @@ Mohamad Najem,Arm,,,, Ruifeng Wang,Arm,,,, Zenon Zhilong Xiu,Arm,,zenon-zhilong-xiu-491bb398,, Zbynek Roubalik,Kedify,,,, -Rani Chowdary Mandepudi, Arm,,,, +Rani Chowdary Mandepudi,Arm,,,, Ayoub Bourjilat,Ac6,Bourjilat,ayoub-bourjilat-a55b58165/,,https://www.ac6.fr/en/ Yahya Abouelseoud,Arm,,,, +Éliás Bálint,Arm,,,, +Steve Suzuki,Arm,,,, +Qixiang Xu,Arm,,,, diff --git a/content/install-guides/_images/q.gif b/content/install-guides/_images/q.gif deleted file mode 100644 index 401ff3081e..0000000000 Binary files a/content/install-guides/_images/q.gif and /dev/null differ diff --git a/content/install-guides/aws-q-cli.md b/content/install-guides/aws-q-cli.md deleted file mode 100644 index e7cfaa568d..0000000000 --- a/content/install-guides/aws-q-cli.md +++ /dev/null @@ -1,342 +0,0 @@ ---- -title: Amazon Q Developer CLI - -author: Jason Andrews -minutes_to_complete: 10 -official_docs: https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/command-line.html - -test_maintenance: true -test_images: -- ubuntu:latest - -layout: installtoolsall -multi_install: false -multitool_install_part: false -tool_install: true -weight: 1 ---- - -Amazon Q Developer CLI is a command-line tool for Amazon Q, a generative AI-powered assistant. You can use it to ask questions about AWS architecture, resources, and general development tasks. - -It supports multiple operating systems, including Arm-based Linux distributions and macOS, supports the Arm architecture, and you can install it in several ways. - -## What should I do before installing Amazon Q Developer CLI? - -You need a Builder ID to use the Amazon Q Developer CLI. If you don't have one, visit [Do more with AWS Builder ID](https://community.aws/builderid) and click **Sign up with Builder ID** to create your AWS Builder ID. - -This guide explains how to install Amazon Q Developer CLI on macOS and Arm Linux. - -## How do I download and install Amazon Q Developer CLI? - -The CLI is invoked using the `q` command. - -### How do I install Amazon Q Developer CLI on macOS? - -Install [Homebrew](https://brew.sh/) if it's not already available on your computer. - -Then install the Q CLI: - -```console -brew install amazon-q -``` - -### How do I install the Q CLI on Arm Linux? - -The easiest way to install the Q CLI on any Arm Linux distribution is to download and run the installer. - -Before starting, ensure that `curl` and `unzip` are available on your computer. - -{{% notice Note %}} -For Debian-based distributions such as Ubuntu, use the commands below. For other Linux distributions, use the appropriate package manager to install `curl` and `unzip`. -{{% /notice %}} - -```bash { target="ubuntu:latest" } -sudo apt update -sudo apt install curl unzip -y -``` - -Download the zip file with `curl`: - -```bash { target="ubuntu:latest" } -curl --proto '=https' --tlsv1.2 -sSf "https://desktop-release.codewhisperer.us-east-1.amazonaws.com/latest/q-aarch64-linux.zip" -o "q.zip" -``` - -Extract the installer and run it: - -```console -unzip q.zip -bash ./q/install.sh -``` - -You'll then be prompted about updating your shell config: - -```output -✔ Do you want q to modify your shell config (you will have to manually do this otherwise)? -``` - -To automate the install, add the `--no-confirm` flag to the `install.sh` command. - -{{% notice Note %}} -If you're using a Linux distribution with an older version of the GNU C Library - or one that does not use it at all, such as Alpine - you can download an alternative package built with the musl C library and has no external dependencies. - -Substitute the `curl` command above with this one and use the same install instructions: - -```bash { target="ubuntu:latest" } -curl "https://desktop-release.codewhisperer.us-east-1.amazonaws.com/latest/q-aarch64-linux-musl.zip" -o "q.zip" -``` - -{{% /notice %}} - -### How do I confirm the Q CLI is working? - -You now have the latest version of the Amazon Q Developer CLI installed. - -Confirm the CLI is available by invoking the `q` command to print the version. - -```console -q version -``` - -The version is printed: - -```output -q 1.10.1 -``` - -## How can I configure my AWS account to get the most from the Q CLI? - -The Q CLI can answer questions and solve problems related to your AWS resources and help you develop faster on AWS. To get the maximum benefit, you can configure the AWS CLI to use your account. - -Follow the [AWS CLI Install Guide](/install-guides/aws_access_keys/) and the [AWS Credentials Install Guide](/install-guides/aws_access_keys/) to set up the AWS CLI and generate and configure access keys. - -This allows you to use the Amazon Q Developer CLI to ask questions and solve issues specific to your AWS account. - -## What is an example of using the Q CLI? - -You can use `q chat` to find information about your AWS resources. - -```console -q chat -``` - -When the chat session starts you see: - -```output -To learn more about MCP safety, see https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/command-line-mcp-security.html - - - - ⢠⣶⣶⣦⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⣶⣿⣿⣿⣶⣦⡀⠀ - ⠀⠀⠀⣾⡿⢻⣿⡆⠀⠀⠀⢀⣄⡄⢀⣠⣤⣤⡀⢀⣠⣤⣤⡀⠀⠀⢀⣠⣤⣤⣤⣄⠀⠀⢀⣤⣤⣤⣤⣤⣤⡀⠀⠀⣀⣤⣤⣤⣀⠀⠀⠀⢠⣤⡀⣀⣤⣤⣄⡀⠀⠀⠀⠀⠀⠀⢠⣿⣿⠋⠀⠀⠀⠙⣿⣿⡆ - ⠀⠀⣼⣿⠇⠀⣿⣿⡄⠀⠀⢸⣿⣿⠛⠉⠻⣿⣿⠛⠉⠛⣿⣿⠀⠀⠘⠛⠉⠉⠻⣿⣧⠀⠈⠛⠛⠛⣻⣿⡿⠀⢀⣾⣿⠛⠉⠻⣿⣷⡀⠀⢸⣿⡟⠛⠉⢻⣿⣷⠀⠀⠀⠀⠀⠀⣼⣿⡏⠀⠀⠀⠀⠀⢸⣿⣿ - ⠀⢰⣿⣿⣤⣤⣼⣿⣷⠀⠀⢸⣿⣿⠀⠀⠀⣿⣿⠀⠀⠀⣿⣿⠀⠀⢀⣴⣶⣶⣶⣿⣿⠀⠀⠀⣠⣾⡿⠋⠀⠀⢸⣿⣿⠀⠀⠀⣿⣿⡇⠀⢸⣿⡇⠀⠀⢸⣿⣿⠀⠀⠀⠀⠀⠀⢹⣿⣇⠀⠀⠀⠀⠀⢸⣿⡿ - ⢀⣿⣿⠋⠉⠉⠉⢻⣿⣇⠀⢸⣿⣿⠀⠀⠀⣿⣿⠀⠀⠀⣿⣿⠀⠀⣿⣿⡀⠀⣠⣿⣿⠀⢀⣴⣿⣋⣀⣀⣀⡀⠘⣿⣿⣄⣀⣠⣿⣿⠃⠀⢸⣿⡇⠀⠀⢸⣿⣿⠀⠀⠀⠀⠀⠀⠈⢿⣿⣦⣀⣀⣀⣴⣿⡿⠃ - ⠚⠛⠋⠀⠀⠀⠀⠘⠛⠛⠀⠘⠛⠛⠀⠀⠀⠛⠛⠀⠀⠀⠛⠛⠀⠀⠙⠻⠿⠟⠋⠛⠛⠀⠘⠛⠛⠛⠛⠛⠛⠃⠀⠈⠛⠿⠿⠿⠛⠁⠀⠀⠘⠛⠃⠀⠀⠘⠛⠛⠀⠀⠀⠀⠀⠀⠀⠀⠙⠛⠿⢿⣿⣿⣋⠀⠀ - ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠛⠿⢿⡧ - -╭─────────────────────────────── Did you know? ────────────────────────────────╮ -│ │ -│ You can resume the last conversation from your current directory by │ -│ launching with q chat --resume │ -│ │ -╰──────────────────────────────────────────────────────────────────────────────╯ - -/help all commands • ctrl + j new lines • ctrl + s fuzzy search -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -> -``` - -For example, you can ask for the IP address of an EC2 instance instead of going to the AWS console or looking up the AWS CLI command to get it. - -An example is shown below: - -![Connect #center](/install-guides/_images/q.gif) - -## How can I set the Q CLI context to tailor responses? - -The Q CLI reads your context when you start it. If you provide more information about yourself, you will get tailored responses that match your development environment. - -There are multiple options to store context. - -Use the `/context` command to see the possible locations to store your context. - -```console -/context show -``` - -The help information is printed. - -```output - -🌍 global: - .amazonq/rules/**/*.md - README.md - AmazonQ.md - -👤 profile (default): - - -No files in the current directory matched the rules above. -``` - -For example, you can create a new file to store your context. - -```console -mkdir -p ~/.amazonq/rules/context -echo "I am an Arm Linux developer. I prefer Ubuntu and other Debian based distributions. I don't use any x86 computers so please provide all information assuming I'm working on Arm Linux. Sometimes I use macOS and Windows on Arm, but please only provide information about these operating systems when I ask for it." > ~/.amazonq/rules/context/context.md -``` - -When you invoke `q chat` you can confirm your context information was read by asking. - -```console -did you read my context information? -``` - -The response confirms the context file was read: - -```output -Yes, I've read your context information. I understand that you're an Arm Linux developer who primarily -uses Ubuntu and other Debian-based distributions. You don't use x86 computers, so I'll provide all -information assuming you're working on Arm Linux. You occasionally use macOS and Windows on Arm, but I' -ll only provide information about those operating systems when you specifically ask for it. - -I'll tailor my responses to be relevant for Arm Linux development, particularly focusing on Debian- -based distributions like Ubuntu, which is your preference. -``` - -Give it a try by asking questions like "How do I install the AWS CLI?" and verify that the answers match the provided context. - -## How do I change the model Amazon Q uses? - -When you start `q chat` the model is printed: - -```output -🤖 You are chatting with claude-3.7-sonnet -``` - -You can use the `/model` command to list other available models. - -```console -/model -``` - -The model options are displayed: - -```output -? Select a model for this chat session › -❯ claude-4-sonnet - claude-3.7-sonnet (active) - claude-3.5-sonnet -``` - -Use the arrow keys and select the model you want to use. - -You can ask Amazon Q to set the default model for future sessions. - -## Install an MCP server - -As an example of using MCP with Amazon Q, you can configure a local Github MCP server. - -Go to your GitHub account developer settings and create a personal access token with the following permissions: - -- repo (Full control of private repositories) -- read:org (Read organization membership) -- read:user (Read user profile data) - - -Use an editor to add the content below to the file `$HOME/.amazonq/mcp.json` - -```console -{ - "mcpServers": { - "github": { - "command": "docker", - "args": [ - "run", - "-i", - "--rm", - "-e", - "GITHUB_PERSONAL_ACCESS_TOKEN", - "ghcr.io/github/github-mcp-server" - ], - "env": { - "GITHUB_PERSONAL_ACCESS_TOKEN": "" - } - } - } -} -``` - -Replace `` with your GitHub token. - -You also need Docker running on the system. Refer to the [Docker install guide](/install-guides/docker/) for instructions. - -Restart `q` with the new MCP configuration: - -```console -q chat -``` - -You see the GitHub MCP server loaded and running: - -```output -✓ github loaded in 0.14 s -✓ 1 of 1 mcp servers initialized. - - ⢠⣶⣶⣦⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⣶⣿⣿⣿⣶⣦⡀⠀ - ⠀⠀⠀⣾⡿⢻⣿⡆⠀⠀⠀⢀⣄⡄⢀⣠⣤⣤⡀⢀⣠⣤⣤⡀⠀⠀⢀⣠⣤⣤⣤⣄⠀⠀⢀⣤⣤⣤⣤⣤⣤⡀⠀⠀⣀⣤⣤⣤⣀⠀⠀⠀⢠⣤⡀⣀⣤⣤⣄⡀⠀⠀⠀⠀⠀⠀⢠⣿⣿⠋⠀⠀⠀⠙⣿⣿⡆ - ⠀⠀⣼⣿⠇⠀⣿⣿⡄⠀⠀⢸⣿⣿⠛⠉⠻⣿⣿⠛⠉⠛⣿⣿⠀⠀⠘⠛⠉⠉⠻⣿⣧⠀⠈⠛⠛⠛⣻⣿⡿⠀⢀⣾⣿⠛⠉⠻⣿⣷⡀⠀⢸⣿⡟⠛⠉⢻⣿⣷⠀⠀⠀⠀⠀⠀⣼⣿⡏⠀⠀⠀⠀⠀⢸⣿⣿ - ⠀⢰⣿⣿⣤⣤⣼⣿⣷⠀⠀⢸⣿⣿⠀⠀⠀⣿⣿⠀⠀⠀⣿⣿⠀⠀⢀⣴⣶⣶⣶⣿⣿⠀⠀⠀⣠⣾⡿⠋⠀⠀⢸⣿⣿⠀⠀⠀⣿⣿⡇⠀⢸⣿⡇⠀⠀⢸⣿⣿⠀⠀⠀⠀⠀⠀⢹⣿⣇⠀⠀⠀⠀⠀⢸⣿⡿ - ⢀⣿⣿⠋⠉⠉⠉⢻⣿⣇⠀⢸⣿⣿⠀⠀⠀⣿⣿⠀⠀⠀⣿⣿⠀⠀⣿⣿⡀⠀⣠⣿⣿⠀⢀⣴⣿⣋⣀⣀⣀⡀⠘⣿⣿⣄⣀⣠⣿⣿⠃⠀⢸⣿⡇⠀⠀⢸⣿⣿⠀⠀⠀⠀⠀⠀⠈⢿⣿⣦⣀⣀⣀⣴⣿⡿⠃ - ⠚⠛⠋⠀⠀⠀⠀⠘⠛⠛⠀⠘⠛⠛⠀⠀⠀⠛⠛⠀⠀⠀⠛⠛⠀⠀⠙⠻⠿⠟⠋⠛⠛⠀⠘⠛⠛⠛⠛⠛⠛⠃⠀⠈⠛⠿⠿⠿⠛⠁⠀⠀⠘⠛⠃⠀⠀⠘⠛⠛⠀⠀⠀⠀⠀⠀⠀⠀⠙⠛⠿⢿⣿⣿⣋⠀⠀ - ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠛⠿⢿⡧ - -╭─────────────────────────────── Did you know? ────────────────────────────────╮ -│ │ -│ You can execute bash commands by typing ! followed by the command │ -│ │ -╰──────────────────────────────────────────────────────────────────────────────╯ - -/help all commands • ctrl + j new lines • ctrl + s fuzzy search -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -> -``` - -You can now use the GitHub MCP server to interact with GitHub repositories and do things like: - -**Repository Management** -- Create new repositories -- Fork existing repositories -- List branches and tags -- Create new branches - -**Code Management** -- Get file contents from repositories -- Create or update files -- Delete files -- Push multiple files in a single commit -- Search code across repositories - -**Pull Requests** -- Create pull requests -- List pull requests -- Get pull request details -- Update pull requests -- Merge pull requests -- Review pull requests -- Request GitHub Copilot reviews -- Get pull request files and comments - -**Issues** -- Create issues -- List issues -- Get issue details -- Update issues -- Add comments to issues -- Search issues - -**Commits** -- List commits -- Get commit details - -You're ready to use the Q CLI. diff --git a/content/install-guides/kiro-cli.md b/content/install-guides/kiro-cli.md new file mode 100644 index 0000000000..13e9dae4b0 --- /dev/null +++ b/content/install-guides/kiro-cli.md @@ -0,0 +1,299 @@ +--- +title: Kiro CLI + +author: Jason Andrews +minutes_to_complete: 10 +official_docs: https://kiro.dev/docs/cli/ + +test_maintenance: true +test_images: +- ubuntu:latest + +layout: installtoolsall +multi_install: false +multitool_install_part: false +tool_install: true +weight: 1 +--- + +Kiro CLI is a command-line tool powered by a generative AI assistant. You can use it to ask questions about AWS architecture, resources, and general development tasks. + +It supports multiple operating systems, including Arm-based Linux distributions and macOS, and you can install it in several ways. + +## What should I do before installing Kiro CLI? + +You need a Builder ID to use Kiro CLI. If you don't have one, visit [Do more with AWS Builder ID](https://community.aws/builderid) and select **Sign up with Builder ID** to create your AWS Builder ID. + +This guide explains how to install Kiro CLI on macOS and Arm Linux. + +## How do I download and install Kiro CLI? + +The CLI is invoked using the `kiro-cli` command. + +The easiest way to install Kiro CLI on Linux and macOS is with a single command: + +```console +curl -fsSL https://cli.kiro.dev/install | bash +``` + +### Can I use Homebrew to install Kiro CLI on macOS? + +Yes, you can install [Homebrew](https://brew.sh/) if it's not already available on your computer. + +Install Kiro CLI using Homebrew: + +```console +brew install kiro-cli +``` + +### Can I install Kiro CLI on Arm Linux by downloading a ZIP file? + +Yes, you can download and install Kiro CLI on any Arm Linux distribution using the installer. + +Before starting, ensure that `curl` and `unzip` are available on your computer. + +{{% notice Note %}} +For Debian-based distributions such as Ubuntu, use the commands below. For other Linux distributions, use the appropriate package manager to install `curl` and `unzip`. +{{% /notice %}} + +```bash { target="ubuntu:latest" } +sudo apt update +sudo apt install curl unzip -y +``` + +Download the ZIP file with `curl`: + +```bash { target="ubuntu:latest" } +curl --proto '=https' --tlsv1.2 -sSf 'https://desktop-release.q.us-east-1.amazonaws.com/latest/kirocli-aarch64-linux.zip' -o 'kirocli.zip' +``` + +Extract the installer and run it: + +```console +unzip kirocli.zip +bash ./kirocli/install.sh +``` + +The installer prompts you about updating your shell configuration: + +```output +✔ Do you want kiro to modify your shell config (you will have to manually do this otherwise)? +``` + +To automate the install, add the `--no-confirm` flag to the `install.sh` command. + +{{% notice Note %}} +If you're using a Linux distribution with an older version of the GNU C Library, or one that doesn't use it at all (such as Alpine), you can download an alternative package. This package is built with the musl C library and has no external dependencies. + +Substitute the `curl` command above with this one and use the same install instructions: + +```bash { target="ubuntu:latest" } +curl --proto '=https' --tlsv1.2 -sSf 'https://desktop-release.q.us-east-1.amazonaws.com/latest/kirocli-aarch64-linux-musl.zip' -o 'kirocli.zip' +``` + +{{% /notice %}} + +### How do I confirm Kiro CLI is working? + +You now have the latest version of Kiro CLI installed. + +Confirm the CLI is available by printing the version: + +```console +kiro-cli version +``` + +The output shows the version: + +```output +kiro-cli 1.20.0 +``` + +## How can I configure my AWS account to get the most from Kiro CLI? + +Kiro CLI can answer questions and solve problems related to your AWS resources and help you develop faster on AWS. To get the maximum benefit, you can configure the AWS CLI to use your account. + +Follow the [AWS CLI Install Guide](/install-guides/aws-cli/) and the [AWS Credentials Install Guide](/install-guides/aws_access_keys/) to set up the AWS CLI and generate and configure access keys. + +This allows you to use Kiro CLI to ask questions and solve issues specific to your AWS account. + +For example, you can ask for the IP address of an EC2 instance instead of going to the AWS console or looking up the AWS CLI command to get it. + +Kiro accesses your AWS resources and prints the information you ask for. + +## How can I set the Kiro CLI context to tailor responses? + +Kiro CLI can read your context. If you provide more information about yourself, you get tailored responses that match your development environment. + +There are multiple options to store context. + +Use the `/context` command to see the possible locations to store your context. + +```console +/context show +``` + +The help information is printed. + +```output + +Agent (kiro_default) + - AmazonQ.md (no matches) + - AGENTS.md (no matches) + - README.md (no matches) + +Session (temporary) + + +No files in the current directory matched the rules above. +``` + +For example, you can create a new file to store your context as shown below: + +```console +echo "I am an Arm Linux developer. I prefer Ubuntu and other Debian based distributions. I don't use any x86 computers so please provide all information assuming I'm working on Arm Linux. Sometimes I use macOS and Windows on Arm, but please only provide information about these operating systems when I ask for it." > ~/.kiro/context.md +``` + +When you invoke `kiro-cli chat`, you can confirm your context information was read by loading it and asking about it. + +Load the context file: + +```console +/context add ~/.kiro/context.md +``` + +Confirm it was read: + +```console +did you read my context information? +``` + +The response confirms the context file was read: + +```output +Yes, I read your context information. You're an Arm Linux developer who prefers Ubuntu and other Debian-based +distributions, and you don't use x86 computers. You also sometimes use macOS and Windows on Arm, but only want +information about those when you specifically ask for them. +``` + +Ask questions like "How do I install the AWS CLI?" to verify that the answers match the provided context. + +## How do I change the model Kiro uses? + +When you start `kiro-cli chat`, the model is printed: + +```output +Model: Auto (/model to change) +``` + +Use the `/model` command to list other available models: + +```console +/model +``` + +The model options are displayed: + +```output + Press (↑↓) to navigate · Enter(⏎) to select model +❯ Auto (current) | 1x credit | Models chosen by task for optimal usage and consistent quality + claude-sonnet-4.5 | 1.3x credit | The latest Claude Sonnet model + claude-sonnet-4 | 1.3x credit | Hybrid reasoning and coding for regular use + claude-haiku-4.5 | 0.4x credit | The latest Claude Haiku model +``` + +Use the arrow keys to select the model you want to use. + +You can ask Kiro to set the default model for future sessions. + +## Install an MCP server + +As an example of using MCP with Kiro, you can configure a local GitHub MCP server. + +Go to your GitHub account developer settings and create a personal access token with the following permissions: + +- `repo` (Full control of private repositories) +- `read:org` (Read organization membership) +- `read:user` (Read user profile data) + +Use an editor to add the content below to the file `$HOME/.kiro/settings/mcp.json`: + +```json +{ + "mcpServers": { + "github": { + "command": "docker", + "args": [ + "run", + "-i", + "--rm", + "-e", + "GITHUB_PERSONAL_ACCESS_TOKEN", + "ghcr.io/github/github-mcp-server" + ], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "" + }, + "disabled": false, + "autoApprove": [] + } + } +} +``` + +Replace `` with your GitHub personal access token. + +You also need Docker running on the system. See the [Docker install guide](/install-guides/docker/) for instructions. + +Restart `kiro-cli` with the new MCP configuration: + +```console +kiro-cli chat +``` + +The output shows the GitHub MCP server loaded and running: + +```output +✓ github loaded in 0.16 s +✓ 1 of 1 mcp servers initialized. + +``` + +You can now use the GitHub MCP server to interact with GitHub repositories and do things like: + +**Repository Management** +- Create new repositories +- Fork existing repositories +- List branches and tags +- Create new branches + +**Code Management** +- Get file contents from repositories +- Create or update files +- Delete files +- Push multiple files in a single commit +- Search code across repositories + +**Pull Requests** +- Create pull requests +- List pull requests +- Get pull request details +- Update pull requests +- Merge pull requests +- Review pull requests +- Request GitHub Copilot reviews +- Get pull request files and comments + +**Issues** +- Create issues +- List issues +- Get issue details +- Update issues +- Add comments to issues +- Search issues + +**Commits** +- List commits +- Get commit details + +You're ready to use Kiro CLI. diff --git a/content/install-guides/vnc.md b/content/install-guides/vnc.md index 05623d7fac..8982fc9ac3 100644 --- a/content/install-guides/vnc.md +++ b/content/install-guides/vnc.md @@ -73,9 +73,10 @@ Create a file at `$HOME/.vnc/xstartup` with the following contents: ```console #!/bin/sh -unset SESSION_MANAGER -unset DBUS_SESSION_BUS_ADDRESS -exec startxfce4 +# select your favorite windows manager here +/bin/bash -l < ``` -You can see by looking at the timeline view that instructions no longer depend on each other and can execute in parallel. +You can see by looking at the timeline view that instructions no longer depend on each other and can execute in parallel. -Instructions also spend less time waiting in the scheduler's queue. This explains why the performance of `sum_test2.s` is so much better than `sum_test1.s`. +Instructions also spend less time waiting in the scheduler's queue. This explains why the performance of `sum_test2.s` is so much better than `sum_test1.s`. + +Note the use of the flag `-mcpu=neoverse-v2` throughout all of those examples. This flag tells MCA to simulate the performance of the code in `sum_test1.s` and `sum_test2.s` on a Neoverse V2 core. This flag can be changed to any core supported in MCA. You can find what cores are supported in MCA by running `llvm-mca -mcpu=help <<<''`. You can also look at the LLVM sources in [llvm-project](https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-mca/AArch64), which will give you more detailed examples. For instance, when looking at the Neoverse cores, there is currently support for the N1, N2, N3 and the V1, V2, V3 cores. In the next section, you can try running `llvm-mca` with Compiler Explorer. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md index 981fbc3c3a..0b0cda642d 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md @@ -9,7 +9,7 @@ layout: learningpathall ## Choose your SME2 setup: native or emulated {{< notice Note>}} -This Learning Path demonstrates how to use SME2 on macOS on a device with an M4 chip. It does not provide instructions for using SME2 on iPhone or iPad, even though they have SME2 support. +This Learning Path demonstrates how to use SME2 on macOS on a device with an M4 chip, or on some Android phones that have SME2 support. It does not provide instructions for using SME2 on iPhone or iPad, even though they have SME2 support. {{< /notice >}} To build or run SME2-accelerated code, first set up your development environment. @@ -17,14 +17,16 @@ This section walks you through the required tools and two supported setup option * [**Native SME2 hardware**](#native-sme2) - build and run directly on a system with SME2 support, see [Devices with native SME2 support](#devices) +* [**Android with SME2 hardware**](#android-sme2) - cross build and run on Android phones with SME2 support, see [Devices with native SME2 support](#devices) + * [**Docker-based emulation**](#docker-sme2) - use a container to emulate SME2 in bare metal mode (without an OS) ## Download and explore the code examples -To get started, begin by [downloading the code examples](https://gitlab.arm.com/learning-code-examples/code-examples/-/archive/d41190c0cf962f778ae71b94adf5330033019aed/code-examples-d41190c0cf962f778ae71b94adf5330033019aed.tar.gz?path=learning-paths/cross-platform/multiplying-matrices-with-sme2). +To get started, begin by [downloading the code examples](https://gitlab.arm.com/learning-code-examples/code-examples/-/archive/2632d7cae67fc1ce6b43438a38e00b9edb78f5d9/code-examples-2632d7cae67fc1ce6b43438a38e00b9edb78f5d9.tar.gz?path=learning-paths/cross-platform/multiplying-matrices-with-sme2). Now extract the archive, and change directory to: -``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2.`` +`code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2.` ```BASH tar xfz code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2.tar.gz -s /code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2/code-examples/ @@ -40,7 +42,10 @@ code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2/ │ └── devcontainer.json ├── .git/ ├── .gitignore -├── Makefile +├── CMakeLists.txt +├── cmake/ +│ ├── SME2_MATMUL.cmake +│   └── baremetal-toolchain.cmake ├── README.rst ├── docker/ │ ├── assets.source_me @@ -64,48 +69,100 @@ code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2/ Among other files, it includes: - Code examples. -- A `Makefile` to build the code. +- A `CMakeLists.txt` to build the code together with a `cmake/` subdirectory that contains the toolchain file needed for baremetal builds as well as some function to hide some cmake details from the top-level `CMakeLists.txt`. - `run-fvp.sh` to run the FVP model. - A `docker` directory containing: - - `assets.source_me` to provide toolchain paths. + - `assets.source_me` to provide tools information like versions and paths. - `build-my-container.sh`, a script that automates building the Docker image from the `sme2-environment.docker` file. It runs the Docker build command with the correct arguments so you don’t have to remember them. - `sme2-environment.docker`, a custom Docker file that defines the steps to build the SME2 container image. It installs all the necessary dependencies, including the SME2-compatible compiler and Arm FVP emulator. - - `build-all-containers.sh`, a script to build multi-architecture images. + - `build-all-containers.sh`, a script to build multi-architecture images. It's the script that has been used to build the docker images for this Learning Path. - `.devcontainer/devcontainer.json` for VS Code container support. {{% notice Note %}} From this point, all instructions assume that your current directory is -``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``, so ensure that you are in the correct directory before proceeding. +`code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2/`, so ensure that you are in the correct directory before proceeding. {{% /notice %}} +The build system is `cmake` based and we will use `ninja` as the build tool in this learning path. If you don't have hardware with SME2 support available and are going to use **Docker-based emulation**, then there is nothing to do, as `cmake` and `ninja` have already been installed for you in the docker container. If you're going to target **Native SME2 hardware** or **Android with SME2 hardware**, you first have to install them if they are not already available on your system: + +{{< tabpane code=true >}} + +{{< tab header="Linux/Ubuntu" language="bash">}} +sudo apt install cmake ninja-build +{{< /tab >}} + +{{< tab header="macOS" language="bash">}} +brew install cmake ninja +{{< /tab >}} + +{{< /tabpane >}} + ## Set up a system with native SME2 support {#native-sme2} To run SME2 code natively, ensure your system includes SME2 hardware and uses a compiler version that supports SME2. -For the compiler, you can use [Clang](https://www.llvm.org/) version 18 or later, or [GCC](https://gcc.gnu.org/) version 14 or later. This Learning Path uses ``clang``. +For the compiler, you can use [Clang](https://www.llvm.org/) version 18 or later, or [GCC](https://gcc.gnu.org/) version 14 or later. This Learning Path uses `clang`. {{% notice Note %}} -At the time of writing, macOS ships with `clang` version 17.0.0, which doesn't support SME2. Use a newer version, such as 20.1.7, available through Homebrew.{{% /notice%}} +At the time of writing, macOS ships with `clang` version 17.0.0, which doesn't support SME2. Use a newer version, such as 21.1.4, available through Homebrew. +{{% /notice%}} + +You can check your compiler version using the command: `clang --version` if it's already installed. If not, install `clang` using the instructions below, selecting either macOS or Linux/Ubuntu, depending on your setup: + +{{< tabpane code=true >}} + +{{< tab header="Linux/Ubuntu" language="bash">}} +sudo apt install clang +{{< /tab >}} + +{{< tab header="macOS" language="bash">}} +brew install llvm +{{< /tab >}} + +{{< /tabpane >}} + +You are now all set to start hacking with SME2. + +## Set up a system to target Android phones with SME2 support {#android-sme2} + +Targeting an Android phone means you will have to cross-compile the code examples using Android's NDK. The easiest way is to install [Android Studio](https://developer.android.com/studio) on your system, then in `Android Studio` go to `Tools` > `SDK manager` in the menu, then select the `SDK Tools` tab and tick `NDK (Side by side)` as show in the picture below: + +![NDK installation alt-text#center](ndk_install.png "Figure 1: NKK installation.") -You can check your compiler version using the command:``clang --version`` +Locate where the NDK has been installed on your machine and save the location to an `NDK` environment variable for use later when building the code examples. For example, on macOS, at the time of writing, the NDK is located in `/Users/$USER/Library/Android/sdk/ndk/29.0.14206865`: -### Install Clang +```BASH +export NDK="/Users/$USER/Library/Android/sdk/ndk/29.0.14206865" +``` -Install Clang using the instructions below, selecting either macOS or Linux/Ubuntu, depending on your setup: +You will also need the `adb` (Android Debug Bridge) tool to upload and execute the programs on a mobile phone (with SME2 support). {{< tabpane code=true >}} - {{< tab header="Linux/Ubuntu" language="bash">}} - sudo apt install clang - {{< /tab >}} +{{< tab header="Linux/Ubuntu" language="bash">}} +sudo apt install adb +{{< /tab >}} - {{< tab header="macOS" language="bash">}} - brew install llvm - {{< /tab >}} +{{< tab header="macOS" language="bash">}} +brew install android-platform-tools +{{< /tab >}} {{< /tabpane >}} -You are now all set to start hacking with SME2. +The phone will have to be connected with a USB cable to your development machine in order for `adb` to work. Enable the `developper mode` on the Android phone, and connect it on your machine with the USB cable. + +{{% notice Note %}} +You might have to perform some actions on the phone to enable the debug connection. +{{% /notice %}} + +Now, check that `adb` can see the phone: + +```SH +adb devices -l +List of devices attached +57251FDCH0027M device usb:32-0 product:blazer model:Pixel_10_Pro device:blazer transport_id:1 +``` +In the above case, `adb` can see a `Pixel 10 Pro` phone. ## Set up a system using SME2 emulation with Docker {#docker-sme2} @@ -113,9 +170,9 @@ If your machine doesn't support SME2, or you want to emulate it, you can use the The Docker container includes both a compiler and [Arm's Fixed Virtual Platform (FVP) model](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms) -for emulating code that uses SME2 instructions. You can either run the prebuilt container image provided in this Learning Path or build it yourself using the Docker file that is included. +for emulating code that uses SME2 instructions. You can either run the prebuilt container image provided in this Learning Path or build it yourself using the Docker file that is included with the help of `docker/build-my-container.sh` script. -If building manually, follow the instructions in the ``sme2-environment.docker`` file to install the required tools on your machine. +If building manually, follow the instructions in the `sme2-environment.docker` file to install the required tools on your machine. ### Install and verify Docker @@ -127,10 +184,10 @@ To begin, start by checking that Docker is installed on your machine: ```BASH { output_lines="2" } docker --version -Docker version 27.3.1, build ce12230 +Docker version 28.5.1, build e180ab8 ``` -If the above command fails with an error message similar to "``docker: command not found``", then follow the steps from the [Docker install guide](/install-guides/docker/) to install Docker. +If the above command fails with an error message similar to "`docker: command not found`", then follow the steps from the [Docker install guide](/install-guides/docker/) to install Docker. {{% notice Note %}} You might need to log out and back in again or restart your machine for the changes to take @@ -182,54 +239,65 @@ When a command is executed in the Docker container environment, you must prepend it with instructions on the command line so that your shell executes it within the container. -For example, to execute ``COMMAND ARGUMENTS`` in the SME2 Docker container, the +For example, to execute `COMMAND ARGUMENTS` in the SME2 Docker container, the command line looks like this: ```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 COMMAND ARGUMENTS +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 COMMAND ARGUMENTS ``` This invokes Docker, using the -``armswdev/sme2-learning-path:sme2-environment-v2`` container image, and mounts +`armswdev/sme2-learning-path:sme2-environment-v3` container image, and mounts the current working directory (the -``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``) -inside the container to ``/work``, then sets ``/work`` as the working directory -and runs ``COMMAND ARGUMENTS`` in this environment. +`code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2`) +inside the container to `/work` and runs `COMMAND ARGUMENTS` in this environment. -For example, to run ``make``, you need to enter: +For example, to run `echo Hello`, you need to enter: -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make +```BASH { output_lines="2" } +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 echo Hello +Hello ``` ### Use an interactive Docker shell -The standard `docker run` commands can be long and repetitive. To streamline your workflow, you can start an interactive Docker session that allows you to run commands directly - without having to prepend docker run each time. +The standard `docker run` commands can be long and repetitive. To streamline +your workflow, you can start an interactive Docker session that allows you to +run commands directly --- without having to prepend docker run each time. To launch an interactive shell inside the container, use the `-it` flag: ```BASH -docker run --rm -it -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 +docker run --rm -it -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ``` You are now in the Docker container, and you can execute all commands directly. For -example, the ``make`` command can now be simply invoked with: +example, the `make` command can now be simply invoked with: -```BASH -make +```BASH { output_lines="2" } +echo Hello +Hello ``` -To exit the container, simply hit CTRL+D. Note that the container is not persistent (it was invoked with ``--rm``), so each invocation will use a container freshly built from the image. All the files reside outside the container, so changes you make to them will be persistent. +To exit the container, simply hit CTRL+D. + +{{% notice Note %}} +The container is not persistent (it was invoked with `--rm`), so each invocation +will use a container freshly built from the image. All files in `/work` reside +outside the container though, so changes you make to them will be persistent +across sessions. +{{% /notice %}} ### Develop with Docker in Visual Studio Code -If you are using Visual Studio Code as your IDE, the container setup is already configured with `devcontainer/devcontainer.json`. +If you are using Visual Studio Code as your IDE, the container setup is already +configured with `devcontainer/devcontainer.json`. Make sure you have the [Microsoft Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension installed. Then select the **Reopen in Container** menu entry as shown below. -It automatically finds and uses ``.devcontainer/devcontainer.json``: +It automatically finds and uses `.devcontainer/devcontainer.json`: ![VSCode Docker alt-text#center](VSCode.png "Figure 1: Setting up the Docker container.") @@ -247,8 +315,6 @@ part. These Apple devices support SME2 natively. - - | Device | Release Date | Chip Options | |-------------------------------------|--------------|---------------------------| | iPhone 16 | 2024 | A18 | @@ -258,10 +324,8 @@ These Apple devices support SME2 natively. | MacBook Pro (14-inch, 16-inch, 2024)| 2024 | M4 Pro, M4 Max | | MacBook Air (2025) | 2025 | M4 | - These Android phones support SME2 natively. - | Device | Release Date | Chip Options | |-------------------------------------|--------------|---------------------------| | Vivo X300 | 2025 | MediaTek Dimensity 9500 featuring an 8-core Arm C1 CPU cluster and Arm G1-Ultra GPU | diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md index 24a22b805b..ea5da264d2 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md @@ -8,7 +8,7 @@ layout: learningpathall ## Beyond this implementation -There are many different ways that you can extend and optimize the matrix multiplication algorithm beyond the specific SME2 implementation that you've explored in this Learning Path. While the current approach is tuned for performance on a specific hardware target, further improvements can make your code more general, more efficient, and better suited to a wider range of applications. +There are many different ways that you can extend and optimize the matrix multiplication algorithm beyond the specific SME2 implementation that you've explored in this Learning Path. While the current approach is tuned for performance on a specific hardware target, further improvements can make your code more general, more efficient, and better suited to a wider range of applications. Advanced optimization techniques are essential when adapting algorithms to real-world scenarios. These often include processing matrices of different shapes and sizes, handling mixed data types, or maximizing throughput for large batch operations. The ability to generalize and fine-tune your implementation opens the door to more scalable and reusable code that performs well across workloads. @@ -16,9 +16,9 @@ Whether you're targeting different data types, improving parallelism, or adaptin Some ideas of improvements that you might like to test out include: -* Generalization -* Loop unrolling -* The strategic use of matrix properties +- Generalization +- Loop unrolling +- The strategic use of matrix properties ## Generalize the algorithm for different data types @@ -26,17 +26,17 @@ So far, you've focused on multiplying floating-point matrices. In practice, matr The structure of the algorithm (the core logic - tiling, outer product, and accumulation) remains consistent across data types. It uses preprocessing with tiling and outer product–based multiplication. To adapt it for other data types, you only need to change how values are: -* Loaded from memory -* Accumulated (often with widening) -* Stored to the output +- Loaded from memory +- Accumulated (often with widening) +- Stored to the output -Languages that support [generic programming](https://en.wikipedia.org/wiki/Generic_programming), such as C++ with templates, make this easier. +Languages that support [generic programming](https://en.wikipedia.org/wiki/Generic_programming), such as C++ with templates, make this easier. Templates allow you to: -* Swap data types flexibly -* Handle accumulation in a wider format when needed -* Reuse algorithm logic across multiple matrix types +- Swap data types flexibly +- Handle accumulation in a wider format when needed +- Reuse algorithm logic across multiple matrix types By expressing the algorithm generically, you benefit from the compiler generating multiple optimized variants, allowing you the opportunity to focus on: @@ -53,7 +53,7 @@ For example, the `preprocess_l_intr` function uses: ```c svld1_x2(...); // Load two vectors at once ``` -Loading two vectors at a time enables the simultaneous computing of more tiles. Since the matrices are already laid out efficiently in memory, consecutive loading is fast. Implementing this approach can make improvements to the ``macc`` to load ``ratio``. +Loading two vectors at a time enables the simultaneous computing of more tiles. Since the matrices are already laid out efficiently in memory, consecutive loading is fast. Implementing this approach can make improvements to the `macc` to load `ratio`. In order to check your understanding of SME2, you can try to implement this unrolling yourself in the intrinsic version (the assembly version already has this optimization). You can check your work by comparing your results to the expected reference values. @@ -65,13 +65,10 @@ By playing with the mathematical properties of matrix multiplication and the out For example, it is common that one of the matrices is actually a vector, meaning that it has a single row or column, and then it becomes advantageous to transpose it. Can you see why? -The answer is that as the elements are stored contiguously in memory, an ``Nx1``and ``1xN`` matrices have the exact same memory layout. The transposition becomes a no-op, and the matrix elements stay in the same place in memory. +The answer is that as the elements are stored contiguously in memory, an `Nx1`and `1xN` matrices have the exact same memory layout. The transposition becomes a no-op, and the matrix elements stay in the same place in memory. An even more *degenerated* case that is easy to manage is when one of the matrices is essentially a scalar, which means that it is a matrix with one row and one column. Although the current code used here handles it correctly from a results point of view, a different algorithm and use of instructions might be more efficient. Can you think of another way? - -In order to check your understanding of SME2, you can try to implement this unrolling yourself in the intrinsic version (the asm version already has this optimization). You can check your work by comparing your results to the expected reference values. - - +In order to check your understanding of SME2, you can try to implement this optimization for special cases yourself in the intrinsic version. You can check your work by comparing your results to the expected reference values. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md index 0d1e438649..49bfdba4ed 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md @@ -6,79 +6,146 @@ weight: 4 layout: learningpathall --- -In this section, you'll verify that your environment is ready for SME2 development. This is your first hands-on task and confirms that the toolchain, hardware (or emulator), and compiler are set up correctly. +In this section, you'll verify that your environment is ready for SME2 +development. This is your first hands-on task and confirms that the toolchain, +hardware (or emulator), and compiler are set up correctly. ## Build the code examples -Use the `make` command to compile all examples and generate assembly listings: +Make sure your current working directory is `code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2`. + +Use the `cmake` command to configure the project. Note that for native builds, +you may have (as shown in the example) to tell `cmake` which `clang` to use as +it would otherwise find the default one from the system (which might not be +suitable). If you system `clang` is recent enough, omit the `CC=...` +part of the `cmake` invocation. + +{{< tabpane code=true >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-12">}} +CC=/opt/homebrew/Cellar/llvm/21.1.4/bin/clang cmake -G Ninja -S . -B build-native -DCMAKE_BUILD_TYPE:STRING=Release +-- The C compiler identification is Clang 21.1.4 +-- The ASM compiler identification is Clang with GNU-like command-line +-- Found assembler: /opt/homebrew/Cellar/llvm/21.1.4/bin/clang +-- Detecting C compiler ABI info +-- Detecting C compiler ABI info - done +-- Check for working C compiler: /opt/homebrew/Cellar/llvm/21.1.4/bin/clang - skipped +-- Detecting C compile features +-- Detecting C compile features - done +-- Configuring done (0.8s) +-- Generating done (0.0s) +-- Build files have been written to: .../multiplying-matrices-with-sme2/build-native +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-12">}} +cmake -G Ninja -S . -B build-android -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_TOOLCHAIN_FILE:STRING="$NDK/build/cmake/android.toolchain.cmake" -DANDROID_ABI:STRING=arm64-v8a -DANDROID_PLATFORM:STRING=android-24 -DANDROID_STL:STRING=c++_static -DCMAKE_BUILD_TYPE:STRING=Release +-- The C compiler identification is Clang 21.0.0 +-- The ASM compiler identification is Clang with GNU-like command-line +-- Found assembler: .../Library/Android/sdk/ndk/29.0.14206865/toolchains/llvm/prebuilt/darwin-x86_64/bin/clang +-- Detecting C compiler ABI info +-- Detecting C compiler ABI info - done +-- Check for working C compiler: .../Library/Android/sdk/ndk/29.0.14206865/toolchains/llvm/prebuilt/darwin-x86_64/bin/clang - skipped +-- Detecting C compile features +-- Detecting C compile features - done +-- Configuring done (1.1s) +-- Generating done (0.0s) +-- Build files have been written to: .../multiplying-matrices-with-sme2/build-android +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-19">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 cmake -G Ninja -S . -B build-baremetal -DCMAKE_TOOLCHAIN_FILE:STRING=cmake/baremetal-toolchain.cmake -DCMAKE_BUILD_TYPE:STRING=Release +-- Using ATfE from: /tools/ATfE-21.1.1-Linux-AArch64 +-- Using ATfE from: /tools/ATfE-21.1.1-Linux-AArch64 +-- The C compiler identification is Clang 21.1.1 +-- The ASM compiler identification is Clang with GNU-like command-line +-- Found assembler: /tools/ATfE-21.1.1-Linux-AArch64/bin/clang +-- Detecting C compiler ABI info +-- Detecting C compiler ABI info - done +-- Check for working C compiler: /tools/ATfE-21.1.1-Linux-AArch64/bin/clang - skipped +-- Detecting C compile features +-- Detecting C compile features - done +-- Configuring done (0.3s) +-- Generating done (0.0s) +-- Build files have been written to: /work/build-baremetal +{{< /tab >}} + +{{< /tabpane >}} + +Then build all the examples with `ninja`: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2-19">}} -make -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -o hello hello.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o sme2_check.o sme2_check.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o misc.o misc.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_check sme2_check.o misc.o -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -DIMPL=asm -c -o main_asm.o main.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o matmul_asm.o matmul_asm.c -/opt/homebrew/opt/llvm/bin/clang -Wall -march=native+sve+sme2 -DBAREMETAL=0 -c -o matmul_asm_impl.o matmul_asm_impl.S -/opt/homebrew/opt/llvm/bin/clang -Wall -march=native+sve+sme2 -DBAREMETAL=0 -c -o preprocess_l_asm.o preprocess_l_asm.S -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -c -o matmul_vanilla.o matmul_vanilla.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -c -o preprocess_vanilla.o preprocess_vanilla.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_matmul_asm main_asm.o matmul_asm.o matmul_asm_impl.o preprocess_l_asm.o matmul_vanilla.o preprocess_vanilla.o misc.o -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -DIMPL=intr -c -o main_intr.o main.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o matmul_intr.o matmul_intr.c -/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_matmul_intr main_intr.o matmul_intr.o matmul_vanilla.o preprocess_vanilla.o misc.o -/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d hello > hello.lst -/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_check > sme2_check.lst -/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_matmul_asm > sme2_matmul_asm.lst -/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_matmul_intr > sme2_matmul_intr.lst - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2-19">}} -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o hello hello.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o sme2_check.o sme2_check.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o misc.o misc.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_check sme2_check.o misc.o -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -DIMPL=asm -c -o main_asm.o main.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o matmul_asm.o matmul_asm.c -clang -Wall --target=aarch64-none-elf -march=armv9.4-a+sme2 -DBAREMETAL=1 -c -o matmul_asm_impl.o matmul_asm_impl.S -clang -Wall --target=aarch64-none-elf -march=armv9.4-a+sme2 -DBAREMETAL=1 -c -o preprocess_l_asm.o preprocess_l_asm.S -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -c -o matmul_vanilla.o matmul_vanilla.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -c -o preprocess_vanilla.o preprocess_vanilla.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_asm main_asm.o matmul_asm.o matmul_asm_impl.o preprocess_l_asm.o matmul_vanilla.o preprocess_vanilla.o misc.o -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -DIMPL=intr -c -o main_intr.o main.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o matmul_intr.o matmul_intr.c -clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_intr main_intr.o matmul_intr.o matmul_vanilla.o preprocess_vanilla.o misc.o -llvm-objdump --demangle -d hello > hello.lst -llvm-objdump --demangle -d sme2_check > sme2_check.lst -llvm-objdump --demangle -d sme2_matmul_asm > sme2_matmul_asm.lst -llvm-objdump --demangle -d sme2_matmul_intr > sme2_matmul_intr.lst - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-native/ +ninja: Entering directory `build-native/' +[19/19] Linking C executable sme2_matmul_intr +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-android/ +ninja: Entering directory `build-android/' +[19/19] Linking C executable sme2_matmul_asm +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-21">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ninja -C build-baremetal/ +ninja: Entering directory `build-baremetal/' +[1/19] Building ASM object CMakeFiles/sme2_matmul_asm.dir/preprocess_l_asm.S.obj +[2/19] Building ASM object CMakeFiles/sme2_matmul_asm.dir/matmul_asm_impl.S.obj +[3/19] Building C object CMakeFiles/hello.dir/hello.c.obj +[4/19] Building C object CMakeFiles/sme2_matmul_asm.dir/matmul_vanilla.c.obj +[5/19] Building C object CMakeFiles/sme2_matmul_asm.dir/preprocess_vanilla.c.obj +[6/19] Building C object CMakeFiles/sme2_matmul_intr.dir/matmul_vanilla.c.obj +[7/19] Building C object CMakeFiles/sme2_matmul_intr.dir/preprocess_vanilla.c.obj +[8/19] Linking C executable hello +[9/19] Building C object CMakeFiles/sme2_matmul_asm.dir/matmul_asm.c.obj +[10/19] Building C object CMakeFiles/sme2_check.dir/sme2_check.c.obj +[11/19] Building C object CMakeFiles/sme2_matmul_intr.dir/main.c.obj +[12/19] Building C object CMakeFiles/sme2_matmul_asm.dir/main.c.obj +[13/19] Building C object CMakeFiles/sme2_check.dir/misc.c.obj +[14/19] Building C object CMakeFiles/sme2_matmul_asm.dir/misc.c.obj +[15/19] Building C object CMakeFiles/sme2_matmul_intr.dir/misc.c.obj +[16/19] Building C object CMakeFiles/sme2_matmul_intr.dir/matmul_intr.c.obj +[17/19] Linking C executable sme2_check +[18/19] Linking C executable sme2_matmul_asm +[19/19] Linking C executable sme2_matmul_intr +{{< /tab >}} + {{< /tabpane >}} -The `make` command performs the following tasks: +The `ninja` command performs the following tasks: - It builds four executables: `hello`, `sme2_check`, `sme2_matmul_asm`, and `sme2_matmul_intr`. - It creates the assembly listings for the four executables: `hello.lst`, `sme2_check.lst`, `sme2_matmul_asm.lst`, and `sme2_matmul_intr.lst`. - These targets compile and link all example programs and generate disassembly listings for inspection. At any point, you can clean the directory of all the files that have been built -by invoking `make clean`: +by invoking `ninja` with the `clean` target: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2">}} - make clean - rm hello sme2_check sme2_matmul_asm sme2_matmul_intr hello.lst sme2_check.lst sme2_matmul_asm.lst sme2_matmul_intr.lst *.o - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make clean - rm hello sme2_check sme2_matmul_asm sme2_matmul_intr hello.lst sme2_check.lst sme2_matmul_asm.lst sme2_matmul_intr.lst *.o - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +ninja -C build-native/ clean +ninja: Entering directory `build-native' +[1/1] Cleaning all built files... +Cleaning... 19 files. +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-4">}} +ninja -C build-android/ clean +ninja: Entering directory `build-android/' +[1/1] Cleaning all built files... +Cleaning... 19 files. +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-4">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ninja -C build-baremetal/ clean +ninja: Entering directory `build-baremetal/' +[1/1] Cleaning all built files... +Cleaning... 19 files. +{{< /tab >}} + {{< /tabpane >}} ## Run a Hello World program @@ -101,20 +168,30 @@ int main(int argc, char *argv[]) { Run the `hello` program with: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2">}} - ./hello - Hello, world ! - {{< /tab >}} - {{< tab header="Emulated SME2 support" language="bash" output_lines="2-4">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh hello - Hello, world ! +{{< tab header="Native SME2 support" language="bash" output_lines="2">}} +./build-native/hello +Hello, world ! +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5">}} +adb push build-android/hello /data/local/tmp +build-android/hello: 1 file pushed, 0 skipped. 14.6 MB/s (7544 bytes in 0.000s) +adb shell chmod 755 /data/local/tmp/hello +adb shell /data/local/tmp/hello +Hello, world ! +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-4">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh build-baremetal/hello +Hello, world ! + +Info: /OSCI/SystemC: Simulation stopped by user. +{{< /tab >}} - Info: /OSCI/SystemC: Simulation stopped by user. - {{< /tab >}} {{< /tabpane >}} -In the emulated case, you may see that the FVP prints out extra lines. The key confirmation is the presence of "Hello, world!" in the output. It demonstrates that the generic code can be compiled and executed. +In the emulated case, you will notice that the FVP prints out extra lines. The key confirmation is the presence of "`Hello, world!`" in the output: it demonstrates that the generic code can be compiled and executed. ## Check SME2 availability @@ -174,12 +251,13 @@ int main(int argc, char *argv[]) { } ``` -The ``__ARM_FEATURE_SME2`` macro (line 7) is provided by the compiler when it -targets an SME-capable target, which is specified with the ``+sme2`` -architectural feature in ``-march=armv9.4-a+sme2`` (emulated environment) or -``-march=native+sme2`` command line option to ``clang`` in file ``Makefile``. +The `__ARM_FEATURE_SME2` macro (line 7) is provided by the compiler when it +targets an SME-capable target, which is specified with the `+sme2` +architectural feature in `-march=armv9.4-a+sme2` (emulated environment) or +`-march=native+sme2` command line option to `clang` in the `CMakeLists.txt` +(or in `cmake/baremetal-toolchain.cmake` for the emulated SME2 case). -The ``arm_sme.h`` file included at line 8 is part of the Arm C Library Extension +The `arm_sme.h` file included at line 8 is part of the Arm C Library Extension ([ACLE](https://arm-software.github.io/acle/main/)). The ACLE provides types and function declarations to enable C/C++ programmers to make the best possible use of the Arm architecture. You can use the SME-related part of the library, but it @@ -188,66 +266,83 @@ does also provide support for Neon or other Arm architectural extensions. In order to run in a baremetal environment (like the one being used in the emulated SME2 support), where no operating system has done the setup of the processor for the user land programs, an additional step is required to turn -SME2 on. This is the purpose of the ``setup_sme_baremetal()`` call at line 21. +SME2 on. This is the purpose of the `setup_sme_baremetal()` call at line 21. In environments where SME2 is natively supported, nothing needs to be done, -which is why the execution of this function is conditioned by the ``BAREMETAL`` -macro. ``BAREMETAL`` is set to 1 in the ``Makefile`` when the FVP is targeted, -and set to 0 otherwise. The body of the ``setup_sme_baremetal`` function is -defined in ``misc.c``. +which is why the execution of this function is conditioned by the `BAREMETAL` +macro. `BAREMETAL` is set to 1 in the `cmake/baremetal-toolchain.cmake` when the FVP is targeted, +and set to 0 otherwise. The body of the `setup_sme_baremetal` function is +defined in `misc.c`. -The ``sme2_check`` program then displays whether SVE, SME and SME2 are supported +The `sme2_check` program then displays whether SVE, SME and SME2 are supported at line 24. The checking of SVE, SME and SME2 is done differently depending on -``BAREMETAL``. This platform specific behavior is abstracted by the -``display_cpu_features()``: -- In baremetal mode, our program has access to system registers and can inspect system registers for SME2 support. The program will print the SVE field of the ``ID_AA64PFR0_EL1`` system register and the SME field of the ``ID_AA64PFR1_EL1`` system register. +`BAREMETAL`. This platform specific behavior is abstracted by the +`display_cpu_features()`: +- In baremetal mode, our program has access to system registers and can inspect system registers for SME2 support. The program will print the SVE field of the `ID_AA64PFR0_EL1` system register and the SME field of the `ID_AA64PFR1_EL1` system register. - In non baremetal mode, on an Apple platform the program needs to use a higher level API call. -The body of the ``display_cpu_features`` function is defined in ``misc.c``. +The body of the `display_cpu_features` function is defined in `misc.c`. -If SME2 is not available, ``sme2_check`` will emit a diagnostic message (line +If SME2 is not available, `sme2_check` will emit a diagnostic message (line 25) and exit (line 26). -``sme2_check`` will then print the initial streaming mode state at line 29 +`sme2_check` will then print the initial streaming mode state at line 29 (which is expected to be 0), then will switch to streaming mode (line 34) when -invoking function ``function_in_streaming_mode`` to show the Streaming Vector -Length (a.k.a ``SVL``), and then switch back to non streaming mode (when -returning from ``function_in_streaming_mode``). Function -``function_in_streaming_mode`` is defined at line 13. Note that it has been -annotated with the ``__arm_locally_streaming`` attribute, which instructs the +invoking function `function_in_streaming_mode` to show the Streaming Vector +Length (a.k.a `SVL`), and then switch back to non streaming mode (when +returning from `function_in_streaming_mode`). Function +`function_in_streaming_mode` is defined at line 13. Note that it has been +annotated with the `__arm_locally_streaming` attribute, which instructs the compiler to automatically switch to streaming mode when invoking this function. Streaming mode will be discussed in more depth in the next section. Look for the following confirmation messages in the output: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2-9">}} - ./sme2_check - HAS_SVE: 0 - HAS_SME: 1 - HAS_SME2: 1 - Checking initial in_streaming_mode: 0 - Switching to streaming mode... - In streaming_mode: 1, SVL: 512 bits - Switching back from streaming mode... - Checking in_streaming_mode: 0 - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2-12">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_check - ID_AA64PFR0_EL1 : 0x1101101131111112 - - SVE : 0x00000001 - ID_AA64PFR1_EL1 : 0x0000101002000001 - - SME : 0x00000002 - Checking has_sme: 1 - Checking initial in_streaming_mode: 0 - Switching to streaming mode... - In streaming_mode: 1, SVL: 512 bits - Switching back from streaming mode... - Checking in_streaming_mode: 0 - - Info: /OSCI/SystemC: Simulation stopped by user. - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-9">}} +./build-native/sme2_check +HAS_SVE: 0 +HAS_SME: 1 +HAS_SME2: 1 +Checking initial in_streaming_mode: 0 +Switching to streaming mode... +In streaming_mode: 1, SVL: 512 bits +Switching back from streaming mode... +Checking in_streaming_mode: 0 +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5-12">}} +adb push build-android/sme2_check /data/local/tmp +build-android/sme2_check: 1 file pushed, 0 skipped. 29.7 MB/s (19456 bytes in 0.001s) +adb shell chmod 755 /data/local/tmp/sme2_check +adb shell /data/local/tmp/sme2_check +HAS_SVE: 1 +HAS_SME: 1 +HAS_SME2: 1 +Checking initial in_streaming_mode: 0 +Switching to streaming mode... +In streaming_mode: 1, SVL: 512 bits +Switching back from streaming mode... +Checking in_streaming_mode: 0 +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-13">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh build-baremetal/sme2_check +ID_AA64PFR0_EL1 : 0x1101101131111112 + - SVE : 0x00000001 +ID_AA64PFR1_EL1 : 0x0000101002000001 + - SME : 0x00000002 +Checking has_sme: 1 +Checking initial in_streaming_mode: 0 +Switching to streaming mode... +In streaming_mode: 1, SVL: 512 bits +Switching back from streaming mode... +Checking in_streaming_mode: 0 + +Info: /OSCI/SystemC: Simulation stopped by user. +{{< /tab >}} + {{< /tabpane >}} You've now confirmed that your environment can compile and run SME2 code, and that SME2 features like streaming mode are working correctly. You're ready to continue to the next section and start working with SME2 in practice. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md index 1e28558f2d..ad40368a6e 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md @@ -10,9 +10,9 @@ layout: learningpathall In this section, you'll learn how to improve matrix multiplication performance using the SME engine and outer product operations. -This approach increases the number of multiply-accumulate (MACC) operations per memory load, reducing bandwidth pressure and improving overall throughput. +This approach increases the number of multiply-accumulate (MAC) operations per memory load, reducing bandwidth pressure and improving overall throughput. -## Increase MACC efficiency using outer products +## Increase MAC efficiency using outer products In the vanilla implementation, the core multiply-accumulate step looks like this: @@ -20,9 +20,9 @@ In the vanilla implementation, the core multiply-accumulate step looks like this acc += matLeft[m * K + k] * matRight[k * N + n]; ``` -This translates to one multiply-accumulate operation, known as `macc`, for two loads (`matLeft[m * K + k]` and `matRight[k * N + n]`). It therefore has a 1:2 `macc` to `load` ratio of multiply-accumulate operations (MACCs) to memory loads - one multiply-accumulate and two loads per iteration, which is inefficient. This becomes more pronounced in triple-nested loops and when matrices exceed cache capacity. +This translates to one multiply-accumulate operation, known as `mac`, for two loads (`matLeft[m * K + k]` and `matRight[k * N + n]`). It therefore has a 1:2 `mac` to `load` ratio of multiply-accumulate operations (MACs) to memory loads - one multiply-accumulate and two loads per iteration, which is inefficient. This becomes more pronounced in triple-nested loops and when matrices exceed cache capacity. -To improve performance, you want to increase the `macc` to `load` ratio, which means increasing the number of multiply-accumulate operations per load - you can express matrix multiplication as a sum of column-by-row outer products. +To improve performance, you want to increase the `mac` to `load` ratio, which means increasing the number of multiply-accumulate operations per load - you can express matrix multiplication as a sum of column-by-row outer products. The diagram below illustrates how the matrix multiplication of `matLeft` (3 rows, 2 columns) by `matRight` (2 rows, 3 columns) can be decomposed into a sum of column-by-row outer diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md index e41965f946..ef1a7589ee 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md @@ -341,54 +341,83 @@ Benchmarking and profiling are not simple tasks. The purpose of this Learning Pa First, make sure that the `sme2_matmul_asm` executable is up-to-date: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2">}} - make sme2_matmul_asm - make: `sme2_matmul_asm' is up to date. - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make sme2_matmul_asm - make: 'sme2_matmul_asm' is up to date. - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-native/ sme2_matmul_asm +ninja: Entering directory `build-native/' +ninja: no work to do. +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-android/ sme2_matmul_asm +ninja: Entering directory `build-android/' +ninja: no work to do. +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-3">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ninja -C build-baremetal/ sme2_matmul_asm +ninja: Entering directory `build-baremetal/' +ninja: no work to do. +{{< /tab >}} + {{< /tabpane >}} -Then execute `sme2_matmul_asm` either natively or on the FVP: +Then execute `sme2_matmul_asm` either natively, or on the FVP, or on the Android phone: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} - ./sme2_matmul_asm - SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 - Matrix preprocessing: PASS ! - Matrix multiplication: PASS ! - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_asm - SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 - Matrix preprocessing: PASS ! - Matrix multiplication: PASS ! - - Info: /OSCI/SystemC: Simulation stopped by user. - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +./build-native/sme2_matmul_asm +SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5-7">}} +adb push build-android/sme2_matmul_asm /data/local/tmp +build-android/sme2_matmul_asm: 1 file pushed, 0 skipped. 29.7 MB/s (19456 bytes in 0.001s) +adb shell chmod 755 /data/local/tmp/sme2_matmul_asm +adb shell /data/local/tmp/sme2_matmul_asm +SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh build-baremetal/sme2_matmul_asm +SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! + +Info: /OSCI/SystemC: Simulation stopped by user. +{{< /tab >}} + {{< /tabpane >}} `sme2_matmul_asm` prints the version of the matrix multiplication performed -(asm or intr) as well as the `M`, `K` and `N` parameters. It also prints +(`asm` or `intr`) as well as the `M`, `K` and `N` parameters. It also prints whether the preprocessing and matrix multiplication passed (`PASS`) or failed (`FAILED`) the comparison the vanilla reference implementation. {{% notice Tip %}} The example above uses the default values for the `M` (125), `K`(70) and `N`(70) -parameters. You can override this and provide your own values on the command line: +parameters. You can override this and provide your own values on the command line +when executing `sme2_matmul_asm`: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash">}} - ./sme2_matmul_asm 7 8 9 - {{< /tab >}} - {{< tab header="Emulated SME2 support" language="bash">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_asm 7 8 9 - {{< /tab >}} +{{< tab header="Native SME2 support" language="bash">}} +./build-native/sme2_matmul_asm 7 8 9 +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash">}} +adb shell /data/local/tmp/sme2_matmul_asm 7 8 9 +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh build-baremetal/sme2_matmul_asm 7 8 9 +{{< /tab >}} + {{< /tabpane >}} In this example, `M=7`, `K=8`, and `N=9` are used. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md index f5aa9a7248..311da4396b 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md @@ -179,7 +179,7 @@ Note again that `matmul_intr_impl` function has been annotated at line 4 with: - `__arm_inout("za")`, because the function reuses the ZA storage from its caller -The multiplication with the outer product is performed in a double-nested loop, over the `M` (line 7) and `N` (line 11) dimensions of the input matrices `matLeft_mod` and `matRight`. Both loops have an `SVL` step increment, which corresponds to the horizontal and vertical dimensions of the ZA storage that will be used as one tile at a time will be processed. +The multiplication with the outer product is performed in a double-nested loop, over the `M` (line 7) and `N` (line 11) dimensions of the input matrices `matLeft_mod` and `matRight`. Both loops have an `SVL` step increment, which corresponds to the horizontal and vertical dimensions of the ZA storage that will be used as one tile at a time will be processed. The `M` and `N` dimensions of the inputs might not be perfect multiples of `SVL` so the predicates `pMDim` (line 9) (respectively `pNDim` at line 13) are computed in order to know which rows (respectively columns) are valid. @@ -217,33 +217,60 @@ with the `IMPL` macro defined to be `intr` in the `Makefile`. First, make sure that the `sme2_matmul_intr` executable is up-to-date: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2">}} - make sme2_matmul_intr - make: `sme2_matmul_intr' is up to date. - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make sme2_matmul_intr - make: 'sme2_matmul_intr' is up to date. - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-native/ sme2_matmul_intr +ninja: Entering directory `build-native/' +ninja: no work to do. +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-3">}} +ninja -C build-android/ sme2_matmul_intr +ninja: Entering directory `build-android/' +ninja: no work to do. +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-3">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ninja -C build-baremetal/ sme2_matmul_intr +ninja: Entering directory `build-baremetal/' +ninja: no work to do. +{{< /tab >}} + {{< /tabpane >}} Then execute `sme2_matmul_intr` either natively or on the FVP: {{< tabpane code=true >}} - {{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} - ./sme2_matmul_intr - SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 - Matrix preprocessing: PASS ! - Matrix multiplication: PASS ! - {{< /tab >}} - - {{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} - docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_intr - SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 - Matrix preprocessing: PASS ! - Matrix multiplication: PASS ! - - Info: /OSCI/SystemC: Simulation stopped by user. - {{< /tab >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +./build-native/sme2_matmul_intr +SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5-7">}} +adb push build-android/sme2_matmul_intr /data/local/tmp +build-android/sme2_matmul_intr: 1 file pushed, 0 skipped. 29.7 MB/s (19456 bytes in 0.001s) +adb shell chmod 755 /data/local/tmp/sme2_matmul_intr +adb shell /data/local/tmp/sme2_matmul_intr +SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! +{{< /tab >}} + +{{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh build-baremetal/sme2_matmul_intr +SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 +Matrix preprocessing: PASS ! +Matrix multiplication: PASS ! + +Info: /OSCI/SystemC: Simulation stopped by user. +{{< /tab >}} + {{< /tabpane >}} + +{{% notice Tip %}} +As with the `sme2_matmul_asm` program, you can provide the `M`, `K`and `N` +parameters on the command line to `sme2_matmul_intr`. +{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md index afbffecee7..1f52e6049a 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md @@ -6,69 +6,131 @@ weight: 10 layout: learningpathall --- -In this section, you'll benchmark matrix multiplication performance using SME2, if your machine supports native execution of SME2 instructions. +In this section, you'll benchmark matrix multiplication performance using SME2, +if your machine supports native execution of SME2 instructions. ## About benchmarking and emulation Emulation is generally not the best way to assess the performance of a piece of -code. Emulation focuses on correctly simulating instructions and not accurate execution timing. For example, as explained in the [outer product section](../5-outer-product/), improving performance involves increasing the `macc`-to-`load` ratio. +code. Emulation focuses on correctly simulating instructions and not accurate +execution timing. For example, as explained in the [outer product +section](../5-outer-product/), improving performance involves increasing the +`macc`-to-`load` ratio. -Emulators, including the FVP, do not model in detail memory bandwidth, cache behavior, or latency. At best, an emulator provides an instruction count for the vanilla reference implementation versus the assembly-/intrinsic-based versions of the matrix multiplication, which is useful for functional validation but not for precise benchmarking. +Emulators, including the FVP, do not model in detail memory bandwidth, cache +behavior, or latency. At best, an emulator provides an instruction count for the +vanilla reference implementation versus the assembly-/intrinsic-based versions +of the matrix multiplication, which is useful for functional validation but not +for precise benchmarking. -## Benchmarking on a platform with native SME2 support +## Benchmarking on platforms with native SME2 support or Android phones {{% notice Note %}} -Benchmarking and profiling are complex tasks. This Learning Path provides a *simplified* framework for observing SME2-related performance improvements. +Benchmarking and profiling are complex tasks. This Learning Path provides a +*simplified* framework for observing SME2-related performance improvements. {{% /notice %}} -If your machine natively supports SME2, then benchmarking is possible. When -`sme2_matmul_asm` and `sme2_matmul_intr` were compiled with `BAREMETAL=0`, the -*benchmarking mode* is available. +If your machine natively supports SME2, or if the target Android phone supports +SMES, then benchmarking is possible. When `sme2_matmul_asm` and +`sme2_matmul_intr` were compiled with `BAREMETAL=0`, the *benchmarking mode* is +available. -*Benchmarking mode* is enabled by prepending the `M`, `K`, `N` optional parameters with an iteration count (`I`). +*Benchmarking mode* is enabled by prepending the `M`, `K`, `N` optional +parameters with an iteration count (`I`). ## Run the intrinsic version Now measure the execution time of `sme2_matmul_intr` for 1000 multiplications of matrices with the default sizes: -```BASH { output_lines="2-4"} -./sme2_matmul_intr 1000 +{{< tabpane code=true >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +./build-native/sme2_matmul_intr 1000 SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 Reference implementation: min time = 101 us, max time = 438 us, avg time = 139.42 us SME2 implementation *intr*: min time = 1 us, max time = 8 us, avg time = 1.82 us -``` +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5-7">}} +adb push build-android/sme2_matmul_intr /data/local/tmp +build-android/sme2_matmul_intr: 1 file pushed, 0 skipped. 29.7 MB/s (19456 bytes in 0.001s) +adb shell chmod 755 /data/local/tmp/sme2_matmul_intr +adb shell /data/local/tmp/sme2_matmul_intr 1000 +SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 +Reference implementation: min time = 115 us, max time = 808 us, avg time = 117.98 us +SME2 implementation *intr*: min time = 5 us, max time = 21 us, avg time = 6.56 us +{{< /tab >}} -The execution time is reported in microseconds. A wide spread between the minimum and maximum figures can be noted and is expected as the way of doing the benchmarking is simplified for the purpose of simplicity. You will, however, note that the intrinsic version of the matrix multiplication brings on average a 76x execution time reduction. +{{< /tabpane >}} + +The execution time is reported in microseconds. A wide spread between the +minimum and maximum figures can be noted and is expected as the way of doing the +benchmarking is simplified for the purpose of simplicity. You will, however, +note that the intrinsic version of the matrix multiplication brings a significant +reduction in time, from 18x to 76x depending on the device. {{% notice Tip %}} You can override the default values for `M` (125), `K` (25), and `N` (70) and provide your own values on the command line. For example, you can benchmark the `M=7`, `K=8`, and `N=9` case with: -```BASH { output_lines="2-4"} -./sme2_matmul_intr 1000 7 8 9 +{{< tabpane code=true >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +./build-native/sme2_matmul_intr 1000 7 8 9 SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=7, K=8, N=9 Reference implementation: min time = 0 us, max time = 14 us, avg time = 0.93 us SME2 implementation *intr*: min time = 0 us, max time = 1 us, avg time = 0.61 us -``` +{{< /tab >}} + +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2-4">}} +adb shell /data/local/tmp/sme2_matmul_intr 1000 7 8 9 +SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=7, K=8, N=9 +Reference implementation: min time = 0 us, max time = 1 us, avg time = 0.37 us +SME2 implementation *intr*: min time = 0 us, max time = 8 us, avg time = 0.32 us +{{< /tab >}} + +{{< /tabpane >}} + {{% /notice %}} Now measure the execution time of `sme2_matmul_asm` for 1000 multiplications of matrices with the default sizes: -```BASH { output_lines="2-4"} -./sme2_matmul_asm 1000 +{{< tabpane code=true >}} + +{{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} +./build-native/sme2_matmul_asm 1000 SME2 Matrix Multiply fp32 *asm* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 Reference implementation: min time = 101 us, max time = 373 us, avg time = 136.49 us SME2 implementation *asm*: min time = 1 us, max time = 8 us, avg time = 1.44 us -``` - -You'll notice that although the vanilla reference matrix multiplication is the same, there is some variability in the execution time. +{{< /tab >}} -The assembly version of the SME2 matrix multiplication runs slightly faster (1.44 us compared to 1.82 us for the intrinsic-based version). However, this should not lead you to be convinced that assembly is inherently better. The comparison here is not apples-to-apples: -- Firstly, the assembly version has specific constraints on the `K` parameter that the intrinsics version does not. -- Second, the assembly version includes an optimization that the intrinsic version, for the sake of readability in this Learning Path, does not have (see the [Going further +{{< tab header="Android phones with SME2 support" language="bash" output_lines="2,5-7">}} +adb push build-android/sme2_matmul_asm /data/local/tmp +build-android/sme2_matmul_asm: 1 file pushed, 0 skipped. 29.7 MB/s (19456 bytes in 0.001s) +adb shell chmod 755 /data/local/tmp/sme2_matmul_asm +adb shell /data/local/tmp/sme2_matmul_asm 1000 +SME2 Matrix Multiply fp32 *asm* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 +Reference implementation: min time = 114 us, max time = 754 us, avg time = 117.91 us +SME2 implementation *asm*: min time = 3 us, max time = 18 us, avg time = 4.14 us +{{< /tab >}} + +{{< /tabpane >}} + +You'll notice that although the vanilla reference matrix multiplication is the +same, there is some variability in the execution time. + +The assembly version of the SME2 matrix multiplication runs slightly faster. However, this +should not lead you to be convinced that assembly is inherently better as this is not an +apples-to-apples comparison: +- Firstly, the assembly version has specific constraints on the `K` parameter + that the intrinsics version does not. +- Second, the assembly version includes an optimization that the intrinsic + version, for the sake of readability in this Learning Path, does not have (see + the [Going further section](/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further/) to learn more). -- Most importantly, the intrinsics version is significantly more readable and maintainable. These are qualities that matter in real-world development. \ No newline at end of file +- Most importantly, the intrinsics version is significantly more readable and + maintainable. These are qualities that matter in real-world development. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md index d05e5a7ea0..546f73357d 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md @@ -96,7 +96,7 @@ product, and the trace displays the content of the ZA storage. You can get a Tarmac trace when invoking `run-fvp.sh` by adding the `--trace` option as the *first* argument, for example: ```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh --trace sme2_matmul_asm +docker run --rm -v "$PWD:/work" armswdev/sme2-learning-path:sme2-environment-v3 ./run-fvp.sh --trace build-baremetal/sme2_matmul_asm ``` {{% notice Tip %}} diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md index 6e56e62f5a..771da15452 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md @@ -17,9 +17,11 @@ prerequisites: - Working knowledge of Arm’s SVE and SME2 instruction sets - Intermediate proficiency with the C programming language and the Armv9-A assembly language - A computer running Linux, macOS, or Windows - - Installations of Git and Docker for project setup and emulation + - Installations of Git, CMake and Ninja for project setup - A platform that supports SME2 - see the list of [devices with SME2 support](/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started/#devices) or an emulator to run code with SME2 instructions - - Compiler support for SME2 instructions (for example, LLVM 17+ with SME2 backend support) + - Installation of Docker for SME2 emulation (if you don't have SME2 available) + - Installation of Android Development Studio and adb (if you're targeting an Android phone with SME2 support) + - Compiler support for SME2 instructions (for example, LLVM 18 or later with SME2 backend support) author: Arnaud de Grandmaison @@ -32,7 +34,6 @@ armips: tools_software_languages: - C - Clang - - Runbook - LLVM operatingsystems: diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/ndk_install.png b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/ndk_install.png new file mode 100644 index 0000000000..dc5e816ecd Binary files /dev/null and b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/ndk_install.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index 813c94fc0e..8a55dda130 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -11,7 +11,7 @@ maintopic: true operatingsystems_filter: - Android: 1 - Baremetal: 30 -- Linux: 31 +- Linux: 32 - macOS: 7 - RTOS: 10 - Windows: 4 @@ -21,7 +21,7 @@ subjects_filter: - Embedded Linux: 4 - Libraries: 3 - ML: 17 -- Performance and Architecture: 21 +- Performance and Architecture: 22 - RTOS Fundamentals: 5 - Security: 2 - Virtual Hardware: 2 @@ -35,6 +35,8 @@ tools_software_languages_filter: - Arm Compute Library: 2 - Arm Development Studio: 8 - Arm Fast Models: 4 +- Arm Performance Studio: 1 +- Arm Streamline: 1 - Arm Virtual Hardware: 12 - Assembly: 1 - C: 5 @@ -72,6 +74,7 @@ tools_software_languages_filter: - Keil Studio Cloud: 1 - Kubernetes: 1 - lgpio: 1 +- Linux kernel: 1 - LLM: 2 - MCP: 1 - MPS3: 1 @@ -80,6 +83,7 @@ tools_software_languages_filter: - NumPy: 1 - Ollama: 1 - Paddle: 1 +- Performance analysis: 1 - Porcupine: 1 - Python: 8 - PyTorch: 3 diff --git a/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/end-to-end_workflow.md b/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/end-to-end_workflow.md index 85aeb7cad8..c4ad31b306 100644 --- a/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/end-to-end_workflow.md +++ b/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/end-to-end_workflow.md @@ -14,7 +14,7 @@ This section provides hands-on instructions for you to deploy pre-trained Paddle The steps involved in the model deployment are shown in the figure below: -![End-to-end workflow#center](./Figure3.png "Figure 3. End-to-end workflow") +![End-to-end workflow#center](./figure3.webp "Figure 3. End-to-end workflow") ## Deploy PaddleOCR text recognition model on the Corstone-300 FVP included with Arm Virtual Hardware diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/greengrassinstallation.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/greengrassinstallation.md index a5d2136569..59c54875cd 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/greengrassinstallation.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/greengrassinstallation.md @@ -17,7 +17,7 @@ Log into your edge device via SSH and we'll start the process of installing/conf Prior to installing AWS IoT Greengrass, we need to create a set of AWS credentials that will be used as part of the installation process. >**_NOTE:_** ->These credentials may automatically be provided to you when you initiate the workshop has hosted by AWS Workshop Studio. If so, please copy the credentials as we'll need them in the next step. The credentials should look like this: +>These credentials may automatically be provided to you when you initiate the workshop is hosted by AWS Workshop Studio. If so, please copy the credentials as we'll need them in the next step. The credentials should look like this: > > export AWS_ACCESS_KEY_ID= > export AWS_SECRET_ACCESS_KEY= @@ -70,10 +70,19 @@ In this example, we choose the "Linux" device type and we are going to download Lower down in the menu, you will see the specific instructions that are custom-crafted for you to download and invoke the "Nucleus Classic" installer. The basic sequence of instructions are: 1) Start with a SSH shell session into your edge device - 2) copy and paste your two AWS credentials into the shell environment - 3) copy and paste/run the installer download curl command into your shell + 2) copy and paste your two AWS credentials into the shell environment +``` + NOTE: Your "two AWS credentials" are the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY from above... +``` + 3) copy and paste/run the installer download curl command into your shell +``` + NOTE: The "installer download command" is in the "Download the installer" section in the image below... +``` 4) copy and paste/run the installer invocation command - 5) wait for the installer to complete +``` + NOTE: The "installer invocation command" is in the "Run the installer" section in the image below... +``` + 5) wait for the installer to complete on your edge device ![CreateDevice](./images/GG_Install_Device2.png) diff --git a/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/debug.md b/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/debug.md index a064b3006c..485052e3e7 100644 --- a/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/debug.md +++ b/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/debug.md @@ -89,4 +89,4 @@ Ensure your FVP instance is running and matches the model and parameters selecte After these steps, you can debug the software stack as shown in the following figure: -![FVP running #center](Select_target.png "Debug interface in GUI") +![FVP running #center](select_target.webp "Debug interface in GUI") diff --git a/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/run.md b/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/run.md index cdbfda2db0..12945e535a 100644 --- a/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/run.md +++ b/content/learning-paths/embedded-and-microcontrollers/linux-on-fvp/run.md @@ -101,4 +101,4 @@ Always check the name of the CPU instance when switching between different FVP m You can also run the FVP using its graphical user interface: -![GUI #center](FVP.png "View of the FVP GUI") +![GUI #center](fvp.webp "View of the FVP GUI") diff --git a/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/3_oot_module.md b/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/3_oot_module.md index 578a52f9b4..d5842bb2d7 100644 --- a/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/3_oot_module.md +++ b/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/3_oot_module.md @@ -223,7 +223,7 @@ The module above receives the size of a 2D array as a string through the `char_d ssh root@ ``` -4. Execute the following commads on the target to run the module: +4. Execute the following commands on the target to run the module: ```bash insmod /root/mychardrv.ko mknod /dev/mychardrv c 42 0 diff --git a/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/4_sl_profile_oot.md b/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/4_sl_profile_oot.md index d10aa1d78f..be3f7edf51 100644 --- a/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/4_sl_profile_oot.md +++ b/content/learning-paths/embedded-and-microcontrollers/streamline-kernel-module/4_sl_profile_oot.md @@ -70,7 +70,7 @@ If you are using an AArch32 target, use `arm` instead of `arm64`. ![Streamline command#center](./images/img04_streamline_cmd.png) -8. In the Capture settings dialog, select Add image, add the absolut path of your kernel module file `mychardrv.ko` and click Save. +8. In the Capture settings dialog, select Add image, add the absolute path of your kernel module file `mychardrv.ko` and click Save. ![Capture settings#center](./images/img05_capture_settings.png) 9. Start the capture and enter a name and location for the capture file. Streamline will start collecting data and the charts will show activity being captured from the target. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md index 55907d401b..36552ce838 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md @@ -1,5 +1,5 @@ --- -title: Install and configure Zephyr Workbench in VS Code +title: Install and configure Workbench for Zephyr in VS Code weight: 2 ### FIXED, DO NOT MODIFY @@ -10,45 +10,53 @@ layout: learningpathall Setting up a [Zephyr](https://zephyrproject.org/) RTOS development environment from scratch can be challenging, requiring you to manually install SDKs, configure toolchains, and initialize workspace directories. These steps often vary across operating systems and board vendors, leading to a fragmented and error-prone setup process. -[Zephyr Workbench](https://zephyr-workbench.com/) is an open-source Visual Studio Code extension that transforms Zephyr RTOS development into a streamlined IDE experience. Created by [Ac6](https://www.ac6.fr/en/), it automates toolchain setup, project management, and debugging, making Zephyr projects faster to start and easier to scale. +[Workbench for Zephyr](https://zephyr-workbench.com/) is an open-source Visual Studio Code [extension](https://marketplace.visualstudio.com/items?itemName=Ac6.zephyr-workbench) that transforms Zephyr RTOS development into a streamlined IDE experience. Created by [Ac6](https://www.ac6.fr/en/), it automates toolchain setup, project management, and debugging, making Zephyr projects faster to start and easier to scale. -In this Learning Path, you'll learn the essential steps to install Zephyr Workbench and configure a complete development environment on your local machine. Once complete, you'll be ready to create, build, and debug applications for Arm Cortex-M platforms using Zephyr RTOS. +In this Learning Path, you'll learn the essential steps to install Workbench for Zephyr and configure a complete development environment on your local machine. Once complete, you'll be ready to create, build, and debug applications for Arm Cortex-M platforms using Zephyr RTOS. -Zephyr Workbench provides one-click environment setup that automatically installs the required tools including Python, CMake, Ninja, and Git. It supports importing and managing Zephyr SDKs with version and architecture selection, while initializing west workspaces and creating board-specific applications from samples. The extension builds Zephyr applications and flashes hardware directly from the VS Code interface. It also provides breakpoint debugging and memory usage insights with hardware probe support. +Workbench for Zephyr provides one-click environment setup that automatically installs the required tools including Python, CMake, Ninja, and Git. It supports importing and managing Zephyr SDKs with version and architecture selection, while initializing west workspaces and creating board-specific applications from samples. The extension builds Zephyr applications and flashes hardware directly from the VS Code interface. It also provides breakpoint debugging and memory usage insights with hardware probe support. -## What you need before installing Zephyr Workbench +## What you need before installing Workbench for Zephyr -To get started with Zephyr Workbench you need to have Visual Studio Code downloaded, installed, and running on your computer. +To get started with Workbench for Zephyr you need to have Visual Studio Code downloaded, installed, and running on your computer. -For Windows, you need version 10 or later (64-bit), along with administrator privileges for installing tools and drivers. +**Windows OS:** +For Windows, you need version 10 or later (64-bit x64), along with administrator privileges for installing runners and drivers. -On macOS, the Homebrew package manager is required. To install Homebrew, run the following command: +**MacOS:** +On MacOS, the Homebrew package manager is required. To install Homebrew, run the following command: ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" ``` -Zephyr Workbench supports STM32 development boards (STM32 Discovery, Nucleo series), Nordic Semiconductor boards (nRF52, nRF53, nRF91 series), NXP development boards (FRDM, LPCXpresso series), Espressif boards (ESP32-based boards), and many other Zephyr-supported platforms. You need a development board to try out the code examples. +**Linux:** +- A recent 64-bit X64 distribution such as Ubuntu 20.04 or later, Fedora, Clear Linux OS, or Arch Linux +- Other distributions might work, but may require manual configuration of system packages +- After installation, use the Workbench host tools manager to verify that all required tools were installed correctly -## Configure the Zephyr Workbench extension in Visual Studio Code + +Zephyr Workbench supports STM32 development boards (STM32 Discovery, Nucleo series), Nordic Semiconductor boards (nRF52, nRF53, nRF91 series), NXP development boards (FRDM, LPCXpresso series), Espressif boards (ESP32-based boards), and many other Zephyr-supported platforms like Renesas, Silabs or Infineon. You need a development board to try out the code examples. + +## Configure the Workbench for Zephyr extension in Visual Studio Code This section covers installing the Zephyr Workbench extension and configuring your Arm development environment. ### Install the extension -To install the Zephyr Workbench extension, open Visual Studio Code and navigate to the Extensions view by selecting the Extensions icon in the Activity Bar. +To install the Workbench for Zephyr extension, open Visual Studio Code and navigate to the Extensions view by selecting the Extensions icon in the Activity Bar. You can also use the keyboard shortcut `Ctrl+Shift+X` (Windows/Linux) or `Cmd+Shift+X` (macOS). -In the search box, type "Zephyr Workbench" and locate the official "Zephyr Workbench" extension by Ac6. Select **Install** to add the extension to VS Code. +In the search box, type "Workbench for Zephyr" and locate the official "Workbench for Workbench" extension by Ac6. Select **Install** to add the extension to VS Code. The extension icon appears in the Activity Bar, and a welcome message may appear confirming successful installation. -Once installed, the Zephyr Workbench icon appears in the sidebar with a welcome screen. +Once installed, the Workbench for Zephyr icon appears in the sidebar with a welcome screen. ### Install the required host tools -In the Zephyr Workbench panel, select **Install Host Tools** to automatically install the required dependencies. +In the Workbench for Zephyr panel, select **Install Host Tools** to automatically install the required dependencies. This process installs Python 3.x, CMake, the Ninja build system, Git, Device Tree Compiler (DTC), and the West meta-tool. @@ -62,7 +70,7 @@ When the installation completes, select **Verify Host Tools** to check the versi ### Import and configure the toolchain -Next, download and configure the toolchain by selecting **Import Toolchain** in the Zephyr Workbench panel. Select the toolchain family (*Zephyr SDK*) and configure the SDK Type by choosing *Minimal* for basic functionality. +Next, download and configure the toolchain by selecting **Import Toolchain** in the Workbench for Zephyr panel. Select the toolchain family (*Zephyr SDK*) and configure the SDK Type by choosing *Minimal* for basic functionality. Select your desired version (such as v0.17.0 or v0.17.3) and choose the target architectures. For this Learning Path, you only need to select *arm*. @@ -73,9 +81,9 @@ Specify the parent directory for SDK installation and select **Import** to downl ### Initialize the Zephyr project workspace -Zephyr uses a Git-based workspace manager called West to organize its source code, modules, and samples. Use Zephyr Workbench to initialize your first West workspace. +Zephyr uses a Git-based workspace manager called West to organize its source code, modules, and samples. Use Workbench for Zephyr to initialize your first West workspace. -In the Zephyr Workbench panel, select **Initialize Workspace** to set up your project environment. Configure the workspace settings by selecting "Minimal from template" for the source location and using the default path `https://github.com/zephyrproject-rtos/zephyr`. +In the Workbench for Zephyr panel, select **Initialize Workspace** to set up your project environment. Configure the workspace settings by selecting "Minimal from template" for the source location and using the default path `https://github.com/zephyrproject-rtos/zephyr`. Choose a target-specific template (such as STM32 or NXP) and select your Zephyr version (such as v3.7.0 or v4.1.0). Specify the directory for your workspace, keeping in mind that initialization takes approximately 10 minutes to complete. @@ -89,7 +97,7 @@ The workspace initialization downloads the Zephyr source code and dependencies. ### Verify setup -Test your setup by confirming that the Zephyr Workbench panel shows all components as installed successfully. Verify the host tools are installed, the SDK is imported and detected, and the West workspace is initialized. Ensure no error messages appear in the VS Code output panel. +Test your setup by confirming that the Workbench for Zephyr panel shows all components as installed successfully. Verify the host tools are installed, the SDK is imported and detected, and the West workspace is initialized. Ensure no error messages appear in the VS Code output panel. {{% notice Note %}} **Troubleshooting tips:** diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md index 2bda1ffe4d..ca17c005c1 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md @@ -8,12 +8,12 @@ layout: learningpathall ## Create and build your first Zephyr application -In this session, you'll learn how to create and build your first Zephyr application using Zephyr Workbench. This step prepares you to customize, test, and expand real firmware projects on Arm Cortex-M boards. +In this session, you'll learn how to create and build your first Zephyr application using Workbench for Zephyr. This step prepares you to customize, test, and expand real firmware projects on Arm Cortex-M boards. For demonstration, you'll use an [NXP FRDM-MCXN947](https://www.nxp.com/design/design-center/development-boards-and-designs/FRDM-MCXN947) development board as the target device. However, the same steps apply to any Zephyr-supported Arm Cortex-M board. You can find the full list of supported boards in the [Supported Boards](https://docs.zephyrproject.org/latest/boards/#). -Depending on your board, you might need to install a different debug tool. The next module covers this setup. +Depending on your board, you might need to install a different debug tool aka `runner`. The next module covers this setup. ### Create application @@ -30,7 +30,7 @@ In the Zephyr Workbench panel: ### Build the application -Select the **Build** button in Zephyr Workbench or press `Ctrl+Shift+B`. +Select the **Build** button in Workbench for Zephyr or press `Ctrl+Shift+B`. The build system compiles your application and links it against the Zephyr kernel and board-specific drivers. @@ -43,11 +43,11 @@ To enable debugging on your target hardware, you might need to install additiona For the NXP FRDM-MCXN947, download and install the LinkServer debug utility: - LinkServer for Microcontrollers: [NXP LinkServer Download Page](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/linkserver-for-microcontrollers:LINKERSERVER) -Once installed, Zephyr Workbench attempts to detect it automatically during a debug session. +Once installed, Workbench for Zephyr attempts to detect it automatically during a debug session. If you're using a different board, see your vendor's documentation to install the appropriate debug utility. {{% notice Note %}} -If Zephyr Workbench doesn't automatically detect the installed debug runner, you can manually configure it. +If Workbench for Zephyr doesn't automatically detect the installed debug runner, you can manually configure it. Open the **Debug Manager** from the Zephyr sidebar, and enter the full path to the runner executable. {{% /notice %}} @@ -68,11 +68,11 @@ The following code shows a basic Zephyr application that prints a message to the ```c #include -#include +#include int main(void) { - printk("Hello World! %s\n", CONFIG_BOARD); // Prints board name to serial console + printk("Hello World! %s\n", CONFIG_BOARD_TARGET); // Prints board name to serial console return 0; } ``` @@ -83,4 +83,4 @@ int main(void) Now that the app works, try editing the message in `printk()` or changing the board target in the application settings. Then rebuild and observe the output. This helps verify that your toolchain and workspace respond correctly to code and config changes. -With your first Zephyr application successfully built, you're ready to take the next step—debugging. In the next module, you'll launch a debug session, set breakpoints, and perform memory analysis using Zephyr Workbench. These skills help you validate and optimize applications running on real Arm Cortex-M hardware. +With your first Zephyr application successfully built, you're ready to take the next step—debugging. In the next module, you'll launch a debug session, set breakpoints, and perform memory analysis using Workbench for Zephyr. These skills help you validate and optimize applications running on real Arm Cortex-M hardware. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md index 475055798c..909bec0fd0 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md @@ -8,21 +8,21 @@ layout: learningpathall ## Analyze and debug Zephyr applications in VS Code -In this module, you'll learn how to inspect memory usage and perform live debugging on your Zephyr applications using Zephyr Workbench. These capabilities are essential for diagnosing bugs and optimizing embedded firmware performance on Arm Cortex-M platforms. +In this module, you'll learn how to inspect memory usage and perform live debugging on your Zephyr applications using Workbench for Zephyr. These capabilities are essential for diagnosing bugs and optimizing embedded firmware performance on Arm Cortex-M platforms. ## Analyze memory usage -Understanding how your application uses memory is crucial for optimizing embedded firmware on resource-constrained Arm Cortex-M systems. Zephyr Workbench provides built-in tools to generate detailed memory usage reports after a successful build, helping you identify ROM and RAM consumption hotspots early in development. +Understanding how your application uses memory is crucial for optimizing embedded firmware on resource-constrained Arm Cortex-M systems. Workbench for Zephyr provides built-in tools to generate detailed memory usage reports after a successful build, helping you identify ROM and RAM consumption hotspots early in development. ### Generate memory reports -After building your Zephyr application, analyze how memory is allocated and used. Zephyr Workbench offers built-in memory reporting tools that help you visualize RAM and ROM usage, identify inefficient memory patterns, and guide optimization efforts. These insights are especially useful when working with constrained Arm Cortex-M platforms. +After building your Zephyr application, analyze how memory is allocated and used. Workbench for Zephyr offers built-in memory reporting tools that help you visualize RAM and ROM usage, identify inefficient memory patterns, and guide optimization efforts. These insights are especially useful when working with constrained Arm Cortex-M platforms. To generate memory reports, open the **Zephyr Workbench** panel and select **Memory Analysis** after a successful build. The tool generates detailed reports showing RAM usage (stack, heap, static variables), ROM usage (code size, constants), and **Puncover** analysis for binary analysis including function size, call graphs, and timing on Arm Cortex-M processors. The following steps show how to generate and review memory reports: -- Open the **Zephyr Workbench** panel +- Open the **Workbench for Zephyr** panel - Select **Memory Analysis** after a successful build - Review detailed memory reports: - **RAM usage**: stack, heap, static variables @@ -159,22 +159,22 @@ Root ``` -## Install and configure debug tools +## Install and configure debug Runners -Depending on your board, different debug utilities might be required. Zephyr Workbench integrates several common runners: - -Go to **Host Tools > Install Debug Tools** in Zephyr Workbench. Debug tools vary depending on your target board. +Depending on your board, different debug utilities may be required. Workbench for Zephyr integrates and discovers several common runners: - **OpenOCD**: Generic open-source debugger - **LinkServer**: For NXP targets - **STM32CubeProgrammer**: For STM32 boards - **J-Link**: For SEGGER debug probes -### Install debug utilities +Workbench for Zephyr will automatically detect these tools when they are installed in their default locations and available on your system `PATH`. If a tool is installed in a custom location, you can either update your `PATH` or configure your environment so that Workbench for Zephyr can find it. + +### Install Runners Utilities To install debug tools for your specific board, go to **Host Tools > Install Debug Tools** in the Zephyr Workbench panel and select the tools applicable to your board. -![Debug Tools](images/install_debug_tools.png) +![Debug Runners](images/install_runners.png) ## Configure debug settings @@ -193,7 +193,7 @@ Choose the runner from OpenOCD, J-Link, LinkServer, or PyOCD. If the system does ### Manual debug runner configuration -If Zephyr Workbench doesn't automatically detect the installed debug runner, open the **Debug Manager** from the sidebar and locate your board profile to enter the path to the runner executable manually. +If Workbench for Zephyr doesn't automatically detect the installed debug runner, open the **Debug Manager** from the sidebar and locate your board profile to enter the path to the runner executable manually. {{% notice Note %}} Manual configuration might be required on first-time setups or if using custom runner versions. @@ -201,7 +201,7 @@ Manual configuration might be required on first-time setups or if using custom r ## Launch and use the debugger -You can start debugging from Zephyr Workbench by selecting **Debug**, or from VS Code by going to **Run and Debug** (`Ctrl+Shift+D`), selecting the debug config, and selecting **Run**. +You can start debugging from Workbench for Zephyr by selecting **Debug**, or from VS Code by going to **Run and Debug** (`Ctrl+Shift+D`), selecting the debug config, and selecting **Run**. ![Debug Application](images/debug_app.png) @@ -222,4 +222,4 @@ The debugger provides comprehensive inspection capabilities including breakpoint If using `pyocd`, target support might take a few seconds to initialize. -In this Learning Path, you explored how to analyze memory usage and debug Zephyr applications using Zephyr Workbench in VS Code. You learned to generate memory reports, install and configure debug tools, and launch interactive debug sessions. These steps help you troubleshoot and optimize embedded applications for Arm Cortex-M boards. +In this Learning Path, you explored how to analyze memory usage and debug Zephyr applications using Workbench for Zephyr. You learned to generate memory reports, install and configure debug tools, and launch interactive debug sessions. These steps help you troubleshoot and optimize embedded applications for Arm Cortex-M boards. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md index 2b964723b0..57f42d741b 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md @@ -1,5 +1,5 @@ --- -title: Build Zephyr projects with Zephyr Workbench in VS Code +title: Build Zephyr projects with Workbench for Zephyr in VS Code draft: true cascade: @@ -7,10 +7,10 @@ cascade: minutes_to_complete: 30 -who_is_this_for: This is an introductory topic for embedded developers targeting Arm-based platforms with the Zephyr RTOS using the Zephyr Workbench extension for VS Code. +who_is_this_for: This is an introductory topic for embedded developers targeting Arm-based platforms with the Zephyr RTOS using the Workbench for Zephyr extension for VS Code. learning_objectives: - - Install and configure the Zephyr Workbench extension in VS Code + - Install and configure the Workbench for Zephyr extension in VS Code - Set up a complete Zephyr development environment including the SDK and toolchain - Create, build, and debug Zephyr applications using hands-on examples - Perform memory usage analysis and apply basic optimization techniques @@ -20,6 +20,7 @@ prerequisites: - Basic familiarity with embedded C programming - Visual Studio Code installed and running - A Cortex-M development board + - Windows 10+ (64-bit), macOS with Homebrew, or Linux (preferably Ubuntu 20.04+) author: - Ayoub Bourjilat @@ -41,8 +42,8 @@ further_reading: link: https://docs.zephyrproject.org/latest/index.html type: documentation - resource: - title: Zephyr Workbench Official Website - link: https://zephyr-workbench.com/ + title: Workbench for Zephyr Official Website + link: https://z-workbench.com/ type: website - resource: title: AC6 Zephyr Training @@ -53,4 +54,4 @@ further_reading: weight: 1 # _index.md always has weight of 1 to order correctly layout: "learningpathall" # All files under learning paths have this same wrapper learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. ---- \ No newline at end of file +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png index 53cf2449e8..4d83aa2f3f 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png index 6a129d7292..6f952a41a2 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png index b2276d64a2..28fb138605 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png index 8fadbc01ae..a07c621f48 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png index ea0cd693e9..ff2d95a2ec 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png index 672c9f0976..0179e74b29 100644 Binary files a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_runners.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_runners.png new file mode 100644 index 0000000000..b871600ff3 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_runners.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/install_host_tools.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/install_host_tools.png new file mode 100644 index 0000000000..0179e74b29 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/install_host_tools.png differ diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index ef48d314d8..b3a0e010bc 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -9,15 +9,15 @@ maintopic: true operatingsystems_filter: - Android: 2 - ChromeOS: 2 -- Linux: 35 -- macOS: 9 +- Linux: 36 +- macOS: 10 - Windows: 46 subjects_filter: - CI-CD: 5 - Containers and Virtualization: 7 - Migration to Arm: 30 -- ML: 3 -- Performance and Architecture: 27 +- ML: 4 +- Performance and Architecture: 28 subtitle: Create and migrate apps for power efficient performance title: Laptops and Desktops tools_software_languages_filter: @@ -29,7 +29,7 @@ tools_software_languages_filter: - Arm64EC: 1 - Assembly: 1 - Bash: 2 -- C: 9 +- C: 10 - C#: 6 - C++: 11 - CCA: 1 @@ -45,14 +45,16 @@ tools_software_languages_filter: - GitLab: 1 - Google Test: 1 - HTML: 2 +- Hugging Face: 1 - Hyper-V: 1 - i3: 1 - Intrinsics: 1 - JavaScript: 2 +- KleidiCV: 1 - Kubernetes: 1 - KVM: 1 - Linux: 1 -- llama.cpp: 1 +- llama.cpp: 2 - LLM: 1 - LLVM: 2 - llvm-mca: 1 @@ -65,13 +67,13 @@ tools_software_languages_filter: - OpenCV: 1 - perf: 4 - PowerShell: 1 -- Python: 7 +- Python: 8 - QEMU: 1 - Qt: 2 - RDP: 1 - Remote.It: 1 - RME: 1 -- Runbook: 18 +- Runbook: 17 - Rust: 2 - SVE: 1 - SVE2: 1 diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/1_rag.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/1_rag.md new file mode 100644 index 0000000000..6eeb6d4f12 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/1_rag.md @@ -0,0 +1,136 @@ +--- +title: Explore building a RAG pipeline on Arm-based Grace–Blackwell systems +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Get started + +Before getting started, you should complete the Learning Path [Unlock quantized LLM performance on Arm-based NVIDIA DGX Spark](/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/) to learn about the CPU and GPU builds of llama.cpp. This background is recommended for building the RAG solution on llama.cpp. + +The NVIDIA DGX Spark is also referred to as the Grace-Blackwell platform or GB10, the name of the NVIDIA Grace-Blackwell Superchip. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) combines information retrieval with language-model generation. Instead of relying solely on pre-trained weights, a RAG system retrieves relevant text from a document corpus and passes it to a language model to create factual, context-aware responses. + +Here is a typical pipeline: + +User Query ─> Embedding ─> Vector Search ─> Context ─> Generation ─> Answer + +Each stage in this pipeline plays a distinct role in transforming a question into a context-aware response: + +* Embedding model: Converts text into dense numerical vectors. An example is e5-base-v2. +* Vector database: Searches for semantically similar chunks. An example is FAISS. +* Language model: Generates an answer conditioned on retrieved context. An example is Llama 3.1 8B Instruct. + +## Why is Grace–Blackwell good for RAG pipelines? + +The Grace–Blackwell (GB10) platform combines Arm-based Grace CPUs with NVIDIA Blackwell GPUs, forming a unified architecture optimized for large-scale AI workloads. + +Its unique CPU–GPU design and unified memory enable seamless data exchange, making it an ideal foundation for RAG systems that require both fast document retrieval and high-throughput language model inference. + +The GB10 platform includes: + +- Grace CPU (Armv9.2 architecture) - 20 cores including 10 Cortex-X925 cores and 10 Cortex-A725 cores +- Blackwell GPU - CUDA 13.0 Tensor Core architecture +- Unified Memory (128 GB NVLink-C2C) - Shared address space between CPU and GPU which allows both processors to access the same 128 GB unified memory region without copy operations. + +The GB10 provides the following benefits for RAG applications: + +- Hybrid execution – Grace CPU efficiently handles embedding, indexing, and API orchestration. +- GPU acceleration – Blackwell GPU performs token generation with low latency. +- Unified memory – Eliminates CPU to GPU copy overhead because tensors and document vectors share the same memory region. +- Open-source friendly – Works natively with PyTorch, FAISS, Transformers, and FastAPI. + +## RAG system architecture + +Here is a diagram of the architecture: + +```console + . + ┌─────────────────────────────────────┐ + │ User Query │ + └──────────────┬──────────────────────┘ + │ + ▼ + ┌────────────────────┐ + │ Embedding (E5) │ + │ → FAISS (CPU) │ + └────────────────────┘ + │ + ▼ + ┌────────────────────┐ + │ Context Builder │ + │ (Grace CPU) │ + └────────────────────┘ + │ + ▼ + ┌───────────────────────────────────────────────┐ + │ llama.cpp (GGUF Model, Q8_0) │ + │ -ngl 40 --ctx-size 8192 │ + │ Grace CPU + Blackwell GPU (split compute) │ + └───────────────────────────────────────────────┘ + │ + ▼ + ┌────────────────────┐ + │ FastAPI Response │ + └────────────────────┘ + +``` + +## Create an engineering assistant + +You can use this architecture to create an engineering assistant. + +The assistant retrieves technical references from datasheets, programming guides, and application notes and and generates helpful explanations for software developers. + +This use case illustrates how a RAG system can provide contextual knowledge without retraining the model. + +The technology stack you will use is listed below: + +| **Stage** | **Technology / Framework** | **Hardware Execution** | **Function** | +|------------|-----------------------------|--------------------------|---------------| +| Document Processing | pypdf, text preprocessing scripts | Grace CPU | Converts PDFs and documents into plain text, performs cleanup and segmentation. | +| Embedding Generation | e5-base-v2 via sentence-transformers | Grace CPU | Transforms text into semantic vector representations for retrieval. | +| Semantic Retrieval | FAISS and LangChain | Grace CPU | Searches the vector index to find the most relevant text chunks for a given query. | +| Text Generation | llama.cpp REST Server (GGUF model) | Blackwell GPU and Grace CPU | Generates natural language responses using the Llama 3 model, accelerated by GPU inference. | +| Pipeline Orchestration | Python (RAG Query Script) | Grace CPU | Coordinates embedding, retrieval, and generation via REST API calls. | +| Unified Memory Architecture | Unified LPDDR5X shared memory | Grace CPU and Blackwell GPU | Enables zero-copy data sharing between CPU and GPU for improved latency and efficiency. | + + +## Check your setup + +Before starting, run the following commands to confirm your hardware is ready: + +```bash +# Check Arm CPU architecture +lscpu | grep "Architecture" +``` + +The expected result is: + +```output +Architecture: aarch64 +``` + +Print the NVIDIA GPU information: + +```bash +# Confirm visible GPU and driver version +nvidia-smi +``` + +Look for CUDA version 13.0 or later and Driver version 580.95.05 or later. + +{{% notice Note %}} +If your software versions are lower than the versions mentioned above, you should upgrade before proceeding. +{{% /notice %}} + +## Summary + +You now understand how RAG works and why Grace–Blackwell is ideal for RAG systems. The unified memory architecture allows the Grace CPU to handle document retrieval while the Blackwell GPU accelerates text generation, all without data copying overhead. + +Next, you'll set up your development environment and install the required tools to build this RAG system. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2_rag_setup.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2_rag_setup.md new file mode 100644 index 0000000000..16656877e6 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2_rag_setup.md @@ -0,0 +1,188 @@ +--- +title: Configure the RAG development environment and models +weight: 3 +layout: "learningpathall" +--- + +## Create the development environment + +To get started, you need to set up your development environment and prepare the embedding model and the LLM you will use in the RAG pipeline. + +The embedding model for the solution is e5-base-v2, and the LLM is Llama 3.1 8B Instruct. + +First, create a Python virtual environment to use for the project: + +```bash +cd ~ +python3 -m venv rag-venv +source rag-venv/bin/activate +``` + +Next, install the required packages: + +```bash +pip install --upgrade pip +pip install torch --index-url https://download.pytorch.org/whl/cpu +pip install transformers==4.46.2 sentence-transformers==2.7.0 faiss-cpu langchain==1.0.5 \ + langchain-community langchain-huggingface huggingface_hub \ + pypdf tqdm numpy +``` + +These packages provide the essential building blocks of the RAG system: + +- `sentence-transformers` is used for text embedding with the e5-base-v2 model. +- `faiss-cpu` enables efficient similarity search for document retrieval. +- `langchain` manages data orchestration between embedding, retrieval, and generation. +- `huggingface_hub` is used for model download and authentication. +- `pypdf` extracts and processes text content from documents. +- `tqdm` provides progress visualization. + +Since the pipeline runs on the Grace CPU, the CPU version of FAISS is sufficient and GPU acceleration is not required. + +Check the installation by printing the FAISS version: + +```bash +python - <<'EOF' +import faiss, transformers +print("FAISS version:", faiss.__version__) +print("FAISS GPU:", faiss.get_num_gpus() > 0) +EOF +``` + +The output confirms that FAISS is running in CPU mode. + +```output +FAISS version: 1.13.0 +FAISS GPU: False +``` + +## Model preparation + +Download and organize the models required for the RAG pipeline. + +The two models are: + +- The Large Language Model (LLM) is Llama 3.1 8B Instruct for text generation. +- The Embedding Model is e5-base-v2 for document vectorization. + +Both models will be stored locally under the `~/models` directory for offline operation. + +You will need a Hugging Face token to get the embedding model. The instructions will be printed when you run `hf auth login` providing a link to generate a token. + +```bash +mkdir -p ~/models && cd ~/models + +# Login to your Hugging Face Token +hf auth login +hf download intfloat/e5-base-v2 --local-dir ~/models/e5-base-v2 + +# Download GGUF version of Llama 3.1 8B model to save the time for local conversion +wget https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf -P ~/models/Llama-3.1-8B-gguf +``` + +## Verify the e5-base-v2 model + +Run a Python script to verify that the e5-base-v2 model loads correctly and can generate embeddings. + +Save the code below in a text file named `vector-test.py`: + +```bash +from sentence_transformers import SentenceTransformer +import numpy as np +import os + +model_path = os.path.expanduser("~/models/e5-base-v2") +print(f"Loading model from: {model_path}") + +try: + model = SentenceTransformer(model_path) + sentences = [ + "Arm processors are designed for high efficiency.", + "The Raspberry Pi uses Arm cores for its SoC." + ] + embeddings = model.encode(sentences) + + if isinstance(embeddings, np.ndarray) and embeddings.shape[0] == len(sentences): + print(" Model loaded and embeddings generated successfully.") + print("Embedding shape:", embeddings.shape) + print("First vector snippet:", np.round(embeddings[0][:10], 4)) + else: + print(" Model loaded, but embedding output seems incorrect.") +except Exception as e: + print(f" Model failed to load or generate embeddings: {e}") +``` + + +Run the code with: + +```bash +python ./vector-test.py +``` + +The output confirms the e5-base-v2 model can generate embeddings successfully. + +```output + Model loaded and embeddings generated successfully. +Embedding shape: (2, 768) +First vector snippet: [-0.012 -0.0062 -0.0008 -0.0014 0.026 -0.0066 -0.0173 0.026 -0.0238 + -0.0455] + ``` + +The e5-base-v2 results show: + +- Test sentences: The two example sentences are used to confirm that the model can process text input and generate embeddings correctly. If this step succeeds, the model's tokenizer, encoder, and PyTorch runtime on the Grace CPU are all working together properly. +- Embedding shape (2, 768): The two sentences were converted into two 768-dimensional embedding vectors. 768 is the hidden dimension size of this model. +- First vector snippet: Displays the first 10 values of the first embedding vector. Each number represents a learned feature extracted from the text. + +A successful output confirms that the e5-base-v2 embedding model is functional and ready for use. + +## Verify the Llama 3.1 model + +The llama.cpp runtime will be used for text generation using the Llama 3.1 model. + +Ensure that both the CPU and the GPU builds of llama.cpp have been installed. You can find the instructions in [Unlock quantized LLM performance on Arm-based NVIDIA DGX Spark](/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/). + +Verify the `Llama-3.1-8B-Q8_0.gguf` model is working using llama.cpp: + +```bash +cd ~/llama.cpp/build-gpu + +./bin/llama-cli \ + -m ~/models/Llama-3.1-8B-gguf/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf \ + -p "Hello from RAG user" \ + -ngl 40 --n-predict 64 +``` + +You should see the model load successfully and print a short generated sentence, for example: + +```output +Hello from this end! What brings you to this chat? Do you have any questions or topics you'd like to discuss? I'm here to help! +``` + +Next, check the REST Server, which is needed for the RAG pipeline: + +```bash +./bin/llama-server \ + -m ~/models/Llama-3.1-8B-gguf/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf \ + -ngl 40 --ctx-size 8192 \ + --port 8000 \ + --host 0.0.0.0 +``` + +Use another terminal on the same machine to do the health check: + +```bash +curl http://127.0.0.1:8000/completion \ + -H "Content-Type: application/json" \ + -d '{"prompt": "Explain why unified memory improves CPU–GPU collaboration.", "n_predict": 64}' +``` + +You should see a short JSON payload containing a coherent explanation generated by the model. + +Terminate the `llama-server` using Ctrl-C. + +{{% notice Note %}} +To test remote access from another machine, replace `127.0.0.1` with the IP address of the machine running `llama-server`. +{{% /notice %}} + +With the development setup, tools, and models prepared, you can create the vector database and add your documents. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2b_rag_setup.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2b_rag_setup.md new file mode 100644 index 0000000000..eece472804 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/2b_rag_setup.md @@ -0,0 +1,355 @@ +--- +title: Add documents to the RAG vector database +weight: 4 +layout: "learningpathall" +--- + +## Prepare a sample document corpus for RAG + +You are now ready to add your documents to the RAG database that will be used for retrieval and reasoning. + +This converts your raw knowledge documents into clean, chunked text segments that can later be vectorized and indexed by FAISS. + +## Use FAISS for efficient vector search on Arm + +FAISS (Facebook AI Similarity Search) is an open-source library developed by Meta AI for efficient similarity search and clustering of dense vectors. It's particularly well-suited for RAG applications because it can quickly find the most relevant document chunks from large collections. + +Key advantages of FAISS for this application: + +- CPU efficiency: FAISS is highly optimized for Arm CPUs, making it ideal for the Grace CPU in the GB10 platform +- Scalability: Handles millions of vectors with minimal memory overhead +- Speed: Uses advanced indexing algorithms to perform nearest-neighbor searches in milliseconds +- Flexibility: Supports multiple distance metrics (L2, cosine similarity) and index types + +## Set up your RAG workspace and data folder + +Create a directory structure for your data: + +```bash +mkdir -p ~/rag && cd ~/rag +mkdir pdf text +``` + +You can add any PDF data sources to your RAG database. + +For illustration, you can add a number of Raspberry Pi documents that you want to use to find out specific information about the Raspberry Pi products. + +Use a text editor to create a file named `datasheet.txt` listing all data source URLs that will be used for the RAG data. Make sure to include one URL per line. + +```console +https://datasheets.raspberrypi.com/cm/cm1-and-cm3-datasheet.pdf +https://datasheets.raspberrypi.com/cm/cm3-plus-datasheet.pdf +https://datasheets.raspberrypi.com/cm4/cm4-datasheet.pdf +https://datasheets.raspberrypi.com/cm4io/cm4io-datasheet.pdf +https://datasheets.raspberrypi.com/cm4s/cm4s-datasheet.pdf +https://datasheets.raspberrypi.com/pico/pico-2-datasheet.pdf +https://datasheets.raspberrypi.com/pico/pico-datasheet.pdf +https://datasheets.raspberrypi.com/picow/pico-2-w-datasheet.pdf +https://datasheets.raspberrypi.com/picow/pico-w-datasheet.pdf +https://datasheets.raspberrypi.com/rp2040/rp2040-datasheet.pdf +https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf +https://datasheets.raspberrypi.com/rpi4/raspberry-pi-4-datasheet.pdf +``` + +Use `wget` to batch download all the PDFs into `~/rag/pdf`. + +```bash +wget -P ~/rag/pdf -i datasheet.txt +``` + +## Convert PDF documents to text files + +Then, create a Python file named `pdf2text.py` with the code below: + +```python +from pypdf import PdfReader +import glob, os + +pdf_root = os.path.expanduser("~/rag/pdf") +txt_root = os.path.expanduser("~/rag/text") +os.makedirs(txt_root, exist_ok=True) + +count = 0 +for file in glob.glob(os.path.join(pdf_root, "**/*.pdf"), recursive=True): + print(f"File processing {file}") + try: + reader = PdfReader(file) + text = "\n".join(page.extract_text() or "" for page in reader.pages) + + rel_path = os.path.relpath(file, pdf_root) + txt_path = os.path.join(txt_root, os.path.splitext(rel_path)[0] + ".txt") + os.makedirs(os.path.dirname(txt_path), exist_ok=True) + + with open(txt_path, "w", encoding="utf-8") as f: + f.write(text) + + count += 1 + print(f"Converted: {file} -> {txt_path}") + + except Exception as e: + print(f"Error processing {file}: {e}") + +print(f"\nTotal converted PDFs: {count}") +print(f"Output directory: {txt_root}") +``` + +The resulting text files will form the corpus for semantic retrieval. + +Run the Python script to convert all PDFs into text files. + +```bash +python pdf2text.py +``` + +This script converts all PDFs into text files for later embedding. + +At the end of the output you see: + +```output +Total converted PDFs: 12 +``` + +## Verify your document corpus + +You should now see a number of files in your folder. Run the command below to inspect the results: + +```bash +find ~/rag/text/ -type f -name "*.txt" -exec cat {} + | wc -l +``` + +It will show how many lines are in total. The number is around 100,000. + +## Build an embedding and search index with FAISS + +Convert your prepared text corpus into vector embeddings and store them in a FAISS index for efficient semantic search. + +This stage enables your RAG pipeline to retrieve the most relevant text chunks when you ask questions. + +| **Component** | **Role** | +|--------------|------------------------------| +| SentenceTransformer (e5-base-v2) | Generates vector embeddings for each text chunk | +| LangChain and FAISS | Stores and searches embeddings efficiently | +| RecursiveCharacterTextSplitter | Splits long documents into manageable text chunks | + +Use e5-base-v2 to encode the documents and create a FAISS vector index. + +## Create and run the FAISS builder script + + +```bash +mkdir -p ~/rag/faiss_index +``` + +Create a file named `build_index.py` in `~/rag` that will perform the embedding. + +The embedding process (about 10 minutes on CPU) will batch every 100 text chunks for progress logging. + +```python +import os, glob +from tqdm import tqdm + +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS +from langchain_core.documents import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter + +# Paths +data_dir = os.path.expanduser("~/rag/text") +model_dir = os.path.expanduser("~/models/e5-base-v2") +index_dir = os.path.expanduser("~/rag/faiss_index") + +os.makedirs(index_dir, exist_ok=True) + +# Load embedding model (CPU only) +embedder = HuggingFaceEmbeddings( + model_name=model_dir, + model_kwargs={"device": "cpu"} +) + +print(f" Embedder loaded on: {embedder._client.device}") +print(f" Model path: {model_dir}") + +# Collect and split all text files (recursive) +docs = [] +splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) + +print("\n Scanning and splitting text files...") +for path in glob.glob(os.path.join(data_dir, "**/*.txt"), recursive=True): + with open(path, "r", encoding="utf-8", errors="ignore") as f: + text = f.read() + if not text.strip(): + continue + rel_path = os.path.relpath(path, data_dir) + for chunk in splitter.split_text(text): + docs.append(Document(page_content=chunk, metadata={"source": rel_path})) + +print(f" Total chunks loaded: {len(docs)}") + +# Prepare inputs for embedding +texts = [d.page_content for d in docs] +metadatas = [d.metadata for d in docs] + +""" +# Full embedding with progress logging every 100 chunks +print("\n Embedding text chunks (batch log every 100)...") +embeddings = [] +for i, chunk in enumerate(texts): + embedding = embedder.embed_documents([chunk])[0] + embeddings.append(embedding) + if (i + 1) % 100 == 0 or (i + 1) == len(texts): + print(f" Embedded {i + 1} / {len(texts)} chunks") +""" +# Batch embedding +embeddings = [] +batch_size = 16 +for i in range(0, len(texts), batch_size): + batch_texts = texts[i:i+batch_size] + batch_embeddings = embedder.embed_documents(batch_texts) + embeddings.extend(batch_embeddings) + print(f" Embedded {i + len(batch_texts)} / {len(texts)}") + +# Pair (text, embedding) for FAISS +text_embeddings = list(zip(texts, embeddings)) + +print("\n Saving FAISS index...") +db = FAISS.from_embeddings( + text_embeddings, + embedder, + metadatas=metadatas +) +db.save_local(index_dir) +print(f"\n FAISS index saved to: {index_dir}") +``` + +Run the code to generate the embeddings: + +```bash +python build_index.py +``` + +The script will process the corpus, load approximately 6,000 text chunks, and save the resulting FAISS index to the `~/rag/faiss_index` directory. + +You will find two files inside. + +- ***index.faiss*** + - A binary file that stores the vector index built using FAISS. + - It contains the actual embeddings and data structures used for efficient similarity search. + - This file enables fast retrieval of nearest neighbors for any given query vector. + +- ***index.pkl*** + - A pickle file that stores metadata and original document chunks. + - It maps each vector in `index.faiss` back to its text content and source info, including file name. + - Used by LangChain to return human-readable results along with context. + +You can verify the FAISS index using the following script. + +Save the code below in `check_index.py`. + +```python +import os +from langchain_community.vectorstores import FAISS +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_core.documents import Document + +model_path = os.path.expanduser("~/models/e5-base-v2") +index_path = os.path.expanduser("~/rag/faiss_index") + +embedder = HuggingFaceEmbeddings(model_name=model_path) +db = FAISS.load_local(index_path, embedder, allow_dangerous_deserialization=True) + +query = "raspberry pi 4 power supply" +results = db.similarity_search(query, k=3) + +for i, r in enumerate(results, 1): + print(f"\nResult {i}") + print(f"Source: {r.metadata.get('source')}") + print(r.page_content[:300], "...") + +query = "Use SWD debug Raspberry Pi Pico" +results = db.similarity_search(query, k=3) + +for i, r in enumerate(results, 4): + print(f"\nResult {i}") + print(f"Source: {r.metadata.get('source')}") + print(r.page_content[:300], "...") +``` + +Run the code using: + +```bash +python check_index.py +``` + +The results will look like the following: + +```output +Result 1 +Source: cm4io-datasheet.txt +Raspberry Pi Compute Module 4 IO Board. We recommend budgeting 9W for CM4. +If you want to supply an external +5V supply to the board, e.g. via J20 or via PoE J9, then we recommend that L5 be +removed. Removing L5 will prevent the on-board +5V and +3.3V supplies from starting up and +5V coming out of ... + +Result 2 +Source: cm4io-datasheet.txt +power the CM4. There is also an on-board +12V to +3.3V DC-DC converter PSU which is only used for the PCIe slot. The ++12V input feeds the +12V PCIe slot, the external PSU connector and the fan connector directly. If these aren’t being +used then a wider input supply is possible (+7.5V to +28V). +With ... + +Result 3 +Source: cm4io-datasheet.txt +that Raspberry Pi 4 Model B has, and for general usage you should refer to the Raspberry Pi 4 Model B documentation . +The significant difference between CM4IO and Raspberry Pi 4 Model B is the addition of a single PCIe socket. The +CM4IO has been designed as both a reference design for CM4 or to be u ... + +Result 4 +Source: pico-datasheet.txt +mass storage device), or the standard Serial Wire Debug (SWD) port can reset the system and load and run code +without any button presses. The SWD port can also be used to interactively debug code running on the RP2040. +Raspberry Pi Pico Datasheet +Chapter 1. About Raspberry Pi Pico 4 +Getting started ... + +Result 5 +Source: pico-2-datasheet.txt +mass storage device), or the standard Serial Wire Debug (SWD) port can reset the system and load and run code +without any button presses. The SWD port can also be used to interactively debug code running on the RP2350. + TIP +Getting started with Raspberry Pi Pico-series walks through loading progra ... + +Result 6 +Source: pico-w-datasheet.txt +without any button presses. The SWD port can also be used to interactively debug code running on the RP2040. +Getting started with Pico W +The Getting started with Raspberry Pi Pico-series book walks through loading programs onto the +board, and shows how to install the C/C++ SDK and build the example ... +``` + +The execution of `check_index.py` confirms that your local FAISS vector index is functioning correctly for semantic search tasks. + +You performed two distinct queries targeting different product lines within the Raspberry Pi ecosystem: "Raspberry Pi 4 power supply" and "Raspberry Pi Pico SWD debugging". + +- For the first query, the system returned three highly relevant results, all sourced from the `cm4io-datasheet.txt` file. These passages provided technical guidance on power requirements, supply voltage ranges, and hardware configurations specific to the Compute Module 4 IO Board. This indicates that the embeddings captured the correct semantic intent and that the FAISS index correctly surfaced content even when specific keywords like "power supply" appeared in varied contexts. + +- For the second query, the search retrieved top results from all three relevant datasheets in the Pico family: `pico-datasheet.txt`, `pico-2-datasheet.txt`, and `pico-w-datasheet.txt`. +The extracted passages consistently explained how the Serial Wire Debug (SWD) port allows developers to reset the system, load and run code without manual input, and perform interactive debugging on the RP2040 or RP2350 microcontrollers. This demonstrates that your chunking and indexing pipeline accurately retained embedded debugging context, and that metadata mapping correctly links each result to its original source document. + +This process validates that your system can perform semantic retrieval on technical documents, a core capability of any RAG application. + +In summary, both semantic queries were successfully answered using your local vector store, validating that the indexing, embedding, metadata, and retrieval components of your RAG backend are working correctly in a CPU-only configuration. + + +| **Stage** | **Technology** | **Hardware Execution** | **Function** | +|------------|----------------|------------------------|---------------| +| Document Processing | pypdf, python-docx | Grace CPU | Text extraction | +| Embedding | e5-base-v2 (sentence-transformers) | Grace CPU | Vectorization | +| Retrieval | FAISS + LangChain | Grace CPU | Semantic search | +| Generation | llama.cpp REST Server | Blackwell GPU + Grace CPU | Text generation | +| Orchestration | Python RAG Script | Grace CPU | Pipeline control | +| Unified Memory | NVLink-C2C | Shared | Zero-copy data exchange | + +At this point, your environment is fully configured and validated. +You have confirmed that the e5-base-v2 embedding model, FAISS index, and Llama 3.1 8B model are all functioning correctly. + +In the next section, you will integrate the validated components into a full Retrieval-Augmented Generation (RAG) pipeline, combining CPU-based retrieval and GPU-accelerated generation. + diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/3_rag_pipeline.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/3_rag_pipeline.md new file mode 100644 index 0000000000..c7b743f423 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/3_rag_pipeline.md @@ -0,0 +1,215 @@ +--- +title: Build and run the RAG pipeline +weight: 5 +layout: "learningpathall" +--- + +## Integrate retrieval and generation on Arm + +In the previous sections, you prepared the environment, validated the e5-base-v2 embedding model, and verified that the Llama 3.1 8B Instruct model runs successfully on the Grace–Blackwell (GB10) platform. + +In this section, you will bring all components together to build a complete Retrieval-Augmented Generation (RAG) workflow. + +This stage connects the CPU-based retrieval and indexing with GPU-accelerated language generation, creating an end-to-end system capable of answering technical questions using real documentation data. + +Building upon the previous modules, you will now: +- Connect the e5-base-v2 embedding model and FAISS vector index. +- Integrate the llama.cpp REST server for GPU-accelerated inference. +- Execute a complete Retrieval-Augmented Generation (RAG) workflow for end-to-end question answering. + +## Start the llama.cpp REST server + +Before running the RAG query script, ensure the LLM server is active by running: + +```bash +cd ~/llama.cpp/build-gpu/ +./bin/llama-server \ + -m ~/models/Llama-3.1-8B-gguf/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf \ + -ngl 40 --ctx-size 8192 \ + --port 8000 --host 0.0.0.0 +``` + +Verify the server status from another terminal: + +```bash +curl http://127.0.0.1:8000/health +``` + +The output is: + +```output +{"status":"ok"} +``` + +## Create the RAG query script + +This script performs the full pipeline using the flow: + +User Query ─> Embedding ─> Vector Search ─> Context ─> Generation ─> Answer + +Save the code below in a file named `rag_query_rest.py` in the `~/rag` directory. + +```bash +import os +import requests, faiss, json, numpy as np +from sentence_transformers import SentenceTransformer +from langchain_community.vectorstores import FAISS +from langchain_huggingface import HuggingFaceEmbeddings + +# --- Paths --- +index_path = os.path.expanduser("~/rag/faiss_index") +model_path = os.path.expanduser("~/models/e5-base-v2") +LLAMA_URL = "http://127.0.0.1:8000/completion" + +# --- Load Embedding Model & FAISS Index --- +embedder = HuggingFaceEmbeddings(model_name=model_path, model_kwargs={"device": "cpu"}) +db = FAISS.load_local(index_path, embedder, allow_dangerous_deserialization=True) + +def rag_query(question, top_k=3, max_new_tokens=256): + # Step 1: Retrieve documents + results = db.similarity_search(question, k=top_k) + context = "\n\n".join([r.page_content for r in results]) + + print("\nRetrieved sources:") + for i, r in enumerate(results, 1): + print(f"{i}. {r.metadata.get('source', 'unknown')}") + + # Step 2: Construct prompt + prompt = f"""You are a helpful engineering assistant. +Use the following context to answer the question. + +Context: +{context} + +Question: +{question} + +Answer:""" + + # Step 3: Call llama.cpp REST Server + payload = {"prompt": prompt, "n_predict": max_new_tokens, "temperature": 0.2} + try: + resp = requests.post(LLAMA_URL, json=payload, timeout=300) + data = resp.json() + return data.get("content", data) + except Exception as e: + print(f"llama.cpp server error or invalid response: {e}") + +if __name__ == "__main__": + answer = rag_query("How many CPU core inside the RaspberryPi 4?") +# answer = rag_query("On the Raspberry Pi 4, which GPIOs have a default pull-down (pull low) configuration? Please specify the source and the section of the datasheet where this information can be found.") + print("\n=== RAG Answer ===\n") + print(answer) +``` + +Make sure you are in the Python virtual environment in each terminal. If needed, run: + +```bash +cd ~/rag +source rag-venv/bin/activate +``` + +Run the python script to ask the question, "How many CPU core inside the RaspberryPi 4?". + +```bash +python rag_query_rest.py +``` + +You will receive an answer similar to the following. + +```output +Retrieved sources: +1. cm4-datasheet.txt +2. raspberry-pi-4-datasheet.txt +3. cm4s-datasheet.txt + +=== RAG Answer === + + 4 +The Raspberry Pi 4 has 4 CPU cores. +``` + +The retrieved context referenced three datasheets and produced the correct answer: "4". + +Try a different question. + +Comment out the first question `answer = rag_query("How many CPU core inside the RaspberryPi 4?")` +and uncomment the second question to test a more detailed query. + +Run the script again with the new question. + +```bash +python rag_query_rest.py +``` + +The output is: + +```output +Retrieved sources: +1. cm3-plus-datasheet.txt +2. raspberry-pi-4-datasheet.txt +3. cm4s-datasheet.txt + +=== RAG Answer === + + Low +Step 1: The question asks about the default pull state of GPIO12 on a Raspberry Pi 4. +Step 2: To answer this question, we need to refer to the provided table, which lists the default pin pull state and available alternate GPIO functions for the Raspberry Pi 4. +Step 3: Specifically, we are looking for the default pull state of GPIO12. We can find this information in the table by locating the row corresponding to GPIO12. +Step 4: The table shows that GPIO12 has a default pull state of Low. +Step 5: Therefore, the default pull of GPIO12 on a Raspberry Pi 4 is Low. + +Retrieved sources: +1. raspberry-pi-4-datasheet.txt +2. cm4-datasheet.txt +3. cm3-plus-datasheet.txt + +=== RAG Answer === + +The GPIOs with a default pull-down (pull low) configuration are: +- GPIO 9 (SPI0 MISO) +- GPIO 10 (SPI0 MOSI) +- GPIO 11 (SPI0 SCLK) +- GPIO 12 (PWM0) +- GPIO 13 (PWM1) +- GPIO 14 (TXD0) +- GPIO 15 (RXD0) +- GPIO 16 (FL0) +- GPIO 17 (FL1) +- GPIO 19 (PCM FS) + +Source: Table 5: Raspberry Pi 4 GPIO Alternate Functions, section 5.1.2 GPIO Alternate Functions. +``` + +This demonstrates that the RAG system correctly retrieved relevant sources and generated the right answer using both CPU retrieval and GPU inference. + +You can reference the section 5.1.2 on the PDF to verify the result. + +## Observe CPU and GPU utilization + +If you have installed `htop` and `nvtop`, you can observe CPU and GPU utilization. + +If you do not have them, run: + +```bash +sudo apt install -y nvtop htop +``` + +The screenshots below show `nvtop` on the left and `htop` on the right side. + +![image1 CPU–GPU Utilization screenshot](rag_utilization.jpeg) + +From the screenshots, you can see how the Grace CPU and the Blackwell GPU collaborate during RAG execution. + +On the left, the GPU utilization graph shows a clear spike reaching 96%, indicating that the llama.cpp inference engine is actively generating tokens on the GPU. + +Meanwhile, on the right, `htop` shows multiple Python processes running on the Grace CPU cores, maintaining around 93% per-core utilization. + +This demonstrates the hybrid execution model of the RAG pipeline: +- The Grace CPU handles embedding computation, FAISS retrieval, and orchestration of REST API calls. +- The Blackwell GPU performs heavy matrix multiplications for LLM token generation. +- Both operate concurrently within the same Unified Memory space, eliminating data copy overhead between the CPU and GPU. + +You have now connected the components of the RAG pipeline on the GB10 platform. + +With the RAG pipeline now complete, the next section focuses on unified memory. You will learn how the CPU and GPU share data seamlessly within the same memory space. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/4_rag_memory_observation.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/4_rag_memory_observation.md new file mode 100644 index 0000000000..680f665e05 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/4_rag_memory_observation.md @@ -0,0 +1,229 @@ +--- +title: Monitor unified memory performance +weight: 6 +layout: "learningpathall" +--- + +## Observe unified memory performance + +In this section, you will learn how to monitor unified memory performance and GPU utilization on Grace–Blackwell systems during Retrieval-Augmented Generation (RAG) AI workloads. By observing real-time system memory and GPU activity, you will verify zero-copy data sharing and efficient hybrid AI inference enabled by the Grace–Blackwell unified memory architecture. + + +You will start from an idle system state, then progressively launch the RAG model server and run a query, while monitoring both system memory and GPU activity from separate terminals. This hands-on experiment demonstrates how unified memory enables both the Grace CPU and Blackwell GPU to access the same memory space without data movement, optimizing AI inference performance. + +You will start from an idle system state, then progressively launch the model server and run a query, while monitoring both system memory and GPU activity from separate terminals. + +Through these real-time observations, you will verify that the Grace–Blackwell unified memory architecture enables zero-copy data sharing, allowing both processors to access the same memory space without moving data. + +Open two terminals on your GB10 system and use them as listed in the table below: + +| **Terminal** | **Observation Target** | **Purpose** | +|----------------------|------------------------|----------------------------------------------------| +| Monitor Terminal 1 | System memory usage | Observe memory allocation changes as processes run | +| Monitor Terminal 2 | GPU activity | Track GPU utilization, power draw, and temperature | + +You should also have your original terminals open that you used to run the `llama-server` and the RAG queries in the previous section. You will run these again and use the two new terminals for observation. + + +## Prepare for unified memory observation + +Ensure the RAG pipeline is stopped before starting the observation. + +### Terminal 1:system memory observation + +Run the Bash commands below in terminal 1 to print the free memory of the system: + +```bash +while true; do + echo -n "$(date '+[%Y-%m-%d %H:%M:%S]') " + free -h | grep Mem: | awk '{printf "used=%s free=%s available=%s\n", $3, $4, $7}' + sleep 1 +done +``` + +The output is similar to the following: + +```output +[2025-11-07 22:34:24] used=3.5Gi free=106Gi available=116Gi +[2025-11-07 22:34:25] used=3.5Gi free=106Gi available=116Gi +[2025-11-07 22:34:26] used=3.5Gi free=106Gi available=116Gi +[2025-11-07 22:34:27] used=3.5Gi free=106Gi available=116Gi +``` + +The printed fields are: + +- `used` — Total memory currently utilized by all active processes. +- `free` — Memory not currently allocated or reserved by the system. +- `available` — Memory immediately available for new processes, accounting for reclaimable cache and buffers. + +### Terminal 2: GPU status observation + +Run the Bash commands below in terminal 2 to print the GPU statistics: + +```bash +stdbuf -oL nvidia-smi --loop-ms=1000 \ + --query-gpu=timestamp,utilization.gpu,utilization.memory,power.draw,temperature.gpu,memory.used \ + --format=csv,noheader,nounits +``` + +The output is similar to the following: + +```output +2025/11/07 22:38:05.114, 0, 0, 4.43, 36, [N/A] +2025/11/07 22:38:06.123, 0, 0, 4.46, 36, [N/A] +2025/11/07 22:38:07.124, 0, 0, 4.51, 36, [N/A] +2025/11/07 22:38:08.124, 0, 0, 4.51, 36, [N/A] +``` + +The format is not easy to read, but following the date and time, there are three key stats being reported: utilization, power, and temperature. The memory-related stats are not used on the GB10 system. + +Here is an explanation of the fields: + +| **Field** | **Description** | **Interpretation** | +|----------------------|---------------------------|-----------------------------------------------------------------------------| +| `timestamp` | Time of data sampling | Used to align GPU metrics with memory log timestamps | +| `utilization.gpu` | GPU compute activity | Peaks during token generation | +| `utilization.memory` | GPU DRAM controller usage | Stays at 0% — Unified Memory bypasses the GDDR controller | +| `power.draw` | GPU power consumption | Rises during inference, falls after completion | +| `temperature.gpu` | GPU temperature (°C) | Slightly increases during workload, confirming GPU activity | +| `memory.used` | GPU VRAM usage | GB10 does not include separate VRAM; all data resides within Unified Memory | + + +## Run the llama-server + +With the idle condition understood, start the `llama.cpp` REST server again in your original terminal, not the two new terminals being used for observation. + +Here is the command: + +```bash +cd ~/llama.cpp/build-gpu/ +./bin/llama-server \ + -m ~/models/Llama-3.1-8B-gguf/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf \ + -ngl 40 --ctx-size 8192 \ + --port 8000 --host 0.0.0.0 +``` + +Observe both monitoring terminals: + +The output in monitor terminal 1 is similar to: + +```output +[2025-11-07 22:50:27] used=3.5Gi free=106Gi available=116Gi +[2025-11-07 22:50:28] used=3.9Gi free=106Gi available=115Gi +[2025-11-07 22:50:29] used=11Gi free=98Gi available=108Gi +[2025-11-07 22:50:30] used=11Gi free=98Gi available=108Gi +[2025-11-07 22:50:31] used=11Gi free=98Gi available=108Gi +[2025-11-07 22:50:32] used=12Gi free=97Gi available=106Gi +[2025-11-07 22:50:33] used=12Gi free=97Gi available=106Gi +``` + +The output in monitor terminal 2 is similar to: + +```output +2025/11/07 22:50:27.836, 0, 0, 4.39, 35, [N/A] +2025/11/07 22:50:28.836, 0, 0, 6.75, 36, [N/A] +2025/11/07 22:50:29.837, 6, 0, 11.47, 36, [N/A] +2025/11/07 22:50:30.837, 7, 0, 11.51, 36, [N/A] +2025/11/07 22:50:31.838, 6, 0, 11.50, 36, [N/A] +2025/11/07 22:50:32.839, 0, 0, 11.90, 36, [N/A] +2025/11/07 22:50:33.840, 0, 0, 10.85, 36, [N/A] +``` + +| **Terminal** | **Observation** | **Behavior** | +|--------------------|------------------------------------------------------|-------------------------------------------------| +| Monitor Terminal 1 | used increases by about 8 GiB | Model weights loaded into shared Unified Memory | +| Monitor Terminal 2 | GPU utilization momentarily spikes and power rises | GPU initialization and model mapping | + + +This confirms the model is resident in unified memory, which is visible by the increased system RAM usage. + + +## Execute the RAG query + +With the observation code and the `llama-server` still running, run the RAG query in another terminal: + +```bash +python3 ~/rag/rag_query_rest.py +``` + +The output in monitor terminal 1 is similar to: + +```output +[2025-11-07 22:53:56] used=12Gi free=97Gi available=106Gi +[2025-11-07 22:53:57] used=12Gi free=97Gi available=106Gi +[2025-11-07 22:53:58] used=12Gi free=97Gi available=106Gi +[2025-11-07 22:53:59] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:00] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:01] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:02] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:03] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:04] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:05] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:06] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:07] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:08] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:09] used=13Gi free=96Gi available=106Gi +[2025-11-07 22:54:10] used=12Gi free=97Gi available=106Gi +[2025-11-07 22:54:11] used=12Gi free=97Gi available=106Gi +``` + +The output in monitor terminal 2 is similar to: + +```output +2025/11/07 22:53:56.010, 0, 0, 11.24, 41, [N/A] +2025/11/07 22:53:57.010, 0, 0, 11.22, 41, [N/A] +2025/11/07 22:53:58.011, 0, 0, 11.20, 41, [N/A] +2025/11/07 22:53:59.012, 0, 0, 11.19, 41, [N/A] +2025/11/07 22:54:00.012, 0, 0, 11.33, 41, [N/A] +2025/11/07 22:54:01.013, 0, 0, 11.89, 41, [N/A] +2025/11/07 22:54:02.014, 96, 0, 31.53, 44, [N/A] +2025/11/07 22:54:03.014, 96, 0, 31.93, 45, [N/A] +2025/11/07 22:54:04.015, 96, 0, 31.98, 45, [N/A] +2025/11/07 22:54:05.015, 96, 0, 32.11, 46, [N/A] +2025/11/07 22:54:06.016, 96, 0, 32.01, 46, [N/A] +2025/11/07 22:54:07.016, 96, 0, 32.03, 46, [N/A] +2025/11/07 22:54:08.017, 96, 0, 32.14, 47, [N/A] +2025/11/07 22:54:09.017, 95, 0, 32.17, 47, [N/A] +2025/11/07 22:54:10.018, 0, 0, 28.87, 45, [N/A] +2025/11/07 22:54:11.019, 0, 0, 11.83, 44, [N/A] +``` + +| **Timestamp** | **GPU Utilization** | **GPU Power** | **System Memory (used)** | **Interpretation** | +|---------------|---------------------|---------------|--------------------------|-------------------------------------------------------| +| 22:53:58 | 0% | 11 W | 12 Gi | System idle | +| 22:54:02 | 96% | 32 W | 13 Gi | GPU performing generation while CPU handles retrieval | +| 22:54:09 | 96% | 32 W | 13 Gi | Unified Memory data sharing in progress | +| 22:54:10 | 0% | 12 W | 12 Gi | Query completed, temporary buffers released | + + +The GPU executes compute kernels with GPU utilization at 96%, without reading from GDDR or PCIe. + +The `utilization.memory=0` and `memory.used=[N/A]` metrics are clear signs that data sharing, not data copying, is happening. + +## Interpret unified memory behavior + +This experiment confirms the Grace–Blackwell Unified Memory architecture in action: +- The CPU and GPU share the same address space. +- No data transfers occur via PCIe. +- Memory activity remains stable while GPU utilization spikes. + +Data does not move — computation moves to the data. + +The Grace CPU orchestrates retrieval, and the Blackwell GPU performs generation, both operating within the same Unified Memory pool. + +## Summary of unified memory behavior + +| **Observation** | **Unified Memory Explanation** | +|----------------------------------------------------|----------------------------------------------------------| +| Memory increases once (during model loading) | Model weights are stored in shared Unified Memory | +| Slight memory increase during query execution | CPU temporarily stores context; GPU accesses it directly | +| GPU power increases during computation | GPU cores are actively performing inference | +| No duplicated allocation or data transfer observed | Data is successfully shared between the CPU and GPU | + + +Through this experiment, you confirmed that: +- The Grace CPU efficiently handles retrieval, embedding, and orchestration tasks. +- The Blackwell GPU accelerates generation using data directly from Unified Memory. +- The system memory and GPU activity clearly demonstrate zero-copy data sharing. + +This exercise highlights how the Grace–Blackwell architecture simplifies hybrid AI development by reducing complexity and improving efficiency for next-generation Arm-based AI systems. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_index.md new file mode 100644 index 0000000000..fc29aba1a5 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_index.md @@ -0,0 +1,54 @@ +--- +title: Build a RAG pipeline on Arm-based NVIDIA DGX Spark +minutes_to_complete: 60 + +who_is_this_for: This is an advanced topic for developers who want to build a Retrieval-Augmented Generation (RAG) pipeline on the NVIDIA DGX Spark platform. You'll learn how Arm-based Grace CPUs handle document retrieval and orchestration, while Blackwell GPUs speed up large language model inference using the open-source llama.cpp REST server. This is a great fit if you're interested in combining Arm CPU management with GPU-accelerated AI workloads. + +learning_objectives: + - Describe how a RAG system combines document retrieval and language model generation + - Deploy a hybrid CPU-GPU RAG pipeline on the GB10 platform using open-source tools + - Use the llama.cpp REST Server for GPU-accelerated inference with CPU-managed retrieval + - Build a reproducible RAG application that demonstrates efficient hybrid computing + +prerequisites: + - An NVIDIA DGX Spark system with at least 15 GB of available disk space + +author: Odin Shen + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Cortex-A +operatingsystems: + - Linux +tools_software_languages: + - Python + - llama.cpp + - Hugging Face + +further_reading: + - resource: + title: Nvidia DGX Spark + link: https://www.nvidia.com/en-gb/products/workstations/dgx-spark/ + type: website + - resource: + title: EdgeXpert from MSI + link: https://ipc.msi.com/product_detail/Industrial-Computer-Box-PC/AI-Supercomputer/EdgeXpert-MS-C931 + type: website + - resource: + title: Nvidia DGX Spark Playbooks + link: https://github.com/NVIDIA/dgx-spark-playbooks + type: documentation + - resource: + title: Unlock quantized LLM performance on Arm-based NVIDIA DGX Spark + link: https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/ + type: Learning Path + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_next-steps.md b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_rag/rag_utilization.jpeg b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/rag_utilization.jpeg new file mode 100644 index 0000000000..aba6e6d840 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/dgx_spark_rag/rag_utilization.jpeg differ diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md new file mode 100644 index 0000000000..f479de08a5 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md @@ -0,0 +1,52 @@ +--- +title: Build and test KleidiCV on macOS + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers who want to build and test KleidiCV on macOS. + +learning_objectives: +- Install and compile KleidiCV on macOS +- Run KleidiCV example tests +- Enable Scalable Matrix Extensions (SME) and verify increased SME performance + +prerequisites: +- A Mac with Apple Silicon (M4 generation or newer) +- Xcode command line tools installed +- Basic familiarity with using the Terminal and command-line tools + +author: Jett Zhou + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +tools_software_languages: + - KleidiCV + - C +armips: + - Cortex-A +operatingsystems: + - macOS + +further_reading: + - resource: + title: KleidiCV documentation + link: https://gitlab.arm.com/kleidi/kleidicv/-/tree/0.6.0/doc?ref_type=tags + type: documentation + - resource: + title: Announcing Arm KleidiCV 0.1 + link: https://developer.arm.com/community/arm-community-blogs/b/ai-blog/posts/kleidicv + type: blog + - resource: + title: Learn about function multiversioning + link: https://learn.arm.com/learning-paths/cross-platform/function-multiversioning/ + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md new file mode 100644 index 0000000000..14956d052c --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md @@ -0,0 +1,218 @@ +--- +title: Download and build KleidiCV software +weight: 2 + +layout: learningpathall +--- + +## Introduction + +Arm KleidiCV is an open-source library that provides fast, optimized routines for Arm CPUs. You can use KleidiCV with any computer vision (CV) framework to boost performance for CV workloads on Arm systems. + +KleidiCV includes multiple optimized implementations for each function, targeting Arm Neon, SVE2 (Scalable Vector Extension 2), and SME2 (Scalable Matrix Extension 2) instruction sets. The library automatically detects your hardware and chooses the fastest available code path, so you don't need to adjust your code for different Arm CPUs. + +You can use KleidiCV as a standalone image processing library or integrate it with OpenCV for broader computer vision support. On Apple M4 processors, which use the Armv9.2‑A architecture and support SME, you'll see improved performance for matrix operations. In this Learning Path, you'll build and test KleidiCV to observe how it selects the best backend for your hardware. + +## Set up your environment + +To follow this example you'll need a MacBook Pro with an Apple Silicon M4 processor. + +To check your operating system version, follow these steps: + +- Select the **Apple menu ()** in the top-left corner of your screen +- Select **About This Mac** +- Alternatively, open a terminal and run: + +```console +sw_vers +``` +The output is similar to: + +```output +ProductName: macOS +ProductVersion: 15.5 +BuildVersion: 24F74 +``` +### Install CMake + +If CMake is not already installed on your host machine, you can install it using Homebrew: + +```bash +brew install cmake +``` +To check which Arm architecture features your Mac supports, run the following command in your terminal: + +```bash +sysctl -a | grep hw.optional.arm.FEAT +``` + +Look for `hw.optional.arm.FEAT_SME: 1` in the output. If you see this line, your system supports SME (Scalable Matrix Extension). If the value is `0`, SME is not available on your hardware. + +The output is: + +```output +hw.optional.arm.FEAT_CRC32: 1 +hw.optional.arm.FEAT_FlagM: 1 +hw.optional.arm.FEAT_FlagM2: 1 +hw.optional.arm.FEAT_FHM: 1 +hw.optional.arm.FEAT_DotProd: 1 +hw.optional.arm.FEAT_SHA3: 1 +hw.optional.arm.FEAT_RDM: 1 +hw.optional.arm.FEAT_LSE: 1 +hw.optional.arm.FEAT_SHA256: 1 +hw.optional.arm.FEAT_SHA512: 1 +hw.optional.arm.FEAT_SHA1: 1 +hw.optional.arm.FEAT_AES: 1 +hw.optional.arm.FEAT_PMULL: 1 +hw.optional.arm.FEAT_SPECRES: 0 +hw.optional.arm.FEAT_SPECRES2: 0 +hw.optional.arm.FEAT_SB: 1 +hw.optional.arm.FEAT_FRINTTS: 1 +hw.optional.arm.FEAT_PACIMP: 1 +hw.optional.arm.FEAT_LRCPC: 1 +hw.optional.arm.FEAT_LRCPC2: 1 +hw.optional.arm.FEAT_FCMA: 1 +hw.optional.arm.FEAT_JSCVT: 1 +hw.optional.arm.FEAT_PAuth: 1 +hw.optional.arm.FEAT_PAuth2: 1 +hw.optional.arm.FEAT_FPAC: 1 +hw.optional.arm.FEAT_FPACCOMBINE: 1 +hw.optional.arm.FEAT_DPB: 1 +hw.optional.arm.FEAT_DPB2: 1 +hw.optional.arm.FEAT_BF16: 1 +hw.optional.arm.FEAT_EBF16: 0 +hw.optional.arm.FEAT_I8MM: 1 +hw.optional.arm.FEAT_WFxT: 1 +hw.optional.arm.FEAT_RPRES: 1 +hw.optional.arm.FEAT_CSSC: 0 +hw.optional.arm.FEAT_HBC: 0 +hw.optional.arm.FEAT_ECV: 1 +hw.optional.arm.FEAT_AFP: 1 +hw.optional.arm.FEAT_LSE2: 1 +hw.optional.arm.FEAT_CSV2: 1 +hw.optional.arm.FEAT_CSV3: 1 +hw.optional.arm.FEAT_DIT: 1 +hw.optional.arm.FEAT_FP16: 1 +hw.optional.arm.FEAT_SSBS: 0 +hw.optional.arm.FEAT_BTI: 1 +hw.optional.arm.FEAT_SME: 1 +hw.optional.arm.FEAT_SME2: 1 +hw.optional.arm.FEAT_SME_F64F64: 1 +hw.optional.arm.FEAT_SME_I16I64: 1 +``` + +If your Mac does not have an M4 processor, you won't see the `FEAT_SME` flags set to `1`. In that case, SME (Scalable Matrix Extension) features are not available on your hardware, and KleidiCV will use other optimized code paths instead. + +## Create a workspace + +You can use an environment variable to define your workspace: + +```bash +export WORKSPACE= +``` + +For example, + +```bash +mkdir $HOME/kleidi +export WORKSPACE=$HOME/kleidi +``` + +## Download the software + +To set up KleidiCV and OpenCV, first download the source code from GitLab. + +In your $WORKSPACE directory, clone KleidiCV using the v0.6.0 release tag: + +```bash +cd $WORKSPACE +git clone -b 0.6.0 https://git.gitlab.arm.com/kleidi/kleidicv.git +``` + +Clone the OpenCV repository into $WORKSPACE using the v4.12.0 release tag: + +```bash +cd $WORKSPACE +git clone https://github.com/opencv/opencv.git +cd opencv +git checkout 4.12.0 +``` + +Apply the patch for OpenCV version 4.12: + +```bash +patch -p1 < ../kleidicv/adapters/opencv/opencv-4.12.patch +patch -p1 < ../kleidicv/adapters/opencv/extra_benchmarks/opencv-4.12.patch +``` + +## Build options + +KleidiCV provides several CMake options to control which instruction sets and features are enabled during the build. + +Here are the most important options for Arm systems: + +- KLEIDICV_ENABLE_SVE2 enables Scalable Vector Extension 2 (SVE2) code paths. This is on by default for popular compilers that support SVE2, but off otherwise. +- KLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS limits SVE2 code paths to algorithms where SVE2 is expected to outperform other options. This is on by default. It has no effect if SVE2 is disabled. +- KLEIDICV_BENCHMARK enables building KleidiCV benchmarks. The benchmarks use Google Benchmark, which is downloaded automatically. This is off by default. +- KLEIDICV_ENABLE_SME2 enables Scalable Matrix Extension 2 (SME2) and Streaming SVE code paths. This is off by default while the ACLE SME specification is in beta. +- KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS limits SME2 code paths to cases where SME2 is expected to provide a benefit. This is on by default. It has no effect if SME2 is disabled. + +You can set these options when running `cmake` to customize your build for your hardware and use case. + +KleidiCV automatically selects the fastest available code path for your hardware. If the library detects that SVE2 (Scalable Vector Extension 2) or SME2 (Scalable Matrix Extension 2) is slower than NEON for a specific function, it defaults to NEON—unless you explicitly turn off this behavior by setting `-DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF` or `-DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF`. + +## Build the KleidiCV standalone + +Use the following command to build KleidiCV natively: + +```bash +cmake -S $WORKSPACE/kleidicv \ + -B build-kleidicv-benchmark-SME \ + -DKLEIDICV_ENABLE_SME2=ON \ + -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \ + -DKLEIDICV_BENCHMARK=ON \ + -DCMAKE_BUILD_TYPE=Release +cmake --build build-kleidicv-benchmark-SME --parallel +``` +Once the build completes, the KleidiCV API and framework tests appear below: + +```bash +ls ./build-kleidicv-benchmark-SME/test/framework/kleidicv-framework-test +ls ./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test +``` + +The KleidiCV benchmark test is available as follows: + +```bash +ls ./build-kleidicv-benchmark-SME/benchmark/kleidicv-benchmark +``` +## Build the OpenCV with KleidiCV + +You can use the following command to build OpenCV with KleidiCV: + +```bash +cmake -S $WORKSPACE/opencv \ + -B build-opencv-kleidicv-sme \ + -DWITH_KLEIDICV=ON \ + -DKLEIDICV_ENABLE_SME2=ON \ + -DKLEIDICV_SOURCE_PATH=$WORKSPACE/kleidicv \ + -DBUILD_LIST=imgproc,core,ts \ + -DBUILD_SHARED_LIBS=OFF \ + -DBUILD_TESTS=ON \ + -DBUILD_PERF_TEST=ON \ + -DWITH_PNG=OFF +cmake --build build-opencv-kleidicv-sme --parallel --target opencv_perf_imgproc opencv_perf_core +``` + +Upon completion of the build process, the OpenCV test binary will be available at the following location: + +```bash +ls build-opencv-kleidicv-sme/bin/opencv_perf_core +ls build-opencv-kleidicv-sme/bin/opencv_perf_imgproc +``` + +## What you've accomplished and what's next + +You've successfully set up your development environment, downloaded the KleidiCV and OpenCV source code, and built both libraries with SME2 support on your Apple Silicon Mac. At this point, you have all the tools you need to explore how KleidiCV optimizes for Arm architectures. + +In the next section, you'll run benchmarks to see SME in action and learn how KleidiCV automatically selects the best code paths for your hardware. This will help you understand the performance benefits of Arm's advanced instruction sets for computer vision workloads. diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md new file mode 100644 index 0000000000..4f1327d74a --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md @@ -0,0 +1,570 @@ +--- +title: Test KleidiCV and verify SME backend support +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Run the test + +Once the build steps are complete, you can run the KleidiCV and OpenCV tests. +The KleidiCV API test checks the public C++ API and confirms that the build is working as expected. To run the test, use the following command: + +```bash +./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test +``` + +You will see output showing the number of tests run and their results. The full test log is omitted here for clarity. + +```bash +./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test +``` + +The output is similar to: + +```output +Vector length is set to 16 bytes. +Seed is set to 2542467924. +[==========] Running 3703 tests from 141 test suites. +[----------] Global test environment set-up. +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0, where TypeParam = short +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestPositive +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestPositive (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestNegative +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestNegative (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestMin +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestMin (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestZero +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestZero (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestMax +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestMax (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.NullPointer +[ OK ] SaturatingAddAbsWithThresholdTest/0.NullPointer (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.Misalignment +[ OK ] SaturatingAddAbsWithThresholdTest/0.Misalignment (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.ZeroImageSize +[ OK ] SaturatingAddAbsWithThresholdTest/0.ZeroImageSize (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.OversizeImage +[ OK ] SaturatingAddAbsWithThresholdTest/0.OversizeImage (0 ms) +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0 (0 ms total) + +[----------] 4 tests from BitwiseAnd/0, where TypeParam = unsigned char +[ RUN ] BitwiseAnd/0.API +[ OK ] BitwiseAnd/0.API (0 ms) +[ RUN ] BitwiseAnd/0.Misalignment +[ OK ] BitwiseAnd/0.Misalignment (0 ms) +[ RUN ] BitwiseAnd/0.ZeroImageSize +[ OK ] BitwiseAnd/0.ZeroImageSize (0 ms) +[ RUN ] BitwiseAnd/0.OversizeImage +[ OK ] BitwiseAnd/0.OversizeImage (0 ms) +[----------] 4 tests from BitwiseAnd/0 (0 ms total)``` +``` +{{% notice Note %}} +Currently, Apple Xcode is built on Clang 17. Version clang-1700.3.19.1 has an SME-related code generation bug that causes float `ResizeLinear` API tests to fail. +{{% /notice %}} + + +## Run the OpenCV test + +After building OpenCV with KleidiCV, you will find the test binaries in the `build-opencv-kleidicv-sme/bin/` directory. The main tool for benchmarking image processing performance is `opencv_perf_imgproc`. This utility measures both execution speed and throughput for the OpenCV `imgproc` module, including KleidiCV-accelerated operations. + +To focus your testing, use the `--gtest_filter` option to select specific tests and `--gtest_param_filter` to set test parameters. For example, you can run the Gaussian blur 5×5 performance test three times on a 1920x1080 grayscale image with replicated borders: + +- Image size: 1920x1080 (Full HD) +- Image type: 8UC1 (8-bit unsigned, single channel) +- Border type: BORDER_REPLICATE + +You can explore additional test cases and parameter combinations in the [benchmarks.txt](https://gitlab.arm.com/kleidi/kleidicv/-/blob/0.6.0/scripts/benchmark/benchmarks.txt?ref_type=tags) file in the KleidiCV repository. + +The command for running the test is as follows: + +```bash +./build-opencv-kleidicv-sme/bin/opencv_perf_imgproc + --gtest_filter='*gaussianBlur5x5/*' \ + --gtest_param_filter='(1920x1080, 8UC1, BORDER_REPLICATE)' \ + --gtest_repeat=3 +``` + +The expected output is: + +```output +[ERROR:0@0.001] global persistence.cpp:566 open Can't open file: 'imgproc.xml' in read mode +TEST: Skip tests with tags: 'mem_6gb', 'verylong' +CTEST_FULL_OUTPUT +OpenCV version: 4.12.0 +OpenCV VCS version: 4.12.0-2-g2eea907534 +Build type: Release +Compiler: /usr/bin/c++ (ver 17.0.0.17000013) +Algorithm hint: ALGO_HINT_ACCURATE +HAL: YES (carotene (ver 0.0.1) KleidiCV (ver 0.6.0)) +Parallel framework: gcd (nthreads=12) +CPU features: NEON FP16 NEON_DOTPROD NEON_FP16 *NEON_BF16 +OpenCL Platforms: + Apple + iGPU: Apple M4 Pro (OpenCL 1.2 ) +Current OpenCL device: + Type = iGPU + Name = Apple M4 Pro + Version = OpenCL 1.2 + Driver version = 1.2 1.0 + Address bits = 64 + Compute units = 16 + Max work group size = 256 + Local memory size = 32 KB + Max memory allocation size = 3 GB + Double support = No + Half support = No + Host unified memory = Yes + Device extensions: + cl_APPLE_SetMemObjectDestructor + cl_APPLE_ContextLoggingFunctions + cl_APPLE_clut + cl_APPLE_query_kernel_names + cl_APPLE_gl_sharing + cl_khr_gl_event + cl_khr_byte_addressable_store + cl_khr_global_int32_base_atomics + cl_khr_global_int32_extended_atomics + cl_khr_local_int32_base_atomics + cl_khr_local_int32_extended_atomics + cl_khr_3d_image_writes + cl_khr_image2d_from_buffer + cl_khr_depth_images + Has AMD Blas = No + Has AMD Fft = No + Preferred vector width char = 1 + Preferred vector width short = 1 + Preferred vector width int = 1 + Preferred vector width long = 1 + Preferred vector width float = 1 + Preferred vector width double = 1 + Preferred vector width half = 0 + +Repeating all tests (iteration 1) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.18 median=0.18 min=0.16 stddev=0.02 (12.7%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (22 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (22 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (22 ms total) +[ PASSED ] 1 test. + +Repeating all tests (iteration 2) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.18 median=0.17 min=0.16 stddev=0.04 (23.7%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (22 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (22 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (22 ms total) +[ PASSED ] 1 test. + +Repeating all tests (iteration 3) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.19 median=0.17 min=0.15 stddev=0.07 (36.1%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (23 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (23 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (23 ms total) +[ PASSED ] 1 test. +``` + +## Understand KleidiCV multiversion backend support + +The KleidiCV library detects the platform hardware at runtime and selects the backend implementation based on the following priority: + +- SME2 backend implementation +- SME backend implementation +- SVE backend implementation +- NEON backend implementation + +The following code shows how the library resolves which implementation to use: + +```C { line_numbers = "true" } +#define KLEIDICV_MULTIVERSION_C_API(api_name, neon_impl, sve2_impl, sme_impl, \ + sme2_impl) \ + static decltype(neon_impl) api_name##_resolver() { \ + [[maybe_unused]] KLEIDICV_TARGET_NAMESPACE::HwCaps hwcaps = \ + KLEIDICV_TARGET_NAMESPACE::get_hwcaps(); \ + KLEIDICV_SME2_RESOLVE(sme2_impl); \ + KLEIDICV_SME_RESOLVE(sme_impl); \ + KLEIDICV_SVE2_RESOLVE(sve2_impl); \ + return neon_impl; \ + } \ + extern "C" { \ + decltype(neon_impl) api_name = api_name##_resolver(); \ + } +``` +It verifies SME support using the query `hw.optional.arm.FEAT_SME` as follows: + +```C { line_numbers = "true" } +#define KLEIDICV_SME_RESOLVE(sme_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME")) { \ + return sme_impl; \ + } +``` +It verifies SME2 support using the query `hw.optional.arm.FEAT_SME2` as follows: + +```C { line_numbers = "true" } +#define KLEIDICV_SME2_RESOLVE(sme2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME2")) { \ + return sme2_impl; \ + } +``` + + +## Enable debug information for backend implementation at runtime + +To incorporate dump information for multiversion backend support during runtime testing, update `kleidicv/include/kleidicv/dispatch.h` as outlined below: + +To patch `dispatch.h`, copy the entire code below and paste it in your terminal. It will run the `patch` command insert the print statements to identify the backend. + +```bash { line_numbers = "true" } +patch -p1 -d "$HOME/kleidi" << 'EOF' +diff --git a/kleidicv/kleidicv/include/kleidicv/dispatch.h b/kleidicv/kleidicv/include/kleidicv/dispatch.h +index cc6ee01..44c98a5 100644 +--- a/kleidicv/kleidicv/include/kleidicv/dispatch.h ++++ b/kleidicv/kleidicv/include/kleidicv/dispatch.h +@@ -1,10 +1,11 @@ +-// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates ++// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates + // + // SPDX-License-Identifier: Apache-2.0 + + #ifndef KLEIDICV_DISPATCH_H + #define KLEIDICV_DISPATCH_H + ++#include + #include "kleidicv/config.h" + + #if KLEIDICV_ENABLE_SME2 || KLEIDICV_ENABLE_SME || KLEIDICV_ENABLE_SVE2 +@@ -33,6 +34,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SVE2_RESOLVE(sve2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SVE2")) { \ ++ printf("kleidicv API:: %s,SVE2 backend. \n", __func__); \ + return sve2_impl; \ + } + #else +@@ -43,6 +45,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SME_RESOLVE(sme_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME")) { \ ++ printf("kleidicv API:: %s,SME backend. \n", __func__); \ + return sme_impl; \ + } + #else +@@ -53,6 +56,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SME2_RESOLVE(sme2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME2")) { \ ++ printf("kleidicv API:: %s,SME2 backend. \n", __func__); \ + return sme2_impl; \ + } + #else +@@ -67,6 +71,7 @@ static bool query_sysctl(const char* attribute_name) { + KLEIDICV_SME2_RESOLVE(sme2_impl); \ + KLEIDICV_SME_RESOLVE(sme_impl); \ + KLEIDICV_SVE2_RESOLVE(sve2_impl); \ ++ printf("kleidicv API:: %s,NEON backend. \n", __func__); \ + return neon_impl; \ + } \ + extern "C" { \ +EOF +``` + +After making the change, rebuild the benchmark: + +```bash +cmake --build build-kleidicv-benchmark-SME --parallel +``` + +## Extract Neon or SME backend data on a MacBook + +After making the change and rebuilding for testing, you can display the SME backend usage summary as follows: + +```bash +./build-kleidicv-benchmark-SME/benchmark/kleidicv-benchmark +``` + +The output starts by printing the backends followed by the benchmark output: + +```output +kleidicv API:: kleidicv_min_max_u8_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s8_resolver,SME backend. +kleidicv API:: kleidicv_min_max_u16_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s16_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s32_resolver,SME backend. +kleidicv API:: kleidicv_min_max_f32_resolver,SME backend. +kleidicv API:: kleidicv_min_max_loc_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_abs_with_threshold_s16_resolver,SME backend. +kleidicv API:: kleidicv_saturating_add_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s64_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u64_resolver,NEON backend. +kleidicv API:: kleidicv_compare_equal_u8_resolver,NEON backend. +kleidicv API:: kleidicv_compare_greater_u8_resolver,NEON backend. +kleidicv API:: kleidicv_exp_f32_resolver,SME backend. +kleidicv API:: kleidicv_in_range_u8_resolver,NEON backend. +kleidicv API:: kleidicv_in_range_f32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s32_resolver,NEON backend. +kleidicv API:: kleidicv_rotate_resolver,NEON backend. +kleidicv API:: kleidicv_scale_u8_resolver,NEON backend. +kleidicv API:: kleidicv_scale_f32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s64_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u64_resolver,NEON backend. +kleidicv API:: kleidicv_sum_f32_resolver,SME backend. +kleidicv API:: kleidicv_threshold_binary_u8_resolver,SME backend. +kleidicv API:: kleidicv_transpose_resolver,NEON backend. +kleidicv API:: kleidicv_f32_to_s8_resolver,SME backend. +kleidicv API:: kleidicv_f32_to_u8_resolver,SME backend. +kleidicv API:: kleidicv_s8_to_f32_resolver,SME backend. +kleidicv API:: kleidicv_u8_to_f32_resolver,SME backend. +kleidicv API:: kleidicv_gray_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_gray_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_merge_resolver,NEON backend. +kleidicv API:: kleidicv_rgb_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_split_resolver,NEON backend. +kleidicv API:: kleidicv_yuv_p_to_rgb_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_bgr_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_rgba_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_bgra_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_blur_and_downsample_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_gaussian_blur_fixed_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_gaussian_blur_arbitrary_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s8_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u16_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s16_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_f32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_small_hist_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_median_blur_large_hist_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_scharr_interleaved_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_u16_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_s16_resolver,SME backend. +kleidicv API:: kleidicv_sobel_3x3_horizontal_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_sobel_3x3_vertical_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_bitwise_and_resolver,NEON backend. +kleidicv API:: kleidicv_dilate_u8_resolver,SME backend. +kleidicv API:: kleidicv_erode_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_to_quarter_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_linear_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_linear_stripe_f32_resolver,SME backend. +kleidicv API:: kleidicv_remap_s16_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16_u16_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16point5_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16point5_u16_resolver,NEON backend. +kleidicv API:: kleidicv_remap_f32_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_f32_u16_resolver,NEON backend. +kleidicv API:: kleidicv_warp_perspective_stripe_u8_resolver,NEON backend. +``` +The output is truncated for brevity, but you will see detailed performance metrics for each operation at 1280x720 resolution. Look for lines showing the operation name, sample count, mean and median times, and standard deviation. These results help you compare the performance of different backends and confirm that SME or NEON acceleration is active. + +## Use lldb to check the SME backend implementation + +To perform source-level debugging during the build process, you must change the build type from `Release` to `Debug`, as demonstrated in the following example: + +```bash +cmake -S $WORKSPACE/kleidicv \ + -B build-kleidicv-benchmark-SME \ + -DKLEIDICV_ENABLE_SME2=ON \ + -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \ + -DKLEIDICV_BENCHMARK=ON \ + -DCMAKE_BUILD_TYPE=Debug +cmake --build build-kleidicv-benchmark-SME --parallel +``` + +Use the `lldb` debug tool to set breakpoints during API testing and verify if the SME backend implementation is invoked. To view the function call backtrace, run the `bt` command as shown below: + +```bash +lldb ./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test +``` + +The interactions with the `(lldb)` command line are shown below. +Start by entering the following commands in the `lldb` debugger: + +```console +target create "./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test" +b saturating_add_abs_with_threshold +run +``` + +When the program stops at your breakpoint, enter: + +```console +bt +``` + +This command displays the stack trace, showing how the function was called. + +Next, to view the assembly instructions (including SME streaming mode), enter: + +```console +disassemble --frame +``` + +After you finish inspecting the output, exit `lldb` by typing: + +```console +quit +``` + +Note: Your file paths may differ, but the sequence of commands remains the same. Enter each command as shown and review the output at each step. + +```console +target create "./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test" +Current executable set to '$HOME/kleidi/opencv/build-kleidicv-benchmark-SME/test/api/kleidicv-api-test' (arm64). +(lldb) b saturating_add_abs_with_threshold +Breakpoint 1: 2 locations. +(lldb) run +Process 82381 launched: '/Users/Shared/workspace/build-kleidicv-benchmark-SME-debug/test/api/kleidicv-api-test' (arm64) +Vector length is set to 16 bytes. +Seed is set to 3168213869. +[==========] Running 3703 tests from 141 test suites. +[----------] Global test environment set-up. +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0, where TypeParam = short +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestPositive +Process 82381 stopped +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.2 + frame #0: 0x0000000100695554 kleidicv-api-test`kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold(src_a=0x0000600002796762, src_a_stride=46, src_b=0x00006000027967f2, src_b_stride=46, dst=0x0000600002796912, dst_stride=46, width=23, height=3, threshold=50) at add_abs_with_threshold_sme.cpp:15:47 + 12 const T *src_b, size_t src_b_stride, T *dst, + 13 size_t dst_stride, size_t width, + 14 size_t height, T threshold) { +-> 15 return saturating_add_abs_with_threshold_sc(src_a, src_a_stride, src_b, + 16 src_b_stride, dst, dst_stride, + 17 width, height, threshold); + 18 } +(lldb) bt +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.2 + * frame #0: 0x0000000100695554 kleidicv-api-test`kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold(src_a=0x0000600002796762, src_a_stride=46, src_b=0x00006000027967f2, src_b_stride=46, dst=0x0000600002796912, dst_stride=46, width=23, height=3, threshold=50) at add_abs_with_threshold_sme.cpp:15:47 + frame #1: 0x0000000100009930 kleidicv-api-test`SaturatingAddAbsWithThresholdTestBase::call_api(this=0x000000016fdfe670) at test_add_abs_with_threshold.cpp:17:12 + frame #2: 0x00000001000090c8 kleidicv-api-test`OperationTest::test(this=0x000000016fdfe670) at operation.h:90:11 + frame #3: 0x0000000100008870 kleidicv-api-test`SaturatingAddAbsWithThresholdTest_TestPositive_Test::TestBody(this=0x000060000179e270) at test_add_abs_with_threshold.cpp:135:58 + frame #4: 0x00000001008417cc kleidicv-api-test`void testing::internal::HandleSehExceptionsInMethodIfSupported(object=0x000060000179e270, method=0x00000000000000010000000000000020, location="the test body") at gtest.cc:2599:10 + frame #5: 0x0000000100810908 kleidicv-api-test`void testing::internal::HandleExceptionsInMethodIfSupported(object=0x000060000179e270, method=0x00000000000000010000000000000020, location="the test body") at gtest.cc:2635:14 + frame #6: 0x0000000100810858 kleidicv-api-test`testing::Test::Run(this=0x000060000179e270) at gtest.cc:2674:5 + frame #7: 0x000000010081163c kleidicv-api-test`testing::TestInfo::Run(this=0x000000011fe04290) at gtest.cc:2853:11 + frame #8: 0x00000001008126bc kleidicv-api-test`testing::TestSuite::Run(this=0x000000011fe049d0) at gtest.cc:3012:30 + frame #9: 0x000000010081fdec kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests(this=0x000000011fe04780) at gtest.cc:5870:44 + frame #10: 0x0000000100845750 kleidicv-api-test`bool testing::internal::HandleSehExceptionsInMethodIfSupported(object=0x000000011fe04780, method=(kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests() at gtest.cc:5748), location="auxiliary test code (environments or event listeners)") at gtest.cc:2599:10 + frame #11: 0x000000010081f804 kleidicv-api-test`bool testing::internal::HandleExceptionsInMethodIfSupported(object=0x000000011fe04780, method=(kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests() at gtest.cc:5748), location="auxiliary test code (environments or event listeners)") at gtest.cc:2635:14 + frame #12: 0x000000010081f6fc kleidicv-api-test`testing::UnitTest::Run(this=0x00000001009c92f0) at gtest.cc:5444:10 + frame #13: 0x00000001004e8600 kleidicv-api-test`RUN_ALL_TESTS() at gtest.h:2293:73 + frame #14: 0x00000001004e83a8 kleidicv-api-test`main(argc=1, argv=0x000000016fdff3b0) at test_main.cpp:82:10 + frame #15: 0x000000019f492b98 dyld`start + 6076 +(lldb) disassemble --frame +kleidicv-api-test`kleidicv::sme::saturating_add_abs_with_threshold: + 0x100695510 <+0>: sub sp, sp, #0xa0 + 0x100695514 <+4>: stp d15, d14, [sp, #0x50] + 0x100695518 <+8>: stp d13, d12, [sp, #0x60] + 0x10069551c <+12>: stp d11, d10, [sp, #0x70] + 0x100695520 <+16>: stp d9, d8, [sp, #0x80] + 0x100695524 <+20>: stp x29, x30, [sp, #0x90] + 0x100695528 <+24>: smstart sm + 0x10069552c <+28>: ldrsh w8, [sp, #0xa0] + 0x100695530 <+32>: str x0, [sp, #0x48] + 0x100695534 <+36>: str x1, [sp, #0x40] + 0x100695538 <+40>: str x2, [sp, #0x38] + 0x10069553c <+44>: str x3, [sp, #0x30] + 0x100695540 <+48>: str x4, [sp, #0x28] + 0x100695544 <+52>: str x5, [sp, #0x20] + 0x100695548 <+56>: str x6, [sp, #0x18] + 0x10069554c <+60>: str x7, [sp, #0x10] + 0x100695550 <+64>: strh w8, [sp, #0xe] +-> 0x100695554 <+68>: ldr x0, [sp, #0x48] + 0x100695558 <+72>: ldr x1, [sp, #0x40] + 0x10069555c <+76>: ldr x2, [sp, #0x38] + 0x100695560 <+80>: ldr x3, [sp, #0x30] + 0x100695564 <+84>: ldr x4, [sp, #0x28] + 0x100695568 <+88>: ldr x5, [sp, #0x20] + 0x10069556c <+92>: ldr x6, [sp, #0x18] + 0x100695570 <+96>: ldr x7, [sp, #0x10] + 0x100695574 <+100>: ldrh w8, [sp, #0xe] + 0x100695578 <+104>: mov x9, sp + 0x10069557c <+108>: strh w8, [x9] + 0x100695580 <+112>: bl 0x10087b8d0 ; symbol stub for: kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold_sc(short const*, unsigned long, short const*, unsigned long, short*, unsigned long, unsigned long, unsigned long, short) + 0x100695584 <+116>: smstop sm + 0x100695588 <+120>: ldp x29, x30, [sp, #0x90] + 0x10069558c <+124>: ldp d9, d8, [sp, #0x80] + 0x100695590 <+128>: ldp d11, d10, [sp, #0x70] + 0x100695594 <+132>: ldp d13, d12, [sp, #0x60] + 0x100695598 <+136>: ldp d15, d14, [sp, #0x50] + 0x10069559c <+140>: add sp, sp, #0xa0 + 0x1006955a0 <+144>: ret +(lldb) quit +``` + +## Summary + +In this Learning Path, you tested the KleidiCV build and verified its functionality. You ran both the KleidiCV API tests and the OpenCV performance tests. You also explored how KleidiCV's multiversion support works, enabling it to select the optimal backend like SME, SVE, or NEON at runtime. Finally, you learned how to enable debug output and use the `lldb` debugger to confirm that the SME backend is being used and to inspect the assembly code. + diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md index 69c2e17be4..85a5b968cc 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md @@ -10,13 +10,13 @@ key_ip: maintopic: true operatingsystems_filter: - Android: 32 -- Linux: 31 +- Linux: 32 - macOS: 14 - Windows: 14 subjects_filter: - Gaming: 6 - Graphics: 6 -- ML: 13 +- ML: 14 - Performance and Architecture: 35 subtitle: Optimize Android apps and build faster games using cutting-edge Arm tech title: Mobile, Graphics, and Gaming @@ -26,7 +26,7 @@ tools_software_languages_filter: - Android: 4 - Android NDK: 2 - Android SDK: 1 -- Android Studio: 11 +- Android Studio: 12 - Arm Development Studio: 1 - Arm Mobile Studio: 1 - Arm Performance Studio: 3 @@ -34,24 +34,24 @@ tools_software_languages_filter: - Bazel: 1 - C: 4 - C#: 3 -- C++: 12 +- C++: 13 - CCA: 1 - Clang: 12 -- CMake: 1 -- Coding: 1 +- CMake: 2 - Docker: 1 -- ExecuTorch: 2 +- ExecuTorch: 3 - Frame Advisor: 1 - GCC: 12 - Generative AI: 2 - Godot: 1 - Google Pixel 8: 1 - Google Test: 1 +- Halide: 1 - Hugging Face: 5 - Java: 6 - Jupyter Notebook: 1 -- KleidiAI: 1 -- Kotlin: 7 +- KleidiAI: 2 +- Kotlin: 8 - LiteRT: 1 - LLM: 1 - LLVM: 1 @@ -61,12 +61,12 @@ tools_software_languages_filter: - NEON: 1 - ONNX Runtime: 1 - OpenGL ES: 1 -- Python: 4 +- Python: 5 - PyTorch: 2 - QEMU: 1 - RenderDoc: 1 - RME: 1 -- Runbook: 15 +- Runbook: 14 - Rust: 2 - SDDiskTool: 1 - SVE2: 1 @@ -77,6 +77,6 @@ tools_software_languages_filter: - Visual Studio Code: 1 - Vulkan: 5 - Vulkan SDK: 1 -- XNNPACK: 1 +- XNNPACK: 2 weight: 3 --- diff --git a/content/learning-paths/mobile-graphics-and-gaming/analyze_a_frame_with_frame_advisor/analyze_render_graph.md b/content/learning-paths/mobile-graphics-and-gaming/analyze_a_frame_with_frame_advisor/analyze_render_graph.md index 2a277009d1..afc773038e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/analyze_a_frame_with_frame_advisor/analyze_render_graph.md +++ b/content/learning-paths/mobile-graphics-and-gaming/analyze_a_frame_with_frame_advisor/analyze_render_graph.md @@ -19,6 +19,6 @@ Render passes flow from left to right. The render pass that outputs to the swapc 1. In this example, we can see that some render passes have no consumers at all and that they do not contribute to the final rendered output. - ![Redundant render passes in Frame Advisor's Render Graph alt-text#center](Render_graph_egypt_redundant_rps.png "Figure 4. Redundant render passes") + ![Redundant render passes in Frame Advisor's Render Graph alt-text#center](render_graph_egypt_redundant_rps.webp "Figure 4. Redundant render passes") These render passes could therefore be removed, without affecting the output, saving processing power and bandwidth. diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md index b351d54846..59000dba25 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md @@ -1,25 +1,21 @@ --- -title: Halide Essentials From Basics to Android Integration - -draft: true -cascade: - draft: true +title: Build high-performance image processing with Halide on Android minutes_to_complete: 180 -who_is_this_for: This is an introductory topic for software developers interested in learning how to use Halide for image processing. +who_is_this_for: This is an introductory topic for developers interested in learning how to use Halide for image processing. learning_objectives: - - Understand foundational concepts of Halide and set up your development environment. - - Create a basic real-time image processing pipeline using Halide. - - Optimize image processing workflows by applying operation fusion in Halide. - - Integrate Halide pipelines into Android applications developed with Kotlin. + - Learn the basics of Halide and set up your development environment + - Build a simple real-time image processing pipeline with Halide + - Make your image processing faster by combining operations in Halide + - Use Halide pipelines in Android apps written with Kotlin prerequisites: - Basic C++ knowledge - Android Studio with Android Emulator -author: Dawid Borycki +author: Éliás Bálint, Dawid Borycki, Steve Suzuki ### Tags skilllevels: Introductory @@ -31,15 +27,20 @@ operatingsystems: - Android tools_software_languages: - Android Studio - - Coding + - Halide + - C++ + - Kotlin + - Android Studio + - CMake + further_reading: - resource: - title: Halide 19.0.0 + title: Halide documentation link: https://halide-lang.org/docs/index.html type: website - resource: - title: Halide GitHub + title: Halide GitHub repository link: https://github.com/halide/Halide type: repository - resource: diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md index ba6eb63972..9e9bb96139 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md @@ -1,34 +1,34 @@ --- # User change -title: "Integrating Halide into an Android (Kotlin) Project" +title: "Integrate Halide into an Android project with Kotlin" weight: 6 layout: "learningpathall" --- -## Objective -In this lesson, we’ll learn how to integrate a high-performance Halide image-processing pipeline into an Android application using Kotlin. +## What you'll build +In this section you'll integrate a high-performance Halide image-processing pipeline into an Android application using Kotlin. -## Overview of mobile integration with Halide +## Learn about mobile integration with Halide Android is the world’s most widely-used mobile operating system, powering billions of devices across diverse markets. This vast user base makes Android an ideal target platform for developers aiming to reach a broad audience, particularly in applications requiring sophisticated image and signal processing, such as augmented reality, photography, video editing, and real-time analytics. Kotlin, now the preferred programming language for Android development, combines concise syntax with robust language features, enabling developers to write maintainable, expressive, and safe code. It offers seamless interoperability with existing Java codebases and straightforward integration with native code via JNI, simplifying the development of performant mobile applications. -## Benefits of using Halide on mobile +## Explore the benefits of using Halide on mobile Integrating Halide into Android applications brings several key advantages: -1. Performance. Halide enables significant acceleration of complex image processing algorithms, often surpassing the speed of traditional Java or Kotlin implementations by leveraging optimized code generation. By generating highly optimized native code tailored for ARM CPUs or GPUs, Halide can dramatically increase frame rates and responsiveness, essential for real-time or interactive applications. -2. Efficiency. On mobile devices, resource efficiency translates directly to improved battery life and reduced thermal output. Halide's scheduling strategies (such as operation fusion, tiling, parallelization, and vectorization) minimize unnecessary memory transfers, CPU usage, and GPU overhead. This optimization substantially reduces overall power consumption, extending battery life and enhancing the user experience by preventing overheating. -3. Portability. Halide abstracts hardware-specific details, allowing developers to write a single high-level pipeline that easily targets different processor architectures and hardware configurations. Pipelines can seamlessly run on various ARM-based CPUs and GPUs commonly found in Android smartphones and tablets, enabling developers to support a wide range of devices with minimal platform-specific modifications. -4. Custom Algorithm Integration. Halide allows developers to easily integrate their bespoke image-processing algorithms that may not be readily available or optimized in common libraries, providing full flexibility and control over application-specific performance and functionality. +- Performance - Halide enables significant acceleration of complex image processing algorithms, often surpassing the speed of traditional Java or Kotlin implementations by leveraging optimized code generation. By generating highly optimized native code tailored for Arm CPUs or GPUs, Halide can dramatically increase frame rates and responsiveness, essential for real-time or interactive applications. +- Efficiency - on mobile devices, resource efficiency translates directly to improved battery life and reduced thermal output. Halide's scheduling strategies (such as operation fusion, tiling, parallelization, and vectorization) minimize unnecessary memory transfers, CPU usage, and GPU overhead. This optimization substantially reduces overall power consumption, extending battery life and enhancing the user experience by preventing overheating. +- Portability - Halide abstracts hardware-specific details, allowing developers to write a single high-level pipeline that easily targets different processor architectures and hardware configurations. Pipelines can seamlessly run on various Arm-based CPUs and GPUs commonly found in Android smartphones and tablets, enabling developers to support a wide range of devices with minimal platform-specific modifications. +- Custom Algorithm Integration - Halide allows developers to easily integrate their bespoke image-processing algorithms that may not be readily available or optimized in common libraries, providing full flexibility and control over application-specific performance and functionality. In short, Halide delivers high-performance image processing without sacrificing portability or efficiency, a balance particularly valuable on resource-constrained mobile devices. -### Android development ecosystem and challenges +### Navigate Android development challenges While Android presents abundant opportunities for developers, the mobile development ecosystem brings its own set of challenges, especially for performance-intensive applications: -1. Limited Hardware Resources. Unlike desktop or server environments, mobile devices have significant constraints on processing power, memory capacity, and battery life. Developers must optimize software meticulously to deliver smooth performance while carefully managing hardware resource consumption. Leveraging tools like Halide allows developers to overcome these constraints by optimizing computational workloads, making resource-intensive tasks feasible on constrained hardware. -2. Cross-Compilation Complexities. Developing native code for Android requires handling multiple hardware architectures (such as armv8-a, ARM64, and sometimes x86/x86_64). Cross-compilation introduces complexities due to different instruction sets, CPU features, and performance characteristics. Managing this complexity involves careful use of the Android NDK, understanding toolchains, and correctly configuring build systems (e.g., Gradle, CMake). Halide helps mitigate these issues by abstracting away many platform-specific optimizations, automatically generating code optimized for target architectures. -3. Image-Format Conversions (Bitmap ↔ Halide Buffer). Android typically handles images through the Bitmap class or similar platform-specific constructs, whereas Halide expects image data to be in raw, contiguous buffer formats. Developers must bridge the gap between Android-specific image representations (Bitmaps, YUV images from camera APIs, etc.) and Halide's native buffer format. Proper management of these conversions—including considerations for pixel formats, stride alignment, and memory copying overhead—can significantly impact performance and correctness, necessitating careful design and efficient implementation of buffer-handling routines. +- Limited hardware resources: unlike desktop or server environments, mobile devices have significant constraints on processing power, memory capacity, and battery life. Developers must optimize software meticulously to deliver smooth performance while carefully managing hardware resource consumption. Leveraging tools like Halide allows developers to overcome these constraints by optimizing computational workloads, making resource-intensive tasks feasible on constrained hardware. +- Cross-compilation complexities: developing native code for Android requires handling multiple hardware architectures (such as Armv8-A, ARM64, and sometimes x86/x86_64). Cross-compilation introduces complexities due to different instruction sets, CPU features, and performance characteristics. Managing this complexity involves careful use of the Android NDK, understanding toolchains, and correctly configuring build systems (e.g., Gradle, CMake). Halide helps mitigate these issues by abstracting away many platform-specific optimizations, automatically generating code optimized for target architectures. +- Image format conversions (Bitmap ↔ Halide Buffer). Android typically handles images through the Bitmap class or similar platform-specific constructs, whereas Halide expects image data to be in raw, contiguous buffer formats. Developers must bridge the gap between Android-specific image representations (Bitmaps, YUV images from camera APIs, etc.) and Halide's native buffer format. Proper management of these conversions—including considerations for pixel formats, stride alignment, and memory copying overhead—can significantly impact performance and correctness, necessitating careful design and efficient implementation of buffer-handling routines. ## Project requirements Before integrating Halide into your Android application, ensure you have the necessary tools and libraries. @@ -37,11 +37,11 @@ Before integrating Halide into your Android application, ensure you have the nec 1. Android Studio. [Download link](https://developer.android.com/studio). 2. Android NDK (Native Development Kit). Can be easily installed from Android Studio (Tools → SDK Manager → SDK Tools → Android NDK). -## Setting up the Android project -### Creating the project +## Set up the Android project +### Create the project 1. Open Android Studio. 2. Select New Project > Native C++. -![img4](Figures/04.webp) +![Android Studio New Project dialog showing Native C++ template selected. The dialog displays options for project name, language, and minimum SDK. The primary subject is the Native C++ template highlighted in the project creation workflow. The wider environment is a typical Android Studio interface with a neutral, technical tone. Visible text includes Native C++ and fields for configuring the new project.] ### Configure the project 1. Set the project Name to Arm.Halide.AndroidDemo. @@ -152,8 +152,9 @@ dependencies { Click the Sync Now button at the top. To verify that everything is configured correctly, click Build > Make Project in Android Studio. -## UI -Now, you'll define the application's User Interface, consisting of two buttons and an ImageView. One button loads the image, the other processes it, and the ImageView displays both the original and processed images. +## Define the user interface +Define the application's user interface, consisting of two buttons and an ImageView. One button loads the image, the other processes it, and the ImageView displays both the original and processed images. + 1. Open the res/layout/activity_main.xml file, and modify it as follows: ```XML @@ -204,8 +205,8 @@ Now you can run the app to view the UI: ![img7](Figures/07.webp) -## Processing -You will now implement the image processing code. First, pick up an image you want to process. Here we use the camera man. Then, under the Arm.Halide.AndroidDemo/src/main create assets folder, and save the image under that folder as img.png. +## Implement image processing +Implement the image processing code. First, pick an image you want to process. This example uses the camera man image. Under Arm.Halide.AndroidDemo/src/main, create an assets folder and save the image as img.png. Now, open MainActivity.kt and modify it as follows: ```java @@ -330,13 +331,13 @@ class MainActivity : AppCompatActivity() { } ``` -This Kotlin Android application demonstrates integrating a Halide-generated image-processing pipeline within an Android app. The main activity (MainActivity) manages loading and processing an image stored in the application’s asset folder. +This Kotlin Android application demonstrates integrating a Halide-generated image-processing pipeline within an Android app. The main activity (MainActivity) manages loading and processing an image stored in the application's asset folder. -When the app launches, the Process Image button is disabled. When a user taps Load Image, the app retrieves img.png from its assets directory and displays it within the ImageView, simultaneously enabling the Process Image button for further interaction. +When the app launches, the app disables the Process Image button. When you tap Load Image, the app retrieves img.png from its assets directory and displays it within the ImageView, simultaneously enabling the Process Image button for further interaction. Upon pressing the Process Image button, the following sequence occurs: 1. Background Processing. A Kotlin coroutine initiates processing on a background thread, ensuring the application’s UI remains responsive. -2. Conversion to Grayscale. The loaded bitmap image is converted into a grayscale byte array using a simple RGB-average method, preparing it for processing by the native (JNI) layer. +2. Conversion to Grayscale. The loaded bitmap image is converted into a grayscale byte array using a simple RGB (Red-Green-Blue) average method, preparing it for processing by the native (JNI) layer. 3. Native Function Invocation. This grayscale byte array, along with image dimensions, is passed to a native function (blurThresholdImage) defined via JNI. This native function is implemented using the Halide pipeline, performing operations such as blurring and thresholding directly on the image data. 4. Post-processing. After the native function completes, the resulting processed grayscale byte array is converted back into a Bitmap image. 5. UI Update. The coroutine then updates the displayed image (on the main UI thread) with this newly processed bitmap, providing the user immediate visual feedback. @@ -346,11 +347,11 @@ The code defines three utility methods: 2. extractGrayScaleBytes - converts a Bitmap into a grayscale byte array suitable for native processing. 3. createBitmapFromGrayBytes - converts a grayscale byte array back into a Bitmap for display purposes. -Note that performing the grayscale conversion in Halide allows us to exploit operator fusion, further improving performance by avoiding intermediate memory accesses. This could be done as in our examples before (processing-workflow). +Note that performing the grayscale conversion in Halide allows you to exploit operator fusion, further improving performance by avoiding intermediate memory accesses. You can do this as shown in the earlier processing-workflow examples. The JNI integration occurs through an external method declaration, blurThresholdImage, loaded via the companion object at app startup. The native library (armhalideandroiddemo) containing this function is compiled separately and integrated into the application (native-lib.cpp). -You will now need to create blurThresholdImage function. To do so, in Android Studio put the cursor above blurThresholdImage function, and then click Create JNI function for blurThresholdImage: +Create the blurThresholdImage function. In Android Studio, put the cursor above blurThresholdImage function, and then select Create JNI function for blurThresholdImage: ![img8](Figures/08.webp) This will generate a new function in the native-lib.cpp: @@ -404,9 +405,9 @@ This C++ function acts as a bridge between Java (Kotlin) and native code. Specif The input Java byte array (input_bytes) is accessed and pinned into native memory via GetByteArrayElements. This provides a direct pointer (inBytes) to the grayscale data sent from Kotlin. The raw grayscale byte data is wrapped into a Halide::Runtime::Buffer object (inputBuffer). This buffer structure is required by the Halide pipeline. An output buffer (outputBuffer) is created with the same dimensions as the input image. This buffer will store the result produced by the Halide pipeline. The native function invokes the Halide-generated AOT function blur_threshold, passing in both the input and output buffers. After processing, a new Java byte array (outputArray) is allocated to hold the processed grayscale data. The processed data from the Halide output buffer is copied into this Java array using SetByteArrayRegion. The native input buffer (inBytes) is explicitly released using ReleaseByteArrayElements, specifying JNI_ABORT as no changes were made to the input array. Finally, the processed byte array (outputArray) is returned to Kotlin. -Through this JNI bridge, Kotlin can invoke high-performance native code. You can now re-run the application. Click the Load Image button, and then Process Image. You will see the following results: +Through this JNI bridge, Kotlin can invoke high-performance native code. You can now re-run the application. Select the Load Image button, and then Process Image. You'll see the following results: -![img9](Figures/09.png) +Android app screenshot showing the Arm Halide Android demo interface. The screen displays two buttons labeled Load Image and Process Image, with the Process Image button enabled. Below the buttons, an ImageView shows a grayscale photo of a camera man standing outdoors, holding a camera and tripod. The environment appears neutral and technical, with no visible emotional tone. The layout is centered and uses a simple vertical arrangement, making the interface easy to navigate for users with visual impairment. ![img10](Figures/10.png) In the above code we created a new jbyteArray and copying the data explicitly, which can result in an additional overhead. To optimize performance by avoiding unnecessary memory copies, you can directly wrap Halide's buffer in a Java-accessible ByteBuffer like so @@ -416,4 +417,4 @@ jobject outputBuffer = env->NewDirectByteBuffer(output.data(), width * height); ``` ## Summary -In this lesson, we’ve successfully integrated a Halide image-processing pipeline into an Android application using Kotlin. We started by setting up an Android project configured for native development with the Android NDK, employing Kotlin as the primary language. We then integrated Halide-generated static libraries and demonstrated their usage through Java Native Interface (JNI), bridging Kotlin and native code. This equips developers with the skills needed to harness Halide's capabilities for building sophisticated, performant mobile applications on Android. \ No newline at end of file +You've successfully integrated a Halide image-processing pipeline into an Android application using Kotlin. You started by setting up an Android project configured for native development with the Android NDK, using Kotlin as the primary language. You then integrated Halide-generated static libraries and demonstrated their usage through Java Native Interface (JNI), bridging Kotlin and native code. You now have the skills needed to harness Halide's capabilities for building sophisticated, performant mobile applications on Android. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md index f4003f1f51..b74495cba1 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md @@ -1,22 +1,25 @@ --- # User change -title: "Ahead-of-time and cross-compilation" +title: "Generate optimized Halide pipelines for Android using ahead-of-time cross-compilation" weight: 5 layout: "learningpathall" --- -## Ahead-of-time and cross-compilation -One of Halide's standout features is the ability to compile image processing pipelines ahead-of-time (AOT), enabling developers to generate optimized binary code on their host machines rather than compiling directly on target devices. This AOT compilation process allows developers to create highly efficient libraries that run effectively across diverse hardware without incurring the runtime overhead associated with just-in-time (JIT) compilation. -Halide also supports robust cross-compilation capabilities. Cross-compilation means using the host version of Halide, typically running on a desktop Linux or macOS system—to target different architectures, such as ARM for Android devices. Developers can thus optimize Halide pipelines on their host machine, produce libraries specifically optimized for Android, and integrate them seamlessly into Android applications. The generated pipeline code includes essential optimizations and can embed minimal runtime support, further reducing workload on the target device and ensuring responsiveness and efficiency. +## What you'll build +In this section, you'll leverage the host version of Halide to perform AOT compilation of an image processing pipeline via cross-compilation. The resulting pipeline library is specifically tailored to Android devices (targeting, for instance, arm64-v8a ABI), while the compilation itself occurs entirely on the host system. This approach significantly accelerates development by eliminating the need to build Halide or perform JIT compilation on Android devices. It also guarantees that the resulting binaries are optimized for the intended hardware, streamlining the deployment of high-performance image processing applications on mobile platforms. -## Objective -In this section, we leverage the host version of Halide to perform AOT compilation of an image processing pipeline via cross-compilation. The resulting pipeline library is specifically tailored to Android devices (targeting, for instance, arm64-v8a ABI), while the compilation itself occurs entirely on the host system. This approach significantly accelerates development by eliminating the need to build Halide or perform JIT compilation on Android devices. It also guarantees that the resulting binaries are optimized for the intended hardware, streamlining the deployment of high-performance image processing applications on mobile platforms. -## Prepare Pipeline for Android -The procedure implemented in the following code demonstrates how Halide's AOT compilation and cross-compilation features can be utilized to create an optimized image processing pipeline for Android. We will run Halide on our host machine (in this example, macOS) to generate a static library containing the pipeline function, which will later be invoked from an Android device. Below is a step-by-step explanation of this process. +## Learn about ahead-of-time (AOT) and cross-compilation +One of Halide's standout features is the ability to compile image processing pipelines ahead-of-time (AOT), enabling you to generate optimized binary code on your host machine rather than compiling directly on target devices. This AOT compilation process enables you to create highly efficient libraries that run effectively across diverse hardware without incurring the runtime overhead associated with just-in-time (JIT) compilation. + +Halide also supports robust cross-compilation capabilities. Cross-compilation means using the host version of Halide, typically running on a desktop Linux or macOS system—to target different architectures, such as Arm for Android devices. You can optimize Halide pipelines on your host machine, produce libraries specifically optimized for Android, and integrate them seamlessly into Android applications. The generated pipeline code includes essential optimizations and can embed minimal runtime support, further reducing workload on the target device and ensuring responsiveness and efficiency. + + +## Prepare pipeline for Android +The following code demonstrates how to use Halide's AOT compilation and cross-compilation features to create an optimized image processing pipeline for Android. Run Halide on your host machine (in this example, macOS) to generate a static library containing the pipeline function, which you'll later invoke from an Android device. Below is a step-by-step explanation of this process. Create a new file named blur-android.cpp with the following contents: @@ -85,9 +88,9 @@ int main(int argc, char** argv) { } ``` -In the original implementation constants 128, 255, and 0 were implicitly treated as integers. Here, the threshold value (128) and output values (255, 0) are explicitly cast to uint8_t. This approach removes ambiguity and clearly specifies the types used, ensuring compatibility and clarity. Both approaches result in identical functionality, but explicitly casting helps emphasize the type correctness and may avoid subtle issues during cross-compilation or in certain environments. Additionally, explicit uint8_t casts help avoid implicit promotion to 32-bit integers (and the corresponding narrowings back to 8-bit) in the generated code, reducing redundant cast operations and potential vector widen/narrow overhead—especially on ARM/NEON +In the original implementation constants 128, 255, and 0 were implicitly treated as integers. Here, the threshold value (128) and output values (255, 0) are explicitly cast to uint8_t. This approach removes ambiguity and clearly specifies the types used, ensuring compatibility and clarity. Both approaches result in identical functionality, but explicitly casting helps emphasize the type correctness and may avoid subtle issues during cross-compilation or in certain environments. Additionally, explicit uint8_t casts help avoid implicit promotion to 32-bit integers (and the corresponding narrowings back to 8-bit) in the generated code, reducing redundant cast operations and potential vector widen/narrow overhead—especially on Arm/NEON. -The program takes at least one command-line argument, the output base name used to generate the files (e.g., “blur_threshold_android”). Here, the target architecture is explicitly set within the code to Android ARM64: +The program takes at least one command-line argument, the output base name used to generate the files (for example, "blur_threshold_android"). Here, the target architecture is explicitly set within the code to Android ARM64: ```cpp // Configure Halide Target for Android @@ -99,20 +102,20 @@ target.bits = 64; // Enable Halide runtime inclusion in the generated library (needed if not linking Halide runtime separately). target.set_feature(Target::NoRuntime, false); -// Optionally, enable hardware-specific optimizations to improve performance on ARM devices: -// - DotProd: Optimizes matrix multiplication and convolution-like operations on ARM. +// Optionally, enable hardware-specific optimizations to improve performance on Arm devices: +// - DotProd: Optimizes matrix multiplication and convolution-like operations on Arm. // - ARMFp16 (half-precision floating-point operations). ``` Notes: 1. NoRuntime — When set to true, Halide excludes its runtime from the generated code, and you must link the runtime manually during the linking step. When set to false, the Halide runtime is included in the generated library, which simplifies deployment. -2. ARMFp16 — Enables the use of ARM hardware support for half-precision (16-bit) floating-point operations, which can provide faster execution when reduced precision is acceptable. +2. ARMFp16 — Enables the use of Arm hardware support for half-precision (16-bit) floating-point operations, which improves execution speed when reduced precision is acceptable. 3. Why the runtime choice matters - If your app links several AOT-compiled pipelines, ensure there is exactly one Halide runtime at link time: -* Strategy A (cleanest): build all pipelines with NoRuntime ON and link a single standalone Halide runtime once (matching the union of features you need, e.g., Vulkan/OpenCL/Metal or ARM options). +* Strategy A (cleanest): build all pipelines with NoRuntime ON and link a single standalone Halide runtime once (matching the union of features you need, for example, Vulkan/OpenCL/Metal or Arm options). * Strategy B: embed the runtime in exactly one pipeline (leave NoRuntime OFF only there); compile all other pipelines with NoRuntime ON. * Mixing more than one runtime can cause duplicate symbols and split global state (e.g., error handlers, device interfaces). -We declare spatial variables (x, y) and an ImageParam named “input” representing the input image data. We use boundary clamping (clamp) to safely handle edge pixels. Then, we apply a 3x3 blur with a reduction domain (RDom). The accumulated sum is divided by 9 (the number of pixels in the neighborhood), producing an average blurred image. Lastly, thresholding is applied, producing a binary output: pixels above a certain brightness threshold (128) become white (255), while others become black (0). +The code declares spatial variables (x, y) and an ImageParam named "input" representing the input image data. Boundary clamping (clamp) safely handles edge pixels. A 3×3 blur with a reduction domain (RDom) is then applied. The accumulated sum is divided by 9 (the number of pixels in the neighborhood), producing an average blurred image. Lastly, thresholding is applied, producing a binary output: pixels above a certain brightness threshold (128) become white (255), while others become black (0). This section intentionally reinforces previous concepts, focusing now primarily on explicitly clarifying integration details, such as type correctness and the handling of runtime features within Halide. @@ -120,9 +123,9 @@ Simple scheduling directives (compute_root) instruct Halide to compute intermedi This strategy can simplify debugging by clearly isolating computational steps and may enhance runtime efficiency by explicitly controlling intermediate storage locations. -By clearly separating algorithm logic from scheduling, developers can easily test and compare different scheduling strategies,such as compute_inline, compute_root, compute_at, and more, without modifying their fundamental algorithmic code. This separation significantly accelerates iterative optimization and debugging processes, ultimately yielding better-performing code with minimal overhead. +By clearly separating algorithm logic from scheduling, you can easily test and compare different scheduling strategies, such as compute_inline, compute_root, compute_at, and more, without modifying your fundamental algorithmic code. This separation significantly accelerates iterative optimization and debugging processes, ultimately yielding better-performing code with minimal overhead. -We invoke Halide's AOT compilation function compile_to_static_library, which generates a static library (.a) containing the optimized pipeline and a corresponding header file (.h). +Halide's AOT compilation function compile_to_static_library generates a static library (.a) containing the optimized pipeline and a corresponding header file (.h). ```cpp thresholded.compile_to_static_library( @@ -134,18 +137,18 @@ thresholded.compile_to_static_library( ``` This will produce: -* A static library (blur_threshold_android.a) containing the compiled pipeline. This static library also includes Halide's runtime functions tailored specifically for the targeted architecture (arm-64-android). Thus, no separate Halide runtime needs to be provided on the Android device when linking against this library. +* A static library (blur_threshold_android.a) containing the compiled pipeline. This static library also includes Halide's runtime functions tailored specifically for the targeted architecture (arm-64-android). Thus, no separate Halide runtime needs to be provided on the Android device when linking against this library. * A header file (blur_threshold_android.h) declaring the pipeline function for use in other C++/JNI code. These generated files are then ready to integrate directly into an Android project via JNI, allowing efficient execution of the optimized pipeline on Android devices. The integration process is covered in the next section. -Note: JNI (Java Native Interface) is a framework that allows Java (or Kotlin) code running in a Java Virtual Machine (JVM), such as on Android, to interact with native applications and libraries written in languages like C or C++. JNI bridges the managed Java/Kotlin environment and the native, platform-specific implementations. +JNI (Java Native Interface) is a framework that allows Java (or Kotlin) code running in a Java Virtual Machine (JVM), such as on Android, to interact with native applications and libraries written in languages like C or C++. JNI bridges the managed Java/Kotlin environment and the native, platform-specific implementations. -## Compilation instructions +## Compile the pipeline To compile the pipeline-generation program on your host system, use the following commands (replace /path/to/halide with your Halide installation directory): ```console export DYLD_LIBRARY_PATH=/path/to/halide/lib/libHalide.19.dylib -g++ -std=c++17 blud-android.cpp -o blud-android \ +g++ -std=c++17 blur-android.cpp -o blur-android \ -I/path/to/halide/include -L/path/to/halide/lib -lHalide \ $(pkg-config --cflags --libs opencv4) -lpthread -ldl \ -Wl,-rpath,/path/to/halide/lib @@ -160,7 +163,7 @@ This will produce two files: * blur_threshold_android.a: The static library containing your Halide pipeline. * blur_threshold_android.h: The header file needed to invoke the generated pipeline. -We will integrate these files into our Android project in the following section. +You'll integrate these files into the Android project in the following section. ## Summary -In this section, we’ve explored Halide's powerful ahead-of-time (AOT) and cross-compilation capabilities, preparing an optimized image processing pipeline tailored specifically for Android devices. By using the host-based Halide compiler, we’ve generated a static library optimized for ARM64 Android architecture, incorporating safe boundary conditions, neighborhood-based blurring, and thresholding operations. This streamlined process allows seamless integration of highly optimized native code into Android applications, ensuring both development efficiency and runtime performance on mobile platforms. \ No newline at end of file +You've explored Halide's powerful ahead-of-time (AOT) and cross-compilation capabilities, preparing an optimized image processing pipeline tailored specifically for Android devices. By using the host-based Halide compiler, you generated a static library optimized for 64-bit Arm Android architecture, incorporating safe boundary conditions, neighborhood-based blurring, and thresholding operations. This streamlined process allows seamless integration of highly optimized native code into Android applications, ensuring both development efficiency and runtime performance on mobile platforms. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md index f10442403f..84b6ee815e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md @@ -1,19 +1,24 @@ --- # User change -title: "Demonstrating Operation Fusion" +title: "Apply operator fusion in Halide for real-time image processing" weight: 4 layout: "learningpathall" --- -## Objective -In the previous section, you explored parallelization and tiling. Here, you will focus on operator fusion (inlining) in Halide i.e., letting producers be computed directly inside their consumers—versus materializing intermediates with compute_root() or compute_at(). You will learn when fusion reduces memory traffic and when materializing saves recomputation (e.g., for large stencils or multi-use intermediates). You will inspect loop nests with print_loop_nest(), switch among schedules (fuse-all, fuse-blur-only, materialize, tile-and-materialize-per-tile) in a live camera pipeline, and measure the impact (ms/FPS/MPix/s). +## What you'll build and learn -This section does not cover loop fusion (the fuse directive). You will focus on operator fusion, which is Halide's default behavior. +You'll explore operator fusion in Halide, where each stage is computed inside its consumer instead of storing intermediate results. This approach reduces memory traffic and improves cache efficiency. You'll also learn when it's better to materialize intermediates using `compute_root()` or `compute_at()`, such as with large filters or when results are reused by multiple stages. By the end, you'll understand how to choose between fusion and materialization for real-time image processing on Arm devices. -## Code -To demonstrate how fusion in Halide works create a new file `camera-capture-fusion.cpp`, and modify it as follows. This code uses a live camera pipeline (BGR → gray → 3×3 blur → threshold), adds a few schedule variants to toggle operator fusion vs. materialization, and print ms / FPS / MPix/s. So you can see the impact immediately. +You'll also use `print_loop_nest()` to see how Halide arranges the computation, switch between different scheduling modes (fuse all, fuse blur only, materialize, tile and materialize per tile) in a live camera pipeline, and measure the impact using ms, FPS, and MPix/s. + +{{% notice Note on scope %}} +This section doesn't cover loop fusion using the `fuse` directive. You'll focus instead on operator fusion, which is Halide's default behavior. +{{% /notice %}} + +## Explore the code +To explore how fusion in Halide works create a new file called `camera-capture-fusion.cpp`, and copy in the code below. This code uses a live camera pipeline (BGR → gray → 3×3 blur → threshold), adds a few schedule variants to toggle operator fusion compared to materialization, and print ms / FPS / MPix/s. - you'll be able to see the impact immediately: ```cpp #include "Halide.h" @@ -47,11 +52,11 @@ static const char* schedule_name(Schedule s) { } // Build the BGR->Gray -> 3x3 binomial blur -> threshold pipeline. -// We clamp the *ImageParam* at the borders (Func clamp of ImageParam works in Halide 19). +// Clamp the *ImageParam* at the borders (Func clamp of ImageParam works in Halide 19). Pipeline make_pipeline(ImageParam& input, Schedule schedule) { Var x("x"), y("y"); - // Assume 3-channel BGR interleaved frames (we convert if needed). + // Assume 3-channel BGR interleaved frames (converted if needed). input.dim(0).set_stride(3); // x-stride = channels input.dim(2).set_stride(1); // c-stride = 1 input.dim(2).set_bounds(0, 3); // three channels @@ -81,7 +86,7 @@ Pipeline make_pipeline(ImageParam& input, Schedule schedule) { // Final output Func output("output"); output(x, y) = thresholded(x, y); - output.compute_root(); // we always realize 'output' + output.compute_root(); // always realize 'output' // Scheduling to demonstrate OPERATOR FUSION vs MATERIALIZATION // Default in Halide = fusion/inlining (no schedule on producers). @@ -232,12 +237,17 @@ int main(int argc, char** argv) { return 0; } ``` +The heart of this program is the `make_pipeline` function. This function builds the camera processing pipeline in Halide and lets you switch between different scheduling modes. Each mode changes how intermediate results are handled, by either fusing stages together to minimize memory use, or materializing them to avoid recomputation. By adjusting the schedule, you can see how these choices affect both the loop structure and the real-time performance of your image processing pipeline. + +Start by declaring `Var x, y` to represent pixel coordinates. The camera frames use a 3-channel interleaved BGR format. This means: -The main part of this program is the `make_pipeline` function. It defines the camera processing pipeline in Halide and applies different scheduling choices depending on which mode we select. +- The stride along the x-axis is 3, because each step moves across all three color channels. +- The stride along the channel axis (c) is 1, so channels are stored contiguously. +- The channel bounds are set from 0 to 2, covering the three BGR channels. -You start by declaring Var x, y as our pixel coordinates. Similarly as before, the camera frames come in as 3-channel interleaved BGR, you will tell Halide how the data is laid out: the stride along x is 3 (one step moves across all three channels), the stride along c (channels) is 1, and the bounds on the channel dimension are 0–2. +These settings tell Halide exactly how the image data is organized in memory, so it can process each pixel and channel correctly. -Because you don’t want to worry about array bounds when applying filters, you will clamp the input at the borders. In Halide 19, BoundaryConditions::repeat_edge works cleanly when applied to an ImageParam, since it has .dim() information. This way, all downstream stages can assume safe access even at the edges of the image. +To avoid errors when applying filters near the edges of an image, clamp the input at the borders. In Halide 19, you can use `BoundaryConditions::repeat_edge` directly on an `ImageParam`, because it includes dimension information. This ensures that all stages in your pipeline can safely access pixels, even at the image boundaries. ```cpp Pipeline make_pipeline(ImageParam& input, Schedule schedule) { @@ -251,16 +261,38 @@ Pipeline make_pipeline(ImageParam& input, Schedule schedule) { // (b) Border handling: clamp the *ImageParam* (works cleanly in Halide 19) Func inputClamped = BoundaryConditions::repeat_edge(input); ``` +The next stage converts the image to grayscale. Use the Rec.601 weights for BGR to gray conversion, just like in the previous section. For the blur, apply a 3×3 binomial kernel with values: + +``` +1 2 1 +2 4 2 +1 2 1 +``` -Next comes the gray conversion. As in previous section, you will use Rec.601 weights a 3×3 binomial blur. Instead of using a reduction domain (RDom), you unroll the sum in C++ host code with a pair of loops over the kernel. The kernel values {1, 2, 1; 2, 4, 2; 1, 2, 1} approximate a Gaussian filter. Each pixel of blur is simply the weighted sum of its 3×3 neighborhood, divided by 16. +This kernel closely approximates a Gaussian filter. Instead of using Halide's reduction domain (`RDom`), unroll the sum directly in C++ using two nested loops over the kernel values. For each pixel, calculate the weighted sum of its 3×3 neighborhood and divide by 16 to get the blurred result. This approach makes the computation straightforward and easy to follow. +Now, add a threshold stage to your pipeline. This stage checks each pixel value after the blur and sets it to white (255) if it's above 128, or black (0) otherwise. This produces a binary image, making it easy to see which areas are brighter than the threshold. -You will then add a threshold stage. Pixels above 128 become white, and all others black, producing a binary image. Finally, define an output Func that wraps the thresholded result and call compute_root() on it so that it will be realized explicitly when you run the pipeline. +Here's how you define the thresholded stage and the output Func: + +```cpp +// Threshold (binary) +Func thresholded("thresholded"); +Expr T = cast(128); +thresholded(x, y) = select(blur(x, y) > T, cast(255), cast(0)); + +// Final output +Func output("output"); +output(x, y) = thresholded(x, y); +output.compute_root(); // Realize 'output' explicitly when running the pipeline +``` + +This setup ensures that the output is a binary image, and Halide will compute and store the result when you run the pipeline. By calling `compute_root()` on the output Func, you tell Halide to materialize the final result, making it available for display or further processing. Now comes the interesting part: the scheduling choices. Depending on the Schedule enum passed in, you instruct Halide to either fuse everything (the default), materialize some intermediates, or even tile the output. - * Simple: Here you will explicitly compute and store both gray and blur across the whole frame with compute_root(). This makes them easy to reuse or parallelize, but requires extra memory traffic. + * Simple: Here you'll explicitly compute and store both gray and blur across the whole frame with compute_root(). This makes them easy to reuse or parallelize, but requires extra memory traffic. * FuseBlurAndThreshold: You compute gray once as a planar buffer, but leave blur and thresholded fused into output. This often works well when the input is interleaved, because subsequent stages read from a planar gray. - * FuseAll: You will apply no scheduling to producers, so gray, blur, and thresholded are all inlined into output. This minimizes memory usage but can recompute gray many times inside the 3×3 stencil. - * Tile: You will split the output into 64×64 tiles. Within each tile, we materialize gray (compute_at(output, xo)), so the working set is small and stays in cache. blur remains fused within each tile. + * FuseAll: You'll apply no scheduling to producers, so gray, blur, and thresholded are all inlined into output. This minimizes memory usage but can recompute gray many times inside the 3×3 stencil. + * Tile: You'll split the output into 64×64 tiles. Within each tile, you materialize gray (compute_at(output, xo)), so the working set is small and stays in cache. blur remains fused within each tile. To help you examine what’s happening, print the loop nest Halide generates for each schedule using print_loop_nest(). This will give you a clear view of how fusion or materialization changes the structure of the computation. @@ -299,7 +331,7 @@ return Pipeline(output); } ``` -All the camera handling is just like before: you open the default webcam with OpenCV, normalize frames to 3-channel BGR if needed, wrap each frame as an interleaved Halide buffer, run the pipeline, and show the result. You will still time only the realize() call and print ms / FPS / MPix/s, with the first frame marked as [warm-up]. +All the camera handling is just like before: open the default webcam with OpenCV, normalize frames to 3-channel BGR if needed, wrap each frame as an interleaved Halide buffer, run the pipeline, and show the result. Time only the realize() call and print ms / FPS / MPix/s, with the first frame marked as [warm-up]. The new part is that you can toggle scheduling modes from the keyboard while the application is running: 1. Keys: @@ -310,9 +342,9 @@ The new part is that you can toggle scheduling modes from the keyboard while the * q / Esc – quit Under the hood, pressing 0–3 triggers a rebuild of the Halide pipeline with the chosen schedule: -1. You map the key to a Schedule enum value. -2. You call make_pipeline(input, next) to construct the new scheduled pipeline. -3. You reset the warm-up flag, so the next line of stats is labeled [warm-up] (that frame includes JIT). +1. Map the key to a Schedule enum value. +2. Call make_pipeline(input, next) to construct the new scheduled pipeline. +3. Reset the warm-up flag, so the next line of stats is labeled [warm-up] (that frame includes JIT). 4. The main loop keeps grabbing frames; only the Halide schedule changes. This live switching makes fusion tangible: you can watch the loop nest printout change, see the visualization update, and compare throughput numbers in real time as you move between Simple, FuseBlurAndThreshold, FuseAll, and Tile. @@ -326,7 +358,7 @@ g++ -std=c++17 camera-capture-fusion.cpp -o camera-capture-fusion \ ./camera-capture-fusion ``` -You will see the following output: +You'll see the following output: ```output % ./camera-capture-fusion Starting with schedule: FuseAll (press 0..3 to switch; q/Esc to quit) @@ -399,7 +431,7 @@ Simple | 6.01 ms | 166.44 FPS | 345.12 MPix/s15 MPix/s ``` The console output combines two kinds of information: -1. Loop nests – printed by print_loop_nest(). These show how Halide actually arranges the computation for the chosen schedule. They are a great “x-ray” view of fusion and materialization: +1. Loop nests – printed by print_loop_nest(). These show how Halide actually arranges the computation for the chosen schedule. They're a great "x-ray" view of fusion and materialization: * In FuseAll, the loop nest contains only output. That’s because gray, blur, and thresholded are all inlined (fused) into it. Each pixel of output recomputes its 3×3 neighborhood of gray. * In FuseBlurAndThreshold, there is an extra loop for gray, because we explicitly called gray.compute_root(). The blur and thresholded stages are still fused into output. This reduces recomputation of gray and makes downstream loops simpler to vectorize. * In Simple, both gray and blur have their own loop nests, and thresholded fuses into output. This introduces two extra buffers, but each stage is computed once and can be parallelized independently. @@ -411,7 +443,7 @@ The console output combines two kinds of information: Comparing the numbers: * FuseAll runs at ~53 FPS. It has minimal memory traffic but pays for recomputation of gray under the blur. -* FuseBlurAndThreshold jumps to over 200 FPS. By materializing gray, we avoid redundant recomputation and allow blur+threshold to stay fused. This is often the sweet spot for interleaved camera input. +* FuseBlurAndThreshold jumps to over 200 FPS. By materializing gray, redundant recomputation is avoided and blur+threshold stays fused. This is often the sweet spot for interleaved camera input. * Simple reaches ~166 FPS. Both gray and blur are materialized, so no recomputation occurs, but memory traffic is higher than in FuseBlurAndThreshold. * Tile achieves similar speed (~200 FPS). Producing gray per tile balances recomputation and memory traffic by keeping intermediates local to cache. @@ -422,8 +454,8 @@ By toggling schedules live, you can see and measure how operator fusion and mate This demo makes these trade-offs concrete: the loop nest diagrams explain the structure, and the live FPS/MPix/s stats show the real performance impact. -## What “fusion” means in Halide -One of Halide's defining features is that, by default, it performs operator fusion, also called inlining. This means that if a stage produces some intermediate values, those values aren’t stored in a separate buffer and then re-read later—instead, the stage is computed directly inside the consumer’s loop. In other words, unless you tell Halide otherwise, every producer Func is fused into the next stage that uses it. +## What "fusion" means in Halide +Halide's defining feature is that, by default, it performs operator fusion, also called inlining. This means that if a stage produces some intermediate values, those values aren't stored in a separate buffer and then re-read later—instead, the stage is computed directly inside the consumer's loop. In other words, unless you tell Halide otherwise, every producer Func is fused into the next stage that uses it. Why is this important? Fusion reduces memory traffic, because Halide doesn’t need to write intermediates out to RAM and read them back again. On CPUs, where memory bandwidth is often the bottleneck, this can be a major performance win. Fusion also improves cache locality, since values are computed exactly where they are needed and the working set stays small. The trade-off, however, is that fusion can cause recomputation: if a consumer uses a neighborhood (like a blur that reads 3×3 or 9×9 pixels), the fused producer may be recalculated multiple times for overlapping regions. Whether fusion is faster depends on the balance between compute cost and memory traffic. @@ -442,27 +474,27 @@ for y: for x: gray(x,y) = ... // write one planar gray image for y: for x: out(x,y) = threshold( sum kernel * gray(x+i,y+j) ) ``` -The fused version eliminates buffer writes but recomputes gray under the blur stencil. The materialized version performs more memory operations but avoids recomputation, and also gives us a clean point to parallelize or vectorize the gray stage. +The fused version eliminates buffer writes but recomputes gray under the blur stencil. The materialized version performs more memory operations but avoids recomputation, and also provides a clean point to parallelize or vectorize the gray stage. -It’s worth noting that Halide also supports a loop fusion directive (fuse) that merges two loop variables together. That’s a different concept and not our focus here. In this tutorial, we’re talking specifically about operator fusion—the decision of whether to inline or materialize stages. +Note that Halide also supports a loop fusion directive (fuse) that merges two loop variables together. That's a different concept and not the focus here. This tutorial focuses specifically on operator fusion—the decision of whether to inline or materialize stages. ## How this looks in the live camera demo -Our pipeline is: BGR input → gray → 3×3 blur → thresholded → output. Depending on the schedule, we see different kinds of fusion: +The pipeline is: BGR input → gray → 3×3 blur → thresholded → output. Depending on the schedule, different kinds of fusion are shown: * FuseAll. No schedules on producers. gray, blur, and thresholded are all inlined into output. This minimizes memory traffic but recomputes gray repeatedly inside the 3×3 blur. -* FuseBlurAndThreshold: We add gray.compute_root(), materializing gray once as a planar buffer. This avoids recomputation of gray and makes downstream blur and thresholded vectorize better. blur and thresholded remain fused. +* FuseBlurAndThreshold: Adding gray.compute_root() materializes gray once as a planar buffer. This avoids recomputation of gray and makes downstream blur and thresholded vectorize better. blur and thresholded remain fused. * Simple. Both gray and blur are materialized across the frame. This avoids recomputation entirely but increases memory traffic. -* Tile. We split the output into 64×64 tiles and compute gray per tile (compute_at(output, xo)). This keeps intermediate results local to cache while still fusing blur inside each tile. +* Tile. The output is split into 64×64 tiles and gray is computed per tile (compute_at(output, xo)). This keeps intermediate results local to cache while still fusing blur inside each tile. By toggling between these modes in the live demo, you can see how the loop nests and throughput numbers change, which makes the abstract idea of fusion much more concrete. ## When to use operator fusion -Fusion is Halide's default and usually the right place to start. It’s especially effective for: +Fusion is Halide's default and usually the right place to start. It's especially effective for: * Element-wise chains, where each pixel is transformed independently: examples include intensity scaling or offset, gamma correction, channel mixing, color-space conversions, and logical masking. * Cheap post-ops after spatial filters: -for instance, there’s no reason to materialize a blurred image just to threshold it. Fuse the threshold directly into the blur’s consumer. +for instance, there's no reason to materialize a blurred image to threshold it. Fuse the threshold directly into the blur's consumer. -In our code, FuseAll inlines gray, blur, and thresholded into output. FuseBlurAndThreshold materializes only gray, then keeps blur and thresholded fused—a common middle ground that balances memory use and compute reuse. +In the code, FuseAll inlines gray, blur, and thresholded into output. FuseBlurAndThreshold materializes only gray, then keeps blur and thresholded fused—a common middle ground that balances memory use and compute reuse. ## When to materialize instead of fuse Fusion isn’t always best. You’ll want to materialize an intermediate (compute_root() or compute_at()) if: @@ -471,8 +503,9 @@ Fusion isn’t always best. You’ll want to materialize an intermediate (comput * The intermediate is reused by multiple consumers. * You need a natural stage to apply parallelization or tiling. -### Profiling -The fastest way to check whether fusion helps is to measure it. Our demo prints timing and throughput per frame, but Halide also includes a built-in profiler that reports per-stage runtimes. To learn how to enable and interpret the profiler, see the official [Halide profiling tutorial](https://halide-lang.org/tutorials/tutorial_lesson_21_auto_scheduler_generate.html#profiling). +## Profiling +The fastest way to check whether fusion helps is to measure it. The demo prints timing and throughput per frame, but Halide also includes a built-in profiler that reports per-stage runtimes. To learn how to enable and interpret the profiler, see the official [Halide profiling tutorial](https://halide-lang.org/tutorials/tutorial_lesson_21_auto_scheduler_generate.html#profiling). ## Summary -In this section, you have learned about operator fusion in Halide—a powerful technique for reducing memory bandwidth and improving computational efficiency. You explored why fusion matters, looked at scenarios where it is most effective, and saw how Halide's scheduling constructs such as compute_root() and compute_at() let us control whether stages are fused or materialized. By experimenting with different schedules, including fusing the Gaussian blur and thresholding stages, we observed how fusion can significantly improve the performance of a real-time image processing pipeline + +You've seen how operator fusion in Halide can make your image processing pipeline faster and more efficient. Fusion means Halide computes each stage directly inside its consumer, reducing memory traffic and keeping data in cache. You learned when fusion is best—like for simple pixel operations or cheap post-processing—and when materializing intermediates with `compute_root()` or `compute_at()` can help, especially for large stencils or multi-use buffers. By switching schedules in the live demo, you saw how fusion and materialization affect both the loop structure and real-time performance. Now you know how to choose the right approach for your own Arm-based image processing tasks. diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md index 4670270833..e2535f65a6 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md @@ -1,32 +1,41 @@ --- # User change -title: "Background and Installation" +title: "Install and configure Halide for Arm development" weight: 2 layout: "learningpathall" --- -## Introduction -Halide is a powerful, open-source programming language specifically designed to simplify and optimize high-performance image and signal processing pipelines. Initially developed by researchers at MIT and Adobe in 2012, Halide addresses a critical challenge in computational imaging: efficiently mapping image-processing algorithms onto diverse hardware architectures without extensive manual tuning. It accomplishes this by clearly separating the description of an algorithm (specifying the mathematical or logical transformations applied to images or signals) from its schedule (detailing how and where those computations execute). This design enables rapid experimentation and effective optimization for various processing platforms, including CPUs, GPUs, and mobile hardware. +## What is Halide? -A key advantage of Halide lies in its innovative programming model. By clearly distinguishing between algorithmic logic and scheduling decisions—such as parallelism, vectorization, memory management, and hardware-specific optimizations, developers can first focus on ensuring the correctness of their algorithms. Performance tuning can then be handled independently, significantly accelerating development cycles. This approach often yields performance that matches or even surpasses manually optimized code. As a result, Halide has seen widespread adoption across industry and academia, powering image processing systems at organizations such as Google, Adobe, and Facebook, and enabling advanced computational photography features used by millions daily. +Halide is a powerful, open-source programming language designed to simplify and optimize high-performance image and signal processing. In 2012, researchers at MIT and Adobe developed Halide to efficiently run image-processing algorithms on different hardware architectures without extensive manual tuning. -In this learning path, you will explore Halide's foundational concepts, set up your development environment, and create your first functional Halide application. By the end, you will understand what makes Halide uniquely suited to efficient image processing, particularly on mobile and Arm-based hardware, and be ready to build your own optimized pipelines. +Halide makes it easy to write correct image-processing code by separating what your program does from how it runs. You first describe the algorithm, which is the steps to process each pixel, without needing to worry about performance details. You can then later choose scheduling strategies like parallelism, vectorization, and memory management to optimize for your hardware, including Arm processors. This approach helps you focus on getting the right results before tuning for speed, often matching or beating hand-optimized code. -For broader or more general use cases, please refer to the official Halide documentation and tutorials available at [halide-lang.org](https://halide-lang.org). +In this Learning Path, you'll explore Halide's foundational concepts, set up your development environment, and create your first functional Halide application. By the end, you'll understand what makes Halide uniquely suited to efficient image processing, particularly on mobile and Arm-based hardware, and be ready to build your own optimized pipelines. -The example code for this Learning Path is available in two repositories [here](https://github.com/dawidborycki/Arm.Halide.Hello-World.git) and [here](https://github.com/dawidborycki/Arm.Halide.AndroidDemo.git) +For broader use cases, see the official Halide documentation and tutorials on [the Halide website](https://halide-lang.org). + +You can find the example code for this Learning Path in two GitHub repositories: [Arm.Halide.Hello-World GitHub repository](https://github.com/dawidborycki/Arm.Halide.Hello-World.git) and [Arm.Halide.AndroidDemo GitHub repository](https://github.com/dawidborycki/Arm.Halide.AndroidDemo.git). ## Key concepts in Halide -### Separation of algorithm and schedule -At the core of Halide's design philosophy is the principle of clearly separating algorithms from schedules. Traditional image-processing programming tightly couples algorithmic logic with execution strategy, complicating optimization and portability. In contrast, Halide explicitly distinguishes these two components: - * Algorithm: Defines what computations are performed—for example, image filters, pixel transformations, or other mathematical operations on image data. - * Schedule: Specifies how and where these computations are executed, addressing critical details such as parallel execution, memory usage, caching strategies, and hardware-specific optimizations. -This separation allows developers to rapidly experiment and optimize their code for different hardware architectures or performance requirements without altering the core algorithmic logic. +Before you build your first Halide application, get familiar with the key ideas that make Halide powerful for image processing. Halide separates the steps of what your code does (the algorithm) from how it runs (the schedule). You'll use symbolic building blocks to describe image operations, then apply scheduling strategies to optimize performance for Arm processors. Understanding these concepts helps you write code that's both correct and fast. These concepts work together to enable high-performance code that's both readable and portable across different hardware architectures, including Arm processors. + +## Separate algorithm from schedule for optimal performance + +Halide's core design principle separates algorithms from schedules. Traditional image-processing code tightly couples algorithmic logic with execution strategy, complicating optimization and portability. + +- The algorithm defines what computations are performed, such as image filters, pixel transformations, or mathematical operations on image data. -Halide provides three key building blocks, including Functions, Vars, and Pipelines, to simplify and structure image processing algorithms. Consider the following illustrative example: +- The schedule specifies how and where these computations execute, including parallel execution, memory usage, caching strategies, and hardware-specific optimizations. + +This separation enables you to experiment and optimize code for different hardware architectures without changing the core algorithmic logic. + +## Discover Halide building blocks + +Halide provides three key building blocks to structure image processing algorithms, as shown below: ```cpp Halide::Var x("x"), y("y"), c("c"); @@ -36,42 +45,60 @@ Halide::Func brighter("brighter"); brighter(x, y, c) = Halide::cast(Halide::min(input(x, y, c) + 50, 255)); ``` -Functions (Func) represent individual computational steps or image operations. Each Func encapsulates an expression applied to pixels, allowing concise definition of complex image processing tasks. Vars symbolically represent spatial coordinates or dimensions (e.g., horizontal x, vertical y, color channel c). They specify where computations are applied in the image data Pipelines are formed by interconnecting multiple Func objects, structuring a clear workflow where the output of one stage feeds into subsequent stages, enabling modular and structured image processing. +- Functions (`Func)` represent individual computational steps or image operations. Each Func encapsulates an expression applied to pixels, enabling concise definition of complex tasks. -Halide is a domain-specific language (DSL) tailored explicitly for image and signal processing tasks. It provides a concise set of predefined operations and building blocks optimized for expressing complex image processing pipelines. By abstracting common computational patterns into simple yet powerful operators, Halide allows developers to succinctly define their processing logic, facilitating readability, maintainability, and easy optimization for various hardware targets. +- `Var` symbolically represents spatial coordinates or dimensions (for example, horizontal x, vertical y, color channel c), specifying where computations are applied. -### Scheduling strategies (parallelism, vectorization, tiling) -Halide offers several powerful scheduling strategies designed for maximum performance: - * Parallelism: Executes computations concurrently across multiple CPU cores, significantly reducing execution time for large datasets. - * Vectorization: Enables simultaneous processing of multiple data elements using SIMD (Single Instruction, Multiple Data) instructions available on CPUs and GPUs, greatly enhancing performance. - * Tiling: Divides computations into smaller blocks (tiles) optimized for cache efficiency, thus improving memory locality and reducing overhead due to memory transfers. +- Pipelines are formed by connecting multiple `Func` objects, creating a workflow where each stage's output feeds into subsequent stages. -By combining these scheduling techniques, developers can achieve optimal performance tailored specifically to their target hardware architecture. +Halide is a domain-specific language (DSL) tailored for image and signal processing. It provides predefined operations and building blocks optimized for expressing complex pipelines. By abstracting common computational patterns, Halide lets you define processing logic concisely, which in turn facilitates readability, maintainability, and optimization across hardware targets. -Beyond manual scheduling strategies, Halide also provides an Autoscheduler, a powerful tool that automatically generates optimized schedules tailored to specific hardware architectures, further simplifying performance optimization. +## Learn about scheduling strategies -## System requirements and environment setup -To start developing with Halide, your system must meet several requirements and dependencies. +Halide offers several powerful scheduling strategies for maximum performance: -### Installation options -Halide can be set up using one of two main approaches: -* Installing pre-built binaries - pre-built binaries are convenient, quick to install, and suitable for most beginners or standard platforms (Windows, Linux, macOS). This approach is recommended for typical use cases. -* Building Halide from source is required when pre-built binaries are unavailable for your specific environment, or if you wish to experiment with the latest Halide features or LLVM versions still under active development. This method typically requires greater familiarity with build systems and may be more suitable for advanced users. +- Parallelism is the execution of computations concurrently across multiple CPU cores, reducing execution time for large datasets -Here, you will use pre-built binaries: - 1. Visit the official Halide releases [page](https://github.com/halide/Halide/releases). As of this writing, the latest Halide version is v19.0.0. - 2. Download and unzip the binaries to a convenient location (e.g., /usr/local/halide on Linux/macOS or C:\halide on Windows). - 3. Optionally set environment variables to simplify further usage: -```console -export HALIDE_DIR=/path/to/halide -export PATH=$HALIDE_DIR/bin:$PATH -``` +- Vectorization enables simultaneous processing of multiple data elements using SIMD (Single Instruction, Multiple Data) instructions, such as Arm NEON, enhancing performance on Arm CPUs and GPUs + +- Tiling divides computations into smaller blocks optimized for cache efficiency, improving memory locality and reducing transfer overhead + +You can combine these techniques to achieve optimal performance tailored to your target hardware architecture. + +Beyond manual scheduling, Halide provides an Autoscheduler that automatically generates optimized schedules for specific hardware architectures, including Arm-based systems, simplifying performance optimization. + +## Set up your environment + +You can set up Halide using one of two approaches: + +- **Use pre-built binaries** for a fast and convenient setup on Windows, Linux, and macOS. This method is recommended for most users and standard development environments. + +- **Building from source** is required when pre-built binaries aren't available for your environment, or if you want to experiment with the latest Halide features or LLVM versions under active development. This method requires familiarity with build systems. -To proceed futher, make sure to install the following components: -1. LLVM (Halide requires LLVM to compile and execute pipelines) -2. OpenCV (for image handling in later lessons) +To use pre-built binaries, follow these steps: -Install with the commands for your OS: +To set up Halide using pre-built binaries: + +- Go to the [Halide releases page](https://github.com/halide/Halide/releases). This Learning Path uses version v19.0.0. +- Download and unzip the binaries to a convenient location, such as `/usr/local/halide` (Linux/macOS) or `C:\halide` (Windows). +- Set environment variables to make Halide easy to use: + ```console + export HALIDE_DIR=/path/to/halide + export PATH=$HALIDE_DIR/bin:$PATH + ``` + + +## Install LLVM and OpenCV + +Before you can build and run Halide pipelines, you need to install two essential components: + +- LLVM: Halide depends on LLVM to compile and execute image processing pipelines. LLVM provides the backend that turns Halide code into optimized machine instructions for Arm processors. + +- OpenCV: You'll use OpenCV for image input and output in later sections. OpenCV makes it easy to load, display, and save images, and it integrates smoothly with Halide buffers. + +Both tools are available for Arm platforms on Linux, macOS, and Windows. Make sure you install the correct versions for your operating system and architecture. + +The commands below show how to install LLVM and OpenCV: {{< tabpane code=true >}} {{< tab header="Linux/Ubuntu" language="bash">}} @@ -86,8 +113,9 @@ brew install opencv pkg-config Halide examples were tested with OpenCV 4.11.0 -## Your first Halide program -Now you’re ready to build your first Halide-based application. Save the following code in a file named `hello-world.cpp`: +## Build your first Halide program + +You're now ready to build your first Halide application. Save the following code in a file named `hello-world.cpp`: ```cpp #include "Halide.h" #include @@ -102,7 +130,7 @@ int main() { // Static path for the input image. std::string imagePath = "img.png"; - // Load the input image using OpenCV (BGR by default). + // Load the input image using OpenCV (BGR format by default, which stands for Blue-Green-Red channel order). Mat input = imread(imagePath, IMREAD_COLOR); // Alternative: Halide has a built-in IO function to directly load images as Halide::Buffer. // Example: Halide::Buffer inputBuffer = Halide::Tools::load_image(imagePath); @@ -111,7 +139,7 @@ int main() { return -1; } - // Convert RGB back to BGR for correct color display in OpenCV (optional but recommended for OpenCV visualization). + // Convert from BGR to RGB (Red-Green-Blue) format for correct color display in OpenCV. cvtColor(input, input, COLOR_BGR2RGB); // Wrap the OpenCV Mat data in a Halide::Buffer. @@ -151,30 +179,32 @@ int main() { } ``` -This program demonstrates how to combine Halide's image processing capabilities with OpenCV’s image I/O and display functionality. It begins by loading an image from disk using OpenCV, specifically reading from a static file named `img.png` (here you use a Cameraman image). Since OpenCV loads images in BGR format by default, the code immediately converts the image to RGB format so that it is compatible with Halide's expectations. +This program demonstrates how you can combine Halide's image processing capabilities with OpenCV's image I/O and display functionality. It begins by loading an image from disk using OpenCV, specifically reading from a static file named `img.png` (here you use a Cameraman image). Since OpenCV loads images in BGR (Blue-Green-Red) format by default, the code immediately converts the image to RGB (Red-Green-Blue) format so that it's compatible with Halide. + +The program wraps the raw image data into a Halide buffer, capturing the image's width, height, and color channels. It defines the Halide pipeline using a function named `invert` to specify the computation for each pixel—subtract the original pixel value from 255 to invert the colors. -Once the image is loaded and converted, the program wraps the raw image data into a Halide buffer, capturing the image’s dimensions (width, height, and color channels). Next, the Halide pipeline is defined through a function named invert, which specifies the computations to perform on each pixel—in this case, subtracting the original pixel value from 255 to invert the colors. The pipeline definition alone does not perform any actual computation; it only describes what computations should occur and how to schedule them. +{{% notice Note %}} +Remember, the pipeline definition only describes the computations and scheduling; it doesn't perform any actual processing until you realize the pipeline. +{{% /notice %}} -The actual computation occurs when the pipeline is executed with the call to invert.realize(...). This is the step that processes the input image according to the defined pipeline and produces an output Halide buffer. The scheduling directive (invert.reorder(c, x, y)) ensures that pixel data is computed in an interleaved manner (channel-by-channel per pixel), aligning the resulting data with OpenCV’s expected memory layout for images. +The actual computation occurs when the pipeline is executed with the call to `invert.realize`(...). This is the step that processes the input image according to the defined pipeline and produces an output Halide buffer. The scheduling directive `(invert.reorder(c, x, y))` ensures that pixel data is computed in an interleaved manner (channel-by-channel per pixel), aligning the resulting data with OpenCV’s expected memory layout for images. -Finally, the processed Halide output buffer is efficiently wrapped in an OpenCV Mat header without copying pixel data. For proper display in OpenCV, which uses BGR channel ordering by default, the code converts the processed image back from RGB to BGR. The program then displays the original and inverted images in separate windows, waiting for a key press before exiting. This approach demonstrates a streamlined integration between Halide for high-performance image processing and OpenCV for convenient input and output operations. +Wrap the processed Halide output buffer in an OpenCV `Mat` header without copying pixel data. Convert the processed image from RGB back to BGR for proper display in OpenCV, which uses BGR channel ordering by default. Display the original and inverted images in separate windows, and wait for a key press before exiting. Use this approach to integrate Halide for high-performance image processing with OpenCV for convenient input and output operations. -By default, Halide orders loops based on the order of variable declaration. In this example, the original ordering (x, y, c) implies processing the image pixel-by-pixel across all horizontal positions (x), then vertical positions (y), and finally channels (c). This ordering naturally produces a planar memory layout (e.g., processing all red pixels first, then green, then blue). +By default, Halide orders loops based on the order of variable declaration. In this example, the original ordering (x, y, c) implies processing the image pixel-by-pixel across all horizontal positions (x), then vertical positions (y), and finally channels (c). This ordering naturally produces a planar memory layout (for example, processing all red pixels first, then green, then blue). However, the optimal loop order depends on your intended memory layout and compatibility with external libraries: -1. Interleaved Layout (RGBRGBRGB…): -* Commonly used by libraries such as OpenCV. -* To achieve this, the color channel (c) should be the innermost loop, followed by horizontal (x) and then vertical (y) loops -Specifically, call: +**Interleaved layout (RGBRGBRGB…)** is commonly used by libraries such as OpenCV. To achieve this, the color channel (c) should be the innermost loop, followed by horizontal (x) and then vertical (y) loops. + +Call: ```cpp invert.reorder(c, x, y); ``` -This changes the loop nesting to process each pixel’s channels together (R, G, B for the first pixel, then R, G, B for the second pixel, and so on), resulting in: -* Better memory locality and cache performance when interfacing with interleaved libraries like OpenCV. -* Reduced overhead for subsequent image-handling operations (display, saving, or further processing). -By default, OpenCV stores images in interleaved memory layout, using the HWC (Height, Width, Channel) ordering. To correctly represent this data layout in a Halide buffer, you can also explicitly use the Buffer::make_interleaved() method, which ensures the data layout is properly specified. The code snippet would look like this: +This changes the loop nesting to process each pixel's channels together (R, G, B for the first pixel, then R, G, B for the second pixel, and so on). This provides better memory locality and cache performance when interfacing with interleaved libraries like OpenCV, and reduces overhead for subsequent image-handling operations (display, saving, or further processing). + +By default, OpenCV stores images in interleaved memory layout, using the HWC (Height, Width, Channel) ordering. To correctly represent this data layout in a Halide buffer, you can use the `Buffer::make_interleaved()` method, which ensures the data layout is properly specified: ```cpp // Wrap the OpenCV Mat data in a Halide buffer with interleaved HWC layout. @@ -183,28 +213,29 @@ Buffer inputBuffer = Buffer::make_interleaved( ); ``` -2. Planar Layout (RRR...GGG...BBB...): -* Preferred by certain image-processing routines or hardware accelerators (e.g., some GPU kernels or certain ML frameworks). -* Achieved naturally by Halide's default loop ordering (x, y, c). +**Planar layout (RRR...GGG...BBB...)** is preferred by certain image-processing routines or hardware accelerators (for example, some GPU kernels or ML frameworks). This is achieved naturally by Halide's default loop ordering (x, y, c). -It is essential to select loop ordering based on your specific data format requirements and integration scenario. Halide provides full flexibility, allowing you to explicitly reorder loops to match the desired memory layout efficiently. +Choose your loop ordering based on how your image data is stored and which libraries you use. Halide lets you control loop order for both performance and compatibility. -In Halide, two distinct concepts must be distinguished clearly: -1. Loop execution order (controlled by reorder). Defines the nesting order of loops during computation. For example, to make the channel dimension (c) innermost during computation: +Halide separates two important ideas: + +**Loop execution order** — Use `reorder` to set the order in which loops run during computation. For example, making the channel (`c`) the innermost loop helps match interleaved layouts like OpenCV's HWC format: ```cpp invert.reorder(c, x, y); ``` -2. Memory storage layout (controlled by reorder_storage). Defines the actual order in which data is stored in memory, such as interleaved or planar: + +**Memory storage layout** (controlled by `reorder_storage`) defines the actual order in which data is stored in memory, such as interleaved or planar: ```cpp invert.reorder_storage(c, x, y); ``` -Using only reorder(c, x, y) affects the computational loop order but not necessarily the memory layout. The computed data could still be stored in planar order by default. Using reorder_storage(c, x, y) explicitly defines the memory layout as interleaved. +Using only `reorder(c, x, y)` affects the computational loop order but not necessarily the memory layout. The computed data could still be stored in planar order by default. Using `reorder_storage(c, x, y)` defines the memory layout as interleaved. + +## Compile the program -## Compilation instructions -Compile the program as follows (replace /path/to/halide accordingly): +Compile the program as follows (replace `/path/to/halide` with your actual path): ```console export DYLD_LIBRARY_PATH=/path/to/halide/lib/libHalide.19.dylib g++ -std=c++17 hello-world.cpp -o hello-world \ @@ -213,24 +244,24 @@ g++ -std=c++17 hello-world.cpp -o hello-world \ -Wl,-rpath,/path/to/halide/lib ``` -Note that, on Linux, you would set LD_LIBRARY_PATH instead: +On Linux, set LD_LIBRARY_PATH instead: ```console export LD_LIBRARY_PATH=/path/to/halide/lib/ ``` -Run the executable: +To run the executable: ```console ./hello-world ``` -You will see two windows displaying the original and inverted images: -![img1](Figures/01.png) -![img2](Figures/02.png) +You'll see two windows displaying the original and inverted images: +![Original color photograph of a cameraman on the left showing a person operating a professional camera, and inverted version on the right with reversed colors where the subject appears in negative](Figures/01.png) +![Two side-by-side terminal windows showing compilation and execution of the Halide hello-world program, with the left window displaying g++ compilation commands and library paths, and the right window showing successful program execution with OpenCV window initialization messages](Figures/02.png) -## Summary -In this section, you have learned Halide's foundational concepts, explored the benefits of separating algorithms and schedules, set up your development environment, and created your first functional Halide application integrated with OpenCV. +## What you've accomplished and what's next -While the example introduces the core concepts of Halide pipelines (such as defining computations symbolically and realizing them), it does not yet showcase the substantial benefits of explicitly separating algorithm definition from scheduling strategies. +You've learned Halide's foundational concepts, explored the benefits of separating algorithms and schedules, set up your development environment, and created your first functional Halide application integrated with OpenCV for Arm development. -In subsequent sections, you will explore advanced Halide scheduling techniques, including parallelism, vectorization, tiling, and loop fusion, which will clearly demonstrate the practical advantages of separating algorithm logic from scheduling. These techniques enable fine-grained performance optimization tailored to specific hardware without modifying algorithmic correctness. +While the example introduces the core concepts of Halide pipelines (such as defining computations symbolically and realizing them), it doesn't yet showcase the benefits of separating algorithm definition from scheduling strategies. +In subsequent sections, you'll explore advanced Halide scheduling techniques, including parallelism, vectorization, tiling, and loop fusion, which demonstrate the practical advantages of separating algorithm logic from scheduling. These techniques enable fine-grained performance optimization tailored to Arm processors and other hardware without modifying algorithmic correctness. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md index 6d7b9ec3d9..d1637bc222 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md @@ -1,17 +1,25 @@ --- # User change -title: "Building a Simple Camera Image Processing Workflow" +title: "Build a simple camera image processing workflow" weight: 3 layout: "learningpathall" --- -## Objective -In this section, you will build a real-time camera processing pipeline using Halide. First, you capture video frames from a webcam using OpenCV, then implement a Gaussian (binomial) blur to smooth the captured images, followed by thresholding to create a clear binary output highlighting prominent image features. After establishing this pipeline, you will measure performance and then explore Halide's scheduling options—parallelization and tiling—to understand when they help and when they don’t. +## What you'll build + +In this section, you will build a real-time camera processing pipeline using Halide: + +- First, you will capture video frames from a webcam using OpenCV, implement a Gaussian (binomial) blur to smooth the captured images, followed by thresholding to create a clear binary output highlighting prominent image features. + +- Next, you will measure performance and explore Halide's scheduling options: parallelization and tiling. Each technique improves throughput in a different way. + + +## Implement Gaussian blur and thresholding + +To get started, create a new `camera-capture.cpp` file and copy and paste in the contents below: -## Gaussian blur and thresholding -Create a new `camera-capture.cpp` file and modify it as follows: ```cpp #include "Halide.h" #include @@ -127,10 +135,11 @@ int main() { return 0; } ``` +The camera delivers interleaved BGR frames. You convert them to grayscale using Rec.601 weights, apply a 3×3 binomial blur (with 16-bit accumulation and division by 16), and then threshold to create a binary image. -The camera delivers interleaved BGR frames. Inside Halide, we convert to grayscale (Rec.601), apply a 3×3 binomial blur (sum/16 with 16-bit accumulation), then threshold to produce a binary image. We compile once (outside the capture loop) and realize per frame for real-time processing. +Compile the pipeline once before the capture loop starts, then call `realize()` each frame for real-time processing. -A 3×3 filter needs neighbors (x±1, y±1). At the image edges, some taps would fall outside the valid region. Rather than scattering manual clamps across expressions, we wrap the input once: +A 3×3 filter needs neighbors (x±1, y±1). At the image edges, some taps fall outside the valid region. Rather than scattering manual clamps across expressions, wrap the input once: ```cpp // Wrap the input so out-of-bounds reads replicate the nearest edge pixel. @@ -139,7 +148,7 @@ Func inputClamped = BoundaryConditions::repeat_edge(input); Any out-of-bounds access replicates the nearest edge pixel. This makes the boundary policy obvious, keeps expressions clean, and ensures all downstream stages behave consistently at the edges. -Grayscale conversion happens inside Halide using Rec.601 weights. We read B, G, R from the interleaved input and compute luminance: +Halide converts the image to grayscale using Rec.601 weights. Read B, G, R from the interleaved input and compute luminance: ```cpp // Grayscale (Rec.601) @@ -150,7 +159,7 @@ gray(x, y) = cast(0.114f * inputClamped(x, y, 0) + // B 0.299f * inputClamped(x, y, 2)); // R ``` -Next, the pipeline applies a Gaussian-approximate (binomial) blur using a fixed 3×3 kernel. For this learning path, we implement it with small loops and 16-bit accumulation for safety: +Next, the pipeline applies a Gaussian-approximate (binomial) blur using a fixed 3×3 kernel. Implement it with small loops and 16-bit accumulation for safety: ```cpp Func blur("blur"); @@ -162,12 +171,9 @@ for (int j = 0; j < 3; ++j) blur(x, y) = cast(sum / 16); ``` -Why this kernel? -* It provides effective smoothing while remaining computationally lightweight. -* The weights approximate a Gaussian distribution, which reduces noise but preserves edges better than a box filter. -* This is mathematically a binomial filter, a standard and efficient approximation of Gaussian blurring. +This binomial kernel smooths images effectively while staying lightweight. Its weights closely match a Gaussian distribution, so it reduces noise but preserves edges better than a simple box filter. This makes it a fast and practical way to approximate Gaussian blur in real-time image processing. -After the blur, the pipeline applies thresholding to produce a binary image. We explicitly cast constants to uint8_t to remove ambiguity and avoid redundant widen/narrow operations in generated code: +After the blur, the pipeline applies thresholding to produce a binary image. Explicitly cast constants to uint8_t to remove ambiguity and avoid redundant widen/narrow operations in generated code: ```cpp Func output("output"); @@ -175,9 +181,9 @@ Func output("output"); output(x, y) = select(blur(x, y) > T, cast(255), cast(0)); ``` -This simple but effective step emphasizes strong edges and regions of high contrast, often used as a building block in segmentation and feature extraction pipelines +This step emphasizes strong edges and regions of high contrast, providing a building block for segmentation and feature extraction pipelines. -Finally, the result is realized by Halide and displayed via OpenCV. The pipeline is built once (outside the capture loop) and then realized each frame: +Halide generates the final output, and OpenCV displays it. Build the pipeline once (outside the capture loop), and then realize each frame: ```cpp // Build the pipeline once (outside the capture loop) Buffer outBuf(width, height); @@ -192,7 +198,7 @@ imshow("Processing Workflow", view); The main loop continues capturing frames, running the Halide pipeline, and displaying the processed output in real time until a key is pressed. This illustrates how Halide integrates cleanly with OpenCV to build efficient, interactive image-processing applications. -## Compilation instructions +## Compile and run the program Compile the program as follows (replace /path/to/halide accordingly): ```console g++ -std=c++17 camera-capture.cpp -o camera-capture \ @@ -205,16 +211,19 @@ Run the executable: ```console ./camera-capture ``` +The output should look similar to the figure below: +![A camera viewport window titled Processing Workflow displaying a real-time binary threshold output from a webcam feed. The image shows a person's face and shoulders rendered in stark black and white, where bright areas above the threshold value appear white and darker areas appear black, creating a high-contrast silhouette effect that emphasizes edges and prominent features.](Figures/03.webp) + +## Parallelization and tiling + +In this section, you will explore two scheduling optimizations that Halide provides: parallelization and tiling. Each technique improves performance in a different way. Parallelization uses multiple CPU cores, while tiling optimizes cache efficiency through better data locality. -The output should look as in the figure below: -![img3](Figures/03.webp) +You will learn how to use each technique separately for clarity and to emphasize their distinct benefits. -## Parallelization and Tiling -In this section, you will explore two complementary scheduling optimizations provided by Halide: Parallelization and Tiling. Both techniques help enhance performance but achieve it through different mechanisms—parallelization leverages multiple CPU cores, whereas tiling improves cache efficiency by optimizing data locality. +### Establish baseline performance -Now you will learn how to use each technique separately for clarity and to emphasize their distinct benefits. +Before applying any scheduling optimizations, establish a measurable baseline. Create a second file, `camera-capture-perf-measurement.cpp`, that runs the same grayscale → blur → threshold pipeline but prints per-frame timing, FPS, and MPix/s around the Halide `realize()` call. This lets you quantify each optimization you add next (parallelization, tiling, caching). -Let’s first lock in a measurable baseline before we start changing the schedule. You will create a second file, `camera-capture-perf-measurement.cpp`, that runs the same grayscale → blur → threshold pipeline but prints per-frame timing, FPS, and MPix/s around the Halide realize() call. This lets you quantify each optimization you will add next (parallelization, tiling, caching). Create `camera-capture-perf-measurement.cpp` with the following code: ```cpp @@ -353,8 +362,9 @@ realize: 3.98 ms | 251.51 FPS | 521.52 MPix/s This gives an FPS of 251.51, and average throughput of 521.52 MPix/s. Now you can start measuring potential improvements from scheduling. -### Parallelization -Parallelization lets Halide run independent pieces of work at the same time on multiple CPU cores. In image pipelines, rows (or row tiles) are naturally parallel once producer data is available. By distributing work across cores, we reduce wall-clock time—crucial for real-time video. +### Apply parallelization + +Parallelization allows Halide to process different parts of the image at the same time using multiple CPU cores. In image processing pipelines, each row or block of rows can be handled independently once the input data is ready. By spreading the work across several cores, you reduce the total processing time—this is especially important for real-time video applications. With the baseline measured, apply a minimal schedule that parallelizes the loop iteration for y axis. @@ -369,10 +379,15 @@ Add these lines after defining output(x, y) (and before any realize()). In this ``` This does two important things: -* compute_root() on gray divides the entire processing into two loops, one to compute the entire gray output, and the other to compute the final output. -* parallel(y) parallelizes over the pure loop variable y (rows). The rows are computed on different CPU cores in parallel. +* `compute_root()` on gray divides the entire processing into two loops, one to compute the entire gray output, and the other to compute the final output. +* `parallel(y)` parallelizes over the pure loop variable y (rows). The rows are computed on different CPU cores in parallel. +Now rebuild and run the application. You should see output similar to: + +```output +realize: 1.16 ms | 864.15 FPS | 1791.90 MPix/s +``` -Now rebuild and run the application again. The results should look like: +This shows a significant speedup from parallelization. The exact numbers depend on your Arm CPU and how many cores are available. ```output % ./camera-capture-perf-measurement realize: 1.16 ms | 864.15 FPS | 1791.90 MPix/s @@ -380,17 +395,20 @@ realize: 1.16 ms | 864.15 FPS | 1791.90 MPix/s The performance gain by parallelization depends on how many CPU cores are available for this application to occupy. -### Tiling +## Apply tiling for cache efficiency + Tiling is a scheduling technique that divides computations into smaller, cache-friendly blocks or tiles. This approach significantly enhances data locality, reduces memory bandwidth usage, and leverages CPU caches more efficiently. While tiling can also use parallel execution, its primary advantage comes from optimizing intermediate data storage. -Tiling splits the image into cache-friendly blocks (tiles). Two wins: -* Partitioning: tiles are easy to parallelize across cores. -* Locality: when you cache intermediates per tile, you avoid refetching/recomputing data and hit CPU L1/L2 cache more often. +Tiling divides the image into smaller, cache-friendly blocks called tiles. This gives you two main benefits: + +* Partitioning: tiles are easy to process in parallel, so you can spread the work across multiple CPU cores. +* Locality: by caching intermediate results within each tile, you avoid repeating calculations and make better use of the CPU cache. -Now lets look at both flavors. +Try both methods to see how they improve performance. -### Tiling with explicit intermediate storage (best for cache efficiency) -Here you will cache gray once per tile so the 3×3 blur can reuse it instead of recomputing RGB -> gray up to 9× per output pixel. +## Cache intermediates per tile + +This approach caches gray once per tile so the 3×3 blur can reuse it instead of recomputing RGB to gray up to 9× per output pixel. This provides the best cache efficiency. ```cpp // Scheduling @@ -410,27 +428,33 @@ Here you will cache gray once per tile so the 3×3 blur can reuse it instead of ``` In this scheduling: -* tile(...) splits the image into cache-friendly blocks and makes it easy to parallelize across tiles. -* parallel(yo) distributes tiles across CPU cores where a CPU core is in charge of a row (yo) of tiles. -* gray.compute_at(...).store_at(...) materializes a tile-local planar buffer for the grayscale intermediate so blur can reuse it within the tile. +* `tile`(...) splits the image into cache-friendly blocks and makes it easy to parallelize across tiles +* `parallel(yo)` distributes tiles across CPU cores where a CPU core is in charge of a row (yo) of tiles +* `gray.compute_at(...).store_at(...)` materializes a tile-local planar buffer for the grayscale intermediate so blur can reuse it within the tile + +Recompile your application as before, then run. -Recompile your application as before, then run. What we observed on our machine: +Here's sample output: ```output realize: 0.98 ms | 1023.15 FPS | 2121.60 MPix/s ``` -This was the fastest variant here—caching a planar grayscale per tile enabled efficient reuse. +Caching the grayscale image for each tile gives the best performance. By storing the intermediate grayscale result in a tile-local buffer, Halide can reuse it efficiently during the blur step. This reduces redundant computations and makes better use of the CPU cache, resulting in faster processing. + +## Choose a scheduling strategy +There isn't a universal scheduling strategy that guarantees the best performance for every pipeline or device. The optimal approach depends on your specific image-processing workflow and the Arm architecture you're targeting. Halide's scheduling API gives you the flexibility to experiment with parallelization, tiling, and caching. Try different combinations to see which delivers the highest throughput and efficiency for your application. + +For the example of this application: +Start by parallelizing the outermost loop to use multiple CPU cores. This is usually the simplest way to boost performance. + +Add tiling and caching if your pipeline includes a spatial filter (such as blur or convolution), or if an intermediate result is reused by several stages. Tiling works best after converting your source data to planar format, or after precomputing a planar grayscale image. -### How we schedule -In general, there is no one-size-fits-all rule of scheduling to achieve the best performance as it depends on your pipeline characteristics and the target device architecture. So, it is recommended to explore the scheduling options and that is where Halide's scheduling API is purposed for. +Try parallelization first, then experiment with tiling and caching for further speedups. From there, tune tile sizes and thread count for your target. `HL_NUM_THREADS` is the environmental variable which allows you to limit the number of threads in-flight. -For example of this application: -* Start with parallelizing the outer-most loop. -* Add tiling + caching only if: there is a spatial filter, or the intermediate is reused by multiple consumers—and preferably after converting sources to planar (or precomputing a planar gray). -* From there, tune tile sizes and thread count for your target. `HL_NUM_THREADS` is the environmental variable which allows you to limit the number of threads in-flight. +## What you've accomplished and what's next +You built a real-time image processing pipeline using Halide and OpenCV. The workflow included converting camera frames to grayscale, applying a 3×3 binomial blur, and thresholding to create a binary image. You also measured performance to see how different scheduling strategies affect throughput. -## Summary -In this section, you built a real-time Halide+OpenCV pipeline—grayscale, a 3×3 binomial blur, then thresholding—and instrumented it to measure throughput. And then, we observed that parallelization and tiling improved the performance. +- Parallelization lets Halide use multiple CPU cores, speeding up processing by dividing work across rows or tiles. +- Tiling improves cache efficiency, especially when intermediate results are reused often, such as with larger filters or multi-stage pipelines. -* Parallelization spreads independent work across CPU cores. -* Tiling for cache efficiency helps when an expensive intermediate is reused many times per output (e.g., larger kernels, separable/multi-stage pipelines, multiple consumers) and when producers read planar data. +By combining these techniques, you achieved faster and more efficient image processing on Arm systems. diff --git a/content/learning-paths/mobile-graphics-and-gaming/best-practices-for-hwrt-lumen-performance/1-ray-tracing.md b/content/learning-paths/mobile-graphics-and-gaming/best-practices-for-hwrt-lumen-performance/1-ray-tracing.md index d9fd770f06..ecd5573948 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/best-practices-for-hwrt-lumen-performance/1-ray-tracing.md +++ b/content/learning-paths/mobile-graphics-and-gaming/best-practices-for-hwrt-lumen-performance/1-ray-tracing.md @@ -15,9 +15,9 @@ The demo is named **Steel Arms**. Created with Unreal Engine 5.3, Steel Arms bri The following screenshots are from scenes in **Steel Arms** which is powered by Unreal Lumen. Several optimization tips and techniques were used in the development of **Steel Arms** for achieving the best performance with Lumen. This learning path will start with an introduction to ray tracing and then cover the best practices for hardware ray tracing in Lumen. -![](images/Garage.png) +![](images/garage.webp) -![](images/Garage2.png) +![](images/garage2.webp) ## What is Ray Tracing? diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/2-app-scaffolding.md b/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/2-app-scaffolding.md index 635b71a766..e2f85c1756 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/2-app-scaffolding.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/2-app-scaffolding.md @@ -101,7 +101,7 @@ You can also click the **Sync Project with Gradle Files** button in the toolbar, This inflates the layout file into a view binding object, and stores it in a member variable within the view controller for easier access later. -![view binding alt-text#center](images/2/view_binding.png "Figure 5: View Binding.") +![view binding alt-text#center](images/2/view_binding.webp "Figure 5: View Binding.") ## Configure CameraX preview diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md new file mode 100644 index 0000000000..ed041afd5e --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md @@ -0,0 +1,58 @@ +--- +title: Set up your environment +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Set up your Python environment + +Before building ExecuTorch, it is highly recommended to create an isolated Python environment. This prevents dependency conflicts with your system Python installation and ensures that all required build and runtime dependencies remain consistent across runs: + +```bash +sudo apt update +sudo apt install -y python3 python3.12-dev python3-venv build-essential cmake +python3 -m venv pyenv +source pyenv/bin/activate + +``` +Keep your Python virtual environment activated while you complete the next steps. This ensures all dependencies install in the correct location. + +## Download the ExecuTorch source code + +Clone the ExecuTorch repository from GitHub. The following command checks out the stable v1.0.0 release and ensures all required submodules are fetched: + +```bash +export WORKSPACE=$HOME +cd $WORKSPACE +git clone -b v1.0.0 --recurse-submodules https://github.com/pytorch/executorch.git + +``` + + {{% notice Note %}} + The instructions in this Learning Path were tested on ExecuTorch v1.0.0. Commands or configuration options might differ in later releases. + {{% /notice %}} + +## Build and install the ExecuTorch Python components + +Next, you’ll build the ExecuTorch Python bindings and install them into your active virtual environment. This process compiles the C++ runtime, links hardware-optimized backends such as KleidiAI and XNNPACK, and enables optional developer utilities for debugging and profiling. + +Run the following command from your ExecuTorch workspace: +```bash +cd $WORKSPACE/executorch +CMAKE_ARGS="-DEXECUTORCH_BUILD_DEVTOOLS=ON" ./install_executorch.sh + +``` +This builds ExecuTorch and its dependencies using cmake, enabling optional developer utilities such as ETDump and Inspector. + +## Verify the Installation +After the build completes, check that ExecuTorch is installed in your active Python environment. Run the following command: + + +```bash +python -c "import executorch; print('Executorch build and install successfully.')" +``` + +If you see the success message, your environment is ready. You can now move on to cross-compiling and preparing to profile KleidiAI micro-kernels. diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md new file mode 100644 index 0000000000..b1d3c5a783 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md @@ -0,0 +1,84 @@ +--- +title: Cross-Compile ExecuTorch for the AArch64 platform +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you'll cross-compile ExecuTorch for an Arm64 (AArch64) target with XNNPACK and KleidiAI support. Cross-compiling builds all binaries and libraries for your Arm device, even if your development system uses x86_64. This process lets you run and test ExecuTorch on Arm hardware, taking advantage of Arm-optimized performance features. + +## Install the cross-compilation toolchain +On your x86_64 Linux host, install the GNU Arm cross-compilation toolchain along with Ninja, which is a fast build backend commonly used by CMake: +```bash +sudo apt install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu ninja-build -y +``` + +## Run CMake configuration + +Use CMake to configure the ExecuTorch build for the AArch64 target. + +The command below enables all key runtime extensions, developer tools, and optimized backends including XNNPACK and KleidiAI: + +```bash + +cd $WORKSPACE +mkdir -p build-arm64 +cd build-arm64 + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ + -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ + -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_LOG_LEVEL=debug \ + -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON \ + ../executorch + +``` + +## Key Build Options + +| **CMake Option** | **Description** | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `EXECUTORCH_BUILD_XNNPACK` | Builds the XNNPACK backend, which provides highly optimized CPU operators (such as GEMM and convolution) for Arm64 platforms. | +| `EXECUTORCH_XNNPACK_ENABLE_KLEIDI` | Enables Arm KleidiAI acceleration for XNNPACK kernels, providing further performance improvements on Armv8.2+ CPUs. | +| `EXECUTORCH_BUILD_DEVTOOLS` | Builds developer tools such as the ExecuTorch Inspector and diagnostic utilities for profiling and debugging. | +| `EXECUTORCH_BUILD_EXTENSION_MODULE` | Builds the Module API extension, which provides a high-level abstraction for model loading and execution using `Module` objects. | +| `EXECUTORCH_BUILD_EXTENSION_TENSOR` | Builds the Tensor API extension, providing convenience functions for creating, manipulating, and managing tensors in C++ runtime. | +| `EXECUTORCH_BUILD_KERNELS_OPTIMIZED` | Enables building optimized kernel implementations for better performance on supported architectures. | +| `EXECUTORCH_ENABLE_EVENT_TRACER` | Enables the event tracing feature, which records performance and operator timing information for runtime analysis. | + + + +## Build ExecuTorch +Once CMake configuration completes successfully, compile the ExecuTorch runtime and its associated developer tools: + +```bash +cmake --build . -j$(nproc) +``` +CMake invokes Ninja to perform the actual build, generating both static libraries and executables for the AArch64 target. + +## Locate the executor_runner binary +If the build completes successfully, you should see the main benchmarking and profiling utility, executor_runner, under: + +```output +build-arm64/executor_runner +You’ll use `executor_runner` in later sections to execute and profile ExecuTorch models directly from the command line on your Arm64 target. This standalone binary lets you run models using the XNNPACK backend with KleidiAI acceleration, making it easy to benchmark and analyze performance on Arm devices. + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md new file mode 100644 index 0000000000..6a97f9d587 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md @@ -0,0 +1,59 @@ +--- +title: Accelerate ExecuTorch operators with KleidiAI micro-kernels +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- +## Understand how KleidiAI micro-kernels integrate with ExecuTorch + +ExecuTorch uses XNNPACK as its main CPU backend to run and optimize operators like convolutions, matrix multiplications, and fully connected layers. + +KleidiAI SME (Scalable Matrix Extension) micro-kernels are integrated into XNNPACK to boost performance on supported Arm platforms. These micro-kernels accelerate operators that use specific data types and quantization settings in ExecuTorch models. + +When an operator matches a supported configuration, ExecuTorch automatically uses the KleidiAI-optimized path for faster execution. If an operator is not supported by KleidiAI, ExecuTorch falls back to the standard XNNPACK implementation. This ensures your models always run correctly, even if they do not use KleidiAI acceleration. + +## Understand how KleidiAI micro-kernels integrate with ExecuTorch + +In ExecuTorch v1.0.0, the following operator types are implemented through the XNNPACK backend and can potentially benefit from KleidiAI acceleration: +- XNNFullyConnected – Fully connected (dense) layers +- XNNConv2d – Standard 2D convolution layers +- XNNBatchMatrixMultiply – Batched matrix multiplication operations + +However, not all instances of these operators are accelerated by KleidiAI. + +Acceleration eligibility depends on several operator attributes and backend support, including: +- Data types (for example, float32, int8, int4) +- Quantization schemes (for example, symmetric/asymmetric, per-tensor/per-channel) +- Tensor memory layout and alignment +- Kernel dimensions and stride settings + +The following section provides detailed information on which operator configurations can benefit from KleidiAI acceleration, along with their corresponding data type and quantization support. + + +## XNNFullyConnected + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf16_gemm | FP16 | FP16 | FP16 | +| pf32_gemm | FP32 | FP32 | FP32 | +| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization | +| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32 | + + +## XNNConv2d +| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf32_gemm | FP32 | FP32, pointwise (1×1) | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization (NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) | + + +## XNNBatchMatrixMultiply +| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- |--------------------------------------- | +| pf32_gemm | FP32 | FP32 | FP32 | +| pf16_gemm | FP16 | FP16 | FP16 | + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md new file mode 100644 index 0000000000..dc26e235eb --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md @@ -0,0 +1,241 @@ +--- +title: Create and quantize linear layer benchmark model +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In the previous section, you saw that the Fully Connected operator supports multiple GEMM (General Matrix Multiplication) variants. + +To evaluate the performance of these variants across different hardware platforms, you will construct a series of benchmark models that utilize the Fully Connected operator with different GEMM implementations for comparative analysis. + +These models will be used later with executor_runner to measure throughput, latency, and ETDump traces for various KleidiAI micro-kernels. + +## Define a linear benchmark model with PyTorch for ExecuTorch +This step can be confusing at first, but building a minimal model helps you focus on the core operator performance. You’ll be able to quickly test different GEMM implementations and see how each one performs on Arm-based hardware. If you run into errors, check that your PyTorch and ExecuTorch versions are up to date and that you’re using the correct data types for your target GEMM variant. By adjusting some of the model’s input parameters, we can also simulate the behavior of nodes that appear in real-world models. + + +```python +import torch +import torch.nn as nn +class DemoLinearModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(256,256) + + def forward(self, x): + y = self.linear(x) + return y + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, dtype=dtype),) + +``` +This model creates a single 256×256 linear layer, which can easily be exported in different data types (FP32, FP16, INT8, INT4) to match KleidiAI’s GEMM variants. + +### Export FP16 and FP32 models for pf16_gemm and pf32_gemm variants + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf16_gemm | FP16 | FP16 | FP16 | +| pf32_gemm | FP32 | FP32 | FP32 | + +The following code demonstrates how to lower and export a model that leverages the pf16_gemm variant to accelerate computation: + +``` python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_executorch_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_executorch_model(torch.float16,"linear_model_pf16_gemm") + +``` + +To generate a model that uses the pf32_gemm variant, simply change the dtype in the previous code to torch.float32, as shown below: + +```python + +export_executorch_model(torch.float32,"linear_model_pf32_gemm") + +``` + +### Export INT8 quantized models for pqs8_qc8w_gemm and qp8_f32_qc8w_gemm variants +INT8 quantized GEMMs are designed to reduce memory footprint and improve performance while maintaining acceptable accuracy. + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization | + + +The following code demonstrates how to quantized a model that leverages the pqs8_qc8w_gemm/qp8_f32_qc8w_gemm variants to accelerate computation: + +```python + +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=dynamic + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #lower and export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm"); +export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm"); + +``` + +## Export INT4 quantized model for qp8_f32_qb4w_gemm variant +This final variant represents KleidiAI’s INT4 path, accelerated by SME2 micro-kernels. + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32 | + + +The following code demonstrates how to quantized a model that leverages the qp8_f32_qb4w_gemm variant to accelerate computation: + +```python +from torchao.quantization.granularity import PerGroup, PerAxis +from torchao.quantization.quant_api import ( + IntxWeightOnlyConfig, + Int8DynamicActivationIntxWeightConfig, + quantize_, +) + +def export_int4_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + + linear_config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(32), + ) + + quantize_(model, linear_config) + + #lower and export model + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm"); +``` + +{{%notice Note%}} +When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model inspection and performance analysis using the ExecuTorch Inspector API. +{{%/notice%}} + + +## Run the benchmark model export script for ExecuTorch +Instead of manually executing each code block explained above, you can download and run the full example script that builds and exports all linear-layer benchmark models (FP16, FP32, INT8, and INT4). +This script automatically performs quantization, partitioning, lowering, and export to ExecuTorch format. + +```bash +wget https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/main/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py +chmod +x export-linear-model.py +python3 ./export-linear-model.py +``` + +## Verify exported ExecuTorch and KleidiAI model files +After successful execution, you should see both .pte (ExecuTorch model) and .etrecord (profiling metadata) files in the model/ directory: + +``` bash +$ ls model/ -1 +linear_model_pf16_gemm.etrecord +linear_model_pf16_gemm.pte +linear_model_pf32_gemm.etrecord +linear_model_pf32_gemm.pte +linear_model_pqs8_qc8w_gemm.etrecord +linear_model_pqs8_qc8w_gemm.pte +linear_model_qp8_f32_qb4w_gemm.etrecord +linear_model_qp8_f32_qb4w_gemm.pte +linear_model_qp8_f32_qc8w_gemm.etrecord +linear_model_qp8_f32_qc8w_gemm.pte +``` +Great job! You now have a complete set of benchmark models exported for multiple GEMM variants and quantization levels. You’re ready to move on and measure performance using ExecuTorch and KleidiAI micro-kernels on Arm-based hardware. diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md new file mode 100644 index 0000000000..ec3bdaabc1 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md @@ -0,0 +1,188 @@ +--- +title: Create and quantize convolution layer benchmark model +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Understand Conv2d benchmark variants and KleidiAI acceleration + +In the previous section, you saw that that both INT8-quantized Conv2d and pointwise (1×1) Conv2d operators can be accelerated using KleidiAI’s matrix-multiplication micro-kernels. + + +| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization(NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) | +| pf32_gemm | FP32 | FP32, pointwise (1×1) | FP32 | + +To evaluate the performance of Conv2d operators across multiple hardware platforms, you will create a set of benchmark models that utilize different GEMM implementation variants within the convolution operators for systematic comparative analysis. + + +## Create an INT8-quantized Conv2d benchmark model with KleidiAI + +The following example defines a simple model to generate INT8-quantized Conv2d nodes that can be accelerated by KleidiAI. + +By adjusting some of the model’s input parameters, you can also simulate the behavior of nodes that appear in real-world models. + + +```python +import torch +import torch.nn as nn + +class DemoQInt8Conv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 6, 3) + + def forward(self,x): + x = self.conv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +``` + +The following code can be used to quantize and export the model: + +```python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_conv2d_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoQInt8Conv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_conv2d_model("qint8_conv2d_pqs8_qc8w_gemm"); + + +``` + +## Create a PointwiseConv2d benchmark model with Kleidiai + +In the following example model, you will use simple model to generate pointwise Conv2d nodes that can be accelerated by Kleidiai. + +As before, input parameters can be adjusted to simulate real-world model behavior. + + +``` python +import torch +import torch.nn as nn +class DemoConv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.pointwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1) + + def forward(self,x): + x = self.pointwiseconv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +``` + +The following code can be used to lower and export the model: + +```python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_pointwise_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoConv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_pointwise_model("pointwise_conv2d_pf32_gemm") + +``` + +{{%notice Note%}} +When exporting models, the generate_etrecord option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model analysis and performance evaluation. +{{%/notice%}} + + +## Run the benchmark model export script for ExecuTorch and KleidiAI +Rather than executing each block by hand, download and run the full export script. It will generate both Conv2d variants, run quantization (INT8) where applicable, partition to XNNPACK, lower, and export to ExecuTorch .pte together with .etrecord metadata. + +```bash +wget https://raw.githubusercontent.com/pareenaverma/arm-learning-paths/refs/heads/content_review/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py +chmod +x export-conv2d.py +python3 ./export-conv2d.py +``` +## Validate exported model files for ExecuTorch and KleidiAI + +After running this script, both the PTE model file and the etrecord file are generated. + +``` bash +$ ls model/ -1 +qint8_conv2d_pqs8_qc8w_gemm.etrecord +qint8_conv2d_pqs8_qc8w_gemm.pte +pointwise_conv2d_pf32_gemm.etrecord +pointwise_conv2d_pf32_gemm.pte +``` + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md new file mode 100644 index 0000000000..925d1fd415 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md @@ -0,0 +1,105 @@ +--- +title: Create matrix multiply layer benchmark model +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Learn how batch matrix multiply accelerates deep learning on Arm + +The batch matrix multiply operator (`torch.bmm`) is commonly used for efficient matrix operations in deep learning models. When running on Arm systems with XNNPACK, this operator is lowered to a general matrix multiplication (GEMM) implementation. If your input shapes and data types match supported patterns, XNNPACK can automatically dispatch these operations to KleidiAI micro-kernels, which are optimized for Arm hardware. + +To compare the performance of different GEMM variants on various Arm platforms, you'll build a set of benchmark models. These models use the batch matrix multiply operator and allow you to evaluate how each GEMM implementation performs, helping you identify the best configuration for your workload. + + +## Define a matrix multiply benchmark model for KleidiAI and ExecuTorch + +The following example defines a simple model to generate nodes that can be accelerated by KleidiAI. + +By adjusting the input parameters, this model can also simulate the behavior of nodes commonly found in real-world models: + + +```python +class DemoBatchMatMulModel(nn.Module): + def forward(self, x,y): + return torch.bmm(x, y) + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype)) + +``` + +## Export FP16 and FP32 models for pf16_gemm and pf32_gemm variants + +| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- |--------------------------------------- | +| pf32_gemm | FP32 | FP32 | FP32 | +| pf16_gemm | FP16 | FP16 | FP16 | + +The following code snippet demonstrates how to lower the model that leverages the pf16_gemm and pf32_gemm variant to accelerate computation: + +``` python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_mutrix_mul_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoBatchMatMulModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm") +export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm") + +``` + +{{%notice Note%}} +When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model analysis and performance evaluation. +{{%/notice%}} + +## Run the complete benchmark model script +Instead of executing each export block manually, you can download and run the full matrix-multiply benchmark script. +This script automatically builds and exports both FP16 and FP32 models, performing all necessary partitioning, lowering, and ETRecord generation: + +```bash +wget https://raw.githubusercontent.com/pareenaverma/arm-learning-paths/refs/heads/content_review/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py +chmod +x export-matrix-mul.py +python3 ./export-matrix-mul.py +``` + +## Verify the output + +After running this script, both the PTE model file and the etrecord file are generated. + +``` bash +$ ls model/ -1 +model/matrix_mul_pf16_gemm.etrecord +model/matrix_mul_pf16_gemm.pte +model/matrix_mul_pf32_gemm.etrecord +model/matrix_mul_pf32_gemm.pte +``` +These files are the inputs for upcoming executor_runner benchmarks, where you’ll measure and compare KleidiAI micro-kernel performance. + +The complete source code is available [here](../export-matrix-mul.py). diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md new file mode 100644 index 0000000000..63bde34ffc --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md @@ -0,0 +1,57 @@ +--- +title: Run model and generate the ETDump +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Copy artifacts to your Arm64 target +From your x86_64 host (where you cross-compiled), copy the runner and exported models to the Arm device: + +```bash +scp $WORKSPACE/build-arm64/executor_runner @:~/bench/ +scp -r model/ @:~/bench/ +``` + +## Run a model and emit ETDump +Use one of the models you exported earlier (e.g., FP32 linear: linear_model_pf32_gemm.pte). +The flags below tell executor_runner where to write the ETDump and how many times to execute. + +```bash +cd ~/bench +./executor_runner -etdump_path model/linear_model_pf32_gemm.etdump -model_path model/linear_model_pf32_gemm.pte -num_executions=1 -cpu_threads 1 + +``` + +You can adjust the number of execution threads and the number of times the model is invoked. + + +You should see logs like: + +```output +D 00:00:00.015988 executorch:XNNPACKBackend.cpp:57] Creating XNN workspace +D 00:00:00.018719 executorch:XNNPACKBackend.cpp:69] Created XNN workspace: 0xaff21c2323e0 +D 00:00:00.027595 executorch:operator_registry.cpp:96] Successfully registered all kernels from shared library: NOT_SUPPORTED +I 00:00:00.035506 executorch:executor_runner.cpp:157] Resetting threadpool with num threads = 1 +I 00:00:00.048120 executorch:threadpool.cpp:48] Resetting threadpool to 1 threads. +I 00:00:00.051509 executorch:executor_runner.cpp:218] Model file model/linear_model_f32.pte is loaded. +I 00:00:00.051531 executorch:executor_runner.cpp:227] Using method forward +I 00:00:00.051541 executorch:executor_runner.cpp:278] Setting up planned buffer 0, size 2112. +D 00:00:00.051630 executorch:method.cpp:793] Loading method: forward. +.... + +D 00:00:00.091432 executorch:XNNExecutor.cpp:236] Resizing output tensor to a new shape +I 00:00:00.091459 executorch:executor_runner.cpp:340] Model executed successfully 1 time(s) in 2.904883 ms. +I 00:00:00.091477 executorch:executor_runner.cpp:349] 1 outputs: +OutputX 0: tensor(sizes=[1, 256], [ + 0.0106399, 0.0951964, 1.04854, -0.290168, -0.278126, -0.355151, 0.0583736, -0.431953, -0.0773305, -0.32844, + ..., + 0.553568, -0.0339369, 0.562088, -1.21021, -0.769254, 0.677771, -0.264338, 1.05453, 0.724467, 0.53182, +]) +I 00:00:00.093912 executorch:executor_runner.cpp:125] ETDump written to file 'model/linear_model_f32.etdump'. + +``` +If execution succeeds, an ETDump file is created next to your model. You will load the .etdump in the next section and analyze which operators dispatched to KleidiAI and how each micro-kernel performed. + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md new file mode 100644 index 0000000000..2014a30dc2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md @@ -0,0 +1,68 @@ +--- +title: Analyze ETRecord and ETDump +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section you will use the ExecuTorch Inspector to correlate runtime events from the .etdump with the lowered graph and backend mapping from the .etrecord. This lets you confirm that a node was delegated to XNNPACK and when eligible it was accelerated by KleidiAI micro-kernels. + +The Inspector analyzes the runtime data from the ETDump file and maps it to the corresponding operators in the Edge Dialect Graph. + +## Analyze ETDump and ETRecord files with the Inspector script + +Save the following code in a file named `inspect.py` and run it with the path to a .pte model. The script auto-derives .etrecord, .etdump, and an output .csv next to it. + +```python + +import os +import sys +from executorch.devtools.inspector import Inspector + +if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} ") + sys.exit(1) + +pte_file = sys.argv[1] + +base = os.path.splitext(pte_file)[0] + +etrecord = f"{base}.etrecord" +etdump = f"{base}.etdump" +csvfile = f"{base}.csv" + +ins = Inspector(etrecord=etrecord, etdump_path=etdump) +ins.print_data_tabular(include_delegate_debug_data=True, include_units=False) + +with open(csvfile, "w", encoding="utf-8") as f: + ins.save_data_to_tsv(f) + +``` + +## Run the Inspector script and review performance results + +Run the script, for example with the linear_model_pf32_gemm.pte model : + +```bash +python3 inspect.py model/linear_model_pf32_gemm.pte +``` + +Next, you can examine the generated CSV file to view the execution time information for each node in the model. + +Below is an example showing the runtime data corresponding to the Fully Connected node. + + +| event_block_name | event_name | p10 (ms) | p50 (ms) | p90 (ms) | avg (ms) | min (ms) | max (ms) | op_types | is_delegated_op | delegate_backend_name | +|-----------------|--------------------------------|----------------------|----------------------|----------------------|----------------------|----------------------|----------------------|--------------------------|----------------|---------------------| +| Default | Method::init | 33.277046 | 33.277046 | 33.277046 | 33.277046 | 33.277046 | 33.277046 | [] | FALSE | | +| Default | Program::load_method | 33.300006 | 33.300006 | 33.300006 | 33.300006 | 33.300006 | 33.300006 | [] | FALSE | | +| Execute | Fully Connected (NC, F32) GEMM #1 | 0.0160000000000196 | 0.0180000000000007 | 0.0190000000000055 | 0.0187449000000005 | 0.0149999999999864 | 4.244 | [] | TRUE | XnnpackBackend | +| Execute | DELEGATE_CALL | 0.04136 | 0.04464 | 0.04792 | 0.046082053 | 0.03372 | 4.390585 | ['aten.linear.default'] | FALSE | XnnpackBackend | +| Execute | Method::execute | 0.04848 | 0.0525595 | 0.05756 | 0.0540658046 | 0.03944 | 4.404385 | [] | FALSE | | + +You can now iterate over FP32 vs FP16 vs INT8 vs INT4 models, confirm the exact GEMM variant used, and quantify the latency savings attributable to KleidiAI micro-kernels on your Arm device. + +You can experiment with different models and matrix sizes to analyze various performance results. diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md new file mode 100644 index 0000000000..5cbda27058 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md @@ -0,0 +1,49 @@ +--- +title: Benchmark a KleidiAI micro-kernel in ExecuTorch + +minutes_to_complete: 30 + +who_is_this_for: This is an advanced topic for developers, performance engineers, and ML framework contributors who want to benchmark and optimize KleidiAI micro-kernels within ExecuTorch to accelerate model inference on Arm64 platforms supporting SME/SME2 instructions. + +learning_objectives: + - Cross-compile ExecuTorch for Arm64 with XNNPACK and KleidiAI enabled, including SME/SME2 instructions + - Build and export ExecuTorch models that can be accelerated by KleidiAI using SME/SME2 instructions + - Use the executor_runner tool to run kernel workloads and collect ETDump profiling data. + - Inspect and analyze ETRecord and ETDump files using the ExecuTorch Inspector API to understand kernel-level performance behavior. + +prerequisites: + - An x86_64 Linux host machine running Ubuntu, with at least 15 GB of free disk space + - An Arm64 target system with support for SME or SME2 - see the Learning Path [Devices with native SME2 support](https://learn.arm.com/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started/#devices-with-native-sme2-support) + +author: Qixiang Xu + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Cortex-A + +tools_software_languages: + - Python + - ExecuTorch + - XNNPACK + - KleidiAI + +operatingsystems: + - Linux + + +further_reading: + - resource: + title: Executorch User Guide + link: https://docs.pytorch.org/executorch/stable/intro-section.html + type: documentation + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py new file mode 100644 index 0000000000..0e1765436f --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py @@ -0,0 +1,107 @@ + +import torch +import torch.nn as nn +class DemoConv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.bitwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1) + + def forward(self,x): + x = self.bitwiseconv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +class DemoQInt8Conv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 6, 3) + + def forward(self,x): + x = self.conv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_conv2d_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoQInt8Conv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_conv2d_model("qint8_conv2d_pqs8_qc8w_gemm"); + + + +def export_pointwise_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoConv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + + +export_pointwise_model("pointwise_conv2d_pf32_gemm") diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py new file mode 100644 index 0000000000..0f78dab2cc --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py @@ -0,0 +1,145 @@ + +import torch +import torch.nn as nn + +class DemoLinearModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(256,256) + + def forward(self, x): + y = self.linear(x) + return y + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, dtype=dtype),) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_executorch_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_executorch_model(torch.float16,"linear_model_pf16_gemm") +export_executorch_model(torch.float32,"linear_model_pf32_gemm") + + +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=dynamic + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm"); +export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm"); + + +from torchao.quantization.granularity import PerGroup, PerAxis +from torchao.quantization.quant_api import ( + IntxWeightOnlyConfig, + Int8DynamicActivationIntxWeightConfig, + quantize_, +) + +def export_int4_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + + linear_config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(32), + ) + + quantize_(model, linear_config) + + #export model + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm"); + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py new file mode 100644 index 0000000000..19eab1b356 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py @@ -0,0 +1,44 @@ + +import torch +import torch.nn as nn + +class DemoBatchMatMulModel(nn.Module): + def forward(self, x,y): + return torch.bmm(x, y) + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype)) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_mutrix_mul_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoBatchMatMulModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm") +export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm") + diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index 1a686b2b6d..28a0501e3a 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -8,7 +8,7 @@ key_ip: maintopic: true operatingsystems_filter: - Android: 3 -- Linux: 191 +- Linux: 197 - macOS: 13 - Windows: 14 pinned_modules: @@ -18,14 +18,14 @@ pinned_modules: - providers - migration subjects_filter: -- CI-CD: 9 +- CI-CD: 10 - Containers and Virtualization: 34 -- Databases: 20 +- Databases: 21 - Libraries: 9 -- ML: 32 -- Performance and Architecture: 73 +- ML: 34 +- Performance and Architecture: 74 - Storage: 2 -- Web: 16 +- Web: 17 subtitle: Optimize cloud native apps on Arm for performance and cost title: Servers and Cloud Computing tools_software_languages_filter: @@ -37,6 +37,7 @@ tools_software_languages_filter: - Android Studio: 1 - Ansible: 2 - Apache: 1 +- Apache Bench: 1 - Apache Cassandra: 1 - Apache Spark: 2 - Apache Tomcat: 2 @@ -45,8 +46,7 @@ tools_software_languages_filter: - Arm Development Studio: 3 - Arm ISA: 1 - Arm Performance Libraries: 1 -- Arm Performance Studio: 1 -- Arm Streamline: 2 +- Arm Streamline: 1 - armclang: 1 - armie: 1 - ArmRAL: 1 @@ -63,10 +63,12 @@ tools_software_languages_filter: - AWS Elastic Kubernetes Service (EKS): 3 - AWS Graviton: 1 - AWS Lambda: 1 +- Azure: 1 - Azure CLI: 2 - Azure Portal: 1 -- bash: 2 - Bash: 1 +- bash: 2 +- Bash/Shell scripting: 1 - Bastion: 3 - BOLT: 2 - bpftool: 1 @@ -75,9 +77,10 @@ tools_software_languages_filter: - C#: 2 - C++: 12 - Capstone: 1 +- Cargo: 1 - cassandra-stress: 1 - CCA: 8 -- CircleCI: 1 +- CircleCI: 2 - Clair: 1 - Clang: 13 - ClickBench: 1 @@ -85,14 +88,17 @@ tools_software_languages_filter: - Cloud Build: 1 - CMake: 1 - conda: 1 +- Couchbase: 1 - cqlsh: 1 +- Criterion: 1 - Daytona: 1 - Demo: 3 -- Django: 1 +- Django: 2 - Docker: 25 - Docker Buildx: 1 - Envoy: 3 - ExecuTorch: 1 +- Facter: 1 - FAISS: 1 - FlameGraph: 1 - Flink: 2 @@ -102,7 +108,8 @@ tools_software_languages_filter: - GCC: 25 - gdb: 1 - Geekbench: 1 -- Generative AI: 12 +- Generative AI: 13 +- Git: 1 - GitHub: 6 - GitHub Actions: 1 - GitHub CLI: 1 @@ -115,10 +122,12 @@ tools_software_languages_filter: - Google Benchmark: 1 - Google Cloud: 2 - Google Test: 1 +- Gunicorn: 1 - HammerDB: 1 - Helm: 1 - Herd7: 1 -- Hugging Face: 11 +- Hiera: 1 +- Hugging Face: 12 - InnoDB: 1 - Intrinsics: 1 - iPerf3: 1 @@ -127,21 +136,19 @@ tools_software_languages_filter: - JAX: 1 - JMH: 1 - Kafka: 2 -- kafka-consumer-perf-test.sh: 1 -- kafka-producer-perf-test.sh: 1 - KEDA: 1 - Kedify: 1 -- Keras: 1 -- Kubernetes: 12 +- Keras: 2 +- Kubernetes: 13 - Libamath: 1 - libbpf: 1 - Linaro Forge: 1 -- Linux kernel: 1 - Litmus7: 1 -- Llama.cpp: 2 - llama.cpp: 1 -- LLM: 10 +- Llama.cpp: 2 +- LLM: 11 - llvm-mca: 1 +- LM Evaluation Harness: 1 - LSE: 1 - MariaDB: 1 - Maven: 1 @@ -156,8 +163,8 @@ tools_software_languages_filter: - NEON: 7 - Networking: 1 - Nexmark: 1 -- NGINX: 4 - nginx: 1 +- NGINX: 4 - Node.js: 5 - node.js: 1 - npm: 3 @@ -171,22 +178,22 @@ tools_software_languages_filter: - PAPI: 1 - perf: 6 - Perf: 1 -- Performance analysis: 1 - PHP: 1 - PHPBench: 1 - PostgreSQL: 5 - Profiling: 1 -- Python: 32 -- PyTorch: 9 +- Puppet: 1 +- Python: 35 +- PyTorch: 10 - QEMU: 1 - RAG: 1 - Rails: 1 - Redis: 3 - Remote.It: 2 - RME: 8 -- Ruby: 1 +- Ruby: 2 - Runbook: 71 -- Rust: 2 +- Rust: 3 - Service Mesh: 1 - Siege: 1 - Skaffold: 1 @@ -201,7 +208,7 @@ tools_software_languages_filter: - Sysbench: 1 - Tekton: 1 - Telemetry: 1 -- TensorFlow: 2 +- TensorFlow: 3 - Terraform: 11 - ThirdAI: 1 - topdown-tool: 1 @@ -212,7 +219,7 @@ tools_software_languages_filter: - Vectorscan: 1 - Veraison: 2 - Visual Studio Code: 5 -- vLLM: 2 +- vLLM: 3 - vvenc: 1 - Web Server: 1 - Whisper: 1 @@ -226,8 +233,8 @@ tools_software_languages_filter: - ZooKeeper: 1 weight: 1 cloud_service_providers_filter: -- AWS: 17 -- Google Cloud: 26 +- AWS: 18 +- Google Cloud: 31 - Microsoft Azure: 19 - Oracle: 2 --- diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_index.md new file mode 100644 index 0000000000..cdec577f5b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_index.md @@ -0,0 +1,72 @@ +--- +title: CircleCI Arm Native Workflows on AWS Graviton2 (EC2) + +minutes_to_complete: 45 + +draft: true +cascade: + draft: true + +who_is_this_for: This learning path is intended for software developers and DevOps engineers looking to set up and run CircleCI Arm native workflows on Linux Arm64 VMs, specifically on AWS EC2 Graviton2 instances (Neoverse N1), using self-hosted runners. + +learning_objectives: + - Provision an AWS EC2 Graviton2 Arm64 virtual machine + - Install and configure CircleCI self-hosted machine runners on Arm64 + - Verify the runner by running a simple workflow and test computation + - Define and execute CircleCI job using a machine executor + - Check CPU architecture and execute a basic script to confirm if the runner is operational + - Display CPU information and validate outputs from the sample computation + +prerequisites: + - An [AWS account](https://aws.amazon.com/free/) with billing enabled + - Basic familiarity with Linux command line + - Basic understanding of CircleCI concepts such as + [workflows](https://circleci.com/docs/guides/orchestrate/workflows/), + [jobs](https://circleci.com/docs/guides/orchestrate/jobs-steps/), + [resource classes](https://circleci.com/docs/guides/execution-managed/resource-class-overview/), and + [runners](https://circleci.com/docs/guides/execution-runner/runner-overview/) + + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: CI-CD +cloud_service_providers: AWS + +armips: + - Neoverse + +tools_software_languages: + - CircleCI + - Bash/Shell scripting + - Git + + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: AWS EC2 Documentation + link: https://docs.aws.amazon.com/ec2/index.html + type: documentation + + - resource: + title: CircleCI Self-Hosted Runner Documentation + link: https://circleci.com/docs/guides/execution-runner/install-machine-runner-3-on-linux/ + type: documentation + + - resource: + title: CircleCI CLI Documentation + link: https://circleci.com/docs/guides/toolkit/local-cli/ + type: documentation + + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/background.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/background.md new file mode 100644 index 0000000000..e4836e896c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/background.md @@ -0,0 +1,27 @@ +--- +title: Getting Started with CircleCI on AWS Graviton2 (Arm Neoverse-N1) + +weight: 2 + +layout: "learningpathall" +--- + + +## AWS Graviton2 Arm Instances on Amazon EC2 + +**AWS Graviton2** is a family of Arm-based processors designed by AWS and built on **Arm Neoverse-N1 cores**. These instances deliver exceptional price-to-performance efficiency, making them ideal for compute-intensive workloads such as CI/CD pipelines, microservices, containerized applications, and data processing tasks. + +Graviton2-powered EC2 instances provide high performance and energy efficiency compared to traditional x86-based instances while maintaining compatibility with popular Linux distributions and open-source software stacks. + +To learn more about AWS Graviton processors, refer to the [AWS Graviton2 Processor Overview](https://aws.amazon.com/ec2/graviton/). + +## CircleCI + +**CircleCI** is a leading cloud-based **Continuous Integration and Continuous Delivery (CI/CD)** platform that automates the **building, testing, and deployment** of software projects. + +It seamlessly integrates with popular version control systems such as **GitHub**, **Bitbucket**, and **GitLab**, allowing developers to define automation workflows through a `.circleci/config.yml` file written in **YAML syntax**. + +CircleCI supports multiple execution environments, including **Docker**, **Linux**, **macOS**, and **Windows**, while providing advanced capabilities like **parallel job execution**, **build caching**, and **matrix builds** for optimized performance. + +It is widely adopted by development teams to **accelerate build cycles, enforce code quality, automate testing, and streamline application delivery**. +To learn more, visit the [official CircleCI website](https://circleci.com/) and explore its [documentation](https://circleci.com/docs/). diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circleci-runner-installation.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circleci-runner-installation.md new file mode 100644 index 0000000000..ad04b62737 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circleci-runner-installation.md @@ -0,0 +1,93 @@ +--- +title: Install CircleCI Machine Runner on AWS Graviton2 +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install CircleCI Machine Runner on AWS Graviton2 + +This guide provides step-by-step instructions to install and configure the **CircleCI Machine Runner** on an **AWS Graviton2 (Neoverse N1) instance**. +With this setup, your self-hosted **Arm64 environment** can efficiently execute CircleCI jobs directly on the Graviton2 architecture, enabling faster builds and improved performance for ARM-based workloads. + + +### Add CircleCI Package Repository +For **Debian/Ubuntu-based systems** running on **AWS Graviton2 (Arm64)**, first add the official CircleCI repository. +This ensures you can install the CircleCI Runner package directly using `apt`. + +```console +curl -s https://packagecloud.io/install/repositories/circleci/runner/script.deb.sh?any=true | sudo bash +``` + +- The `curl` command downloads and executes the repository setup script from CircleCI's official package server. +- It configures the repository on your system, allowing `apt` to fetch and install the CircleCI runner package. +- After successful execution, the CircleCI repository will be added under `/etc/apt/sources.list.d/`. + +### Configure the Runner Token +- Each self-hosted runner requires a unique authentication token generated from your Resource Class in the CircleCI Dashboard. +- Copy the token from the CircleCI web interface. +- Export the token as an environment variable and update the runner configuration file as shown: + +```console +export RUNNER_AUTH_TOKEN="YOUR_AUTH_TOKEN" +sudo sed -i "s/<< AUTH_TOKEN >>/$RUNNER_AUTH_TOKEN/g" /etc/circleci-runner/circleci-runner-config.yaml +``` + +### Install the CircleCI Runner +Install the pre-built CircleCI runner package: + +```console +sudo apt-get install -y circleci-runner +``` + +- Installs the latest CircleCI Machine Runner compatible with your Arm64 instance. +- Runner binary and configuration files are located in `/usr/bin/` and `/etc/circleci-runner/`. + +### Configure the Runner Authentication Token +Update the CircleCI runner configuration with your authentication token. This token is generated from the Resource Class you created in the CircleCI Dashboard. + +```console +export RUNNER_AUTH_TOKEN="YOUR_AUTH_TOKEN" +sudo sed -i "s/<< AUTH_TOKEN >>/$RUNNER_AUTH_TOKEN/g" /etc/circleci-runner/circleci-runner-config.yaml +``` + +### Enable and Start the CircleCI Runner +Set the CircleCI runner service to start automatically and verify it is running: + +```console +sudo systemctl enable circleci-runner +sudo systemctl start circleci-runner +sudo systemctl status circleci-runner +``` + +If the status shows active (running), your runner is successfully installed and connected to CircleCI. + +```output +● circleci-runner.service - Run the CircleCI self-hosted runner agent + Loaded: loaded (/usr/lib/systemd/system/circleci-runner.service; enabled; preset: enabled) + Active: active (running) since Fri 2025-10-17 05:33:20 UTC; 51min ago + Main PID: 2226 (circleci-runner) + Tasks: 9 (limit: 18717) + Memory: 53.0M (peak: 66.9M) + CPU: 1.249s + CGroup: /system.slice/circleci-runner.service + └─2226 /usr/bin/circleci-runner machine -c /etc/circleci-runner/circleci-runner-config.yaml + +Oct 17 05:41:28 ip-172-31-34-224 circleci-runner[2226]: 05:41:28 7b67e 24.210ms worker loop: claim: app.backoff_ms=5000 app.> +Oct 17 06:00:08 ip-172-31-34-224 circleci-runner[2226]: 06:00:08 a8093 22.942ms POST /api/v3/runner/claim app.loop_name=claim> +Oct 17 06:00:08 ip-172-31-34-224 circleci-runner[2226]: 06:00:08 a8093 23.028ms claim app.loop_name=claim: mode=agent result> +Oct 17 06:00:08 ip-172-31-34-224 circleci-runner[2226]: 06:00:08 a8093 23.064ms worker loop: claim: app.backoff_ms=5000 app.> +Oct 17 06:04:49 ip-172-31-34-224 circleci-runner[2226]: 06:04:49 73039 19.847ms POST /api/v3/runner/claim app.loop_name=claim> +Oct 17 06:04:49 ip-172-31-34-224 circleci-runner[2226]: 06:04:49 73039 19.936ms claim app.loop_name=claim: mode=agent result> +Oct 17 06:04:49 ip-172-31-34-224 circleci-runner[2226]: 06:04:49 73039 19.971ms worker loop: claim: app.backoff_ms=5000 app.> +Oct 17 06:19:13 ip-172-31-34-224 circleci-runner[2226]: 06:19:13 c34c1 22.392ms POST /api/v3/runner/claim app.loop_name=claim> +Oct 17 06:19:13 ip-172-31-34-224 circleci-runner[2226]: 06:19:13 c34c1 22.479ms claim app.loop_name=claim: mode=agent result> +Oct 17 06:19:13 ip-172-31-34-224 circleci-runner[2226]: 06:19:13 c34c1 22.514ms worker loop: claim: app.backoff_ms=5000 app.> +``` + +This confirms that the CircleCI Runner is actively connected to your CircleCI account and ready to accept jobs. + +Also, you can verify it from the dashboard: + +![Self-Hosted Runners alt-text#center](images/runner.png "Figure 1: Self-Hosted Runners ") diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circlecli-installation.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circlecli-installation.md new file mode 100644 index 0000000000..2e1234c548 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/circlecli-installation.md @@ -0,0 +1,44 @@ +--- +title: Install CircleCI CLI +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install CircleCI CLI on AWS Graviton2 (Neoverse N1) Instance +This guide explains installing the **CircleCI Command Line Interface (CLI)** on an **AWS Graviton2 (Neoverse N1) Arm64 EC2 instance**. +The CLI enables you to interact with CircleCI directly from your terminal — for validating configuration files, managing pipelines, and operating self-hosted runners on your EC2 instance. + +### Install Required Packages +Before installing the CircleCI CLI, ensure your system has the necessary tools for downloading and extracting files. + +```console +sudo apt update && sudo apt install -y curl tar gzip coreutils gpg git +``` +### Download and Extract the CircleCI CLI + +Next, download the CircleCI CLI binary for **Linux arm64** and extract it. + +```console +curl -fLSs https://github.com/CircleCI-Public/circleci-cli/releases/download/v0.1.33494/circleci-cli_0.1.33494_linux_arm64.tar.gz | tar xz +sudo mv circleci-cli_0.1.33494_linux_arm64/circleci /usr/local/bin/ +``` +- The `curl` command fetches the official **CircleCI CLI archive** from GitHub. +- The `| tar xz` command extracts the compressed binary in a single step. +- After extraction, a new folder named **`circleci-cli_0.1.33494_linux_arm64`** appears in your current directory. + +### Verify the Installation + +To ensure that the CLI is installed successfully, check its version: + +```console +circleci version +``` +You should see an output similar to: + +```output +0.1.33494+7cc6570 (release) +``` + +If this version number appears, the CircleCI CLI installation on your AWS Graviton2 instance was successful! diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws1.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws1.png new file mode 100644 index 0000000000..04112848b0 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws2.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws2.png new file mode 100644 index 0000000000..776b35417d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws3.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws3.png new file mode 100644 index 0000000000..07d0824930 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/aws3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/computation.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/computation.png new file mode 100644 index 0000000000..9c5bd34391 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/computation.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runner.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runner.png new file mode 100644 index 0000000000..c14df5faa9 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runner.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runnerv1.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runnerv1.png new file mode 100644 index 0000000000..511cfa241c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/runnerv1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner0.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner0.png new file mode 100644 index 0000000000..927a193982 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner0.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner1.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner1.png new file mode 100644 index 0000000000..bb587d2723 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner2.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner2.png new file mode 100644 index 0000000000..f85cc7a55a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner3.png b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner3.png new file mode 100644 index 0000000000..b1362dab6c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/images/shrunner3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/instance.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/instance.md new file mode 100644 index 0000000000..cf6bf669d9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/instance.md @@ -0,0 +1,38 @@ +--- +title: Create an AWS EC2 Arm64 Graviton2 Instance +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision an **AWS Graviton2 Arm64 EC2 instance** on **Amazon Web Services (AWS)** using the **m6g.xlarge** instance type (2 vCPUs, 8 GB memory) in the **AWS Management Console**. + +{{% notice Note %}} +For support on AWS setup, see the Learning Path [Getting started with AWS](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/aws/). +{{% /notice %}} + +## Provision an AWS EC2 Arm64 Graviton2 Instance in the AWS Management Console + +To create a virtual machine based on the AWS Graviton2 Instance type: +- Navigate to the [AWS Management Console](https://aws.amazon.com/console/). +- Go to **EC2 > Instances** and select **Launch Instance**. +- Under **Instance configuration**: + - Enter an appropriate **Instance name**. + - Choose an **Amazon Machine Image (AMI)** such as **Ubuntu 24.04 ARM64**. + + ![AWS Management Console alt-text#center](images/aws1.png "Figure 1: Amazon Machine Image (AMI)") + + - Under **Instance type**, select a Graviton2-based type `m6g.xlarge`. + + ![AWS Management Console alt-text#center](images/aws2.png "Figure 2: Instance type") + + - Configure your **Key pair (login)** by either creating a new key pair or selecting an existing one to securely access your instance. + - In **Network settings**, ensure that **Allow HTTP traffic from the internet** and **Allow HTTPS traffic from the internet** are checked. + + ![AWS Management Console alt-text#center](images/aws3.png "Figure 3: Network settings") + + - Adjust **Storage** settings as needed — for most setups, 30 GB of gp3 (SSD) storage is sufficient. + - Click **Launch Instance** to create your EC2 virtual machine. diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/resource-class.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/resource-class.md new file mode 100644 index 0000000000..3e02ea50f4 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/resource-class.md @@ -0,0 +1,40 @@ +--- +title: Create Resource Class in CircleCI +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Create a Resource Class for Self-Hosted Runner in CircleCI +This guide describes creating a **Resource Class** in the **CircleCI Web Dashboard** for a **self-hosted runner**. +A Resource Class uniquely identifies the runner and links it to your CircleCI namespace, enabling jobs to run on your custom machine environment. + +### Steps + +1. **Go to the CircleCI Web Dashboard** + - From the left sidebar, navigate to **Self-Hosted Runners**. + - You’ll see a screen asking you to accept the **terms of use**. + - **Check the box** that says **“Yes, I agree to the terms”** to enable runners. + - Then click **Self-Hosted Runners** to continue setup. + +![Self-Hosted Runners alt-text#center](images/shrunner0.png "Figure 1: Self-Hosted Runners ") + +2. **Create a New Resource Class** + - Click **Create Resource Class**. + +![Self-Hosted Runners alt-text#center](images/shrunner1.png "Figure 2: Create Resource Class ") + +3. **Fill in the Details** + - **Namespace:** Your CircleCI username or organization (e.g., `circleci`) + - **Resource Class Name:** A descriptive name for your runner, such as `arm64` + +![Self-Hosted Runners alt-text#center](images/shrunner2.png "Figure 3: Details Resource Class & Namespace") + +4. **Save and Copy the Token** + - Once created, CircleCI will generate a **Resource Class Token**. + - Copy this token and store it securely — you will need it to register your runner on the AWS Arm VM. + +![Self-Hosted Runners alt-text#center](images/shrunner3.png "Figure 4: Resource Class Token") + +With your Resource Class and token ready, proceed to the next section to set up the CircleCI self-hosted runner. diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/validation.md b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/validation.md new file mode 100644 index 0000000000..385df18f9f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/circleci-on-aws/validation.md @@ -0,0 +1,109 @@ +--- +title: Verify CircleCI Arm64 Self-Hosted Runner +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Verify CircleCI Arm64 Self-Hosted Runner + +This guide demonstrates validating your **self-hosted CircleCI runner** on an **Arm64 machine** by executing a simple workflow and a test computation. This ensures your runner is correctly configured and ready to process jobs. + +### Create a Test Repository +Start by creating a GitHub repository dedicated to verifying your Arm64 runner: + +```console +git clone https://github.com/ hello.sh +chmod +x hello.sh +``` + +### Define the CircleCI Configuration +Create a `.circleci/config.yml` file to define the workflow that will run on your Arm64 runner: + +```yaml +version: 2.1 + +jobs: + test-Arm64: + machine: + enabled: true + resource_class: your-namespace/Arm64-linux # Replace with your actual resource class + steps: + - checkout + - run: + name: Verify Arm64 Runner + command: | + uname -m + lscpu | grep Architecture + ./hello.sh + - run: + name: Run sample computation + command: | + echo "Performing test on Arm64 runner" + echo "CPU Info:" + lscpu + echo "Success!" + +workflows: + test-workflow: + jobs: + - test-Arm64 +``` +- Defines a single job `test-Arm64` using a machine executor on a self-hosted Arm64 runner. +- Checks CPU architecture with `uname -m` and `lscpu` to verify the runner. +- Executes a simple script `hello.sh` to confirm the runner can run commands. +- Runs a sample computation step to display CPU info and print. + +### Commit and Push to GitHub +Once all files you created (`hello.sh`, `.circleci/config.yml`) are ready, push your project to GitHub so CircleCI can build and verify the Arm64 runner automatically. + +```console +git add . +git commit -m "Initial CircleCI Arm64 test" +git branch -M main +git push -u origin main +``` + +- **Add Changes**: Stage all modified and new files using `git add .`. +- **Commit Changes**: Commit the staged files with a descriptive message. +- **Set Main Branch**: Rename the current branch to `main`. +- **Add Remote Repository**: Link your local repository to GitHub. +- **Push Changes**: Push the committed changes to the `main` branch on GitHub. + +### Start CircleCI Runner and Execute Job +Ensure that your CircleCI runner is enabled and started. This will allow your self-hosted runner to pick up jobs from CircleCI. + +```console +sudo systemctl enable circleci-runner +sudo systemctl start circleci-runner +sudo systemctl status circleci-runner +``` +- **Enable CircleCI Runner**: Ensure the CircleCI runner is set to start automatically on boot. +- **Start and Check Status**: Start the CircleCI runner and verify it is running. + +After pushing your code to GitHub, open your **CircleCI Dashboard → Projects**, and confirm that your **test-Arm64 workflow** starts running using your **self-hosted runner**. + +If the setup is correct, you’ll see your job running under the resource class you created. + +### Output +Once the job starts running, CircleCI will: + +- Verify Arm64 Runner: + + ![Self-Hosted Runners alt-text#center](images/runnerv1.png "Figure 1: Self-Hosted Runners ") + +- Run sample computation: + + ![Self-Hosted Runners alt-text#center](images/computation.png "Figure 1: Self-Hosted Runners ") + +All CircleCI jobs have run successfully, the sample computation completed, and all outputs are visible in the CircleCI Dashboard. diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_index.md new file mode 100644 index 0000000000..f70f62ef38 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_index.md @@ -0,0 +1,55 @@ +--- +title: Deploy Couchbase on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This learning path is intended for software developers deploying and optimizing Couchbase workloads on Arm Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install Couchbase Server on the SUSE Arm64 (C4A) instance + - Verify Couchbase deployment by accessing the Web Console, creating a test bucket, and confirming cluster health + - Benchmark Couchbase by measuring operations per second (ops/sec), memory utilization, and disk performance on the Arm platform + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [Couchbase](https://www.couchbase.com/) + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Databases +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Couchbase + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Couchbase documentation + link: https://docs.couchbase.com/home/index.html + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/background.md new file mode 100644 index 0000000000..2fb616d642 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/background.md @@ -0,0 +1,28 @@ +--- +title: Getting started with Couchbase on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Couchbase + +Couchbase is an open-source NoSQL distributed database designed for building high-performance, scalable, and flexible modern applications. Developed by Couchbase, Inc. +It combines the capabilities of a key-value store, document database, and distributed caching system in a single unified platform. + +Couchbase provides a memory-first architecture for low-latency data access, along with a powerful query engine (N1QL) that supports SQL-like syntax for JSON data. It also features built-in replication, automatic sharding, and cross-datacenter synchronization (XDCR), enabling seamless scalability and high availability across clusters and regions. + +It supports both on-premises and cloud deployments (including AWS, Azure, and GCP) and integrates with modern application stacks and container platforms like Kubernetes and Docker. + +Known for its high throughput, low latency, and ease of scaling, Couchbase is ideal for use cases such as real-time analytics, session management, content delivery, IoT, and mobile synchronization through Couchbase Mobile. + +To learn more, visit the official [Couchbase website](https://www.couchbase.com/) diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/baseline.md new file mode 100644 index 0000000000..48f1a7c0b5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/baseline.md @@ -0,0 +1,90 @@ +--- +title: Couchbase Baseline Testing on Google Axion C4A Arm Virtual Machine +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Couchbase Baseline Testing on GCP SUSE VMs +This section confirms that Couchbase is correctly installed and running on the GCP SUSE Arm64 VM. It includes initializing the cluster, verifying node status, and accessing the Web UI to create a bucket — this ensures the setup is ready for benchmarking. + +### Setup the default cluster +Once the service is running, we need to setup the default cluster for the first time. + +- Open Web Console using your VM public IP address that you saved off in the last step: + +```console +http://:8091 +``` +- Press "Setup New Cluster" + +![Setup new cluster](images/cluster-setup-1.png "Setup new cluster") + +- Provide a name for your cluster (example: "my_cluster") and create a password for your administrator account (leave the username as the default "Administrator") + +![Create cluster and admin count](images/cluster-setup-2.png "Create cluster and admin count") + +- Check the "Accept terms" checkbox and press "Configure Disk, Memory, Services" button + +![Accept Terms](images/cluster-setup-3.png "Accept Terms") + +- Accept the defaults of your cluster configuration and press "Save & Finish" + +![Finalize configuration](images/cluster-setup-4.png "Finalize configuration") + +Our default cluster is now created! Please retain the password you created for your "Administrator" account... you'll need that in the next steps. + +### Verify Cluster Nodes +This command checks if your Couchbase server (called a “node”) is running properly. Replace "password" with your specified Couchbase Administrator password. +If the output says **“healthy active”**, it means your Couchbase node is working fine and ready for operations. + +```console +/opt/couchbase/bin/couchbase-cli server-list \ + -u Administrator -p password --cluster localhost +``` + +```output +ns_1@cb.local 127.0.0.1:8091 healthy active +``` + +### Prepare a Couchbase Bucket for benchmarking +Once the service is running, you can access the **Couchbase Web Console** to create a bucket for benchmarking. + +Open Web Console using the public IP address of your VM that you saved off from the last step: + +```console +http://:8091 +``` +Use the admin `username` (default is "Administrator") and `password` you created during Couchbase cluster setup in the previous step. + +![Couchbase Dashboard](images/dashboard-1.png "Couchbase Dashboard") + +- On the left hand side select "Buckets" +- Press the "Add Bucket" in the upper right hand corner: + +![Create Bucket](images/create-bucket-1.png "Create Bucket") + +- Name the new bucket "benchmark" +- The bucket type will be "Couchbase" +- The Memory Quota can be set to "512 MB" + +![Create Bucket](images/create-bucket-2.png "Create Bucket") + +| **Parameter** | **Value** | +|----------------|-----------| +| **Bucket Name** | benchmark | +| **Bucket Type** | Couchbase | +| **Memory Quota** | 512 MB | + +- You should now see that your bucket has been created: + +![Created Bucket](images/create-bucket-3.png "Created Bucket") + +#### Additional notes about buckets in Couchbase + +- A **bucket** in Couchbase is like a **database** — it stores and manages your data. +- The **benchmark** bucket will be used for **load testing** and **performance benchmarking**. +- Setting the **RAM Quota** ensures Couchbase allocates sufficient memory for **in-memory data operations**, improving overall speed. + +You can now proceed to the next section for benchmarking to measure Couchbase's performance. diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/benchmarking.md new file mode 100644 index 0000000000..4ecff50f41 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/benchmarking.md @@ -0,0 +1,139 @@ +--- +title: Couchbase Benchmarking +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Couchbase Benchmark on GCP SUSE Arm64 VM +This section guides you through benchmarking Couchbase performance on a GCP SUSE Arm64 VM using the **official `cbc-pillowfight` tool** from Couchbase C SDK. +It involves installing dependencies, building the SDK, verifying the setup, and running the benchmark test. + +### Install Build Tools & Dependencies +Before compiling the Couchbase SDK, install all required development tools and libraries. + +```console +sudo zypper install -y gcc gcc-c++ cmake make git openssl-devel libevent-devel cyrus-sasl-devel java +``` + +### Download and Build the Couchbase C SDK (includes cbc-pillowfight) +`cbc-pillowfight` is a Couchbase command-line benchmarking tool that simulates a workload by performing concurrent read and write operations on a bucket to test Couchbase cluster performance. + +Clone the official Couchbase C SDK repository from GitHub. This SDK includes benchmarking tools such as `cbc` and `cbc-pillowfight`. + +```console +cd ~ +git clone https://github.com/couchbase/libcouchbase.git +cd libcouchbase +``` + +**Then build and install:** + +```console +mkdir build && cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +make -j$(nproc) +sudo make install +``` + +### Update the Dynamic Linker Configuration +After installation, tell the system where to find the Couchbase libraries. + +```console +echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/libcouchbase.conf +``` + +Then refresh the linker cache to make the libraries available system-wide: + +```console +sudo ldconfig +``` + +### Verify Installation +After installation, the tools like **cbc**, **cbc-pillowfight**, etc. should be available in `/usr/local/bin`. + +**Verify with:** + +```console +cbc version +cbc-pillowfight --help +``` +For the "cbc version" command, you should see an output similar to: +```output +cbc: + Runtime: Version=3.3.18, Changeset=a8e17873d167ec75338a358e54cec3994612d260 + Headers: Version=3.3.18, Changeset=a8e17873d167ec75338a358e54cec3994612d260 + Build Timestamp: 2025-11-06 04:36:42 + CMake Build Type: Release + Default plugin directory: /usr/local/lib64/libcouchbase + IO: Default=libevent, Current=libevent, Accessible=libevent,select + SSL Runtime: OpenSSL 1.1.1l-fips 24 Aug 2021 SUSE release 150500.17.40.1 + SSL Headers: OpenSSL 1.1.1l-fips 24 Aug 2021 SUSE release SUSE_OPENSSL_RELEASE + HAVE_PKCS5_PBKDF2_HMAC: yes + Snappy: 1.1.8 + Tracing: SUPPORTED + System: Linux-6.4.0-150600.23.73-default; aarch64 + CC: GNU 7.5.0; -fno-strict-aliasing -ggdb3 -pthread + CXX: GNU 7.5.0; -fno-strict-aliasing -ggdb3 -pthread +``` + +For the "cbc-pillowfight --help" command, you should see the "help" menu displayed for cbc-pillowfight. + +### Run Benchmark using cbc-pillowfight +Once Couchbase Server is running and a bucket (e.g., `benchmark`) is created, you can run a workload test using the following command (use your Couchbase administrators password): + +```console +cbc-pillowfight -U couchbase://127.0.0.1/benchmark \ +-u Administrator -P password -I 10000 -B 1000 -t 5 -c 500 +``` + +- **-U couchbase://127.0.0.1/benchmark**: Connection string to Couchbase bucket +- **-u Administrator**: Couchbase admin username (default: "Administrator") +- **-P password**: Couchbase Administrator's password +- **-I 10000**: Number of items (documents) to use +- **-B 1000**: Batch size for operations +- **-t 5**: Number of concurrent threads +- **-c 500**: Number of operation cycles to run + +You should see an output similar to: +```output +Running. Press Ctrl-C to terminate... +Thread 0 has finished populating. +Thread 1 has finished populating. +Thread 2 has finished populating. +Thread 3 has finished populating. +Thread 4 has finished populating. +``` + +### Monitoring During Test +While the benchmark runs, open the Couchbase Web Console in your browser: + +```bash +http://:8091 +``` + +**Navigate to**: +**Dashboard → Buckets → benchmark** + +Monitor real-time performance metrics such as: +- **Ops/sec** — should match your CLI output +- **Resident ratio** — how much data stays in memory +- **Disk write queue** — backlog of writes to disk +- **CPU and memory usage** — tells you how well ARM cores are handling load + +![Couchbase Dashboard alt-text#center](images/arm-benchmark.png "Monitor Benchmark Log") + +### Benchmark summary +Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (SUSE): + +| **Name** | **Items** | **Resident** | **Ops/sec** | **RAM Used / Quota** | **Disk Used** | +|---------------|------------|---------------|---------------|-----------------------|---------------| +| benchmark | 10,000 | 100% | 227,981.1 | 36.8 MiB / 1 GiB | 26.7 MiB | + +- **Operations per Second:** 227,981.1 ops/sec — indicates high throughput +- **Resident Ratio:** 100% — all data served directly from memory +- **RAM Usage:** 36.8 MiB used out of 1 GiB quota — highly efficient memory utilization +- **Disk Usage:** 26.7 MiB — minimal disk consumption +- **Overall:** The Couchbase bucket performed efficiently with strong in-memory performance and low resource usage. diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/firewall_setup.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/firewall_setup.md new file mode 100644 index 0000000000..83cc7be2df --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/firewall_setup.md @@ -0,0 +1,42 @@ +--- +title: Create a Firewall Rule on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to create a Firewall Rule within Google Cloud Console. For this learning path, we need to expose TCP port 8091. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Create a Firewall Rule in GCP + +For this learning path, we need to expose TCP port 8091. To accomplish this, we first need to create a firewall rule. +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **VPC Network > Firewall** and press **Create firewall rule**. + +![Create a firewall rule](images/firewall-rule.png "Create a firewall rule") + +- Next, we create the firewall rule that will expose TCP port 8091 for our learning path. +- Set the "Name" of the new rule to "allow-tcp-8091" +- Select your network that you intend to bind to your VM (default is "autoscaling-net" but your organization might have others that you need to use) +- Direction of traffic should be set to "Ingress" +- Allow on match should be set to "Allow" and the "Targets" should be set to "Specified target tags". +- Enter "allow-tcp-8091" to the "Target tags" text field +- Set the "Source IPv4 ranges" text value to "0.0.0.0/0" + +![Create a firewall rule](images/network-rule.png "Creating the TCP/8091 firewall rule") + +- Lastly, we select "Specified protocols and ports" under the "Protocols and ports" section +- Select the "TCP" checkbox +- Enter "8091" in the "Ports" text field +- Press "Create" + +![Specifying the TCP port to expose](images/network-port.png "Specifying the TCP port to expose") + +Our network firewall rule is now created so we can continue with the VM creation! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/arm-benchmark.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/arm-benchmark.png new file mode 100644 index 0000000000..c873e5e30d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/arm-benchmark.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-1.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-1.png new file mode 100644 index 0000000000..d9156c0b50 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-2.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-2.png new file mode 100644 index 0000000000..bc4813e5f4 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-3.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-3.png new file mode 100644 index 0000000000..7d38ae3f03 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-4.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-4.png new file mode 100644 index 0000000000..dcda544d03 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/cluster-setup-4.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-1.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-1.png new file mode 100644 index 0000000000..1641f5345b Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-2.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-2.png new file mode 100644 index 0000000000..c7f4d02d90 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-3.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-3.png new file mode 100644 index 0000000000..20f5baed7c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/create-bucket-3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/dashboard-1.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/dashboard-1.png new file mode 100644 index 0000000000..f9ad1979b9 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/dashboard-1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/firewall-rule.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/firewall-rule.png new file mode 100644 index 0000000000..cb2d9bf40a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/firewall-rule.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-pubip-ssh.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-pubip-ssh.png new file mode 100644 index 0000000000..558745de3e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-pubip-ssh.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-shell.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-shell.png new file mode 100644 index 0000000000..7e2fc3d1b5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-shell.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-config.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-config.png new file mode 100644 index 0000000000..e590740480 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-config.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-port.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-port.png new file mode 100644 index 0000000000..b475935755 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-port.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-rule.png b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-rule.png new file mode 100644 index 0000000000..796262b07e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/images/network-rule.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/installation.md new file mode 100644 index 0000000000..afa13edf7d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/installation.md @@ -0,0 +1,94 @@ +--- +title: Install Couchbase +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install Couchbase on GCP VM +This section explains how to install and configure **Couchbase Server** on a GCP Linux VM (SUSE or RHEL-based). +Follow the steps below carefully to ensure a successful setup. + +### System Preparation +Before installing Couchbase, update the system and install the required tools. + +```console +sudo zypper refresh +sudo zypper update -y +sudo zypper install -y curl wget net-tools lsb-release +``` +### Download Couchbase Server +Download the Couchbase Server package for ARM64 architecture. + +```console +cd ~ +wget -O couchbase-server-8.0.0-linux.aarch64.rpm \ +https://packages.couchbase.com/releases/8.0.0/couchbase-server-community-8.0.0-linux.aarch64.rpm +``` +**Verify the downloaded file:** +After downloading, verify that the file exists and check its size. + +```console +ls -lh couchbase-server-8.0.0-linux.aarch64.rpm +``` +This helps confirm the file was downloaded correctly and not truncated or corrupted. + +### Install Couchbase Server +Install the downloaded Couchbase RPM package. + +```console +sudo rpm -ivh couchbase-server-8.0.0-linux.aarch64.rpm +``` +- **rpm -ivh** → Installs the RPM package, displaying verbose output and progress (v for verbose, h for hash marks). +- This command installs Couchbase and sets up the necessary directories, binaries, and services. + +**Confirm that Couchbase has been installed successfully:** + +```console +rpm -qa | grep couchbase +``` +You should see an output similar to: +```output +couchbase-server-community-8.0.0-3777.aarch64 +``` +### Start Couchbase Service +Start and enable the Couchbase service so that it runs automatically on startup. + +```console +sudo systemctl start couchbase-server +sudo systemctl enable couchbase-server +``` + +**Verify service status:** +```console +sudo systemctl status couchbase-server +``` + +You should see the following snippet as part of your output: +```output +Active: active(running) since YYY XXXX-XX-XX +``` + +### Check Required Ports +This command checks if those ports are open and active. If you see “LISTEN” next to these ports, it means Couchbase is ready to accept connections. + +Couchbase uses the following ports for basic operation: + +- Web Console: `8091` +- Query Service: `8093` (optional for N1QL queries) +- Data Service: `11210` + +Check if the ports are listening: + +```console +sudo ss -tuln | grep -E '8091|11210' +``` + +```output +tcp LISTEN 0 128 0.0.0.0:8091 0.0.0.0:* +tcp LISTEN 0 1024 0.0.0.0:11210 0.0.0.0:* +tcp LISTEN 0 1024 [::]:11210 [::]:* +``` + +Once the **installation and setup are complete**, you can now proceed to the **baseline testing** phase. diff --git a/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/instance.md new file mode 100644 index 0000000000..11cd46284a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/couchbase-on-gcp/instance.md @@ -0,0 +1,49 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. +- If using use **SUSE Linux Enterprise Server**. Select "Pay As You Go" for the license type. +- Once appropriately selected, please Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Also under **Networking**, in the "Network tags" text field add "allow-tcp-8091" as an additional tag + +![Adding the TCP/8091 firewall rule to our VM](images/network-config.png "Adding the TCP/8091 firewall rule to our VM") + +- Click **Create** to launch the instance. +- Once created, you should see a "SSH" option to the right in your list of VM instances. You should also see the public IP address for your VM. +- Save off the public IP address for your VM as you will need this in the next step. +- Click on this to launch a SSH shell into your VM instance: + +![Invoke a SSH session via your browser alt-text#center](images/gcp-pubip-ssh.png "Invoke a SSH session into your running VM instance") + +- A window from your browser should come up and you should now see a shell into your VM instance: + +![Terminal Shell in your VM instance alt-text#center](images/gcp-shell.png "Terminal shell in your VM instance") + +Next, let's install Couchbase! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_index.md new file mode 100644 index 0000000000..4c1ce1bcd8 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_index.md @@ -0,0 +1,63 @@ +--- +title: Deploy Django on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This learning path is intended for software developers deploying and optimizing Django-based web applications on Linux/Arm64 environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install Django on a SUSE Arm64 (C4A) instance + - Verify Django functionality by running the development server and accessing the default welcome page on the Arm64 VM + - Measure Django application performance by benchmarking request handling throughput and latency using the official ApacheBench (ab) tool with Gunicorn on Arm64 (Aarch64) + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [Django](https://www.djangoproject.com/) + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Web +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Django + - Python + - Gunicorn + - Apache Bench + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Django documentation + link: https://docs.djangoproject.com/ + type: documentation + + - resource: + title: Apache-bench documentation + link: https://httpd.apache.org/docs/2.4/programs/ab.html + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/background.md new file mode 100644 index 0000000000..8b2a674186 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/background.md @@ -0,0 +1,27 @@ +--- +title: Getting started with Django on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Django + +[Django](https://www.djangoproject.com/) is a high-level, **open-source Python web framework** that encourages **rapid development** and **clean, pragmatic design**. Developed and maintained by the [Django Software Foundation](https://www.djangoproject.com/foundation/), it simplifies web application development by handling much of the boilerplate and providing powerful built-in features. + +Django follows the **Model–View–Template (MVT)** architectural pattern and includes robust tools for **authentication**, **URL routing**, **form handling**, **ORM (Object Relational Mapping)**, **session management**, and **administration interface** — all out of the box. + +Django is known for its focus on **security**, **scalability**, and **maintainability**, making it suitable for everything from small projects to large-scale enterprise applications. It helps developers build secure, high-performance web applications quickly without reinventing common components. + +Common use cases include **web applications**, **content management systems**, **APIs**, **e-commerce platforms**, and **data-driven dashboards**. It integrates seamlessly with popular databases like **PostgreSQL**, **MySQL**, **SQLite**, and **Oracle**. + +To learn more, visit the [official Django website](https://www.djangoproject.com/) and explore the [Django documentation](https://docs.djangoproject.com/en/stable/). diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/baseline.md new file mode 100644 index 0000000000..955ce42de5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/baseline.md @@ -0,0 +1,216 @@ +--- +title: Django Baseline Testing on Google Axion C4A Arm Virtual Machine +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Django Baseline Testing on GCP SUSE VMs +This baseline testing guide verifies that your **Django installation**, **web server**, and **basic application routing** are functioning correctly on a **Google Cloud SUSE Linux Arm64 (Axion C4A)** virtual machine. +You will first run the Django development server and access it from your browser, then create a simple Django app to ensure routing works. + +### Baseline 1 — View Django Welcome Page +This test confirms that Django is installed correctly and the server runs successfully. + +#### Create a new Django project +Run the following command to create a new Django project named `myproject`: + +```console +django-admin startproject myproject +cd myproject +``` + +This generates the following directory structure: + +```markdown +myproject/ +├── manage.py +└── myproject/ + ├── settings.py + ├── urls.py + ├── asgi.py + └── wsgi.py +``` +- `manage.py` is Django’s command-line utility for project management (running server, migrations, etc.). +- The inner `myproject/` folder contains the core configuration files that define your project’s settings and URLs.- + +#### Run initial migrations +Migrations prepare your project’s database by creating the required tables for Django’s internal apps (admin, authentication, etc.): + +```console +python3 manage.py migrate +``` + +You should get output showing the Running Migrations (all of which should be "OK"). + +#### Start the Django development server +Before starting the Django development server, you must configure your ALLOWED_HOSTS setting to allow access from your VM’s external IP. +This ensures that Django accepts HTTP requests from outside the localhost (e.g., when testing in a browser or from another machine). + +**ALLOWED_HOSTS:** is a security setting in Django that defines which host/domain names your Django site can serve. + +- Navigate to Your Project Settings + Move into your Django project directory where the settings.py file is located. + + ```console + cd ~/myproject/myproject/ + ``` + +- Open settings.py File + Use any text editor (like vi or nano) to open the file ("edit" is used as an example below). + + ```console + edit myproject/settings.py + ``` + +- Locate the `ALLOWED_HOSTS` Line + Inside the file, find the following line: + + ```python + ALLOWED_HOSTS = [] + ``` + This setting defines which host/domain names Django will serve. + +- Allow All Hosts (for Testing Only) + To make your Django app accessible from your VM’s external IP address, update it to: + ```python + ALLOWED_HOSTS = ['*'] + ``` +{{% notice Note %}} +Allowing all hosts `('*')` is suitable **only for development or testing**. +For production, replace `'*'` with specific domain names or IPs, such as your public IP address for your VM that you recorded earlier: +{{% /notice %}} + +```python +ALLOWED_HOSTS = ['your-external-ip', 'your-domain.com'] +``` + +**Now start the Django development server:** + +We can now start the Django development server since we have exposed TCP/8000 in our VM via firewall rules: +```console +python3 manage.py runserver 0.0.0.0:8000 +``` + +#### View in browser +Open a web browser on your local machine (Chrome, Firefox, Edge, etc.) and enter the following URL in the address bar. Please replace "YOUR_VM_EXTERNAL_IP" with the external IP address of your VM that you saved off earlier: + +```console +http://:8000 +``` +- Replace `` with the public IP of your GCP VM. + +If everything is set up correctly, you should see the default Django welcome page (“The install worked successfully!”). It looks like this: + +![Django welcome page alt-text#center](images/django-welcome-page.png "Figure 1: Django web page") + +### Baseline 2 — Create a Simple Django App +This test ensures Django’s application routing and view rendering work as expected. + +#### Stop the server +Press `Ctrl + C` to stop the Django server if running. + +#### Create a new app +Within your Django project directory, create a new app named `hello`: + +```console +python3 manage.py startapp hello +``` + +**This creates the following directory:** + +```markdown +hello/ +├── admin.py +├── apps.py +├── models.py +├── tests.py +├── views.py +└── urls.py +``` + +#### Create a simple view +Edit `hello/views.py`. Replace your existing file with this: + +```python +from django.http import HttpResponse + +def home(request): + return HttpResponse("

Hello, Django on GCP SUSE ARM64!

") +``` +This defines a simple view function that sends a basic HTML message as the HTTP response. + +#### Create app URL configuration +Create a new file `hello/urls.py` and add: + +```python +from django.urls import path +from . import views + +urlpatterns = [ + path('', views.home, name='home'), +] +``` +This maps the root URL `(/)`of your app to the `home()` view function. + +#### Link the app to the main project +Replace your default `myproject/urls.py` file with this version. + +```python +"""myproject URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/3.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" + +from django.contrib import admin +from django.urls import path, include + +urlpatterns = [ + path('admin/', admin.site.urls), + path('', include('hello.urls')), +] +``` +This tells Django to delegate routing for the root path (`''`) to the `hello` app’s URLs. + +#### Add the app to settings +This makes Django aware of your new app so it can load its configuration and routes. +Edit `myproject/settings.py` → add `'hello'` to INSTALLED_APPS: + +```python +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'hello', +] +``` +#### Run the server again + +```console +python3 manage.py runserver 0.0.0.0:8000 +``` + +#### Test your app +Open in browser: + +```console +http://:8000 +``` +You should see the Django app. It looks like this: + +![Django App alt-text#center](images/django-app.png "Figure 2: Django App") diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/benchmarking.md new file mode 100644 index 0000000000..230ecad863 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/benchmarking.md @@ -0,0 +1,165 @@ +--- +title: Django Benchmarking +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Django Benchmarking using ApacheBench +This section describes how to benchmark a Django web application deployed with **Gunicorn** using **ApacheBench (ab)** — a lightweight HTTP benchmarking tool. +You will measure **throughput (requests per second)** and **latency (response time)** to evaluate the performance of your Django app on an Arm-based GCP SUSE VM. + +### Stop the server +Press `Ctrl + C` to stop the Django server if running. + +### Ensure ApacheBench is installed +**ApacheBench (ab)** is a command-line tool used to benchmark web servers by simulating multiple HTTP requests. + +Install it using following command: + +```console +sudo zypper install -y apache2-utils +``` +**Verify installation:** + +This confirms ApacheBench is correctly installed and available system-wide. + +```console +ab -V +``` + +**Ensure Django and Gunicorn are installed:** + +```console +python3 -m pip install django gunicorn +``` +- **Django** is the Python web framework you’re benchmarking. +- **Gunicorn** is a high-performance WSGI HTTP server for deploying Django apps in production-like environments. + +### Run Django with Gunicorn +Use Gunicorn to serve your Django application for benchmarking (run in the background): + +```console +gunicorn myproject.wsgi:application --bind 0.0.0.0:8000 --workers 4 & +``` + +- `--workers 4`: number of worker processes +- `--bind 0.0.0.0:8000`: binds to all interfaces on port 8000 +- `myproject.wsgi:application` your Django project name ("myproject" used in this example). + +{{% notice Note %}} +Keep this terminal running during the benchmark. If you’re testing remotely, ensure port 8000 is open in your VM firewall settings. +{{% /notice %}} + +### Benchmark with ApacheBench (ab) +Run ApacheBench to simulate multiple clients hitting your Django server. + +```console +ab -n 1000 -c 10 http://127.0.0.1:8000/ +``` +- `-n 1000`: total number of requests +- `-c 10`: concurrency (simultaneous requests) + +You should see an output similar to: + +```output +This is ApacheBench, Version 2.3 <$Revision: 1903618 $> +Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/ +Licensed to The Apache Software Foundation, http://www.apache.org/ + +Benchmarking 127.0.0.1 (be patient) +Completed 100 requests +Completed 200 requests +Completed 300 requests +Completed 400 requests +Completed 500 requests +Completed 600 requests +Completed 700 requests +Completed 800 requests +Completed 900 requests +Completed 1000 requests +Finished 1000 requests + + +Server Software: gunicorn +Server Hostname: 127.0.0.1 +Server Port: 8000 + +Document Path: / +Document Length: 41 bytes + +Concurrency Level: 10 +Time taken for tests: 0.104 seconds +Complete requests: 1000 +Failed requests: 0 +Total transferred: 280000 bytes +HTML transferred: 41000 bytes +Requests per second: 9651.21 [#/sec] (mean) +Time per request: 1.036 [ms] (mean) +Time per request: 0.104 [ms] (mean, across all concurrent requests) +Transfer rate: 2639.00 [Kbytes/sec] received + +Connection Times (ms) + min mean[+/-sd] median max +Connect: 0 0 0.1 0 1 +Processing: 0 1 0.3 1 4 +Waiting: 0 1 0.3 1 3 +Total: 0 1 0.4 1 5 + +Percentage of the requests served within a certain time (ms) + 50% 1 + 66% 1 + 75% 1 + 80% 1 + 90% 1 + 95% 2 + 98% 2 + 99% 3 + 100% 5 (longest request) +``` + +### Cleanup + +With the following output (above) seen, you can type "fg" followed by "ctrl-c" to exit the gunicorn server that is running. + +### Benchmark Metrics Explanation + +- **Concurrency Level:** Number of requests executed simultaneously during the test. +- **Time Taken for Tests:** Total time required to complete all HTTP requests. +- **Complete Requests:** Total number of successful requests processed. +- **Failed Requests:** Number of requests that failed or returned errors. +- **Total Transferred:** Total amount of data (including headers) sent and received. +- **HTML Transferred:** Amount of actual response content transferred. +- **Requests per Second:** Average number of requests handled by the server per second. +- **Time per Request (mean):** Average time taken to process a single request. +- **Time per Request (across concurrent):** Mean time per request across all concurrent clients. +- **Transfer Rate:** Average network data throughput during the benchmark. + +### Benchmark summary +Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (SUSE): + +| **Parameter** | **Description** | **Value** | +|----------------|------------------|-----------| +| **Server Software** | Web server used for serving Django | gunicorn | +| **Server Hostname** | Host address tested | 127.0.0.1 | +| **Server Port** | Port number for benchmark | 8000 | +| **Document Path** | Endpoint used for testing | / | +| **Document Length** | Size of each response | 41 bytes | +| **Concurrency Level** | Number of concurrent requests | 10 | +| **Time Taken for Tests** | Total time to complete all requests | 0.104 seconds | +| **Complete Requests** | Total number of successful requests | 1000 | +| **Failed Requests** | Number of failed requests | 0 | +| **Total Transferred** | Total bytes transferred (including headers) | 280000 bytes | +| **HTML Transferred** | Total HTML body bytes transferred | 41000 bytes | +| **Requests per Second (mean)** | Throughput — higher is better | **9651.21 req/sec** | +| **Time per Request (mean)** | Average time for each request | **1.036 ms** | +| **Time per Request (across all concurrent requests)** | Average latency considering concurrency | **0.104 ms** | +| **Transfer Rate** | Network throughput rate | **2639.00 KB/sec** | + +- **Exceptional Throughput:** The Arm64 VM efficiently handled nearly 10K requests per second, showcasing excellent concurrency handling. +- **Low Latency:** Average response time stayed around 1 ms, indicating rapid request processing even under load. +- **High Efficiency:** Zero failed requests demonstrate stable and reliable performance under benchmark conditions. +- **Optimized Networking:** Strong data transfer rate highlights Arm64’s efficient network I/O capabilities. +- **Ideal for Scalable Apps:** The consistent and predictable response times make Arm64 VMs well-suited for high-performance web workloads. diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/firewall_setup.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/firewall_setup.md new file mode 100644 index 0000000000..983fcdb15c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/firewall_setup.md @@ -0,0 +1,42 @@ +--- +title: Create a Firewall Rule on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to create a Firewall Rule within Google Cloud Console. For this learning path, we need to expose TCP port 8000. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Create a Firewall Rule in GCP + +For this learning path, we need to expose TCP port 8000. To accomplish this, we first need to create a firewall rule. +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **VPC Network > Firewall** and press **Create firewall rule**. + +![Create a firewall rule](images/firewall-rule.png "Create a firewall rule") + +- Next, we create the firewall rule that will expose TCP port 8000 for our learning path. +- Set the "Name" of the new rule to "allow-tcp-8000" +- Select your network that you intend to bind to your VM (default is "autoscaling-net" but your organization might have others that you need to use) +- Direction of traffic should be set to "Ingress" +- Allow on match should be set to "Allow" and the "Targets" should be set to "Specified target tags". +- Enter "allow-tcp-8000" to the "Target tags" text field +- Set the "Source IPv4 ranges" text value to "0.0.0.0/0" + +![Create a firewall rule](images/network-rule.png "Creating the TCP/8000 firewall rule") + +- Lastly, we select "Specified protocols and ports" under the "Protocols and ports" section +- Select the "TCP" checkbox +- Enter "8000" in the "Ports" text field +- Press "Create" + +![Specifying the TCP port to expose](images/network-port.png "Specifying the TCP port to expose") + +Our network firewall rule is now created so we can continue with the VM creation! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-app.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-app.png new file mode 100644 index 0000000000..327ca97e3f Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-app.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-welcome-page.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-welcome-page.png new file mode 100644 index 0000000000..ce0e415a33 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/django-welcome-page.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall-rule.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall-rule.png new file mode 100644 index 0000000000..cb2d9bf40a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall-rule.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall_setup.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall_setup.md new file mode 100644 index 0000000000..1322281f04 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/firewall_setup.md @@ -0,0 +1,42 @@ +--- +title: Create a Firewall Rule on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to create a Firewall Rule within Google Cloud Console. For this learning path, we need to expose TCP port 8000. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Create a Firewall Rule in GCP + +For this learning path, we need to expose TCP port 8000. To accomplish this, we first need to create a firewall rule. +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **VPC Network > Firewall** and press **Create firewall rule**. + +![Create a firewall rule](images/firewall-rule.png "Create a firewall rule") + +- Next, we create the firewall rule that will expose TCP port 8000 for our learning path. +- Set the "Name" of the new rule to "allow-tcp-8000" +- Select your network that you intend to bind to your VM (default is "autoscaling-net" but your organization might have others that you need to use) +- Direction of traffic should be set to "Ingress" +- Allow on match should be set to "Allow" and the "Targets" should be set to "Specified target tags". +- Enter "allow-tcp-8000" to the "Target tags" text field +- Set the "Source IPv4 ranges" text value to "0.0.0.0/0" + +![Create a firewall rule](images/network-rule.png "Creating the TCP/8000 firewall rule") + +- Lastly, we select "Specified protocols and ports" under the "Protocols and ports" section +- Select the "TCP" checkbox +- Enter "8091" in the "Ports" text field +- Press "Create" + +![Specifying the TCP port to expose](images/network-port.png "Specifying the TCP port to expose") + +Our network firewall rule is now created so we can continue with the VM creation! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-pubip-ssh.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-pubip-ssh.png new file mode 100644 index 0000000000..558745de3e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-pubip-ssh.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-shell.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-shell.png new file mode 100644 index 0000000000..7e2fc3d1b5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-shell.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-config.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-config.png new file mode 100644 index 0000000000..007340eaf5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-config.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-port.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-port.png new file mode 100644 index 0000000000..49e43f0577 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-port.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-rule.png b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-rule.png new file mode 100644 index 0000000000..9a073d4df5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/images/network-rule.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/installation.md new file mode 100644 index 0000000000..1271b33f9b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/installation.md @@ -0,0 +1,71 @@ +--- +title: Install Django +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install Django on GCP VM +This guide walks you through installing Django on a **Google Cloud Platform (GCP) SUSE Linux Arm64 VM**, including all dependencies, Python setup, and environment preparation. + +### Update Your System +Before installing Django, it’s good practice to update your package list and upgrade installed software to ensure you have the latest versions and security patches. + +```console +sudo zypper refresh +sudo zypper update -y +``` + +### Install Python and Tools +**Django** requires **Python 3.10+**. We will use Python 3.11, which is compatible with Django 5. +You will also install `pip` for package management, and basic developer tools (`git`, `gcc`, and `make`) to build Python packages and work with Django projects. +```console +sudo zypper install -y python311 python311-pip python311-devel +sudo zypper install -y git gcc make +``` + +**Ensure that both Python and pip are installed correctly:** + +```console +python3.11 --version +pip3 --version +``` + +You should see an output similar to: +```output +Python 3.11.10 +pip 22.3.1 from /usr/lib/python3.11/site-packages/pip (python 3.11) +``` + +### Create a Project Folder and Virtual Environment +It’s recommended to create a dedicated project directory and use a **virtual environment** to isolate project dependencies. + +```console +mkdir ~/myproject && cd ~/myproject +python3.11 -m venv venv +source venv/bin/activate +``` +- `python3.11 -m venv venv` — creates a virtual environment named venv inside your project folder. +- `source venv/bin/activate` — activates the virtual environment. +Once activated, your command prompt will show (venv) at the beginning, indicating that you’re working inside an isolated Python environment. + +### Upgrade Pip and Install Django +With your virtual environment active, upgrade pip and install Django using the following commands: + +```console +python3 -m pip install --upgrade pip +python3 -m pip install django +``` + +**Confirm that Django is installed correctly by checking its version:** + +```console +django-admin --version +``` + +You should see an output similar to: +```output +5.2.8 +``` +Django is installed successfully and ready for project setup. diff --git a/content/learning-paths/servers-and-cloud-computing/django-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/instance.md new file mode 100644 index 0000000000..3291e8d4bb --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/django-on-gcp/instance.md @@ -0,0 +1,49 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. +- If using use **SUSE Linux Enterprise Server**. Select "Pay As You Go" for the license type. +- Once appropriately selected, please Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Also under **Networking**, in the "Network tags" text field add "allow-tcp-8000" as an additional tag + +![Adding the TCP/8000 firewall rule to our VM](images/network-config.png "Adding the TCP/8000 firewall rule to our VM") + +- Click **Create** to launch the instance. +- Once created, you should see a "SSH" option to the right in your list of VM instances. You should also see the public IP address for your VM. +- Save off the public IP address for your VM as you will need this in the next step. +- Click on this to launch a SSH shell into your VM instance: + +![Invoke a SSH session via your browser alt-text#center](images/gcp-pubip-ssh.png "Invoke a SSH session into your running VM instance") + +- A window from your browser should come up and you should now see a shell into your VM instance: + +![Terminal Shell in your VM instance alt-text#center](images/gcp-shell.png "Terminal shell in your VM instance") + +Next, let's install Couchbase! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/envoy_tune/tune_envoy_thp.md b/content/learning-paths/servers-and-cloud-computing/envoy_tune/tune_envoy_thp.md index 9514bea900..2b2b21845e 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy_tune/tune_envoy_thp.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy_tune/tune_envoy_thp.md @@ -32,7 +32,7 @@ sudo apt-get update sudo apt-get install libhugetlbfs-dev libhugetlbfs-bin ``` -### Enable `hugetlbfs` and THP +### Enable HugeTLB filesystem and THP Use the commands shown below to enable `hugetlbfs` and THP: diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md index 1555729c3f..6256c4284f 100644 --- a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md @@ -33,6 +33,17 @@ Verify the Maven installation: ```console mvn -version ``` + +The output should look like: + +```output +pache Maven 3.8.6 (84538c9988a25aec085021c365c560670ad80f63) +Maven home: /opt/maven +Java version: 17.0.13, vendor: N/A, runtime: /usr/lib64/jvm/java-17-openjdk-17 +Default locale: en, platform encoding: UTF-8 +OS name: "linux", version: "5.14.21-150500.55.124-default", arch: "aarch64", family: "unix" +``` + At this point, both Java and Maven are installed and ready to use. ### Start the Flink Cluster diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md index d61f86be03..abb06b48ec 100644 --- a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md @@ -24,7 +24,7 @@ Next, download the pre-built binary package for **Apache Flink** from the offici ```console cd /opt -sudo wget https://dlcdn.apache.org/flink/flink-2.1.0/flink-2.1.0-bin-scala_2.12.tgz +sudo wget https://dlcdn.apache.org/flink/flink-2.1.1/flink-2.1.1-bin-scala_2.12.tgz ``` This command retrieves the official Flink binary distribution for installation on your VM. @@ -39,15 +39,15 @@ The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) re Extract the downloaded `.tgz` archive to make the Flink files accessible for configuration. ```console -sudo tar -xvzf flink-2.1.0-bin-scala_2.12.tgz +sudo tar -xvzf flink-2.1.1-bin-scala_2.12.tgz ``` -After extraction, you will have a directory named `flink-2.1.0` under `/opt`. +After extraction, you will have a directory named `flink-2.1.1` under `/opt`. **Rename the extracted directory for convenience:** For easier access and management, rename the extracted Flink directory to a simple name like `/opt/flink`. ```console -sudo mv flink-2.1.0 /opt/flink +sudo mv flink-2.1.1 /opt/flink ``` This makes future references to your Flink installation path simpler and more consistent. @@ -82,6 +82,6 @@ flink -v You should see an output similar to: ```output -Version: 2.1.0, Commit ID: 4cb6bd3 +Version: 2.1.1, Commit ID: 074f8c5 ``` This confirms that Apache Flink has been installed and is ready for use. diff --git a/content/learning-paths/servers-and-cloud-computing/gh-copilot-simple/copilot-test.md b/content/learning-paths/servers-and-cloud-computing/gh-copilot-simple/copilot-test.md index 1492731b22..8b8598490e 100644 --- a/content/learning-paths/servers-and-cloud-computing/gh-copilot-simple/copilot-test.md +++ b/content/learning-paths/servers-and-cloud-computing/gh-copilot-simple/copilot-test.md @@ -45,7 +45,7 @@ Lastly, the chat output from your extension is printed. Here it is in VS Code: -![#Copilot output](_images/output.webp) +![#Copilot output](_images/output.png) Your GitHub Copilot Extension is now responding to chat prompts. diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/_index.md index 99685a606c..0952b8dda0 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/_index.md @@ -1,23 +1,19 @@ --- -title: Deploy Kafka on the Microsoft Azure Cobalt 100 processors - -draft: true -cascade: - draft: true +title: Deploy Apache Kafka on Arm-based Microsoft Azure Cobalt 100 virtual machines minutes_to_complete: 30 -who_is_this_for: This is an advanced topic designed for software developers looking to migrate their Kafka workloads from x86_64 to Arm-based platforms, specifically on the Microsoft Azure Cobalt 100 processors. +who_is_this_for: This is an advanced topic for developers looking to migrate their Apache Kafka workloads from x86_64 to Arm-based platforms, specifically on Microsoft Azure Cobalt 100 (arm64) virtual machines. learning_objectives: - - Provision an Azure Arm64 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image. - - Deploy Kafka on the Ubuntu virtual machine. - - Perform Kafka baseline testing and benchmarking on Arm64 virtual machines. + - Provision an Azure Arm64 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image + - Deploy Kafka on an Ubuntu virtual machine + - Perform Kafka baseline testing and benchmarking on Arm64 virtual machines prerequisites: - - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6). - - Basic understanding of Linux command line. - - Familiarity with the [Apache Kafka architecture](https://kafka.apache.org/) and deployment practices on Arm64 platforms. + - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6) + - Basic understanding of Linux command line + - Familiarity with the [Apache Kafka architecture](https://kafka.apache.org/) and deployment practices on Arm64 platforms author: Pareena Verma diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/background.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/background.md index 48990a4d0a..74ffcdbd19 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/background.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/background.md @@ -8,7 +8,7 @@ layout: "learningpathall" ## Cobalt 100 Arm-based processor -Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU, ensuring consistent and predictable performance. +Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each virtual CPU (vCPU), ensuring consistent and predictable performance. To learn more about Cobalt 100, refer to the blog [Announcing the preview of new Azure virtual machine based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353). @@ -17,4 +17,4 @@ Apache Kafka is a high-performance, open-source distributed event streaming plat It allows you to publish, subscribe to, store, and process streams of records in a fault-tolerant and scalable manner. Kafka stores data in topics, which are partitioned and replicated across a cluster to ensure durability and high availability. -Kafka is widely used for messaging, log aggregation, event sourcing, real-time analytics, and integrating large-scale data systems. Learn more from the [Apache Kafka official website](https://kafka.apache.org/) and its [official documentation](https://kafka.apache.org/documentation). +Kafka is widely used for messaging, log aggregation, event sourcing, real-time analytics, and integrating large-scale data systems. Learn more from the [Apache Kafka official website](https://kafka.apache.org/) and the [Apache Kafka documentation](https://kafka.apache.org/documentation). diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/baseline.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/baseline.md index e53e41266a..a2aa5af488 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/baseline.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/baseline.md @@ -1,36 +1,37 @@ --- -title: Baseline Testing +title: Run baseline testing with Kafka on Azure Arm VM weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Run a Baseline test with Kafka +## Run a baseline test with Kafka After installing Apache Kafka 4.1.0 on your Azure Cobalt 100 Arm64 virtual machine, you can perform a baseline test to verify that Kafka runs correctly and that messages can be produced and consumed end-to-end. Kafka 4.1.0 introduces KRaft mode (Kafka Raft Metadata mode), which integrates the control and data planes, eliminating the need for ZooKeeper. + This simplifies deployment, reduces latency, and provides a unified, self-managed Kafka cluster architecture. -To perform this baseline test, you will use four terminal sessions: -Terminal 1: Start the Kafka broker (in KRaft mode). -Terminal 2: Create a topic. -Terminal 3: Send messages (Producer). -Terminal 4: Read messages (Consumer). +To run this baseline test, open four terminal sessions: -### Initial Setup: Configure & Format KRaft -KRaft (Kafka Raft) replaces ZooKeeper by embedding metadata management directly into the Kafka broker. -This improves scalability, reduces external dependencies, and speeds up controller failover in distributed clusters. -Before starting Kafka in KRaft mode, configure and initialize the storage directory. These steps are required only once per broker. +- **Terminal 1:** Start the Kafka broker in KRaft mode. +- **Terminal 2:** Create a topic. +- **Terminal 3:** Send messages as the producer. +- **Terminal 4:** Read messages as the consumer. -1. Edit the Configuration File -Open the Kafka configuration file in an editor: +Each terminal has a specific role, helping you verify that Kafka works end-to-end on your Arm64 VM. -```console -vi /opt/kafka/config/server.properties -``` +## Configure and format KRaft + +KRaft (Kafka Raft) mode replaces ZooKeeper by managing metadata directly within the Kafka broker. This change improves scalability, reduces external dependencies, and speeds up controller failover in distributed clusters. + +Before you start Kafka in KRaft mode, you need to configure the broker and initialize the storage directory. You only need to do this once for each broker. + +## Edit the configuration file to update KRaft properties + +Use an editor to open the Kafka configuration file at `/opt/kafka/config/server.properties`. Use `sudo` so that you can save the file. -2. Add or Modify KRaft Properties Ensure the following configuration entries are present for a single-node KRaft setup: ```java @@ -41,91 +42,120 @@ listeners=PLAINTEXT://:9092,CONTROLLER://:9093 advertised.listeners=PLAINTEXT://localhost:9092 log.dirs=/tmp/kraft-combined-logs ``` + This configuration file sets up a single Kafka server to act as both a controller (managing cluster metadata) and a broker (handling data), running in KRaft mode. It defines the node's unique ID and specifies the local host as the sole participant in the controller quorum. -3. Format the Storage Directory -Format the metadata storage directory using the kafka-storage.sh tool. This initializes KRaft’s internal Raft logs with a unique cluster ID. +## Format the storage directory + +Format the metadata storage directory using the kafka-storage.sh tool. This initializes KRaft's internal Raft logs with a unique cluster ID. ```console bin/kafka-storage.sh format -t $(bin/kafka-storage.sh random-uuid) -c config/server.properties ``` + You should see output similar to: ```output Formatting metadata directory /tmp/kraft-combined-logs with metadata.version 4.1-IV1. ``` + This confirms that the Kafka storage directory has been successfully formatted and that the broker is ready to start in KRaft mode. -## Perform the Baseline Test +## Perform the baseline test + With Kafka 4.1.0 installed and configured in KRaft mode, you’re now ready to run a baseline test to verify that the Kafka broker starts correctly, topics can be created, and message flow works as expected. You’ll use multiple terminals for this test: -Terminal 1: Start the Kafka broker. -Terminal 2: Create and verify a topic. -Terminal 3: Send messages (Producer). -Terminal 4: Read messages (Consumer). +Terminal 1: start the Kafka broker +Terminal 2: create and verify a topic +Terminal 3: send messages (Producer) +Terminal 4: read messages (Consumer) + +## Terminal 1 - start Kafka broker -### Terminal 1 – Start Kafka Broker Start the Kafka broker (the main server process responsible for managing topics and handling messages) in KRaft mode: ```console cd /opt/kafka bin/kafka-server-start.sh config/server.properties ``` + Keep this terminal open and running. The broker process must stay active for all subsequent commands. -### Terminal 2 – Create a Topic +## Terminal 2 - create a topic + Open a new terminal window. Create a topic named test-topic-kafka, which acts as a logical channel where producers send and consumers receive messages: ```console cd /opt/kafka bin/kafka-topics.sh --create --topic test-topic-kafka --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1 ``` + You should see output similar to: ```output Created topic test-topic-kafka. ``` -**Verify Topic Creation** -List available topics to confirm that your new topic was created successfully: +## Verify topic creation + +List available topics to confirm that your new topic was created successfully. Run the following command: ```console bin/kafka-topics.sh --list --bootstrap-server localhost:9092 ``` + +The expected output is: + +```output +__consumer_offsets +test-topic-kafka +``` + +If you see `test-topic-kafka` in the list, your topic was created and is ready for use. + +```console +bin/kafka-topics.sh --list --bootstrap-server localhost:9092 +``` + You should see output similar to: ```output __consumer_offsets test-topic-kafka ``` + Kafka is now running, and you’ve successfully created and verified a topic. Next, you’ll use Terminal 3 to produce messages and Terminal 4 to consume messages, completing the baseline functional test on your Arm64 environment. -### Terminal 3 – Console Producer (Write Message) +## Terminal 3 - console producer (write message) + In this step, you’ll start the Kafka Producer, which publishes messages to the topic test-topic-kafka. The producer acts as the data source, sending messages to the Kafka broker. ```console cd /opt/kafka bin/kafka-console-producer.sh --topic test-topic-kafka --bootstrap-server localhost:9092 ``` -After running the command, you’ll see an empty prompt. This means the producer is ready to send data. -Type the following message and press Enter: + +After running the command, you'll see an empty prompt. This means the producer is ready to send data. Type the following message and press Enter: ```output hello from azure arm vm ``` + Each line you type is sent as a message to the Kafka topic and stored on disk by the broker. -### Terminal 4 – Console Consumer (Read Message) -Next, open another terminal and start the Kafka Consumer, which subscribes to the same topic (test-topic-kafka) and reads messages from the beginning of the log. +## Terminal 4 - console consumer (read message) + +Next, open another terminal and start the Kafka Consumer, which subscribes to the same topic (test-topic-kafka) and reads messages from the beginning of the log: ```console cd /opt/kafka bin/kafka-console-consumer.sh --topic test-topic-kafka --from-beginning --bootstrap-server localhost:9092 ``` -If Kafka is working correctly, you should immediately see your message `hello from azure arm vm` displayed: -You’ve now completed a full end-to-end Kafka validation test on your Azure Cobalt 100 Arm64 VM, verifying producer, broker, and consumer communication. +If Kafka is working correctly, you should immediately see your message `hello from azure arm vm` displayed. + +You've now completed a full end-to-end Kafka validation test on your Azure Cobalt 100 Arm64 VM, verifying producer, broker, and consumer communication. -Now you can proceed to benchmarking Kafka’s performance on the Azure Cobalt 100 Arm virtual machine. +Now you can proceed to benchmarking Kafka's performance on the Azure Cobalt 100 Arm virtual machine. diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/benchmarking.md index dde683c1bb..c4597c5989 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/benchmarking.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/benchmarking.md @@ -1,5 +1,5 @@ --- -title: Benchmarking with Official Kafka Tools +title: Benchmark with official Kafka tools weight: 6 ### FIXED, DO NOT MODIFY @@ -10,12 +10,12 @@ layout: learningpathall Apache Kafka includes official performance testing utilities that allow you to measure throughput, latency, and end-to-end efficiency of your messaging system. These tools`kafka-producer-perf-test.sh` and `kafka-consumer-perf-test.sh` are bundled with Kafka’s standard installation and are designed for realistic performance evaluation of producers and consumers. -## Steps for Kafka Benchmarking +## Steps for Kafka benchmarking Before running the benchmarks, make sure your Kafka broker is already active in a separate terminal (as configured in the previous section). -Now open two new terminal sessions — one for running the producer benchmark and another for the consumer benchmark. +Now open two new terminal sessions; one for running the producer benchmark, and the other for the consumer benchmark. -### Terminal A - Producer Benchmark +### Terminal 1 - Producer Benchmark The Producer Performance Test measures how quickly Kafka can publish messages to a topic and reports key performance metrics such as throughput, average latency, and percentile latencies. @@ -44,7 +44,7 @@ You should see output similar to: | **Max latency** | The longest single message send time recorded. | | **50th / 95th / 99th percentiles** | Distribution of message send times. For example, 95% of messages completed under 1,184 ms in the sample output. | -### Terminal B - Consumer benchmark +### Terminal 2 - Consumer benchmark The Consumer Performance Test measures how efficiently Kafka can read and process messages from a topic. It reports metrics such as total messages consumed, data throughput, and fetch rates, helping validate overall consumer-side performance on your Azure Cobalt 100 (Arm64) VM. @@ -107,9 +107,6 @@ The producer sustained a throughput of ~257,500 records/sec (~24.5 MB/sec) with The 95th percentile latency (1168 ms) and 99th percentile (1220 ms) show predictable network and I/O performance. Kafka maintained consistent throughput, even under full-speed production, with no message loss or broker errors reported. -### Benchmark Comparison Insights -When analyzing performance on Azure Cobalt 100 Arm64 virtual machines: - **Producer efficiency**: The producer reached ~23–25 MB/sec throughput with average latencies below 900 ms, demonstrating stable delivery rates for high-volume workloads. - **Consumer scalability**: The consumer maintained ~262K messages/sec throughput with near-linear scaling of fetch performance — exceeding 1.85M messages/sec internally. - **Performance stability**: Both producer and consumer benchmarks showed low jitter and consistent latency distribution across iterations, confirming Kafka’s predictable behavior on Arm-based VMs. +### Benchmark comparison insights +When analyzing performance on Azure Cobalt 100 Arm64 virtual machines, you’ll notice that Kafka delivers stable and predictable results for both producers and consumers. The producer consistently achieves throughput between 23 MB/sec and 25 MB/sec, with average latencies below 900 ms. This means you can rely on efficient message delivery, even when handling high-volume workloads. On the consumer side, throughput remains strong at around 262,000 messages per second, and fetch performance scales nearly linearly, often exceeding 1.85 million messages per second internally. Throughout multiple benchmark runs, both producer and consumer tests demonstrate low jitter and consistent latency distribution, confirming that Kafka maintains reliable performance on Arm-based virtual machines. diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/create-instance.md index 9571395aa2..f9cfe7acd8 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/create-instance.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/create-instance.md @@ -1,5 +1,5 @@ --- -title: Create an Arm based cloud virtual machine using Microsoft Cobalt 100 CPU +title: Create an Arm-based cloud virtual machine using Microsoft Cobalt 100 CPU weight: 3 ### FIXED, DO NOT MODIFY @@ -8,43 +8,44 @@ layout: learningpathall ## Introduction -There are several ways to create an Arm-based Cobalt 100 virtual machine : the Microsoft Azure console, the Azure CLI tool, or using your choice of IaC (Infrastructure as Code). This guide will use the Azure console to create a virtual machine with Arm-based Cobalt 100 Processor. +You can create an Arm-based Cobalt 100 virtual machine in several ways: using the Microsoft Azure portal, the Azure CLI, or Infrastructure as Code (IaC) tools. This Learning Path uses the Azure portal to walk you through creating a virtual machine with an Arm-based Cobalt 100 processor, which is from the D-Series v6 general-purpose virtual machines. -This learning path focuses on the general-purpose virtual machine of the D series. Please read the guide on [Dpsv6 size series](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) offered by Microsoft Azure. +These VMs are designed for a wide range of workloads and offer Arm-based performance with the Cobalt 100 CPU. To learn more about the Dpsv6 size series, see the official [Dpsv6 size series guide](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) from Microsoft Azure. -If you have never used the Microsoft Cloud Platform before, please review the microsoft [guide to Create a Linux virtual machine in the Azure portal](https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal?tabs=ubuntu). +If you have never used the Microsoft Cloud Platform before, see the Microsoft guide on how to [Create a Linux virtual machine in the Azure portal](https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal?tabs=ubuntu). -#### Create an Arm-based Azure Virtual Machine +## Create an Arm-based Azure virtual machine -Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to "Virtual Machines". -1. Select "Create", and click on "Virtual Machine" from the drop-down list. -2. Inside the "Basic" tab, fill in the Instance details such as "Virtual machine name" and "Region". -3. Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select “Arm64” as the VM architecture. -4. In the “Size” field, click on “See all sizes” and select the D-Series v6 family of virtual machines. Select “D4ps_v6” from the list. +Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to **Virtual Machines**. -![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Figure 1: Select the D-Series v6 family of virtual machines") +- Select **Create**, and click on **Virtual Machine** from the drop-down list. +- Inside the **Basic** tab, fill in the Instance details such as **Virtual machine name** and **Region**. +- Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select **Arm64** as the VM architecture. +- In the **Size** field, click on **See all sizes** and select the D-Series v6 family of virtual machines. Select **D4ps_v6** from the list. -5. Select "SSH public key" as an Authentication type. Azure will automatically generate an SSH key pair for you and allow you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine. -6. Fill in the Administrator username for your VM. -7. Select "Generate new key pair", and select "RSA SSH Format" as the SSH Key Type. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key. -8. In the "Inbound port rules", select HTTP (80) and SSH (22) as the inbound ports. +![Azure portal showing the selection of the D-Series v6 family of virtual machines, with D4ps_v6 highlighted as the chosen size. The interface displays a list of available VM sizes, including CPU, memory, and pricing details. The wider environment is the Azure portal's virtual machine creation workflow, with a clean and organized layout. The tone is neutral and informative, focused on guiding users through the selection process. Visible text includes D-Series v6, D4ps_v6, CPU, memory, and price columns. alt-text#center](images/instance.png "Selecting the D-Series v6 family of virtual machines") -![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance1.png "Figure 2: Allow inbound port rules") +- Select **SSH public key** as an authentication type. Azure automatically generates an SSH key pair for you and allows you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine. +- Fill in the administrator username for your VM. +- Select **Generate new key pair**, and select **RSA SSH Format** as the SSH key type. RSA can offer better security with keys longer than 3072 bits. Give a key pair name to your SSH key. +- In the **Inbound port rules**, select **HTTP (80)** and **SSH (22)** as the inbound ports. -9. Click on the "Review + Create" tab and review the configuration for your virtual machine. It should look like the following: +![Azure portal interface displaying the Inbound port rules configuration step for an Azure Cobalt 100 Arm64 virtual machine (D4ps_v6). The main focus is on selecting HTTP port 80 and SSH port 22 as allowed inbound ports. The wider environment is the Azure portal's virtual machine creation workflow, with a clean and organized layout. Visible text includes Inbound port rules, HTTP 80, SSH 22, and options to add or remove ports. The tone is neutral and instructional, guiding users through network security settings for the VM. alt-text#center](images/instance1.png "Allow inbound port rules") -![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/ubuntu-pro.png "Figure 3: Review and Create an Azure Cobalt 100 Arm64 VM") +- Click on the **Review + Create** tab and review the configuration for your virtual machine. It should look like the following: -10. Finally, when you are confident about your selection, click on the "Create" button, and click on the "Download Private key and Create Resources" button. +![Azure portal interface displaying the Review and Create step for an Azure Cobalt 100 Arm64 virtual machine. The primary subject is the summary panel showing selected configuration details, including Ubuntu Pro 24.04 LTS as the operating system, D4ps_v6 as the VM size, Arm64 architecture, and SSH public key authentication. The wider environment is the Azure portal's virtual machine creation workflow, with a clean and organized layout. Visible text includes Review and Create, Ubuntu Pro 24.04 LTS, D4ps_v6, Arm64, SSH public key, and configuration summary fields. The tone is neutral and informative, guiding users through the final review before VM deployment. alt-text#center](images/ubuntu-pro.png "Review and create an Azure Cobalt 100 Arm64 VM") -![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Figure 4: Download Private key and Create Resources") +- Finally, when you are confident about your selection, click on the "Create" button, and click on the "Download Private key and Create Resources" button. -11. Your virtual machine should be ready and running within no time. You can SSH into the virtual machine using the private key, along with the Public IP details. +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Download private key and create resources") -![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "Figure 5: VM deployment confirmation in Azure portal") +Your virtual machine is now ready and running. To connect, use SSH with your private key and the VM's public IP address. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "VM deployment confirmation in Azure portal") {{% notice Note %}} -To learn more about Arm-based virtual machine in Azure, refer to “Getting Started with Microsoft Azure” in [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/azure). +To learn more about Arm-based virtual machine in Azure, refer to *Getting Started with Microsoft Azure* in [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/azure). {{% /notice %}} diff --git a/content/learning-paths/servers-and-cloud-computing/kafka-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/kafka-azure/deploy.md index b8b7a0627c..0b1020f121 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka-azure/deploy.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka-azure/deploy.md @@ -10,7 +10,7 @@ layout: learningpathall This section guides you through installing the latest version of Apache Kafka on an Ubuntu Pro 24.04 (Arm64) virtual machine running on Azure Cobalt 100. Kafka is a high-throughput, distributed event streaming platform used for real-time data pipelines and messaging applications. -### Install Java +## Install Java Apache Kafka runs on the Java Virtual Machine (JVM), so Java must be installed before setting up Kafka. Use the following commands to update your package index and install the default JDK: ```console @@ -19,7 +19,7 @@ sudo apt install -y default-jdk ``` This installs the Java Development Kit (JDK), which includes the JVM, compiler, and standard libraries required for running Kafka services. -### Download and Install Kafka +## Download and install Kafka Use the following commands to download and install Apache Kafka 4.1.0 in the /opt directory, extract the archive, and set appropriate permissions for your user. This prepares your system to run Kafka without requiring elevated privileges later. @@ -35,7 +35,7 @@ Kafka [3.5.0 release announcement](https://kafka.apache.org/blog#apache_kafka_35 The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) recommends Apache Kafka version 3.5.0 as the minimum recommended on Arm platforms. {{% /notice %}} -### Check installed Kafka version +## Check installed Kafka version After extraction, verify that Kafka was installed successfully by checking the version: diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_index.md new file mode 100644 index 0000000000..c509ef77f2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_index.md @@ -0,0 +1,58 @@ +--- +title: Deploy Puppet on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers deploying and optimizing Puppet workloads on Arm Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install Puppet on a SUSE Arm64 (C4A) instance + - Verify Puppet by applying a test manifest and confirming successful resource creation on Arm64 + - Benchmark Puppet by measuring catalog compile time, apply speed, and resource usage on Arm64 + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [Puppet](https://www.puppet.com/) + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Performance and Architecture +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Puppet + - Ruby + - Facter + - Hiera + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Puppet documentation + link: https://www.puppet.com/docs/index.html + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/background.md new file mode 100644 index 0000000000..a83fa26d68 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/background.md @@ -0,0 +1,27 @@ +--- +title: Getting started with Puppet on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Puppet + +[Puppet](https://puppet.com/) is an **open-source configuration management and automation tool** designed to help system administrators and DevOps teams **manage infrastructure as code**. Developed by [Puppet Labs](https://puppet.com/company/), it automates the provisioning, configuration, and management of servers and services across large-scale environments. + +Puppet uses a **declarative language** to define system configurations, ensuring that every machine’s state matches the desired setup described in its manifests. It supports both **agent-based** and **agentless** architectures, making it flexible for diverse deployment needs. + +Known for its **scalability**, **reliability**, and **idempotent behavior**, Puppet continuously enforces configurations, reducing manual effort and configuration drift. It integrates well with major platforms like **Linux**, **Windows**, **macOS**, and cloud providers such as **AWS**, **Azure**, and **GCP**. + +Common use cases include **automating server configuration**, **applying security policies**, **software installation**, and **infrastructure auditing**. Puppet is widely used in enterprises for managing **hybrid and multi-cloud environments** efficiently. + +To learn more, visit the [official Puppet website](https://puppet.com/). diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/baseline.md new file mode 100644 index 0000000000..8d2b43a6a3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/baseline.md @@ -0,0 +1,210 @@ +--- +title: Puppet Baseline Testing on Google Axion C4A Arm Virtual Machine +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Puppet baseline testing on GCP SUSE VMs + +You can perform baseline testing of Puppet on a GCP SUSE Arm64 VM to verify that the installation works correctly. You will check Puppet and Facter versions, run basic Puppet commands, apply a simple manifest, and confirm that system facts are collected accurately. + +### Verify the Puppet installation + +Verify that Puppet and Facter are correctly installed and respond to version checks. + +Check the Puppet version: + +```console +puppet --version +``` + +The output shows the installed version: + +```output +8.10.0 +``` + +Check the Facter version: +```console +facter --version +``` + +The output shows the Facter version: + +```output +4.10.0 +``` + +Check the Ruby version, which is a dependency for Puppet: + +```console +ruby -v +``` + +The output confirms the Ruby version and architecture: + +```output +ruby 3.1.4p223 (2023-03-30 revision 957bb7cb81) [aarch64-linux] +``` + +### Run a simple Puppet command + +Check that Puppet responds to commands by running `puppet help`. If the help menu appears, Puppet is working correctly. + +Run the `puppet help` command: + +```console +puppet help +``` + +The output displays the help menu, confirming Puppet is operational: + +```output +Usage: puppet [options] [options] + +Available subcommands: + + Common: + agent The puppet agent daemon + apply Apply Puppet manifests locally + config Interact with Puppet's settings. + help Display Puppet help. + lookup Interactive Hiera lookup + module Creates, installs and searches for modules on the Puppet Forge. + resource The resource abstraction layer shell + + + Specialized: + catalog Compile, save, view, and convert catalogs. + describe Display help about resource types + device Manage remote network devices + doc Generate Puppet references + epp Interact directly with the EPP template parser/renderer. + facts Retrieve and store facts. + filebucket Store and retrieve files in a filebucket + generate Generates Puppet code from Ruby definitions. + node View and manage node definitions. + parser Interact directly with the parser. + plugin Interact with the Puppet plugin system. + script Run a puppet manifests as a script without compiling a catalog + ssl Manage SSL keys and certificates for puppet SSL clients + +See 'puppet help ' for help on a specific subcommand action. +See 'puppet help ' for help on a specific subcommand. +Puppet v8.10.0 +``` + +### Test a Simple Puppet Manifest + +Create a basic Puppet script to make sure Puppet can apply configurations. If it successfully creates the test file, your Puppet agent functions as expected. + +```bash +cd ~ +cat < test.pp +file { '/tmp/puppet_test.txt': + ensure => file, + content => "Hello from Puppet on SUSE ARM64!\n", +} +EOF +``` + +Run the script: + +```console +puppet apply test.pp +``` + +You should see an output similar to: + +```output +Notice: Compiled catalog for danson-puppet-2.c.arm-deveco-stedvsl-prd.internal in environment production in 0.01 seconds +Notice: /Stage[main]/Main/File[/tmp/puppet_test.txt]/ensure: defined content as '{sha256}bcf972b61979afe69626549b3f3f30798aeb50b359e76603a36e96b2abbe73c0' +Notice: Applied catalog in 0.01 seconds +``` + +Open the file created by Puppet to confirm the content matches your script. This step validates that Puppet executed your manifest correctly. + +```console +cat /tmp/puppet_test.txt +``` + +Output: +```output +Hello from Puppet on SUSE ARM64! +``` + +### Check Facter integration + +Run `facter` commands to verify that it collects accurate system details, such as the OS and CPU type. This ensures Puppet can gather the facts it needs for automation decisions. + +Check the OS: +```console +facter os +``` +The output is similar to the following: +```output +{ + architecture => "aarch64", + distro => { + codename => "n/a", + description => "SUSE Linux Enterprise Server 15 SP6", + id => "SUSE", + release => { + full => "15.6", + major => "15", + minor => "6" + } + }, + family => "Suse", + hardware => "aarch64", + name => "SLES", + release => { + full => "15.6", + major => "15", + minor => "6" + }, + selinux => { + enabled => false + } +} +``` + +Check the architecture: + +```console +facter architecture +``` + +The output is: + +```output +aarch64 +``` + +Check the processors: + +```console +facter processors +``` + +The output is similar to the following: + +```output +{ + cores => 4, + count => 4, + extensions => [ + "aarch64" + ], + isa => "aarch64", + models => [ + + ], + physicalcount => 1, + threads => 1 +} +``` + +With these checks complete, proceed to the Puppet benchmarking section to run workload-focused tests on the GCP SUSE VMs. diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/benchmarking.md new file mode 100644 index 0000000000..134d987dad --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/benchmarking.md @@ -0,0 +1,96 @@ +--- +title: Puppet Benchmarking +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Puppet Benchmark on GCP SUSE Arm64 VM + +This guide explains how to perform a **Puppet standalone benchmark** on a **Google Cloud Platform (GCP) SUSE Linux Arm64 VM**. +It measures Puppet’s local execution performance without requiring a Puppet Master. + + +### Prerequisites +Ensure that Puppet is installed and functioning correctly: + +```console +puppet --version +``` +Output: +```output +8.10.0 +``` + +### Create a Benchmark Manifest +Create a directory and a simple manifest file: + +```console +cd ~ +mkdir -p ~/puppet-benchmark +cd ~/puppet-benchmark +``` + +Use an editor to create the following content in a file named `benchmark.pp`: + +```puppet +notify { 'Benchmark Test': + message => 'Running Puppet standalone benchmark.', +} +``` + +- **notify** is a built-in Puppet resource type that displays a message during catalog application (like a print or log message). +- **'Benchmark Test'** is the title of the resource — a unique identifier for this notify action. +- **message => 'Running Puppet standalone benchmark.'** specifies the text message Puppet will print when applying the manifest. + +### Run the Benchmark Command +This step runs Puppet in standalone mode using the `apply` command to execute the benchmark manifest locally while measuring execution time and performance statistics. + +```console +time puppet apply benchmark.pp --verbose +``` +This executes the manifest locally and outputs timing statistics. + +You should see an output similar to: +```output +Notice: Compiled catalog for danson-puppet-2.c.arm-deveco-stedvsl-prd.internal in environment production in 0.01 seconds +Info: Using environment 'production' +Info: Applying configuration version '1763407825' +Notice: Running Puppet standalone benchmark. +Notice: /Stage[main]/Main/Notify[Benchmark Test]/message: defined 'message' as 'Running Puppet standalone benchmark.' +Notice: Applied catalog in 0.01 seconds + +real 0m1.054s +user 0m0.676s +sys 0m0.367s +``` + +### Benchmark Metrics Explanation + +- **Compiled catalog** → Puppet compiled your manifest into an execution plan. +- **Applied catalog** → Puppet executed the plan on your system. +- **real** → Total elapsed wall time (includes CPU + I/O). +- **user** → CPU time spent in user-space. +- **sys** → CPU time spent in system calls. + +### Benchmark results +The above results were executed on a `c4a-standard-4` (4 vCPU, 16 GB memory) Axiom Arm64 VM in GCP running SuSE: + +| **Metric / Log** | **Output** | +|-------------------|------------| +| Compiled catalog | 0.01 seconds | +| Environment | production | +| Applied catalog | 0.01 seconds | +| real | 0m1.054s | +| user | 0m0.676s | +| sys | 0m0.367s | + +### Puppet benchmarking summary + +- **Catalog compilation:** Completed in just **0.01 seconds**, showing excellent processing speed on **Arm64**. +- **Environment:** Executed smoothly under the **production** environment. +- **Configuration version:** Recorded as **1763407825**, confirming successful version tracking. +- **Catalog application:** Finished in **0.01 seconds**, demonstrating very low execution latency. +- **Real time:** Total runtime of **1.054 seconds**, reflecting efficient end. diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-shell.png b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-shell.png new file mode 100644 index 0000000000..7e2fc3d1b5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-shell.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-ssh.png b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-ssh.png new file mode 100644 index 0000000000..597ccd7fea Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-ssh.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/installation.md new file mode 100644 index 0000000000..5e5d65cfcc --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/installation.md @@ -0,0 +1,95 @@ +--- +title: Install Puppet +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install Puppet on GCP VM +This guide walks you through installing Puppet on a **Google Cloud Platform (GCP) SUSE Linux Arm64 VM**, including all dependencies, Ruby setup, and environment preparation. + +### Install build dependencies and Ruby from source +Installs all required tools and builds Ruby 3.1.4 from source to ensure compatibility with Puppet. + +First we install the prerequisites for ruby: +```console +sudo zypper install git curl gcc make patch libyaml-devel libffi-devel libopenssl-devel readline-devel zlib-devel gdbm-devel bzip2 bzip2-devel +``` + +NOTE: +```note +Due to changing version dependencies, you may receive a message in the "zypper" +command above that ncurses-devel is not the correct version. If so, please select the +option that permits downgrading of the installed ncurses-devel package to the required +version (normally "Solution 1"), followed by confirmation with "y". +``` +Then, we will install ruby itself: +```console +cd ~ +sudo wget https://cache.ruby-lang.org/pub/ruby/3.1/ruby-3.1.4.tar.gz +sudo tar -xzf ruby-3.1.4.tar.gz +cd ruby-3.1.4 +sudo ./configure +sudo make && sudo make install +``` + +### Verify Ruby +Checks that Ruby is correctly installed and available in your system path. + +```console +ruby -v +which ruby +``` + +```output +ruby 3.1.4p223 (2023-03-30 revision 957bb7cb81) [aarch64-linux] +/usr/local/bin/ruby +``` + +### Install Puppet dependencies +Installs essential Puppet libraries (`semantic_puppet, facter, hiera`) needed for automation tasks. + +- **semantic_puppet** – Provides tools for handling Puppet-specific versioning, modules, and dependency constraints. +- **facter** – Collects system information (facts) such as OS, IP, and hardware details for Puppet to use in configuration decisions. +- **hiera** – Key-value lookup tool that manages configuration data outside of Puppet manifests for flexible data separation. + +```console +cd ~ +sudo wget https://github.com/puppetlabs/puppet/archive/refs/tags/8.10.0.tar.gz +sudo tar -xvf 8.10.0.tar.gz +cd ~/puppet-8.10.0 +sudo /usr/local/bin/gem install semantic_puppet -v "~> 1.0" +sudo gem install facter -v "~> 4.0" +sudo gem install hiera +``` + +{{% notice Note %}} +Puppet 8.8.1 version expands official support for Arm and AArch64, with new agent compatibility for AlmaLinux 9 (AARCH64), Rocky Linux 9 (AARCH64), and Ubuntu 24.04 (ARM). The release ensures compatibility with Ruby 3.3 and resolves multiple agent and catalog-related issues. Security is enhanced with an OpenSSL 3.0.14 upgrade, addressing CVE-2024-4603 and CVE-2024-2511 vulnerabilities. +You can view [this release note](https://help.puppet.com/osp/current/Content/PuppetCore/PuppetReleaseNotes/release_notes_puppet_x-8-8-1.htm) + +The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) recommends Puppet version 8.8.1, the minimum recommended on the Arm platforms. +{{% /notice %}} + +### Build and install the Puppet gem +The **Puppet gem** provides the core Puppet framework, including its CLI, manifest parser, and resource management engine. + +Build and install the Puppet 8.10.0 package from source into your Ruby environment. + +```console +sudo gem build puppet.gemspec +sudo /usr/local/bin/gem install puppet-8.10.0.gem +``` + +### Verification +Confirm Puppet is successfully installed and ready to use on the system. + +```console +puppet --version +``` + +Output: +```output +8.10.0 +``` +Puppet installation is complete. You can now go ahead with the baseline testing of Puppet in the next section. diff --git a/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/instance.md new file mode 100644 index 0000000000..a8a819d241 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/puppet-on-gcp/instance.md @@ -0,0 +1,43 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. +- If using use **SUSE Linux Enterprise Server**. Select "Pay As You Go" for the license type. +- Once appropriately selected, please Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. +- Once created, you should see a "SSH" option to the right in your list of VM instances. Click on this to launch a SSH shell into your VM instance: + +![Invoke a SSH session via your browser alt-text#center](images/gcp-ssh.png "Invoke a SSH session into your running VM instance") + +- A window from your browser should come up and you should now see a shell into your VM instance: + +![Terminal Shell in your VM instance alt-text#center](images/gcp-shell.png "Terminal shell in your VM instance") + +Next, let's install puppet! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_index.md new file mode 100644 index 0000000000..65a5bf5a4f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_index.md @@ -0,0 +1,62 @@ +--- +title: Deploy Rust on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This learning path is intended for software developers deploying and optimizing Rust workloads on Linux/Arm64 environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install Rust and configure the development environment on a SUSE Arm64 (C4A) instance + - Verify Rust setup by compiling and running a sample program to ensure toolchain functionality + - Benchmark Rust using cargo bench with Criterion to measure execution speed, stability, and performance on Arm64 systems + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [Rust](https://www.rust-lang.org/) + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Performance and Architecture +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Rust + - Cargo + - Criterion + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Rust documentation + link: https://doc.rust-lang.org/stable/ + type: documentation + + - resource: + title: Cargo bench documentation + link: https://doc.rust-lang.org/cargo/commands/cargo-bench.html + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/background.md new file mode 100644 index 0000000000..f777c8c4d5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/background.md @@ -0,0 +1,24 @@ +--- +title: Getting started with Rust on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Rust + +[Rust](https://www.rust-lang.org/) is a modern, high-performance systems programming language designed for safety, speed, and concurrency. It provides memory safety without garbage collection, making it ideal for building reliable and efficient software. + +Developed by Mozilla, Rust is widely used in system-level programming, web assembly, embedded systems, and performance-critical applications. +Its strong type system and ownership model help prevent common bugs like data races and memory leaks. + +To learn more, visit the [official Rust website](https://www.rust-lang.org/). diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/baseline.md new file mode 100644 index 0000000000..9d74aaeaeb --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/baseline.md @@ -0,0 +1,61 @@ +--- +title: Test Rust baseline performance on Google Axion C4A Arm virtual machines +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Perform baseline testing + +You can perform baseline testing of Rust on GCP SUSE aarch64 VMs to verify installation, build functionality, and compilation performance on the Arm-based Axion C4A platform. + +### Create a sample Rust program + +Create and build a simple "Hello, World" application to verify that Rust is working correctly: + +```console +mkdir rust-baseline +cd rust-baseline +cargo new hello +cd hello +cargo run +``` + +This creates a new Rust project and runs it immediately. The `cargo new hello` command generates a default Rust project with the necessary files including `main.rs` and `Cargo.toml`. + +The output is similar to: + +```output + Compiling hello v0.1.0 (/home/gcpuser/rust-baseline/hello) + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.19s + Running `target/debug/hello` +Hello, world! +``` + +This confirms that Rust and Cargo are properly configured on your aarch64 VM. + +### Measure compilation performance + +Use the `time` command to measure compilation performance on the Arm64 processor: + +```console +cargo clean +time cargo build +``` + +The `cargo clean` command removes all build artifacts, ensuring you measure a complete compilation from scratch. + +The output is similar to: + +```output +Removed 21 files, 7.7MiB total + Compiling hello v0.1.0 (/home/gcpuser/rust-baseline/hello) + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.14s + +real 0m0.186s +user 0m0.118s +sys 0m0.071s +``` + +The timing results show that Rust compilation performs well on the Arm64 architecture, with the "real" time indicating the total elapsed time for the build process. diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/benchmarking.md new file mode 100644 index 0000000000..b99aaf2547 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/benchmarking.md @@ -0,0 +1,120 @@ +--- +title: Benchmark Rust performance using Criterion +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Benchmark Rust performance + +This section demonstrates how to benchmark Rust performance using `cargo bench` and the Criterion library to measure code execution speed and performance consistency on aarch64 hardware. + +### Create a benchmark project + +Create a new Rust project specifically for benchmarking: + +```console +cargo new rust-benchmark +cd rust-benchmark +``` + +### Configure Criterion as a dependency + +Criterion is the recommended benchmarking crate for Rust. Edit the `Cargo.toml` file in your project root directory and replace the existing content with: + +```toml +[dependencies] +criterion = "0.5" + +[[bench]] +name = "my_benchmark" +harness = false +``` + +This configuration enables Criterion for high-precision benchmarking and disables the default test harness. + +### Create the benchmark directory and file + +Create the benchmark structure that Cargo expects: + +```console +mkdir benches +``` + +Create a new benchmark file in the `benches/` directory: + +```console +edit benches/my_benchmark.rs +``` + +Add the following benchmark code to measure Fibonacci number calculation performance: + +```rust +use criterion::{black_box, Criterion, criterion_group, criterion_main}; + +// Example benchmark function +fn fibonacci(n: u64) -> u64 { + match n { + 0 => 0, + 1 => 1, + n => fibonacci(n - 1) + fibonacci(n - 2), + } +} + +fn benchmark_fibonacci(c: &mut Criterion) { + c.bench_function("fibonacci 20", |b| b.iter(|| fibonacci(black_box(20)))); +} + +criterion_group!(benches, benchmark_fibonacci); +criterion_main!(benches); +``` + +This code implements a recursive Fibonacci function and measures how efficiently Rust computes the 20th Fibonacci number. The `black_box` function prevents the compiler from optimizing away the benchmark. + +### Run the benchmark + +Execute the benchmark using Cargo: + +```console +cargo bench +``` + +Cargo compiles your code with optimizations enabled and runs the Criterion benchmarks, providing detailed performance metrics. + +The output is similar to: + +```output +Running benches/my_benchmark.rs (target/release/deps/my_benchmark-f40a307ef9cad515) +Gnuplot not found, using plotters backend +fibonacci 20 time: [12.026 µs 12.028 µs 12.030 µs] +Found 1 outliers among 100 measurements (1.00%) + 1 (1.00%) low mild +``` + +### Benchmark Metrics Explanation + +- **Average Time:** Mean execution time across benchmark runs. +- **Outliers:** Represent runs significantly slower or faster than average. +- **Plotting Backend:** Used `plotters` since Gnuplot was not found. +- The results show **consistent performance** with only slight variation across 100 measurements. + +### Understand the results + +The benchmark output provides several key metrics: + +- **Average time**: Mean execution time across benchmark runs +- **Outliers**: Runs significantly slower or faster than average +- **Plotting backend**: Uses plotters since Gnuplot wasn't found + +The results show consistent performance with only slight variation across 100 measurements. + +### Performance summary + +The following table shows results from running the benchmark on a `c4a-standard-4` (4 vCPU, 16 GB memory) aarch64 VM in GCP using SUSE: + +| Benchmark | Average Time (µs) | Min (µs) | Max (µs) | Outliers (%) | Remarks | +|---------------|------------------:|---------:|---------:|-------------:|---------| +| fibonacci 20 | 12.028 | 12.026 | 12.030 | 1.00% | Stable performance with minimal variation | + +The Fibonacci benchmark demonstrates consistent performance on the aarch64 platform. The average execution time of 12.028 µs indicates efficient CPU computation, while only 1% of measurements were outliers. This low variance confirms Rust's reliable execution speed and performance stability on aarch64 architecture. diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-shell.png b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-shell.png new file mode 100644 index 0000000000..7e2fc3d1b5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-shell.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-ssh.png b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-ssh.png new file mode 100644 index 0000000000..597ccd7fea Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-ssh.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/installation.md new file mode 100644 index 0000000000..959917ca11 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/installation.md @@ -0,0 +1,61 @@ +--- +title: Install Rust +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install Rust + +This section explains how to install and configure Rust on a GCP SUSE aarch64 VM, preparing your environment for building and benchmarking Rust applications. + +### Update your system + +Update the system and install essential build tools required for compiling Rust programs: + +```console +sudo zypper refresh +sudo zypper update -y +sudo zypper install -y curl gcc make +``` + +This ensures your system has the latest packages and the necessary compilation tools. + +### Install Rust using rustup + +Rust provides an official installer script via `rustup` that handles the setup automatically: + +```console +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +When prompted, select option 1 for the default installation. This installs the latest stable version of Rust along with Cargo, Rust's package manager and build system. + +### Configure your environment + +Activate Rust's environment variables for your current shell session: + +```console +source $HOME/.cargo/env +``` + +This command adds the Rust toolchain to your PATH, making the `rustc` compiler and `cargo` commands available. + +### Verify the installation + +Confirm that Rust and Cargo installed successfully by checking their versions: + +```console +rustc --version +cargo --version +``` + +The output is similar to: + +```output +rustc 1.91.0 (f8297e351 2025-10-28) +cargo 1.91.0 (ea2d97820 2025-10-10) +``` + +Your Rust installation is now complete and ready for development on the aarch64 platform. diff --git a/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/instance.md new file mode 100644 index 0000000000..f531adfdd6 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/rust-on-gcp/instance.md @@ -0,0 +1,43 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. +- If using use **SUSE Linux Enterprise Server**. Select "Pay As You Go" for the license type. +- Once appropriately selected, please Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. +- Once created, you should see a "SSH" option to the right in your list of VM instances. Click on this to launch a SSH shell into your VM instance: + +![Invoke a SSH session via your browser alt-text#center](images/gcp-ssh.png "Invoke a SSH session into your running VM instance") + +- A window from your browser should come up and you should now see a shell into your VM instance: + +![Terminal Shell in your VM instance alt-text#center](images/gcp-shell.png "Terminal shell in your VM instance") + +Next, let's install rust! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_index.md new file mode 100644 index 0000000000..5e875cfde1 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_index.md @@ -0,0 +1,62 @@ +--- +title: Deploy TensorFlow on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers deploying and optimizing TensorFlow workloads on Arm64 Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install TensorFlow on a SUSE Arm64 (C4A) instance + - Verify TensorFlow by running basic computation and model training tests on Arm64 + - Benchmark TensorFlow using tf.keras to evaluate inference speed and model performance on Arm64 systems. + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [TensorFlow](https://www.tensorflow.org/) + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: ML +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - TensorFlow + - Python + - Keras + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: TensorFlow documentation + link: https://www.tensorflow.org/learn + type: documentation + + - resource: + title: Phoronix Test Suite (PTS) documentation + link: https://www.phoronix-test-suite.com/ + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/background.md new file mode 100644 index 0000000000..bb3cf5b347 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/background.md @@ -0,0 +1,24 @@ +--- +title: Getting started with TensorFlow on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## TensorFlow + +[TensorFlow](https://www.tensorflow.org/) is an **open-source machine learning and deep learning framework** developed by **Google**. It helps developers and researchers **build, train, and deploy AI models** efficiently across **CPUs, GPUs, and TPUs**. + +With support for **neural networks**, **natural language processing (NLP)**, and **computer vision**, TensorFlow is widely used for **AI research and production**. +Its **flexibility** and **scalability** make it ideal for both **cloud** and **edge environments**. + +To learn more, visit the [official TensorFlow website](https://www.tensorflow.org/). diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/baseline.md new file mode 100644 index 0000000000..ce9bae727e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/baseline.md @@ -0,0 +1,90 @@ +--- +title: Test TensorFlow baseline performance on Google Axion C4A Arm virtual machines +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Perform baseline testing + +This section helps you verify that TensorFlow is properly installed and working on your Google Axion C4A VM. You'll run tests to confirm that your CPU can perform TensorFlow operations correctly. + +### Check available devices + +This command shows which hardware devices TensorFlow can use, such as CPU or GPU. On most VMs, you'll see only CPU listed: + +```console +python -c "import tensorflow as tf; print(tf.config.list_physical_devices())" +``` + +The output is similar to: + +```output +[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')] +``` + +### Run a computation test + +This test multiplies two large matrices to verify that TensorFlow computations work correctly on your CPU and measures execution time: + +```console +python -c "import tensorflow as tf; import time; +a = tf.random.uniform((1000,1000)); b = tf.random.uniform((1000,1000)); +start = time.time(); c = tf.matmul(a,b); end = time.time(); +print('Computation time:', end - start, 'seconds')" +``` + +This checks CPU performance for basic operations and provides a baseline measurement. + +The output is similar to: + +```output +Computation time: 0.008263111114501953 seconds +``` + +### Test neural network execution + +Use a text editor to create a new file named `test_nn.py` for testing a simple neural network. + +Add the following code to create and train a basic neural network using random data: + +```python +import keras +from keras import layers +import numpy as np + +# Dummy data +x = np.random.rand(1000, 20) +y = np.random.rand(1000, 1) + +# Define the model +model = keras.Sequential() +model.add(keras.Input(shape=(20,))) +model.add(layers.Dense(64,activation="relu")) +model.add(layers.Dense(1)) + +# Compile the model +model.compile(optimizer='adam', loss='mse') + +# Train for 1 epoch +model.fit(x, y, epochs=1, batch_size=32) +``` + +This script creates a simple neural network to verify that TensorFlow's deep learning functions work properly on the Arm platform. + +### Run the neural network test + +Execute the script: + +```console +python test_nn.py +``` + +TensorFlow displays training progress similar to: + +```output +32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - loss: 0.1024 +``` + +This confirms that TensorFlow is working correctly on your Arm VM and can perform both basic computations and neural network training. diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/benchmarking.md new file mode 100644 index 0000000000..93a0a0e084 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/benchmarking.md @@ -0,0 +1,115 @@ +--- +title: Benchmark TensorFlow model performance using tf.keras +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Benchmark TensorFlow models + +This section benchmarks multiple TensorFlow models (ResNet50, MobileNetV2, and InceptionV3) using dummy input data. You'll measure average inference time and throughput for each model running on the CPU. + +tf.keras is TensorFlow's high-level API for building, training, and benchmarking deep learning models. It provides access to predefined architectures such as ResNet, MobileNet, and Inception, making it easy to evaluate model performance on different hardware setups. + +### Activate your virtual environment + +Enable your isolated Python environment where TensorFlow is installed: + +```console +source ~/tf-venv/bin/activate +python -c "import tensorflow as tf; print(tf.__version__)" +``` + +This ensures that all TensorFlow-related packages run in a clean, controlled setup without affecting system-wide Python installations. + +### Install required packages + +Install TensorFlow and NumPy for model creation and benchmarking: + +```console +pip install tensorflow==2.20.0 numpy +``` + +These packages are likely already installed from the previous installation steps. NumPy supports efficient numerical operations, while TensorFlow handles deep learning workloads. + +### Create the benchmark script + +Use an editor to create a Python script named `tf_cpu_benchmark.py` that will run TensorFlow model benchmarking tests. + +Add the following code to benchmark three different model architectures: + +```python +import tensorflow as tf +import time + +# List of models to benchmark +models = { + "ResNet50": tf.keras.applications.ResNet50, + "MobileNetV2": tf.keras.applications.MobileNetV2, + "InceptionV3": tf.keras.applications.InceptionV3 +} + +batch_size = 32 +num_runs = 50 + +for name, constructor in models.items(): + print(f"\nBenchmarking {name}...") + # Create model without pretrained weights + model = constructor(weights=None, input_shape=(224,224,3)) + # Generate dummy input + dummy_input = tf.random.uniform([batch_size, 224, 224, 3]) + # Warm-up + _ = model(dummy_input) + # Benchmark + start = time.time() + for _ in range(num_runs): + _ = model(dummy_input) + end = time.time() + avg_time = (end - start) / num_runs + throughput = batch_size / avg_time + print(f"{name} average inference time per batch: {avg_time:.4f} seconds") + print(f"{name} throughput: {throughput:.2f} images/sec") +``` + +This script creates model instances without pretrained weights for fair CPU testing, generates random image data for inference, includes a warm-up phase to stabilize model performance, and measures inference time over 50 runs to calculate average performance and throughput. + +### Run the benchmark + +Execute the benchmarking script: + +```console +python tf_cpu_benchmark.py +``` + +The output is similar to: + +```output +Benchmarking ResNet50... +ResNet50 average inference time per batch: 1.2051 seconds +ResNet50 throughput: 26.55 images/sec + +Benchmarking MobileNetV2... +MobileNetV2 average inference time per batch: 0.2909 seconds +MobileNetV2 throughput: 110.02 images/sec + +Benchmarking InceptionV3... +InceptionV3 average inference time per batch: 0.8971 seconds +InceptionV3 throughput: 35.67 images/sec +``` + +### Understand the results + +The benchmark provides key performance metrics. Average inference time per batch measures how long it takes to process one batch of input data, with lower values indicating faster performance. Throughput shows how many images the model can process per second, with higher values indicating better efficiency. + +### Performance summary + +The following table shows results from running the benchmark on a `c4a-standard-4` (4 vCPU, 16 GB memory) aarch64 VM in GCP using SUSE: + +| Model | Average Inference Time per Batch (seconds) | Throughput (images/sec) | +|-------------|-------------------------------------------:|------------------------:| +| ResNet50 | 1.2051 | 26.55 | +| MobileNetV2 | 0.2909 | 110.02 | +| InceptionV3 | 0.8971 | 35.67 | + +The results demonstrate strong performance for lightweight CNNs like MobileNetV2, achieving over 110 images/sec on the aarch64 platform. Medium-depth models like InceptionV3 maintain balanced performance between accuracy and latency. Heavier architectures such as ResNet50 show longer inference times but deliver stable throughput, confirming that TensorFlow workloads run efficiently on Arm processors and provide a cost-effective alternative for AI inference tasks. diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-shell.png b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-shell.png new file mode 100644 index 0000000000..7e2fc3d1b5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-shell.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-ssh.png b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-ssh.png new file mode 100644 index 0000000000..597ccd7fea Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-ssh.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/installation.md new file mode 100644 index 0000000000..9d19b13fef --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/installation.md @@ -0,0 +1,86 @@ +--- +title: Install TensorFlow +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install TensorFlow on Google Axion C4A + +TensorFlow is an open-source machine learning library developed by Google for building and deploying ML models efficiently. On aarch64 SUSE VMs, TensorFlow runs natively on CPU or GPU if available. + +### Update your system + +Update the system and install Python 3.11 with pip and virtual environment support: + +```console +sudo zypper refresh +sudo zypper install python311 python311-pip python311-venv +``` + +Enter "y" when prompted to confirm the installation. This ensures your system has the essential tools required for TensorFlow setup. + +### Verify Python installation + +Confirm that Python and pip are correctly installed: + +```console +python3.11 --version +pip3 --version +``` + +The output is similar to: + +```output +Python 3.11.10 +pip 22.3.1 from /usr/lib/python3.11/site-packages/pip (python 3.11) +``` + +### Create a virtual environment + +Set up an isolated Python environment to keep TensorFlow dependencies separate from system packages: + +```console +python3.11 -m venv tf-venv +source tf-venv/bin/activate +``` + +This creates and activates a virtual environment named `tf-venv` that prevents package conflicts. + +### Upgrade pip + +Upgrade pip to the latest version for reliable package installation: + +```console +pip3 install --upgrade pip +``` + +### Install TensorFlow +Install the latest stable TensorFlow version for Arm64: + +```console +pip3 install tensorflow==2.20.0 +``` + +{{% notice Note %}} +TensorFlow 2.18.0 introduced compatibility with NumPy 2.0, incorporating its updated type promotion rules and improved numerical precision. You can review [What's new in TensorFlow 2.18](https://blog.tensorflow.org/2024/10/whats-new-in-tensorflow-218.html) for more information. + +The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) recommends TensorFlow version 2.18.0 as the minimum recommended version on Arm platforms. +{{% /notice %}} + +### Verify the installation + +Check that TensorFlow installed successfully and display the version: + +```console +python -c "import tensorflow as tf; print(tf.__version__)" +``` + +The output is similar to: + +```output +2.20.0 +``` + +Your TensorFlow installation is now complete and ready for use. diff --git a/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/instance.md new file mode 100644 index 0000000000..ea1d56e7f1 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/tensorflow-gcp/instance.md @@ -0,0 +1,43 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. +- If using use **SUSE Linux Enterprise Server**. Select "Pay As You Go" for the license type. +- Once appropriately selected, please Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. +- Once created, you should see a "SSH" option to the right in your list of VM instances. Click on this to launch a SSH shell into your VM instance: + +![Invoke a SSH session via your browser alt-text#center](images/gcp-ssh.png "Invoke a SSH session into your running VM instance") + +- A window from your browser should come up and you should now see a shell into your VM instance: + +![Terminal Shell in your VM instance alt-text#center](images/gcp-shell.png "Terminal shell in your VM instance") + +Next, let's install tensorflow! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md index 5b4f2f0c0f..8a1292870a 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md @@ -1,19 +1,15 @@ --- -title: Deploy TypeScript on Google Cloud C4A (Arm-based Axion VMs) - -draft: true -cascade: - draft: true +title: Deploy TypeScript on Google Cloud C4A virtual machines minutes_to_complete: 30 -who_is_this_for: This is an introductory topic for software developers deploying and optimizing TypeScript workloads on Arm64 Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. +who_is_this_for: This is an introductory topic for developers deploying and optimizing TypeScript workloads on Arm64 Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. learning_objectives: - - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) - - Install TypeScript on a SUSE Arm64 (C4A) instance - - Validate TypeScript functionality by creating, compiling, and running a simple TypeScript script on the Arm64 VM - - Benchmark TypeScript performance using a JMH-style custom benchmark with perf_hooks on Arm64 architecture + - Provision an Arm-based SUSE Linux Enterprise Server (SLES) virtual machine (VM) on Google Cloud + - Install TypeScript on a SUSE Arm64 C4A instance + - Validate TypeScript functionality by creating, compiling, and running a simple TypeScript script on a Arm64 VM + - Benchmark TypeScript performance using a JMH-style custom benchmark with the perf_hooks module on Arm64 architecture prerequisites: - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md index 8c07d02012..e4b8876664 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md @@ -1,23 +1,28 @@ --- -title: Getting started with TypeScript on Google Axion C4A (Arm Neoverse-V2) +title: Get started with TypeScript on Google Axion C4A instances weight: 2 layout: "learningpathall" --- +## Introduction + +In this Learning Path, you'll deploy and benchmark TypeScript applications on Arm-based Google Cloud C4A instances powered by Axion processors. You'll provision a SUSE Linux Enterprise Server (SLES) virtual machine (VM), install and configure TypeScript, and measure performance using a JMH-style custom benchmark. This process shows you how to use TypeScript with Arm-based cloud infrastructure and helps you evaluate performance and compatibility for cloud-native workloads. + + ## Google Axion C4A Arm instances in Google Cloud -Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, they offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. -The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture on Google Cloud. -To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. +To learn more about Google Axion, see the Google blog [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu). ## TypeScript -TypeScript is an open-source, strongly typed programming language developed and maintained by Microsoft. +TypeScript is an open-source, strongly-typed programming language developed and maintained by Microsoft. -It is a superset of JavaScript, which means all valid JavaScript code is also valid TypeScript, but TypeScript adds static typing, interfaces, and advanced tooling to help developers write more reliable and maintainable code. +TypeScript builds on JavaScript by adding features like static typing and interfaces. Any valid JavaScript code works in TypeScript, but TypeScript gives you extra tools to write code that is easier to maintain and less prone to errors. -TypeScript is widely used for web applications, server-side development (Node.js), and large-scale JavaScript projects** where type safety and code quality are important. Learn more from the [TypeScript official website](https://www.typescriptlang.org/) and its [handbook and documentation](https://www.typescriptlang.org/docs/). +TypeScript is widely used for web applications, server-side development (Node.js), and large-scale JavaScript projects where type safety and code quality are important. Learn more by visiting the [TypeScript official website](https://www.typescriptlang.org/) and the [TypeScript handbook and documentation](https://www.typescriptlang.org/docs/). diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md index dbc57adcd9..a3b94d7b14 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md @@ -1,19 +1,17 @@ --- -title: TypeScript Baseline Testing on Google Axion C4A Arm Virtual Machine +title: Establish a TypeScript performance baseline weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Baseline Setup for TypeScript -This section walks you through the baseline setup and validation of TypeScript on a Google Cloud C4A (Axion Arm64) virtual machine running SUSE Linux. -The goal is to confirm that your TypeScript environment is functioning correctly, from initializing a project to compiling and executing a simple TypeScript file, ensuring a solid foundation before performance or benchmarking steps. +## Overview +This section walks you through the baseline setup and validation of TypeScript on a Google Cloud C4A (Axion Arm64) virtual machine running SUSE Linux. The goal is to confirm that your TypeScript environment is functioning correctly, from initializing a project to compiling and executing a simple TypeScript file, ensuring a solid foundation before performance or benchmarking steps. -### Set Up a TypeScript Project -Before running any tests, you’ll create a dedicated project directory and initialize a minimal TypeScript environment. +## Create project folder -1. Create project folder +Before running any tests, you’ll create a dedicated project directory and initialize a minimal TypeScript environment. Start by creating a new folder to hold your TypeScript project files: @@ -23,15 +21,15 @@ cd ~/typescript-benchmark ``` This creates a workspace named `typescript-benchmark` in your home directory, ensuring all TypeScript configuration and source files are organized separately from system files and global modules. -2. Initialize npm project +## Initialize npm project -Next, initialize a new Node.js project. This creates a `package.json` file that defines your project metadata, dependencies, and scripts. +Next, initialize a new Node.js project. This creates a `package.json` file that defines your project metadata, dependencies, and scripts: ```console npm init -y ``` -3. Install Node.js type definitions +## Install Node.js type definitions To enable TypeScript to properly recognize Node.js built-in APIs (like fs, path, and process), install the Node.js type definitions package: @@ -55,10 +53,10 @@ You should see output similar to: } ``` -### Baseline Testing +## Perform baseline testing With the TypeScript environment configured, you’ll now perform a baseline functionality test to confirm that TypeScript compilation and execution work correctly on your Google Cloud SUSE Arm64 VM. -1. Create a Simple TypeScript File +## Create a simple TypeScript file Create a file named `hello.ts` with the following content: @@ -71,7 +69,7 @@ console.log(greet("GCP SUSE ARM64")); ``` This simple function demonstrates TypeScript syntax, type annotations, and basic console output. -2. Compile TypeScript +## Compile TypeScript Use the TypeScript compiler (tsc) to transpile the .ts file into JavaScript: @@ -80,7 +78,7 @@ tsc hello.ts ``` This generates a new file named `hello.js` in the same directory. -3. Run compiled JavaScript +## Run compiled JavaScript Now, execute the compiled JavaScript using Node.js. This step verifies that: diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md index 567f959845..cdc7c0252f 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md @@ -1,5 +1,5 @@ --- -title: TypeScript Benchmarking +title: Benchmark TypeScript performance weight: 6 ### FIXED, DO NOT MODIFY @@ -7,12 +7,12 @@ layout: learningpathall --- -## JMH-style Custom Benchmarking +## Create a custom JMH-style benchmark for TypeScript on Arm This section demonstrates how to benchmark TypeScript functions using a JMH-style (Java Microbenchmark Harness) methodology implemented with Node.js's built-in `perf_hooks` module. Unlike basic `console.time()` measurements, this approach executes multiple iterations, computes the average runtime, and produces stable and repeatable performance data, useful for evaluating workloads on your Google Cloud C4A (Axion Arm64) VM running SUSE Linux. -### Create the Benchmark Script +## Implement benchmarking with Node.js perf_hooks on Arm Create a file named `benchmark_jmh.ts` inside your project directory with the content below: ```typescript @@ -56,7 +56,7 @@ Code explanation: This JMH-style benchmarking approach provides more accurate and repeatable performance metrics than a single execution, making it ideal for performance testing on Arm-based systems. -### Compile the TypeScript Benchmark +## Compile the TypeScript Benchmark First, compile the benchmark file from TypeScript to JavaScript using the TypeScript compiler (tsc): ```console @@ -65,7 +65,7 @@ tsc benchmark_jmh.ts This command transpiles your TypeScript code into standard JavaScript, generating a file named `benchmark_jmh.js` in the same directory. The resulting JavaScript can be executed by Node.js, allowing you to measure performance on your Google Cloud C4A (Arm64) virtual machine. -### Run the Benchmark +## Run the benchmark Now, execute the compiled JavaScript file with Node.js: ```console @@ -87,33 +87,24 @@ Iteration 10: 0.673 ms Average execution time over 10 iterations: 0.888 ms ``` +## Interpret your TypeScript performance data -### Benchmark Metrics Explained +Each iteration measures how long it takes to run the benchmarked function once, while the average execution time is calculated by dividing the total time for all runs by the number of iterations. Running the benchmark multiple times helps smooth out fluctuations caused by factors like CPU scheduling, garbage collection, or memory caching. This approach produces more consistent and meaningful performance data, similar to the methodology used by Java’s JMH benchmarking framework. - * Iteration times → Each iteration represents the time taken for one complete execution of the benchmarked function. - * Average execution time → Calculated as the total of all iteration times divided by the number of iterations. This gives a stable measure of real-world performance. - * Why multiple iterations? - A single run can be affected by transient factors such as CPU scheduling, garbage collection, or memory caching. - Running multiple iterations and averaging the results smooths out variability, producing more repeatable and statistically meaningful data, similar to Java’s JMH benchmarking methodology. - -### Interpretation +The average execution time reflects how efficiently the function executes under steady-state conditions. The first iteration often shows higher latency because Node.js performing initial JIT (Just-In-Time) compilation and optimization, a common warm-up behavior in JavaScript/TypeScript benchmarks. -The average execution time reflects how efficiently the function executes under steady-state conditions. -The first iteration often shows higher latency because Node.js performing initial JIT (Just-In-Time) compilation and optimization, a common warm-up behavior in JavaScript/TypeScript benchmarks. - -### Benchmark summary on Arm64 +## Benchmark summary Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (SUSE): | Iteration | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Average | |-----------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|---------| | Time (ms) | 2.286 | 0.749 | 1.145 | 0.674 | 0.671 | 0.671 | 0.672 | 0.667 | 0.667 | 0.673 | 0.888 | +## Summarize TypeScript benchmarking results on Arm64 -### TypeScript performance benchmarking summary on Arm64 - -When you look at the benchmarking results, you will notice that on the Google Axion C4A Arm-based instances: +Here’s what the benchmark results show for Google Axion C4A Arm-based instances: -- The average execution time on Arm64 (~0.888 ms) shows that CPU-bound TypeScript operations run efficiently on Arm-based VMs. -- Initial iterations may show slightly higher times due to runtime warm-up and optimization overhead, which is common across architectures. -- Arm64 demonstrates stable iteration times after the first run, indicating consistent performance for repeated workloads. +- The average execution time on Arm64 is about 0.888 ms, which means TypeScript code runs efficiently on Arm-based VMs. +- The first run is usually a bit slower because Node.js is warming up and optimizing the code. This is normal for all architectures. +- After the first run, the times are very consistent, showing that Arm64 delivers stable performance for repeated tasks. -This demonstrates that Google Cloud C4A Arm64 virtual machines provide production-grade stability and throughput for TypeScript workloads, whether used for application logic, scripting, or performance-critical services. +These results confirm that Google Cloud C4A Arm64 virtual machines are reliable and fast for running TypeScript workloads, whether you’re building application logic, scripts, or performance-sensitive services. diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md index a48a49cbf8..28880c7353 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md @@ -6,22 +6,22 @@ weight: 4 layout: learningpathall --- -## Install TypeScript on GCP VM +## Overview This section walks you through installing TypeScript and its dependencies on a Google Cloud Platform (GCP) SUSE Arm64 virtual machine. You’ll install Node.js, npm, TypeScript, and ts-node, and verify that everything works correctly. -Running TypeScript on Google Cloud C4A instances, powered by Axion Arm64 processors, provides a high-performance and energy-efficient platform for Node.js-based workloads. +Running TypeScript on Google Cloud C4A instances, powered by Axion Arm64 processors provides a high-performance and energy-efficient platform for Node.js-based workloads. -### Update SUSE System +## Update SUSE system Before installing new packages, refresh the repositories and update existing ones to ensure your environment is current and secure: ```console sudo zypper refresh sudo zypper update -y ``` -Keeping your system up to date ensures that dependencies, libraries, and compilers required for Node.js and TypeScript work seamlessly on the Arm64 architecture. +Updating your system helps make sure all the tools and libraries you need for Node.js and TypeScript work smoothly on Arm64. -### Install Node.js and npm -Node.js provides the JavaScript runtime that powers TypeScript execution, while npm (Node Package Manager) manages project dependencies and global tools. +## Install Node.js and npm +Node.js is the JavaScript runtime that runs your TypeScript code. npm is the tool you use to install and manage packages and tools for your projects. Install both packages using SUSE’s repositories: @@ -30,7 +30,7 @@ sudo zypper install -y nodejs npm ``` This command installs the Node.js runtime and npm package manager on your Google Cloud SUSE Arm64 VM. -### Install TypeScript globally +## Install TypeScript globally TypeScript (tsc) is the compiler that converts .ts files into JavaScript. `ts-node` lets you run TypeScript files directly without pre-compiling them. It is useful for testing, scripting, and lightweight development workflows. @@ -43,7 +43,7 @@ The `-g` flag installs packages globally, making tsc and ts-node available syste This approach simplifies workflows for developers running multiple TypeScript projects on the same VM. -### Verify installations +## Verify installation Check that Node.js, npm, TypeScript, and ts-node are all installed correctly: ```console @@ -66,5 +66,4 @@ Version 5.9.3 v10.9.2 ``` -Node.js, npm, and TypeScript are now successfully installed and verified on your Google Cloud C4A (Arm64) virtual machine. -You’re ready to create and execute TypeScript scripts for testing, deployment, or performance benchmarking. +You’ve now installed and verified Node.js, npm, and TypeScript on your Google Cloud C4A (Arm64) virtual machine. You’re ready to start creating and running TypeScript scripts for testing, deployment, or performance checks. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md index 2b93bc950d..9a2aa5bf4a 100644 --- a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md @@ -8,15 +8,15 @@ layout: learningpathall ## Overview -In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. +In this section, you'll set up a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` machine type. This instance gives you four virtual CPUs and 16 GB of memory. You'll use the Google Cloud Console to complete each step. {{% notice Note %}} For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). {{% /notice %}} -## Provision a Google Axion C4A Arm VM in Google Cloud Console +## Create the virtual machine -To create a virtual machine based on the C4A instance type: +To create the virtual machine, follow these steps: - Navigate to the [Google Cloud Console](https://console.cloud.google.com/). - Go to **Compute Engine > VM Instances** and select **Create Instance**. - Under **Machine configuration**: @@ -26,6 +26,6 @@ To create a virtual machine based on the C4A instance type: ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") -- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. Pick the preferred version for your Operating System. Ensure you select the **Arm image** variant. Click **Select**. +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. Pick the preferred version for your operating system. Ensure you select the **Arm image** variant. Click **Select**. - Under **Networking**, enable **Allow HTTP traffic**. - Click **Create** to launch the instance. diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/1-overview-and-build.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/1-overview-and-build.md index a206484375..27405413fd 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/1-overview-and-build.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/1-overview-and-build.md @@ -1,5 +1,5 @@ --- -title: Overview and Optimized Build +title: Build and validate vLLM for inference on Azure Cobalt 100 weight: 2 ### FIXED, DO NOT MODIFY @@ -8,50 +8,57 @@ layout: learningpathall ## What is vLLM? -vLLM is an open-source, high-throughput inference and serving engine for large language models (LLMs). -It’s designed to make LLM inference faster, more memory-efficient, and scalable, particularly during the prefill (context processing) and decode (token generation) phases of inference. +vLLM is an open-source, high-throughput inference and serving engine for large language models (LLMs). It’s designed to make LLM inference faster, more memory-efficient, and scalable, particularly during the prefill (context processing) and decode (token generation) phases of inference. -### Key Features - * Continuous Batching – Dynamically combines incoming inference requests into a single large batch, maximizing CPU/GPU utilization and throughput. - * KV Cache Management – Efficiently stores and reuses key-value attention states, sustaining concurrency across multiple active sessions while minimizing memory overhead. - * Token Streaming – Streams generated tokens as they are produced, enabling real-time responses for chat or API scenarios. -### Interaction Modes +## Key features +* Continuous batching: dynamically merges incoming inference requests into larger batches, maximizing Arm CPU utilization and overall throughput +* KV cache management: efficiently stores and reuses key-value attention states, sustaining concurrency across multiple active sessions while minimizing memory overhead +* Token streaming: streams generated tokens as they are produced, enabling real-time responses for chat or API scenarios +## Interaction modes You can use vLLM in two main ways: - * OpenAI-Compatible REST Server: - vLLM provides a /v1/chat/completions endpoint compatible with the OpenAI API schema, making it drop-in ready for tools like LangChain, LlamaIndex, and the official OpenAI Python SDK. - * Python API: - Load and serve models programmatically within your own Python scripts for flexible local inference and evaluation. +- Using an OpenAI-Compatible REST Server: vLLM provides a /v1/chat/completions endpoint compatible with the OpenAI API schema, making it drop-in ready for tools like LangChain, LlamaIndex, and the official OpenAI Python SDK +- Using a Python API: load and serve models programmatically within your own Python scripts for flexible local inference and evaluation vLLM supports Hugging Face Transformer models out-of-the-box and scales seamlessly from single-prompt testing to production batch inference. -## What you build +## What you will build -In this learning path, you will build a CPU-optimized version of vLLM targeting the Arm64 architecture, integrated with oneDNN and the Arm Compute Library (ACL). +In this Learning Path, you'll build a CPU-optimized version of vLLM targeting the Arm64 architecture, integrated with oneDNN and the Arm Compute Library (ACL). This build enables high-performance LLM inference on Arm servers, leveraging specialized Arm math libraries and kernel optimizations. After compiling, you’ll validate your build by running a local chat example to confirm functionality and measure baseline inference speed. ## Why this is fast on Arm -vLLM’s performance on Arm servers is driven by both software optimization and hardware-level acceleration. +vLLM achieves high performance on Arm servers by combining software and hardware optimizations. Here’s why your build runs fast: + +- Arm-optimized kernels: vLLM uses oneDNN and the Arm Compute Library to accelerate matrix multiplications, normalization, and activation functions. These libraries are tuned for Arm’s aarch64 architecture. +- Efficient quantization: INT4 quantized models run faster on Arm because KleidiAI microkernels use DOT-product instructions (SDOT/UDOT) available on Arm CPUs. +- Paged attention tuning: the paged attention mechanism is optimized for Arm’s NEON and SVE pipelines, improving token reuse and throughput during long-sequence generation. +- MoE fusion: for Mixture-of-Experts models, vLLM fuses INT4 expert layers to reduce memory transfers and bandwidth bottlenecks. +- Thread affinity and memory allocation: setting thread affinity ensures balanced CPU core usage, while tcmalloc reduces memory fragmentation and allocator contention. + +These optimizations work together to deliver higher throughput and lower latency for LLM inference on Arm servers. + +vLLM's performance on Arm servers is driven by both software optimization and hardware-level acceleration. Each component of this optimized build contributes to higher throughput and lower latency during inference: -- Optimized kernels: The aarch64 vLLM build uses direct oneDNN with the Arm Compute Library for key operations. +- Optimized kernels: the aarch64 vLLM build uses direct oneDNN with the Arm Compute Library for key operations. - 4‑bit weight quantization: vLLM supports INT4 quantized models, and Arm accelerates this using KleidiAI microkernels, which take advantage of DOT-product (SDOT/UDOT) instructions. -- Efficient MoE execution: For Mixture-of-Experts (MoE) models, vLLM fuses INT4 quantized expert layers to reduce intermediate memory transfers, which minimizes bandwidth bottlenecks -- Optimized Paged attention: The paged attention mechanism, which handles token reuse during long-sequence generation, is SIMD-tuned for Arm’s NEON and SVE (Scalable Vector Extension) pipelines. -- System tuning: Using thread affinity ensures efficient CPU core pinning and balanced thread scheduling across Arm clusters. +- Efficient MoE execution: for Mixture-of-Experts (MoE) models, vLLM fuses INT4 quantized expert layers to reduce intermediate memory transfers, which minimizes bandwidth bottlenecks +- Optimized Paged attention: the paged attention mechanism, which handles token reuse during long-sequence generation, is SIMD-tuned for Arm’s NEON and SVE (Scalable Vector Extension) pipelines. +- System tuning: using thread affinity ensures efficient CPU core pinning and balanced thread scheduling across Arm clusters. Additionally, enabling tcmalloc (Thread-Caching Malloc) reduces allocator contention and memory fragmentation under high-throughput serving loads. -## Before you begin +## Set up your environment -Verify that your environment meets the following requirements: +Before you begin, make sure your environment meets these requirements: -Python version: Use Python 3.12 on Ubuntu 22.04 LTS or later. -Hardware requirements: At least 32 vCPUs, 64 GB RAM, and 64 GB of free disk space. +- Python 3.12 on Ubuntu 22.04 LTS or newer +- At least 32 vCPUs, 64 GB RAM, and 64 GB of free disk space -This Learning Path was validated on an AWS Graviton4 c8g.12xlarge instance with 64 GB of attached storage. +This Learning Path was tested on an AWS Graviton4 c8g.12xlarge instance with 64 GB of attached storage. -### Install Build Dependencies +## Install build dependencies Install the following packages required for compiling vLLM and its dependencies on Arm64: ```bash @@ -67,14 +74,14 @@ sudo apt-get install -y libtcmalloc-minimal4 ``` {{% notice Note %}} -On aarch64, vLLM’s CPU backend automatically builds with the Arm Compute Library (ACL) through oneDNN. +On aarch64, vLLM's CPU backend automatically builds with the Arm Compute Library (ACL) through oneDNN. This ensures optimized Arm kernels are used for matrix multiplications, layer normalization, and activation functions without additional configuration. {{% /notice %}} ## Build vLLM for Arm64 CPU You’ll now build vLLM optimized for Arm (aarch64) servers with oneDNN and the Arm Compute Library (ACL) automatically enabled in the CPU backend. -1. Create and Activate a Python Virtual Environment +## Create and activate a Python virtual environment It’s best practice to build vLLM inside an isolated environment to prevent conflicts between system and project dependencies: ```bash @@ -83,7 +90,7 @@ source vllm_env/bin/activate python3 -m pip install --upgrade pip ``` -2. Clone vLLM and Install Build Requirements +## Clone vLLM and install build requirements Download the official vLLM source code and install its CPU-specific build dependencies: ```bash @@ -94,7 +101,7 @@ pip install -r requirements/cpu.txt -r requirements/cpu-build.txt ``` The specific commit (5fb4137) pins a verified version of vLLM that officially adds Arm CPUs to the list of supported build targets, ensuring full compatibility and optimized performance for Arm-based systems. -3. Build the vLLM Wheel for CPU +## Build the vLLM wheel for CPU Run the following command to compile and package vLLM as a Python wheel optimized for CPU inference: ```bash @@ -102,7 +109,7 @@ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel ``` The output wheel will appear under dist/ and include all compiled C++/PyBind modules. -4. Install the Wheel +## Install the wheel Install the freshly built wheel into your active environment: ```bash @@ -115,7 +122,31 @@ Do not delete the local vLLM source directory. The repository contains C++ extensions and runtime libraries required for correct CPU inference on aarch64 after wheel installation. {{% /notice %}} -## Quick validation via Offline Inferencing +## Validate your build with offline inference + +Run a quick test to confirm your Arm-optimized vLLM build works as expected. Use the built-in chat example to perform offline inference and verify that oneDNN and Arm Compute Library optimizations are active. + +```bash +python examples/offline_inference/basic/chat.py \ + --dtype=bfloat16 \ + --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` + +This command runs a small Hugging Face model in bfloat16 precision, streaming generated tokens to the console. You should see output similar to: + +```output +Generated Outputs: +-------------------------------------------------------------------------------- +Prompt: None + +Generated text: 'The Importance of Higher Education\n\nHigher education is a fundamental right' +-------------------------------------------------------------------------------- +Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9552.05it/s] +Processed prompts: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00, 6.78it/s, est. speed input: 474.32 toks/s, output: 108.42 toks/s] +... +``` + +If you see token streaming and generated text, your vLLM build is correctly configured for Arm64 inference. Once your Arm-optimized vLLM build completes, you can validate it by running a small offline inference example. This ensures that the CPU-specific backend and oneDNN and ACL optimizations were correctly compiled into your build. Run the built-in chat example included in the vLLM repository: @@ -129,7 +160,7 @@ python examples/offline_inference/basic/chat.py \ Explanation: --dtype=bfloat16 runs inference in bfloat16 precision. Recent Arm processors support the BFloat16 (BF16) number format in PyTorch. For example, AWS Graviton3 and Graviton3 processors support BFloat16. --model specifies a small Hugging Face model for testing (TinyLlama-1.1B-Chat), ideal for functional validation before deploying larger models. -You should see token streaming in the console, followed by a generated output confirming that vLLM’s inference pipeline is working correctly. +You should see token streaming in the console, followed by a generated output confirming that vLLM's inference pipeline is working correctly. ```output Generated Outputs: @@ -144,7 +175,7 @@ Processed prompts: 100%|██████████████████ ``` {{% notice Note %}} -As CPU support in vLLM continues to mature, these manual build steps will eventually be replaced by a streamlined pip install workflow for aarch64, simplifying future deployments on Arm servers. +As CPU support in vLLM continues to mature, these manual build steps will eventually be replaced by a streamlined `pip` install workflow for aarch64, simplifying future deployments on Arm servers. {{% /notice %}} You have now verified that your vLLM Arm64 build runs correctly and performs inference using Arm-optimized kernels. diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/2-quantize-model.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/2-quantize-model.md index 102ea00e04..bfa9435c14 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/2-quantize-model.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/2-quantize-model.md @@ -1,14 +1,14 @@ --- -title: Quantize an LLM to INT4 for Arm Platform +title: Quantize an LLM to INT4 weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Accelerating LLMs with 4-bit Quantization +## Accelerate LLMs with 4-bit quantization You can accelerate many LLMs on Arm CPUs with 4‑bit quantization. In this section, you’ll quantize the deepseek-ai/DeepSeek-V2-Lite model to 4-bit integer (INT4) weights. -The quantized model runs efficiently through vLLM’s INT4 inference path, which is accelerated by Arm KleidiAI microkernels. +The quantized model runs efficiently through vLLM's INT4 inference path, which is accelerated by Arm KleidiAI microkernels. ## Install quantization tools @@ -35,7 +35,7 @@ If the model you plan to quantize is gated on Hugging Face (e.g., DeepSeek or pr huggingface-cli login ``` -## INT4 Quantization Recipe +## Apply the INT4 quantization recipe Using a file editor of your choice, save the following code into a file named `quantize_vllm_models.py`: @@ -134,12 +134,16 @@ This script creates a Arm KleidiAI INT4 quantized copy of the vLLM model and sav ## Quantize DeepSeek‑V2‑Lite model -### Quantization parameter tuning -Quantization parameters determine how the model’s floating-point weights and activations are converted into lower-precision integer formats. Choosing the right combination is essential for balancing model accuracy, memory footprint, and runtime throughput on Arm CPUs. +Quantizing your model to INT4 format significantly reduces memory usage and improves inference speed on Arm CPUs. In this section, you'll apply the quantization script to the DeepSeek‑V2‑Lite model, tuning key parameters for optimal performance and accuracy. This process prepares your model for efficient deployment with vLLM on Arm-based servers. -1. You can choose `minmax` (faster model quantization) or `mse` (more accurate but slower model quantization) method. -2. `channelwise` is a good default for most models. -3. `groupwise` can improve accuracy further; `--groupsize 32` is common. +## Tune quantization parameters +Quantization parameters control how the model’s floating-point weights and activations are converted to lower-precision integer formats. The right settings help you balance accuracy, memory usage, and performance on Arm CPUs. + +- Use `minmax` for faster quantization, or `mse` for higher accuracy (but slower) +- Choose `channelwise` for most models; it’s a reliable default +- Try `groupwise` for potentially better accuracy; `--groupsize 32` is a common choice + +Pick the combination that fits your accuracy and speed needs. Execute the following command to quantize the DeepSeek-V2-Lite model: diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/3-run-inference-and-serve.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/3-run-inference-and-serve.md index 0e208af88e..911b38b3ec 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/3-run-inference-and-serve.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/3-run-inference-and-serve.md @@ -9,9 +9,9 @@ layout: learningpathall ## Batch Sizing in vLLM vLLM uses dynamic continuous batching to maximize hardware utilization. Two key parameters govern this process: - * `max_model_len` — The maximum sequence length (number of tokens per request). + * `max_model_len`, which is the maximum sequence length (number of tokens per request). No single prompt or generated sequence can exceed this limit. - * `max_num_batched_tokens` — The total number of tokens processed in one batch across all requests. + * `max_num_batched_tokens`, which is the total number of tokens processed in one batch across all requests. The sum of input and output tokens from all concurrent requests must stay within this limit. Together, these parameters determine how much memory the model can use and how effectively CPU threads are saturated. @@ -19,7 +19,7 @@ On Arm-based servers, tuning them helps achieve stable throughput while avoiding ## Serve an OpenAI‑compatible API -Start vLLM’s OpenAI-compatible API server using the quantized INT4 model and environment variables optimized for performance. +Start vLLM's OpenAI-compatible API server using the quantized INT4 model and environment variables optimized for performance: ```bash export VLLM_TARGET_DEVICE=cpu @@ -39,7 +39,7 @@ The server now exposes the standard OpenAI-compatible /v1/chat/completions endpo You can test it using any OpenAI-style client library to measure tokens-per-second throughput and response latency on your Arm-based server. ## Run multi‑request batch -After verifying a single request in the previous section, simulate concurrent load against the OpenAI-compatible server to exercise vLLM’s continuous batching scheduler. +After verifying a single request in the previous section, simulate concurrent load against the OpenAI-compatible server to exercise vLLM's continuous batching scheduler. About the client: Uses AsyncOpenAI with base_url="http://localhost:8000/v1" to target the vLLM server. @@ -125,9 +125,9 @@ This validates multi‑request behavior and shows aggregate throughput in the se (APIServer pid=4474) INFO: 127.0.0.1:44120 - "POST /v1/chat/completions HTTP/1.1" 200 OK (APIServer pid=4474) INFO 11-10 01:01:06 [loggers.py:221] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 57.5 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% ``` -## Optional: Serve a BF16 (Non-Quantized) Model +## Serve a BF16 (non-quantized) model (optional) -For a non-quantized path, vLLM on Arm can run BF16 end-to-end using its oneDNN integration (which routes to Arm-optimized kernels via ACL under aarch64). +For a non-quantized path, vLLM on Arm can run BF16 end-to-end using its oneDNN integration (which routes to Arm-optimized kernels using ACL under aarch64). ```bash vllm serve deepseek-ai/DeepSeek-V2-Lite \ @@ -136,17 +136,18 @@ vllm serve deepseek-ai/DeepSeek-V2-Lite \ ``` Use this BF16 setup to establish a quality reference baseline, then compare throughput and latency against your INT4 deployment to quantify the performance/accuracy trade-offs on your Arm system. -## Go Beyond: Power Up Your vLLM Workflow +## Go beyond: power up your vLLM workflow Now that you’ve successfully quantized, served, and benchmarked a model using vLLM on Arm, you can build on what you’ve learned to push performance, scalability, and usability even further. -**Try Different Models** -Extend your workflow to other models on Hugging Face that are compatible with vLLM and can benefit from Arm acceleration: - * Meta Llama 2 / Llama 3 – Strong general-purpose baselines; excellent for comparing BF16 vs INT4 performance. - * Qwen / Qwen-Chat – High-quality multilingual and instruction-tuned models. - * Gemma (Google) – Compact and efficient architecture; ideal for edge or cost-optimized serving. - -You can quantize and serve them using the same `quantize_vllm_models.py` recipe, just update the model name. +## Try different models +Explore other Hugging Face models that work well with vLLM and take advantage of Arm acceleration: -**Connect a chat client:** Link your server with OpenAI-compatible UIs like [Open WebUI](https://github.com/open-webui/open-webui) +- Meta Llama 2 and Llama 3: these versatile models work well for general tasks, and you can try them to compare BF16 and INT4 performance +- Qwen and Qwen-Chat: these models support multiple languages and are tuned for instructions, giving you high-quality results +- Gemma (Google): this compact and efficient model is a good choice for edge devices or deployments where cost matters -You can continue exploring how Arm’s efficiency, oneDNN+ACL acceleration, and vLLM’s dynamic batching combine to deliver fast, sustainable, and scalable AI inference on modern Arm architectures. +You can quantize and serve any of these models using the same `quantize_vllm_models.py` script. Just update the model name in the script. + +You can also try connecting a chat client by linking your server with OpenAI-compatible user interfaces such as [Open WebUI](https://github.com/open-webui/open-webui). + +Continue exploring how Arm efficiency, oneDNN and ACL acceleration, and vLLM dynamic batching work together to provide fast, sustainable, and scalable AI inference on modern Arm architectures. diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/4-accuracy-benchmarking.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/4-accuracy-benchmarking.md index 3b53e5ddb4..db43cdce54 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/4-accuracy-benchmarking.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/4-accuracy-benchmarking.md @@ -1,5 +1,5 @@ --- -title: Evaluate Accuracy with LM Evaluation Harness +title: Evaluate accuracy with LM Evaluation Harness weight: 5 ### FIXED, DO NOT MODIFY @@ -8,7 +8,7 @@ layout: learningpathall ## Why accuracy benchmarking -The lm-evaluation-harness is the standard way to measure model accuracy across common academic benchmarks (for example, MMLU, HellaSwag, GSM8K) and runtimes (Hugging Face, vLLM, llama.cpp, etc.). In this module, you will run accuracy tests for both BF16 and INT4 deployments of your model served by vLLM on Arm-based servers. +The lm-evaluation-harness is the standard way to measure model accuracy across common academic benchmarks (for example, MMLU, HellaSwag, GSM8K) and runtimes (such as Hugging Face, vLLM, and llama.cpp). In this Learning Path, you'll run accuracy tests for both BF16 and INT4 deployments of your model served by vLLM on Arm-based servers. You will: * Install lm-eval-harness with vLLM support @@ -16,9 +16,10 @@ You will: * Interpret key metrics and compare quality across precisions {{% notice Note %}} -Results depend on CPU, dataset versions, and model choice. Use the same tasks and few-shot settings when comparing BF16 and INT4 to ensure a fair comparison. +Results vary based on your CPU, dataset version, and model selection. For a fair comparison between BF16 and INT4, always use the same tasks and few-shot settings. {{% /notice %}} + ## Prerequisites Before you start: @@ -70,9 +71,23 @@ lm_eval \ --output_path results ``` -## Accuracy Benchmarking INT4 quantized model +## Benchmark INT4 quantized model accuracy -Use the INT4 quantization recipe & script from previous steps to quantize `meta-llama/Meta-Llama-3.1-8B-Instruct` model +Run accuracy tests on your INT4 quantized model using the same tasks and settings as the BF16 baseline. Replace the model path with your quantized output directory. + +```bash +lm_eval \ + --model vllm \ + --model_args \ + pretrained=Meta-Llama-3.1-8B-Instruct-w4a8dyn-mse-channelwise,dtype=float32,max_model_len=4096,enforce_eager=True \ + --tasks mmlu,hellaswag \ + --batch_size auto \ + --output_path results +``` + +The expected output includes per-task accuracy metrics. Compare these results to your BF16 baseline to evaluate the impact of INT4 quantization on model quality. + +Use the INT4 quantization recipe & script from previous steps to quantize `meta-llama/Meta-Llama-3.1-8B-Instruct` model. Channelwise INT4 (MSE): @@ -86,7 +101,7 @@ lm_eval \ --output_path results ``` -## Interpreting results +## Interpret the results The harness prints per-task and aggregate scores (for example, `acc`, `acc_norm`, `exact_match`). Higher is generally better. Compare BF16 vs INT4 on the same tasks to assess quality impact. @@ -94,7 +109,7 @@ Practical tips: * Use the same tasks and few-shot settings across runs. * For quick iteration, you can add `--limit 200` to run on a subset. -## Example results for Meta‑Llama‑3.1‑8B‑Instruct model +## Explore example results for Meta‑Llama‑3.1‑8B‑Instruct model These illustrative results are representative; actual scores may vary across hardware, dataset versions, and harness releases. Higher values indicate better accuracy. @@ -108,6 +123,10 @@ Use these as ballpark expectations to check whether your runs are in a reasonabl ## Next steps - * Try additional tasks to match your usecase: `gsm8k`, `winogrande`, `arc_easy`, `arc_challenge`. - * Sweep quantization recipes (minmax vs mse; channelwise vs groupwise, group size) to find a better accuracy/performance balance. - * Record both throughput and accuracy to choose the best configuration for your workload. +Now that you've completed accuracy benchmarking for both BF16 and INT4 models on Arm-based servers, you're ready to deepen your evaluation and optimize for your specific use case. Expanding your benchmarks to additional tasks helps you understand model performance across a wider range of scenarios. Experimenting with different quantization recipes lets you balance accuracy and throughput for your workload. + +- Try additional tasks to match your use case: `gsm8k`, `winogrande`, `arc_easy`, `arc_challenge`. +- Sweep quantization recipes (minmax vs mse; channelwise vs groupwise, group size) to find a better accuracy/performance balance. +- Record both throughput and accuracy to choose the best configuration for your workload. + +You've learned how to set up lm-evaluation-harness, run benchmarks for BF16 and INT4 models, and interpret key accuracy metrics on Arm platforms. Great job reaching this milestone—your results will help you make informed decisions about model deployment and optimization! diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md index 9975529515..081c3048cf 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md @@ -1,25 +1,21 @@ --- -title: Optimized LLM Inference with vLLM on Arm-Based Servers - -draft: true -cascade: - draft: true +title: Accelerate vLLM inference on Azure Cobalt 100 virtual machines minutes_to_complete: 60 -who_is_this_for: This learning path is designed for software developers and AI engineers who want to build and optimize vLLM for Arm-based servers, quantize large language models (LLMs) to INT4, serve them efficiently through an OpenAI-compatible API, and benchmark model accuracy using the LM Evaluation Harness. +who_is_this_for: This is an introductory topic for developers interested in building and optimizing vLLM for Arm-based servers. This Learning Path shows you how to quantize large language models (LLMs) to INT4, serve them efficiently using an OpenAI-compatible API, and benchmark model accuracy with the LM Evaluation Harness. learning_objectives: - - Build an optimized vLLM for aarch64 with oneDNN and the Arm Compute Library(ACL). - - Set up all runtime dependencies including PyTorch, llmcompressor, and Arm-optimized libraries. - - Quantize an LLM (DeepSeek‑V2‑Lite) to 4-bit integer (INT4) precision. - - Run and serve both quantized and BF16 (non-quantized) variants using vLLM. - - Use OpenAI‑compatible endpoints and understand sequence and batch limits. - - Evaluate accuracy using the LM Evaluation Harness on BF16 and INT4 models with vLLM. + - Build an optimized vLLM for aarch64 with oneDNN and the Arm Compute Library (ACL) + - Set up all runtime dependencies including PyTorch, llmcompressor, and Arm-optimized libraries + - Quantize an LLM (DeepSeek‑V2‑Lite) to 4-bit integer (INT4) precision + - Run and serve both quantized and BF16 (non-quantized) variants using vLLM + - Use OpenAI‑compatible endpoints and understand sequence and batch limits + - Evaluate accuracy using the LM Evaluation Harness on BF16 and INT4 models with vLLM prerequisites: - - An Arm-based Linux server (Ubuntu 22.04+ recommended) with a minimum of 32 vCPUs, 64 GB RAM, and 64 GB free disk space. - - Python 3.12 and basic familiarity with Hugging Face Transformers and quantization. + - An Arm-based Linux server (Ubuntu 22.04+ recommended) with a minimum of 32 vCPUs, 64 GB RAM, and 64 GB free disk space + - Python 3.12 and basic familiarity with Hugging Face Transformers and quantization author: - Nikhil Gupta @@ -38,6 +34,7 @@ tools_software_languages: - Generative AI - Python - PyTorch + - Hugging Face further_reading: - resource: @@ -47,7 +44,7 @@ further_reading: - resource: title: vLLM GitHub Repository link: https://github.com/vllm-project/vllm - type: github + type: website - resource: title: Hugging Face Model Hub link: https://huggingface.co/models @@ -59,7 +56,7 @@ further_reading: - resource: title: LM Evaluation Harness (GitHub) link: https://github.com/EleutherAI/lm-evaluation-harness - type: github + type: website