From f4a91abb348b96f3b5afae1472661a1dd1fa5af3 Mon Sep 17 00:00:00 2001
From: yinyu01 <ying.yu@arm.com>
Date: Wed, 27 Aug 2025 18:47:43 +0800
Subject: [PATCH] update tune-network-workloads-on-bare-metal result with AWS
 c8g.metal-48xl instance

---
 .../1_setup.md                                |  62 +++----
 .../2_baseline.md                             | 168 ++++++++++++------
 .../3_nic-queue.md                            |  38 ++--
 .../{4_local-numa1.md => 4_local-numa.md}     |  57 +++---
 .../5_iommu.md                                |  46 ++---
 .../6_summary.md                              |  11 +-
 .../_index.md                                 |   4 +-
 7 files changed, 215 insertions(+), 171 deletions(-)
 rename content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/{4_local-numa1.md => 4_local-numa.md} (50%)

diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/1_setup.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/1_setup.md
index 737df21707..60ebc8d111 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/1_setup.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/1_setup.md
@@ -11,9 +11,9 @@ layout: learningpathall
 
 There are numerouse client-server and network-based workloads, and Tomcat is a typical example of such applications, which provide services via HTTP/HTTPS network requests.
 
-In this section, you'll set up a benchmark environment using Apache Tomcat and `wrk2` to simulate HTTP load and evaluate performance on an Arm-based bare-metal (**__`Nvidia-Grace`__**).
+In this section, you'll set up a benchmark environment using `Apache Tomcat` and `wrk2` to simulate HTTP load and evaluate performance on an Arm-based bare-metal, such as **__`AWS c8g.metal-48xl`__**.
 
-## Set up the Tomcat benchmark server on **Nvidia Grace**
+## Set up the Tomcat benchmark server on **AWS c8g.metal-48xl**
 [Apache Tomcat](https://tomcat.apache.org/) is an open-source Java Servlet container that runs Java web applications, handles HTTP requests, and serves dynamic content. It supports technologies such as Servlet, JSP, and WebSocket.
 
 ## Install the Java Development Kit (JDK)
@@ -30,8 +30,8 @@ sudo apt install -y openjdk-21-jdk
 Download and extract Tomcat:
 
 ```bash
-wget -c https://dlcdn.apache.org/tomcat/tomcat-11/v11.0.9/bin/apache-tomcat-11.0.9.tar.gz
-tar xzf apache-tomcat-11.0.9.tar.gz
+wget -c https://dlcdn.apache.org/tomcat/tomcat-11/v11.0.10/bin/apache-tomcat-11.0.10.tar.gz
+tar xzf apache-tomcat-11.0.10.tar.gz
 ```
 Alternatively, you can build Tomcat [from source](https://github.com/apache/tomcat).
 
@@ -41,7 +41,7 @@ To access the built-in examples from your local network or external IP, use a te
 
 The file is at:
 ```bash
-apache-tomcat-11.0.9/webapps/examples/META-INF/context.xml
+~/apache-tomcat-11.0.10/webapps/examples/META-INF/context.xml
 ```
 
 ```xml
@@ -60,17 +60,17 @@ To achieve maximum performance of Tomcat, the maximum number of file descriptors
 Start the server:
 
 ```bash
-ulimit -n 65535 && ./apache-tomcat-11.0.9/bin/startup.sh
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 You should see output like:
 
 ```output
-Using CATALINA_BASE:   /home/ubuntu/apache-tomcat-11.0.9
-Using CATALINA_HOME:   /home/ubuntu/apache-tomcat-11.0.9
-Using CATALINA_TMPDIR: /home/ubuntu/apache-tomcat-11.0.9/temp
+Using CATALINA_BASE:   /home/ubuntu/apache-tomcat-11.0.10
+Using CATALINA_HOME:   /home/ubuntu/apache-tomcat-11.0.10
+Using CATALINA_TMPDIR: /home/ubuntu/apache-tomcat-11.0.10/temp
 Using JRE_HOME:        /usr
-Using CLASSPATH:       /home/ubuntu/apache-tomcat-11.0.9/bin/bootstrap.jar:/home/ubuntu/apache-tomcat-11.0.9/bin/tomcat-juli.jar
+Using CLASSPATH:       /home/ubuntu/apache-tomcat-11.0.10/bin/bootstrap.jar:/home/ubuntu/apache-tomcat-11.0.10/bin/tomcat-juli.jar
 Using CATALINA_OPTS:
 Tomcat started.
 ```
@@ -132,28 +132,28 @@ ulimit -n 65535 && wrk -c32 -t16 -R50000 -d60 http://${tomcat_ip}:8080/examples/
 You should see output similar to:
 
 ```console
-Running 1m test @ http://172.26.203.139:8080/examples/servlets/servlet/HelloWorldExample
+Running 1m test @ http://172.31.46.193:8080/examples/servlets/servlet/HelloWorldExample
   16 threads and 32 connections
-  Thread calibration: mean lat.: 0.986ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.984ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.999ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.994ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.983ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.989ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.991ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.993ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.985ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.990ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.987ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.990ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.984ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.991ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.978ms, rate sampling interval: 10ms
-  Thread calibration: mean lat.: 0.976ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.381ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.626ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.020ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.578ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.166ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.275ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.454ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.655ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.334ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.089ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.365ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.382ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.342ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.349ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.023ms, rate sampling interval: 10ms
+  Thread calibration: mean lat.: 3.275ms, rate sampling interval: 10ms
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency     1.00ms  454.90us   5.09ms   63.98%
-    Req/Sec     3.31k   241.68     4.89k    63.83%
-  2999817 requests in 1.00m, 1.56GB read
-Requests/sec:  49997.08
+    Latency     1.02ms  398.88us   4.24ms   66.77%
+    Req/Sec     3.30k   210.16     4.44k    70.04%
+  2999776 requests in 1.00m, 1.56GB read
+Requests/sec:  49996.87
 Transfer/sec:     26.57MB
 ```
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/2_baseline.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/2_baseline.md
index 970ee2e10a..fe0dffcb5c 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/2_baseline.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/2_baseline.md
@@ -11,18 +11,52 @@ To achieve maximum performance, ulimit -n 65535 must be executed on both server
 {{% /notice %}}
 
 ## Optimal baseline before tuning
-- Baseline on Grace bare-metal (default configuration)
-- Baseline on Grace bare-metal (access logging disabled)
-- Baseline on Grace bare-metal (optimal thread count)
+- Align the IOMMU settings with default Ubuntu
+- Baseline on Arm Neoverse bare-metal (default configuration)
+- Baseline on Arm Neoverse bare-metal (access logging disabled)
+- Baseline on Arm Neoverse bare-metal (optimal thread count)
+
+### Align the IOMMU settings with default Ubuntu
+
+{{% notice Note %}}
+Due to the customized Ubuntu distribution on AWS, you first need to align the IOMMU settings with default Ubuntu: iommu.strict=1 and iommu.passthrough=0.
+{{% /notice %}}
+
+1. Setting IOMMU default status, use a text editor to modify the `grub` file by adding or updating the `GRUB_CMDLINE_LINUX` configuration.
+
+```bash
+sudo vi /etc/default/grub
+```
+then add or update
+```bash
+GRUB_CMDLINE_LINUX="iommu.strict=1 iommu.passthrough=0"
+```
+
+2. Update GRUB and reboot to apply the default settings.
+```bash
+sudo update-grub && sudo reboot
+```
+
+3. Verify whether the default settings have been successfully applied.
+```bash
+sudo dmesg | grep iommu
+```
+It can be observed that under the default configuration, iommu.strict is enabled, and iommu.passthrough is disabled.
+```bash
+[    0.877401] iommu: Default domain type: Translated (set via kernel command line)
+[    0.877404] iommu: DMA domain TLB invalidation policy: strict mode (set via kernel command line)
+...
+```
+
+### Baseline on Arm Neoverse bare-metal (default configuration)
 
-### Baseline on Grace bare-metal (default configuration)
 {{% notice Note %}}
 To align with the typical deployment scenario of Tomcat, reserve 8 cores online and set all other cores offline
 {{% /notice %}}
 
 1. You can offline the CPU cores using the below command.
 ```bash
-for no in {8..143}; do sudo bash -c "echo 0 > /sys/devices/system/cpu/cpu${no}/online"; done
+for no in {8..191}; do sudo bash -c "echo 0 > /sys/devices/system/cpu/cpu${no}/online"; done
 ```
 2. Use the following commands to verify that cores 0-7 are online and the remaining cores are offline.
 ```bash
@@ -30,26 +64,26 @@ lscpu
 ```
 You can check the following information:
 ```bash
-Architecture:             aarch64
-  CPU op-mode(s):         64-bit
-  Byte Order:             Little Endian
-CPU(s):                   144
-  On-line CPU(s) list:    0-7
-  Off-line CPU(s) list:   8-143
-Vendor ID:                ARM
-  Model name:             Neoverse-V2
+Architecture:                aarch64
+  CPU op-mode(s):            64-bit
+  Byte Order:                Little Endian
+CPU(s):                      192
+  On-line CPU(s) list:       0-7
+  Off-line CPU(s) list:      8-191
+Vendor ID:                   ARM
+  Model name:                Neoverse-V2
 ...
 ```
 
-3. Use the following command on the Grace bare-metal where `Tomcat` is on
+3. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 4. And use the following command on the `x86_64` bare-metal where `wrk2` is on
 ```bash
-tomcat_ip=10.169.226.181
+tomcat_ip=172.31.46.193
 ```
 ```bash
 ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examples/servlets/servlet/HelloWorldExample
@@ -58,20 +92,20 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result of default configuration is:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    13.29s     3.25s   19.07s    57.79%
-    Req/Sec   347.59    430.94     0.97k    66.67%
-  3035300 requests in 1.00m, 1.58GB read
-  Socket errors: connect 1280, read 0, write 0, timeout 21760
-Requests/sec:  50517.09
-Transfer/sec:     26.84MB
+    Latency    16.76s     6.59s   27.56s    56.98%
+    Req/Sec     1.97k   165.05     2.33k    89.90%
+  14680146 requests in 1.00m, 7.62GB read
+  Socket errors: connect 1264, read 0, write 0, timeout 1748
+Requests/sec: 244449.62
+Transfer/sec:    129.90MB
 ```
 
-### Baseline on Grace bare-metal (access logging disabled)
+### Baseline on Arm Neoverse bare-metal (access logging disabled)
 To disable the access logging, use a text editor to modify the `server.xml` file by commenting out or removing the **`org.apache.catalina.valves.AccessLogValve`** configuration.
 
 The file is at:
 ```bash
-vi ~/apache-tomcat-11.0.9/conf/server.xml
+vi ~/apache-tomcat-11.0.10/conf/server.xml
 ```
 
 The configuratin is at the end of the file, and common out or remove it.
@@ -83,10 +117,10 @@ The configuratin is at the end of the file, and common out or remove it.
 -->
 ```
 
-1. Use the following command on the Grace bare-metal where `Tomcat` is on
+1. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 2. And use the following command on the `x86_64` bare-metal where `wrk2` is on
@@ -97,15 +131,15 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result of access logging disabled is:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    12.66s     3.05s   17.87s    57.47%
-    Req/Sec   433.69    524.91     1.18k    66.67%
-  3572200 requests in 1.00m, 1.85GB read
-  Socket errors: connect 1280, read 0, write 0, timeout 21760
-Requests/sec:  59451.85
-Transfer/sec:     31.59MB
+    Latency    16.16s     6.45s   28.26s    57.85%
+    Req/Sec     2.16k     5.91     2.17k    77.50%
+  16291136 requests in 1.00m, 8.45GB read
+  Socket errors: connect 0, read 0, write 0, timeout 75
+Requests/sec: 271675.12
+Transfer/sec:    144.36MB
 ```
 
-### Baseline on Grace bare-metal (optimal thread count)
+### Baseline on Arm Neoverse bare-metal (optimal thread count)
 To minimize resource contention between threads and overhead from thread context switching, the number of CPU-intensive threads in Tomcat should be aligned with the number of CPU cores.
 
 1. When using `wrk` to perform pressure testing on `Tomcat`:
@@ -115,23 +149,39 @@ top -H -p$(pgrep java)
 
 You can see the below information
 ```bash
-top - 12:12:45 up 1 day,  7:04,  5 users,  load average: 7.22, 3.46, 1.75
-Threads:  79 total,   8 running,  71 sleeping,   0 stopped,   0 zombie
-%Cpu(s):  3.4 us,  1.9 sy,  0.0 ni, 94.1 id,  0.0 wa,  0.0 hi,  0.5 si,  0.0 st
-MiB Mem : 964975.5 total, 602205.6 free,  12189.5 used, 356708.3 buff/cache
-MiB Swap:      0.0 total,      0.0 free,      0.0 used. 952786.0 avail Mem
+top - 08:57:29 up 20 min,  1 user,  load average: 4.17, 2.35, 1.22
+Threads: 231 total,   8 running, 223 sleeping,   0 stopped,   0 zombie
+%Cpu(s): 31.7 us, 20.2 sy,  0.0 ni, 31.0 id,  0.0 wa,  0.0 hi, 17.2 si,  0.0 st
+MiB Mem : 386127.8 total, 380676.0 free,   4040.7 used,   2801.1 buff/cache
+MiB Swap:      0.0 total,      0.0 free,      0.0 used. 382087.0 avail Mem
 
     PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
-  53254 yinyu01   20   0   38.0g   1.4g  28288 R  96.7   0.1   2:30.70 http-nio-8080-e
-  53255 yinyu01   20   0   38.0g   1.4g  28288 R  96.7   0.1   2:30.62 http-nio-8080-e
-  53256 yinyu01   20   0   38.0g   1.4g  28288 R  96.7   0.1   2:30.64 http-nio-8080-e
-  53258 yinyu01   20   0   38.0g   1.4g  28288 R  96.7   0.1   2:30.62 http-nio-8080-e
-  53260 yinyu01   20   0   38.0g   1.4g  28288 R  96.7   0.1   2:30.69 http-nio-8080-e
-  53257 yinyu01   20   0   38.0g   1.4g  28288 R  96.3   0.1   2:30.59 http-nio-8080-e
-  53259 yinyu01   20   0   38.0g   1.4g  28288 R  96.3   0.1   2:30.63 http-nio-8080-e
-  53309 yinyu01   20   0   38.0g   1.4g  28288 R  95.3   0.1   2:29.69 http-nio-8080-P
-  53231 yinyu01   20   0   38.0g   1.4g  28288 S   0.3   0.1   0:00.10 VM Thread
-  53262 yinyu01   20   0   38.0g   1.4g  28288 S   0.3   0.1   0:00.12 GC Thread#2
+   4677 ubuntu    20   0   36.0g   1.4g  24452 R  89.0   0.4   1:18.71 http-nio-8080-P
+   4685 ubuntu    20   0   36.0g   1.4g  24452 R   4.7   0.4   0:04.42 http-nio-8080-A
+   4893 ubuntu    20   0   36.0g   1.4g  24452 S   3.3   0.4   0:00.60 http-nio-8080-e
+   4963 ubuntu    20   0   36.0g   1.4g  24452 S   3.3   0.4   0:00.66 http-nio-8080-e
+   4924 ubuntu    20   0   36.0g   1.4g  24452 S   3.0   0.4   0:00.59 http-nio-8080-e
+   4955 ubuntu    20   0   36.0g   1.4g  24452 S   3.0   0.4   0:00.60 http-nio-8080-e
+   5061 ubuntu    20   0   36.0g   1.4g  24452 S   3.0   0.4   0:00.61 http-nio-8080-e
+   4895 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.58 http-nio-8080-e
+   4907 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.59 http-nio-8080-e
+   4940 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.58 http-nio-8080-e
+   4946 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.59 http-nio-8080-e
+   4956 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.65 http-nio-8080-e
+   4959 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.59 http-nio-8080-e
+   4960 ubuntu    20   0   36.0g   1.4g  24452 R   2.7   0.4   0:00.60 http-nio-8080-e
+   4962 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.57 http-nio-8080-e
+   4982 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.63 http-nio-8080-e
+   4983 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.58 http-nio-8080-e
+   4996 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.60 http-nio-8080-e
+   5033 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.59 http-nio-8080-e
+   5036 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.66 http-nio-8080-e
+   5056 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.61 http-nio-8080-e
+   5065 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.56 http-nio-8080-e
+   5068 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.61 http-nio-8080-e
+   5070 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.60 http-nio-8080-e
+   5071 ubuntu    20   0   36.0g   1.4g  24452 S   2.7   0.4   0:00.61 http-nio-8080-e
+...
 ```
 
 It can be observed that **`http-nio-8080-e`** and **`http-nio-8080-P`** threads are CPU-intensive.
@@ -141,7 +191,7 @@ To configure the `http-nio-8080-e` thread count, use a text editor to modify the
 
 The file is at:
 ```bash
-vi ~/apache-tomcat-11.0.9/conf/server.xml
+vi ~/apache-tomcat-11.0.10/conf/server.xml
 ```
 
 
@@ -164,10 +214,10 @@ vi ~/apache-tomcat-11.0.9/conf/server.xml
     />
 ```
 
-2. Use the following command on the Grace bare-metal where `Tomcat` is on
+2. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 3. And use the following command on the `x86_64` bare-metal where `wrk2` is on
@@ -178,9 +228,9 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result of optimal thread count is:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    24.34s     9.91s   41.81s    57.77%
-    Req/Sec     1.22k     4.29     1.23k    71.09%
-  9255672 requests in 1.00m, 4.80GB read
-Requests/sec: 154479.07
-Transfer/sec:     82.06MB
+    Latency    10.26s     4.55s   19.81s    62.51%
+    Req/Sec     2.86k    89.49     3.51k    77.06%
+  21458421 requests in 1.00m, 11.13GB read
+Requests/sec: 357835.75
+Transfer/sec:    190.08MB
 ```
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/3_nic-queue.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/3_nic-queue.md
index a2ead0e5fc..9266ad6b02 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/3_nic-queue.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/3_nic-queue.md
@@ -10,7 +10,7 @@ layout: learningpathall
 - Setting NIC queue count
 - The result after tuning NIC queue count
 
-Typically, the number of transmit/receive queues for network cards in bare-metal environments is relatively large, reaching 63 on Grace. Each transmit/receive queue corresponds to one interrupt number. Before CPU cores are taken offline, there are sufficient cores to handle these interrupt numbers. However, when only 8 cores are retained, it results in a single core having to handle multiple interrupt numbers, thereby triggering more context switches.
+Typically, the number of transmit/receive queues for network cards in bare-metal environments is relatively large, reaching 63 on Arm Neoverse. Each transmit/receive queue corresponds to one interrupt number. Before CPU cores are taken offline, there are sufficient cores to handle these interrupt numbers. However, when only 8 cores are retained, it results in a single core having to handle multiple interrupt numbers, thereby triggering more context switches.
 
 ### Setting NIC queue count
 
@@ -26,11 +26,11 @@ It can be observed that the NIC name `enp1s0f0np0` corresponsed to the IP addres
        valid_lft forever preferred_lft forever
     inet6 ::1/128 scope host noprefixroute
        valid_lft forever preferred_lft forever
-2: enp1s0f0np0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
-    link/ether b8:e9:24:67:d5:3a brd ff:ff:ff:ff:ff:ff
-    inet 10.169.226.181/24 brd 10.169.226.255 scope global enp1s0f0np0
-       valid_lft forever preferred_lft forever
-    inet6 fe80::bae9:24ff:fe67:d53a/64 scope link
+2: enP11p4s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9001 qdisc mq state UP group default qlen 1000
+    link/ether 0e:cc:0b:ff:f6:57 brd ff:ff:ff:ff:ff:ff
+    inet 172.31.46.193/20 metric 100 brd 172.31.47.255 scope global dynamic enP11p4s0
+       valid_lft 1938sec preferred_lft 1938sec
+    inet6 fe80::ccc:bff:feff:f657/64 scope link
        valid_lft forever preferred_lft forever
 ```
 
@@ -45,17 +45,17 @@ sudo ethtool -l ${net}
 ```
 It can be observed that the number of transmit/receive queues for the ${net} network interface is currently 63.
 ```bash
-Channel parameters for enp1s0f0np0:
+Channel parameters for enP11p4s0:
 Pre-set maximums:
 RX:		n/a
 TX:		n/a
 Other:		n/a
-Combined:	63
+Combined:	32
 Current hardware settings:
 RX:		n/a
 TX:		n/a
 Other:		n/a
-Combined:	63
+Combined:	32
 ```
 
 4. Use the following command to reset the number of transmit/receive queues for the ${net} to match the number of CPUs, which is 8.
@@ -68,12 +68,12 @@ sudo ethtool -l ${net}
 ```
 It can be observed that the number of combined Rx/Tx queues has been updated to 8.
 ```bash
-Channel parameters for enp1s0f0np0:
+Channel parameters for enP11p4s0:
 Pre-set maximums:
 RX:		n/a
 TX:		n/a
 Other:		n/a
-Combined:	63
+Combined:	32
 Current hardware settings:
 RX:		n/a
 TX:		n/a
@@ -83,10 +83,10 @@ Combined:	8
 
 ### The result after tuning NIC queue count
 
-1. Use the following command on the Grace bare-metal where `Tomcat` is on
+1. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 2. And use the following command on the `x86_64` bare-metal where `wrk2` is on
@@ -97,9 +97,9 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result after NIC queue count tuned:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    21.64s     8.71s   37.22s    57.82%
-    Req/Sec     1.53k     5.70     1.55k    77.15%
-  11562557 requests in 1.00m, 6.00GB read
-Requests/sec: 192932.92
-Transfer/sec:    102.49MB
+    Latency     8.35s     4.14s   16.33s    61.16%
+    Req/Sec     2.96k    73.02     3.24k    89.16%
+  22712999 requests in 1.00m, 11.78GB read
+Requests/sec: 378782.37
+Transfer/sec:    201.21MB
 ```
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa1.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa.md
similarity index 50%
rename from content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa1.md
rename to content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa.md
index f657d11456..4486d818f2 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa1.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/4_local-numa.md
@@ -22,16 +22,16 @@ numactl -H
 It can be observed that the cross-NUMA latency to intra-NUMA latency ratio is 10:40.
 ```bash
 available: 2 nodes (0-1)
-node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
-node 0 size: 483129 MB
-node 0 free: 462395 MB
-node 1 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
-node 1 size: 481845 MB
-node 1 free: 472013 MB
+node 0 cpus: 0 1 2 3 4 5 6 7
+node 0 size: 193502 MB
+node 0 free: 188478 MB
+node 1 cpus:
+node 1 size: 192625 MB
+node 1 free: 192338 MB
 node distances:
 node   0   1
-  0:  10  40
-  1:  40  10
+  0:  10  100
+  1:  100  10
 ```
 
 2. Use the following command to check the NUMA node where the ${net} network interface resides.
@@ -45,8 +45,8 @@ It can be observed that the NUMA node where the ${net} network interface resides
 
 3. Therefore, allocate the reserved 8 cores to NUMA node 1.
 ```bash
-for no in {72..79}; do sudo bash -c "echo 1 > /sys/devices/system/cpu/cpu${no}/online"; done
-for no in {0..71} {80..143}; do sudo bash -c "echo 0 > /sys/devices/system/cpu/cpu${no}/online"; done
+for no in {96..103}; do sudo bash -c "echo 1 > /sys/devices/system/cpu/cpu${no}/online"; done
+for no in {0..95} {104..191}; do sudo bash -c "echo 0 > /sys/devices/system/cpu/cpu${no}/online"; done
 ```
 
 4. Verify whether the settings have been successfully applied.
@@ -56,28 +56,28 @@ lscpu
 
 It can be observed that the only online CPUs are 72-79 on NUMA node 1.
 ```bash
-Architecture:             aarch64
-  CPU op-mode(s):         64-bit
-  Byte Order:             Little Endian
-CPU(s):                   144
-  On-line CPU(s) list:    72-79
-  Off-line CPU(s) list:   0-71,80-143
-Vendor ID:                ARM
-  Model name:             Neoverse-V2
+Architecture:                aarch64
+  CPU op-mode(s):            64-bit
+  Byte Order:                Little Endian
+CPU(s):                      192
+  On-line CPU(s) list:       96-103
+  Off-line CPU(s) list:      0-95,104-191
+Vendor ID:                   ARM
+  Model name:                Neoverse-V2
 ...
 NUMA:
-  NUMA node(s):           2
+  NUMA node(s):              2
   NUMA node0 CPU(s):
-  NUMA node1 CPU(s):      72-79
+  NUMA node1 CPU(s):         96-103
 ...
 ```
 
 ### The result after tuning local NUMA
 
-1. Use the following command on the Grace bare-metal where `Tomcat` is on
+1. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 2. And use the following command on the `x86_64` bare-metal where `wrk2` is on
@@ -88,10 +88,9 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result after NUMA node tuned:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    18.72s     7.76s   33.78s    57.93%
-    Req/Sec     1.87k    59.38     2.08k    58.75%
-  14111369 requests in 1.00m, 7.32GB read
-  Socket errors: connect 0, read 0, write 0, timeout 64
-Requests/sec: 235505.32
-Transfer/sec:    125.10MB
+    Latency     9.41s     4.71s   18.02s    61.07%
+    Req/Sec     2.84k    76.55     3.06k    72.37%
+  21814220 requests in 1.00m, 11.32GB read
+Requests/sec: 363744.39
+Transfer/sec:    193.22MB
 ```
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/5_iommu.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/5_iommu.md
index 7de6c80961..32fc6a9f22 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/5_iommu.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/5_iommu.md
@@ -14,17 +14,7 @@ In cloud environments, SmartNICs are typically used to offload the IOMMU workloa
 
 ### Setting iommu
 
-1. Use the following command to verify the default IOMMU status on the bare-metal
-```bash
-sudo dmesg | grep iommu
-```
-It can be observed that under the default configuration, iommu.strict is enabled, and iommu.passthrough is disabled.
-```bash
-[   11.558455] iommu: Default domain type: Translated
-[   11.563355] iommu: DMA domain TLB invalidation policy: strict mode
-```
-
-2. To set IOMMU status, use a text editor to modify the `grub` file by adding or updating the `GRUB_CMDLINE_LINUX` configuration.
+1. To set IOMMU status, use a text editor to modify the `grub` file by adding or updating the `GRUB_CMDLINE_LINUX` configuration.
 
 ```bash
 sudo vi /etc/default/grub
@@ -34,29 +24,32 @@ then add or update
 GRUB_CMDLINE_LINUX="iommu.strict=0 iommu.passthrough=1"
 ```
 
-3. Update GRUB and reboot to apply the settings.
+2. Update GRUB and reboot to apply the settings.
 ```bash
-sudo update-grub
-sudo reboot
+sudo update-grub && sudo reboot
 ```
 
-4. Verify whether the settings have been successfully applied.
+3. Verify whether the settings have been successfully applied.
 ```bash
 sudo dmesg | grep iommu
 ```
 
 It can be observed that the IOMMU is already in passthrough mode.
 ```bash
-[    0.000000] Kernel command line: BOOT_IMAGE=/boot/vmlinuz-6.8.0-71-generic root=UUID=a9adbbfa-892b-473d-906f-8bc0250bf544 ro iommu.strict=0 iommu.passthrough=1
-[   11.565539] iommu: Default domain type: Passthrough (set via kernel command line)
+[    0.000000] Kernel command line: BOOT_IMAGE=/vmlinuz-6.14.0-1011-aws root=PARTUUID=1c3f3c20-db6b-497c-8727-f6702f73a5b2 ro iommu.strict=0 iommu.passthrough=1 console=tty1 console=ttyS0 nvme_core.io_timeout=4294967295 panic=-1
+[    0.855658] iommu: Default domain type: Passthrough (set via kernel command line)
 ```
 
-### The result after tuning local NUMA
+### The result after tuning IOMMU
 
-1. Use the following command on the Grace bare-metal where `Tomcat` is on
+1. Use the following command on the Arm Neoverse bare-metal where `Tomcat` is on
 ```bash
-~/apache-tomcat-11.0.9/bin/shutdown.sh 2>/dev/null
-ulimit -n 65535 && ~/apache-tomcat-11.0.9/bin/startup.sh
+for no in {96..103}; do sudo bash -c "echo 1 > /sys/devices/system/cpu/cpu${no}/online"; done
+for no in {0..95} {104..191}; do sudo bash -c "echo 0 > /sys/devices/system/cpu/cpu${no}/online"; done
+net=$(ls /sys/class/net/ | grep 'en')
+sudo ethtool -L ${net} combined 8
+~/apache-tomcat-11.0.10/bin/shutdown.sh 2>/dev/null
+ulimit -n 65535 && ~/apache-tomcat-11.0.10/bin/startup.sh
 ```
 
 2. And use the following command on the `x86_64` bare-metal where `wrk2` is on
@@ -67,10 +60,9 @@ ulimit -n 65535 && wrk -c1280 -t128 -R500000 -d60 http://${tomcat_ip}:8080/examp
 The result after iommu tuned:
 ```bash
   Thread Stats   Avg      Stdev     Max   +/- Stdev
-    Latency    10.52s     4.83s   22.43s    61.31%
-    Req/Sec     2.75k    67.27     2.97k    70.85%
-  20917980 requests in 1.00m, 10.85GB read
-  Socket errors: connect 0, read 0, write 0, timeout 16
-Requests/sec: 349085.30
-Transfer/sec:    185.43MB
+    Latency     4.92s     2.49s   10.08s    62.27%
+    Req/Sec     3.36k    56.23     3.58k    69.64%
+  25703668 requests in 1.00m, 13.33GB read
+Requests/sec: 428628.50
+Transfer/sec:    227.69MB
 ```
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/6_summary.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/6_summary.md
index 5fecaf9bf1..50dce96177 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/6_summary.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/6_summary.md
@@ -11,9 +11,12 @@ It can be observed that each step of the tuning method can bring significant per
 
 | Method          | Requests/sec | Latency-Avg |
 |:----------------|:-------------|:------------|
-| default         | 154479.07    | 24.34s      |
-| NIC-Rx/Tx-Queue | 192932.92    | 21.64s      |
-| NUMA-local      | 235505.32    | 18.72s      |
-| IOMMU           | 349085.30    | 10.52s      |
+| default         | 357835.75    | 10.26s      |
+| NIC-Queue       | 378782.37    | 8.35s       |
+| NUMA-Local      | 363744.39    | 9.41s       |
+| IOMMU           | 428628.50    | 4.92s       |
 
+{{% notice Note %}}
+Under normal circumstances, NUMA local can improve the performance of network-intensive workloads on bare-metal servers. However, the reason why the performance improvement is not achieved on the c8g.metal-48xl bare-metal cloud instance provided by AWS requires further investigation.
+{{% /notice %}}
 
diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md
index 843fbcc314..69d46dd11c 100644
--- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md
@@ -14,10 +14,10 @@ learning_objectives:
     - Baseline of optimal performance configuration before tuning
     - Tune network workloads performance with NIC queue
     - Tune network workloads performance with local NUMA
-    - Tune network workloads performance with iommu.strict and iommu.passthrough
+    - Tune network workloads performance with IOMMU
 
 prerequisites:
-    - Access to an Arm-based bare-metal running Ubuntu-24 (you can use a Grace) (for Tomcat)
+    - Access to an Arm Neoverse-based bare-metal running Ubuntu-24 (you can use a AWS c8g.metal-48xl) (for Tomcat)
     - Access to a x86-based bare-metal running Ubuntu-24 (you can use an any x86_64 bare-metal) (for wrk2)
     - Basic familiarity with Java applications
     - Basic familiarity with computer system, network communication, etc.