diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml
index 3a39cd7..ede193b 100644
--- a/.github/workflows/pr-check.yml
+++ b/.github/workflows/pr-check.yml
@@ -22,7 +22,12 @@ jobs:
           - os: ubuntu-latest
             configure_args: -DCMAKE_BUILD_TYPE=Release
             build_args: --parallel
-            artifact_name: tenbox-build-linux
+            artifact_name: tenbox-build-linux-x64
+            artifact_path: build/tenbox-vm-runtime
+          - os: ubuntu-24.04-arm
+            configure_args: -DCMAKE_BUILD_TYPE=Release
+            build_args: --parallel
+            artifact_name: tenbox-build-linux-arm64
             artifact_path: build/tenbox-vm-runtime
     steps:
       - uses: actions/checkout@v4
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 75fd789..8f141a9 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,6 +1,8 @@
 # Common sources shared by all platforms
 set(TENBOX_CORE_SOURCES
     ${CMAKE_SOURCE_DIR}/src/core/vmm/vm.cpp
+    ${CMAKE_SOURCE_DIR}/src/core/vmm/vm_io_loop.cpp
+    ${CMAKE_SOURCE_DIR}/src/core/vmm/console_tx_batcher.cpp
     ${CMAKE_SOURCE_DIR}/src/core/vmm/address_space.cpp
     ${CMAKE_SOURCE_DIR}/src/core/device/virtio/virtqueue.cpp
     ${CMAKE_SOURCE_DIR}/src/core/device/virtio/virtio_mmio.cpp
diff --git a/src/core/arch/aarch64/aarch64_machine.cpp b/src/core/arch/aarch64/aarch64_machine.cpp
index 265297f..a6f18d7 100644
--- a/src/core/arch/aarch64/aarch64_machine.cpp
+++ b/src/core/arch/aarch64/aarch64_machine.cpp
@@ -10,6 +10,9 @@
 #ifdef __APPLE__
 #include "platform/macos/hypervisor/aarch64/hvf_vcpu.h"
 #include "platform/macos/hypervisor/aarch64/hvf_vm.h"
+#elif defined(__linux__) && defined(__aarch64__)
+#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h"
+#include "platform/linux/hypervisor/aarch64/kvm_vm.h"
 #endif
 
 bool Aarch64Machine::SetupPlatformDevices(
@@ -17,6 +20,7 @@ bool Aarch64Machine::SetupPlatformDevices(
     GuestMemMap& /*mem*/,
     HypervisorVm* hv_vm,
     std::shared_ptr<ConsolePort> console_port,
+    VmIoLoop* io_loop,
     std::function<void()> shutdown_cb,
     std::function<void()> reboot_cb) {
 
@@ -30,9 +34,15 @@ bool Aarch64Machine::SetupPlatformDevices(
     uart_.SetIrqLevelCallback([this](bool asserted) {
         SetIrqLevel(hv_vm_, kUartIrq, asserted);
     });
-    uart_.SetTxCallback([console_port](uint8_t byte) {
-        if (!console_port) return;
-        console_port->Write(&byte, 1);
+    // Thread the per-byte UART stream through a batcher so the downstream
+    // ConsolePort sees larger chunks instead of N * 1-byte writes.
+    tx_batcher_ = std::make_unique<ConsoleTxBatcher>(
+        [console_port](const uint8_t* data, size_t size) {
+            if (console_port) console_port->Write(data, size);
+        });
+    tx_batcher_->AttachIoLoop(io_loop);
+    uart_.SetTxCallback([this](uint8_t byte) {
+        tx_batcher_->Append(&byte, 1);
     });
     addr_space.AddMmioDevice(kUartBase, Pl011::kMmioSize, &uart_);
 
@@ -206,20 +216,21 @@ bool Aarch64Machine::LoadKernel(
         fdt.AddPropertyString("device_type", "cpu");
         fdt.AddPropertyString("compatible", "arm,arm-v8");
         fdt.AddPropertyU32("reg", i);
-        if (config.cpu_count > 1) {
-            fdt.AddPropertyString("enable-method", "psci");
-        }
+        // PSCI is always available (in-kernel PSCI on KVM, userspace
+        // emulation on HVF) so every CPU — including a single-core config —
+        // uses "psci" as its enable-method. This also lets the guest use
+        // PSCI SYSTEM_OFF / SYSTEM_RESET for shutdown/reboot.
+        fdt.AddPropertyString("enable-method", "psci");
         fdt.EndNode();
     }
     fdt.EndNode();
 
-    // PSCI node (for multi-core)
-    if (config.cpu_count > 1) {
-        fdt.BeginNode("psci");
-        fdt.AddPropertyString("compatible", "arm,psci-1.0");
-        fdt.AddPropertyString("method", "hvc");
-        fdt.EndNode();
-    }
+    // PSCI node (always present so the guest can issue SYSTEM_OFF /
+    // SYSTEM_RESET, even in single-CPU configurations).
+    fdt.BeginNode("psci");
+    fdt.AddPropertyString("compatible", "arm,psci-1.0");
+    fdt.AddPropertyString("method", "hvc");
+    fdt.EndNode();
 
     // /timer (ARM generic timer)
     fdt.BeginNode("timer");
@@ -235,10 +246,13 @@ bool Aarch64Machine::LoadKernel(
     fdt.AddPropertyEmpty("always-on");
     fdt.EndNode();
 
-    // /intc (GICv3)
-    // Use actual redistributor addresses from the hypervisor
+    // /intc — GICv3 by default, with a GICv2 fallback for hosts where the
+    // in-kernel VGICv3 is unavailable (e.g. Raspberry Pi 5 with GIC-400).
     GPA actual_redist_base = kGicRedistBase;
     uint32_t redist_total_size = static_cast<uint32_t>(config.cpu_count * 0x20000);
+    bool use_gic_v2 = false;
+    GPA gic_v2_cpu_base = 0x08010000ULL;
+    uint32_t gic_v2_cpu_size = 0x10000;
 #ifdef __APPLE__
     if (hv_vm_) {
         auto* hvf = dynamic_cast<hvf::HvfVm*>(hv_vm_);
@@ -247,24 +261,48 @@ bool Aarch64Machine::LoadKernel(
             redist_total_size = static_cast<uint32_t>(hvf->GetRedistSizePerCpu()) * config.cpu_count;
         }
     }
+#elif defined(__linux__) && defined(__aarch64__)
+    if (hv_vm_) {
+        auto* kvm_vm = dynamic_cast<kvm::KvmVm*>(hv_vm_);
+        if (kvm_vm && kvm_vm->UsesGicV2()) {
+            use_gic_v2 = true;
+            gic_v2_cpu_base = kvm::KvmVm::kGicV2CpuBase;
+            gic_v2_cpu_size = static_cast<uint32_t>(kvm::KvmVm::kGicV2CpuSize);
+        }
+    }
 #endif
 
     char gic_name[64];
     snprintf(gic_name, sizeof(gic_name), "intc@%" PRIx64,
              (uint64_t)kGicDistBase);
     fdt.BeginNode(gic_name);
-    fdt.AddPropertyString("compatible", "arm,gic-v3");
+    if (use_gic_v2) {
+        fdt.AddPropertyString("compatible", "arm,cortex-a15-gic");
+    } else {
+        fdt.AddPropertyString("compatible", "arm,gic-v3");
+    }
     fdt.AddPropertyU32("#interrupt-cells", 3);
     fdt.AddPropertyEmpty("interrupt-controller");
     fdt.AddPropertyU32("phandle", gic_phandle);
-    fdt.AddPropertyCells("reg", {
-        static_cast<uint32_t>(kGicDistBase >> 32),
-        static_cast<uint32_t>(kGicDistBase & 0xFFFFFFFF),
-        0, 0x10000,    // Distributor: 64 KiB
-        static_cast<uint32_t>(actual_redist_base >> 32),
-        static_cast<uint32_t>(actual_redist_base & 0xFFFFFFFF),
-        0, redist_total_size,
-    });
+    if (use_gic_v2) {
+        fdt.AddPropertyCells("reg", {
+            static_cast<uint32_t>(kGicDistBase >> 32),
+            static_cast<uint32_t>(kGicDistBase & 0xFFFFFFFF),
+            0, 0x10000,    // Distributor: 64 KiB (v2 only uses first 4 KiB)
+            static_cast<uint32_t>(gic_v2_cpu_base >> 32),
+            static_cast<uint32_t>(gic_v2_cpu_base & 0xFFFFFFFF),
+            0, gic_v2_cpu_size,  // CPU interface (GICC)
+        });
+    } else {
+        fdt.AddPropertyCells("reg", {
+            static_cast<uint32_t>(kGicDistBase >> 32),
+            static_cast<uint32_t>(kGicDistBase & 0xFFFFFFFF),
+            0, 0x10000,    // Distributor: 64 KiB
+            static_cast<uint32_t>(actual_redist_base >> 32),
+            static_cast<uint32_t>(actual_redist_base & 0xFFFFFFFF),
+            0, redist_total_size,
+        });
+    }
     fdt.EndNode();
 
     // Fixed clock for AMBA peripherals (PL011 requires clocks property)
@@ -380,9 +418,16 @@ bool Aarch64Machine::SetupBootVCpu(HypervisorVCpu* vcpu, uint8_t* /*ram*/) {
         return false;
     }
     return hvf_vcpu->SetupAarch64Boot(kernel_entry_, fdt_gpa_);
+#elif defined(__linux__) && defined(__aarch64__)
+    auto* kvm_vcpu = dynamic_cast<kvm::KvmVCpu*>(vcpu);
+    if (!kvm_vcpu) {
+        LOG_ERROR("aarch64: SetupBootVCpu requires KvmVCpu on Linux");
+        return false;
+    }
+    return kvm_vcpu->SetupAarch64Boot(kernel_entry_, fdt_gpa_);
 #else
     (void)vcpu;
-    LOG_ERROR("aarch64: SetupBootVCpu called on non-Apple platform");
+    LOG_ERROR("aarch64: SetupBootVCpu called on unsupported platform");
     return false;
 #endif
 }
diff --git a/src/core/arch/aarch64/aarch64_machine.h b/src/core/arch/aarch64/aarch64_machine.h
index a410752..ef85e73 100644
--- a/src/core/arch/aarch64/aarch64_machine.h
+++ b/src/core/arch/aarch64/aarch64_machine.h
@@ -1,10 +1,13 @@
 #pragma once
 
 #include "core/vmm/machine_model.h"
+#include "core/vmm/console_tx_batcher.h"
 #include "core/arch/aarch64/pl011.h"
 #include "core/arch/aarch64/boot.h"
 #include "core/device/rtc/pl031_rtc.h"
 
+#include <memory>
+
 // ARM64 virt machine model (Apple Hypervisor.framework).
 // Uses GICv3, PL011 UART, FDT boot, and VirtIO MMIO.
 class Aarch64Machine final : public MachineModel {
@@ -16,6 +19,7 @@ class Aarch64Machine final : public MachineModel {
         GuestMemMap& mem,
         HypervisorVm* hv_vm,
         std::shared_ptr<ConsolePort> console_port,
+        VmIoLoop* io_loop,
         std::function<void()> shutdown_cb,
         std::function<void()> reboot_cb) override;
 
@@ -42,6 +46,10 @@ class Aarch64Machine final : public MachineModel {
 private:
     Pl011 uart_;
     Pl031Rtc rtc_;
+    // Coalesces per-byte UART tx writes into larger chunks before they
+    // reach the ConsolePort. unique_ptr so the object is created only
+    // once SetupPlatformDevices captures the downstream writer.
+    std::unique_ptr<ConsoleTxBatcher> tx_batcher_;
     GPA kernel_entry_ = 0;
     GPA fdt_gpa_ = 0;
 
diff --git a/src/core/arch/x86_64/x86_machine.cpp b/src/core/arch/x86_64/x86_machine.cpp
index 1849d3a..0e3dc04 100644
--- a/src/core/arch/x86_64/x86_machine.cpp
+++ b/src/core/arch/x86_64/x86_machine.cpp
@@ -21,6 +21,7 @@ bool X86Machine::SetupPlatformDevices(
     GuestMemMap& /*mem*/,
     HypervisorVm* hv_vm,
     std::shared_ptr<ConsolePort> console_port,
+    VmIoLoop* io_loop,
     std::function<void()> shutdown_cb,
     std::function<void()> reboot_cb) {
 
@@ -29,9 +30,15 @@ bool X86Machine::SetupPlatformDevices(
     };
 
     uart_.SetIrqCallback([this]() { irq_injector_(4); });
-    uart_.SetTxCallback([console_port](uint8_t byte) {
-        if (!console_port) return;
-        console_port->Write(&byte, 1);
+    // Thread the per-byte UART stream through a batcher so the downstream
+    // ConsolePort sees larger chunks instead of N * 1-byte writes.
+    tx_batcher_ = std::make_unique<ConsoleTxBatcher>(
+        [console_port](const uint8_t* data, size_t size) {
+            if (console_port) console_port->Write(data, size);
+        });
+    tx_batcher_->AttachIoLoop(io_loop);
+    uart_.SetTxCallback([this](uint8_t byte) {
+        tx_batcher_->Append(&byte, 1);
     });
     addr_space.AddPioDevice(
         Uart16550::kCom1Base, Uart16550::kRegCount, &uart_);
diff --git a/src/core/arch/x86_64/x86_machine.h b/src/core/arch/x86_64/x86_machine.h
index c11d818..dd15ed6 100644
--- a/src/core/arch/x86_64/x86_machine.h
+++ b/src/core/arch/x86_64/x86_machine.h
@@ -1,7 +1,10 @@
 #pragma once
 
 #include "core/vmm/machine_model.h"
+#include "core/vmm/console_tx_batcher.h"
 #include "core/device/serial/uart_16550.h"
+
+#include <memory>
 #include "core/device/timer/i8254_pit.h"
 #include "core/device/rtc/cmos_rtc.h"
 #include "core/device/irq/ioapic.h"
@@ -22,6 +25,7 @@ class X86Machine final : public MachineModel {
         GuestMemMap& mem,
         HypervisorVm* hv_vm,
         std::shared_ptr<ConsolePort> console_port,
+        VmIoLoop* io_loop,
         std::function<void()> shutdown_cb,
         std::function<void()> reboot_cb) override;
 
@@ -57,6 +61,8 @@ class X86Machine final : public MachineModel {
 
 private:
     Uart16550 uart_;
+    // Coalesces per-byte UART tx writes before they reach the ConsolePort.
+    std::unique_ptr<ConsoleTxBatcher> tx_batcher_;
     I8254Pit pit_;
     SystemControlB sys_ctrl_b_;
     CmosRtc rtc_;
diff --git a/src/core/device/virtio/virtio_mmio.cpp b/src/core/device/virtio/virtio_mmio.cpp
index 07a027d..8c522e0 100644
--- a/src/core/device/virtio/virtio_mmio.cpp
+++ b/src/core/device/virtio/virtio_mmio.cpp
@@ -1,8 +1,28 @@
 #include "core/device/virtio/virtio_mmio.h"
 
+#if defined(__linux__) || defined(__APPLE__)
+#include <unistd.h>  // write() for eventfd in IRQFD mode
+#endif
+
 static constexpr uint64_t VIRTIO_RING_F_INDIRECT_DESC = (1ULL << 28);
 static constexpr uint64_t VIRTIO_F_EVENT_IDX = (1ULL << 29);
 
+namespace {
+
+inline void SignalIrqEventFd(int fd) {
+#if defined(__linux__) || defined(__APPLE__)
+    uint64_t one = 1;
+    // EFD_NONBLOCK fds may return EAGAIN if the counter saturates (2^64-2
+    // accumulated unhandled writes) — impossible in practice and harmless.
+    // Ignore the return value: the only interesting failure would be EBADF.
+    (void)::write(fd, &one, sizeof(one));
+#else
+    (void)fd;
+#endif
+}
+
+}  // namespace
+
 void VirtioMmioDevice::Init(VirtioDeviceOps* ops, const GuestMemMap& mem) {
     ops_ = ops;
     mem_ = mem;
@@ -149,7 +169,10 @@ void VirtioMmioDevice::MmioWrite(uint64_t offset, uint8_t size,
         break;
     case kInterruptACK: {
         uint32_t prev = interrupt_status_.fetch_and(~val, std::memory_order_acq_rel);
-        if ((prev & ~val) == 0 && irq_level_callback_) {
+        // In IRQFD mode, deassert is handled by the in-kernel irqchip via
+        // the EOI + resample path — do not fire the level callback here
+        // (that would race with the kernel and double-toggle the line).
+        if (irq_eventfd_ < 0 && (prev & ~val) == 0 && irq_level_callback_) {
             irq_level_callback_(false);
         }
         break;
@@ -221,6 +244,10 @@ void VirtioMmioDevice::NotifyUsedBuffer(int queue_idx) {
     }
 
     interrupt_status_.fetch_or(1, std::memory_order_release);  // VIRTIO_MMIO_INT_VRING
+    if (irq_eventfd_ >= 0) {
+        SignalIrqEventFd(irq_eventfd_);
+        return;
+    }
     if (irq_level_callback_)
         irq_level_callback_(true);
     else if (irq_callback_)
@@ -230,6 +257,10 @@ void VirtioMmioDevice::NotifyUsedBuffer(int queue_idx) {
 void VirtioMmioDevice::NotifyConfigChange() {
     config_generation_++;
     interrupt_status_.fetch_or(2, std::memory_order_release);  // VIRTIO_MMIO_INT_CONFIG
+    if (irq_eventfd_ >= 0) {
+        SignalIrqEventFd(irq_eventfd_);
+        return;
+    }
     if (irq_level_callback_)
         irq_level_callback_(true);
     else if (irq_callback_)
diff --git a/src/core/device/virtio/virtio_mmio.h b/src/core/device/virtio/virtio_mmio.h
index 77ce5ab..4ef39a1 100644
--- a/src/core/device/virtio/virtio_mmio.h
+++ b/src/core/device/virtio/virtio_mmio.h
@@ -36,6 +36,22 @@ class VirtioMmioDevice : public Device {
     void SetIrqCallback(IrqCallback cb) { irq_callback_ = std::move(cb); }
     void SetIrqLevelCallback(IrqLevelCallback cb) { irq_level_callback_ = std::move(cb); }
 
+    // Switch the device to IRQFD mode: instead of invoking the callbacks on
+    // every notify, write a single 64-bit value to irq_eventfd, letting the
+    // hypervisor's in-kernel irqchip assert the line directly. In this mode
+    // the explicit deassert on InterruptACK is skipped as well — deassertion
+    // is handled by the irqchip EOI + resample path.
+    //
+    // Ownership of the fd stays with the caller; it must outlive this device.
+    void SetIrqEventFd(int fd) { irq_eventfd_ = fd; }
+
+    // Snapshot of the internal interrupt_status register. Used by the irqfd
+    // resample poller to decide whether the device still has a pending
+    // condition and needs to be re-asserted.
+    uint32_t GetInterruptStatus() const {
+        return interrupt_status_.load(std::memory_order_acquire);
+    }
+
     void MmioRead(uint64_t offset, uint8_t size, uint64_t* value) override;
     void MmioWrite(uint64_t offset, uint8_t size, uint64_t value) override;
 
@@ -91,6 +107,7 @@ class VirtioMmioDevice : public Device {
     GuestMemMap mem_;
     IrqCallback irq_callback_;
     IrqLevelCallback irq_level_callback_;
+    int irq_eventfd_ = -1;  // IRQFD mode: write to assert; -1 disables.
 
     // Transport state
     uint32_t status_ = 0;
diff --git a/src/core/device/virtio/virtio_snd.cpp b/src/core/device/virtio/virtio_snd.cpp
index 2fe3bbe..a2a0ba7 100644
--- a/src/core/device/virtio/virtio_snd.cpp
+++ b/src/core/device/virtio/virtio_snd.cpp
@@ -1,5 +1,6 @@
 #include "core/device/virtio/virtio_snd.h"
 #include "core/vmm/types.h"
+#include "core/vmm/vm_io_loop.h"
 #include <algorithm>
 #include <chrono>
 #include <cstring>
@@ -366,8 +367,16 @@ void VirtioSndDevice::HandleChmapInfo(const VirtioSndQueryInfo* query,
 
 void VirtioSndDevice::StartPeriodTimer() {
     StopPeriodTimer();
-    period_running_ = true;
-    period_thread_ = std::thread(&VirtioSndDevice::PeriodTimerThread, this);
+    if (!io_loop_) return;  // no loop => no pacing (dev effectively silent)
+    period_start_time_ = std::chrono::steady_clock::now();
+    period_bytes_processed_ = 0;
+    period_running_.store(true);
+    period_timer_id_ = io_loop_->AddTimer(0, [this]() -> uint64_t {
+        if (!period_running_.load()) return 0;  // self-destruct
+        uint64_t next_ms = PeriodTick();
+        if (!period_running_.load()) return 0;
+        return next_ms ? next_ms : 1;  // never return 0 while running
+    });
 }
 
 void VirtioSndDevice::FlushPendingTxBuffers() {
@@ -392,106 +401,85 @@ void VirtioSndDevice::FlushPendingTxBuffers() {
 }
 
 void VirtioSndDevice::StopPeriodTimer() {
-    if (period_running_) {
-        period_running_ = false;
-        period_cv_.notify_all();
-        if (period_thread_.joinable()) {
-            period_thread_.join();
-        }
+    if (!period_running_.exchange(false)) return;
+    if (io_loop_ && period_timer_id_) {
+        io_loop_->RemoveTimer(period_timer_id_);
+        period_timer_id_ = 0;
     }
 }
 
-void VirtioSndDevice::PeriodTimerThread() {
-    auto start_time = std::chrono::steady_clock::now();
-    uint64_t bytes_processed = 0;  // Track audio position in bytes
+uint64_t VirtioSndDevice::PeriodTick() {
+    // Get current stream parameters
+    uint32_t sample_rate, period_bytes;
+    uint8_t channels;
+    {
+        std::lock_guard<std::mutex> lock(period_mutex_);
+        sample_rate = pcm_sample_rate_;
+        period_bytes = pcm_period_bytes_;
+        channels = pcm_channels_;
+    }
 
-    while (period_running_) {
-        // Get current stream parameters
-        uint32_t sample_rate, period_bytes;
-        uint8_t channels;
-        {
-            std::lock_guard<std::mutex> lock(period_mutex_);
-            sample_rate = pcm_sample_rate_;
-            period_bytes = pcm_period_bytes_;
-            channels = pcm_channels_;
-        }
+    if (sample_rate == 0 || period_bytes == 0 || channels == 0) {
+        return 10;  // stream not yet set up; retry later
+    }
 
-        if (sample_rate == 0 || period_bytes == 0 || channels == 0) {
-            std::unique_lock<std::mutex> lock(period_mutex_);
-            period_cv_.wait_for(lock, std::chrono::milliseconds(10),
-                                [this]() { return !period_running_.load(); });
-            continue;
-        }
+    uint32_t bytes_per_second = sample_rate * channels * 2;  // S16
 
-        uint32_t bytes_per_second = sample_rate * channels * 2; // S16
-
-        // Calculate timing: how far ahead/behind are we?
-        auto now = std::chrono::steady_clock::now();
-        auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-        int64_t audio_ms = static_cast<int64_t>(bytes_processed) * 1000 / bytes_per_second;
-        int64_t drift_ms = audio_ms - elapsed_ms;  // positive = ahead, negative = behind
-
-        // If we're behind, process buffers immediately
-        // If we're ahead, sleep until we need to process
-        if (drift_ms > 0) {
-            // We're ahead of real-time, sleep a bit
-            int64_t sleep_ms = (std::min)(drift_ms, (int64_t)10);
-            std::unique_lock<std::mutex> lock(period_mutex_);
-            period_cv_.wait_for(lock, std::chrono::milliseconds(sleep_ms),
-                                [this]() { return !period_running_.load(); });
-            continue;
-        }
+    auto now = std::chrono::steady_clock::now();
+    auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          now - period_start_time_).count();
+    int64_t audio_ms = static_cast<int64_t>(period_bytes_processed_) * 1000 /
+                       bytes_per_second;
+    int64_t drift_ms = audio_ms - elapsed_ms;  // +ahead / -behind
 
-        // If we're way behind (> 200ms), reset timing
-        if (drift_ms < -200) {
-            start_time = std::chrono::steady_clock::now();
-            bytes_processed = 0;
-            continue;
-        }
+    if (drift_ms > 0) {
+        // Ahead of real time — wait until we need more samples.
+        return static_cast<uint64_t>((std::min)(drift_ms, (int64_t)10));
+    }
 
-        // Process one buffer
-        PendingTxBuffer buf{};
-        bool have_buf = false;
-        {
-            std::lock_guard<std::mutex> lock(tx_mutex_);
-            if (!pending_tx_buffers_.empty()) {
-                buf = std::move(pending_tx_buffers_.front());
-                pending_tx_buffers_.pop_front();
-                have_buf = true;
-            }
-        }
+    if (drift_ms < -200) {
+        // Way behind (suspend/resume?); resync the clock instead of
+        // burning through every queued buffer.
+        period_start_time_ = std::chrono::steady_clock::now();
+        period_bytes_processed_ = 0;
+        return 1;
+    }
 
-        if (!have_buf) {
-            // No buffers available, wait briefly
-            std::unique_lock<std::mutex> lock(period_mutex_);
-            period_cv_.wait_for(lock, std::chrono::milliseconds(1),
-                                [this]() { return !period_running_.load(); });
-            continue;
+    PendingTxBuffer buf{};
+    bool have_buf = false;
+    {
+        std::lock_guard<std::mutex> lock(tx_mutex_);
+        if (!pending_tx_buffers_.empty()) {
+            buf = std::move(pending_tx_buffers_.front());
+            pending_tx_buffers_.pop_front();
+            have_buf = true;
         }
+    }
 
-        // Send PCM data to manager
-        size_t pcm_bytes = 0;
-        if (!buf.pcm_data.empty() && audio_port_) {
-            AudioChunk chunk;
-            chunk.sample_rate = sample_rate;
-            chunk.channels = channels;
-            pcm_bytes = buf.pcm_data.size() * sizeof(int16_t);
-            chunk.pcm = std::move(buf.pcm_data);
-            audio_port_->SubmitPcm(std::move(chunk));
-        }
+    if (!have_buf) {
+        return 1;  // spin gently until the guest queues more data
+    }
 
-        // Track audio position
-        bytes_processed += (pcm_bytes > 0) ? pcm_bytes : period_bytes;
+    size_t pcm_bytes = 0;
+    if (!buf.pcm_data.empty() && audio_port_) {
+        AudioChunk chunk;
+        chunk.sample_rate = sample_rate;
+        chunk.channels = channels;
+        pcm_bytes = buf.pcm_data.size() * sizeof(int16_t);
+        chunk.pcm = std::move(buf.pcm_data);
+        audio_port_->SubmitPcm(std::move(chunk));
+    }
 
-        // Return buffer to guest
-        if (mmio_) {
-            auto* txq = mmio_->GetQueue(VIRTIO_SND_VQ_TX);
-            if (txq) {
-                txq->PushUsed(buf.head, buf.status_len);
-                mmio_->NotifyUsedBuffer(VIRTIO_SND_VQ_TX);
-            }
+    period_bytes_processed_ += (pcm_bytes > 0) ? pcm_bytes : period_bytes;
+
+    if (mmio_) {
+        auto* txq = mmio_->GetQueue(VIRTIO_SND_VQ_TX);
+        if (txq) {
+            txq->PushUsed(buf.head, buf.status_len);
+            mmio_->NotifyUsedBuffer(VIRTIO_SND_VQ_TX);
         }
     }
+    return 1;  // immediately try the next buffer; drift calc paces us
 }
 
 uint32_t VirtioSndDevice::RateEnumToHz(uint8_t rate_enum) {
diff --git a/src/core/device/virtio/virtio_snd.h b/src/core/device/virtio/virtio_snd.h
index dd63be6..936d57c 100644
--- a/src/core/device/virtio/virtio_snd.h
+++ b/src/core/device/virtio/virtio_snd.h
@@ -3,14 +3,15 @@
 #include "common/ports.h"
 #include "core/device/virtio/virtio_mmio.h"
 #include <atomic>
-#include <condition_variable>
+#include <chrono>
 #include <cstdint>
 #include <deque>
 #include <functional>
 #include <mutex>
-#include <thread>
 #include <vector>
 
+class VmIoLoop;
+
 // virtio-snd device ID (spec 5.14)
 constexpr uint32_t VIRTIO_SND_DEVICE_ID = 25;
 
@@ -149,6 +150,10 @@ class VirtioSndDevice : public VirtioDeviceOps {
     void SetMmioDevice(VirtioMmioDevice* mmio) { mmio_ = mmio; }
     void SetMemMap(const GuestMemMap& mem) { mem_ = mem; }
     void SetAudioPort(std::shared_ptr<AudioPort> port) { audio_port_ = std::move(port); }
+    // The io_loop hosts our period timer. Must be set before the guest
+    // starts a stream; a nullptr falls back to "no audio pacing" (playback
+    // effectively stalls, matching a stream-less config).
+    void SetIoLoop(VmIoLoop* loop) { io_loop_ = loop; }
 
     uint32_t GetDeviceId() const override { return VIRTIO_SND_DEVICE_ID; }
     uint64_t GetDeviceFeatures() const override;
@@ -173,7 +178,9 @@ class VirtioSndDevice : public VirtioDeviceOps {
     void HandleChmapInfo(const VirtioSndQueryInfo* query,
                          uint8_t* resp, uint32_t* resp_len);
 
-    void PeriodTimerThread();
+    // One tick of the period-driven playback loop. Runs on io_loop_'s
+    // thread; returns the delay (ms) until the next tick.
+    uint64_t PeriodTick();
     void StartPeriodTimer();
     void StopPeriodTimer();
     void FlushPendingTxBuffers();
@@ -197,11 +204,15 @@ class VirtioSndDevice : public VirtioDeviceOps {
     uint32_t pcm_buffer_bytes_ = 0;
     uint32_t pcm_period_bytes_ = 0;
 
-    // Period timer: releases TX buffers at real audio rate to throttle guest
-    std::thread period_thread_;
+    // Period timer: releases TX buffers at real audio rate to throttle guest.
+    // The timer itself is owned by io_loop_; we only keep the id and state
+    // used by PeriodTick.
+    VmIoLoop* io_loop_ = nullptr;
     std::mutex period_mutex_;
-    std::condition_variable period_cv_;
     std::atomic<bool> period_running_{false};
+    uint64_t period_timer_id_ = 0;
+    std::chrono::steady_clock::time_point period_start_time_{};
+    uint64_t period_bytes_processed_ = 0;
 
     // Pending TX buffers waiting to be returned to guest
     struct PendingTxBuffer {
diff --git a/src/core/vmm/console_tx_batcher.cpp b/src/core/vmm/console_tx_batcher.cpp
new file mode 100644
index 0000000..6b4e403
--- /dev/null
+++ b/src/core/vmm/console_tx_batcher.cpp
@@ -0,0 +1,75 @@
+#include "core/vmm/console_tx_batcher.h"
+
+#include "core/vmm/vm_io_loop.h"
+
+#include <utility>
+
+ConsoleTxBatcher::ConsoleTxBatcher(RawWriter writer) : writer_(std::move(writer)) {}
+
+ConsoleTxBatcher::~ConsoleTxBatcher() {
+    // By contract the io_loop has already been stopped by whoever owns us
+    // (Vm::~Vm), so any armed timer has been closed and its capture of
+    // `this` released. We just need to drain whatever the guest wrote
+    // after the last timer flush so tail-end console output isn't lost.
+    Flush();
+}
+
+void ConsoleTxBatcher::AttachIoLoop(VmIoLoop* loop) {
+    std::lock_guard<std::mutex> lock(mu_);
+    io_loop_ = loop;
+}
+
+void ConsoleTxBatcher::Append(const uint8_t* data, size_t size) {
+    if (!data || size == 0) return;
+
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // If no loop is attached yet, or it's been torn down, we can't
+    // schedule a delayed flush. Drain any previously-buffered bytes
+    // first (preserve order) and write this chunk through synchronously.
+    if (!io_loop_ || !io_loop_->running()) {
+        if (!buf_.empty()) FlushLocked(lock);
+        RawWriter w = writer_;
+        lock.unlock();
+        if (w) w(data, size);
+        return;
+    }
+
+    buf_.append(reinterpret_cast<const char*>(data), size);
+
+    if (buf_.size() >= kFlushThreshold) {
+        FlushLocked(lock);
+        return;
+    }
+
+    if (!timer_armed_) {
+        timer_armed_ = true;
+        timer_id_ = io_loop_->AddTimer(kFlushDelayMs, [this]() -> uint64_t {
+            return OnTimerFire();
+        });
+    }
+}
+
+void ConsoleTxBatcher::Flush() {
+    std::unique_lock<std::mutex> lock(mu_);
+    if (!buf_.empty()) FlushLocked(lock);
+}
+
+void ConsoleTxBatcher::FlushLocked(std::unique_lock<std::mutex>& lock) {
+    std::string pending;
+    pending.swap(buf_);
+    RawWriter w = writer_;
+    lock.unlock();
+    if (w && !pending.empty()) {
+        w(reinterpret_cast<const uint8_t*>(pending.data()), pending.size());
+    }
+    lock.lock();
+}
+
+uint64_t ConsoleTxBatcher::OnTimerFire() {
+    std::unique_lock<std::mutex> lock(mu_);
+    timer_armed_ = false;
+    timer_id_ = 0;
+    if (!buf_.empty()) FlushLocked(lock);
+    return 0;  // self-destruct; next Append() re-arms
+}
diff --git a/src/core/vmm/console_tx_batcher.h b/src/core/vmm/console_tx_batcher.h
new file mode 100644
index 0000000..c523ef1
--- /dev/null
+++ b/src/core/vmm/console_tx_batcher.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <mutex>
+#include <string>
+
+class VmIoLoop;
+
+// Coalesces the 1-byte-at-a-time UART tx stream into larger chunks before
+// handing them to the downstream sink (ConsolePort -> stdout / IPC pipe).
+//
+// Guest UARTs (pl011, 16550) call into us via a TxCallback that fires once
+// per MMIO/PIO write. Boot log alone can push thousands of bytes this way,
+// each currently turning into one ::write()/WriteFile() syscall on the host.
+// This class buffers writes and flushes them in two scenarios:
+//   - The pending buffer reaches kFlushThreshold bytes: synchronously flush
+//     on the caller (vCPU) thread. Bounds worst-case latency for bursts.
+//   - An io_loop timer fires kFlushDelayMs after the first byte landed.
+//     Handles the slow interactive-echo case where bytes dribble in below
+//     the threshold.
+//
+// Thread-safety: all public methods are safe to call from any thread.
+// The RawWriter is invoked without holding the internal mutex so that slow
+// syscalls don't serialize Append() calls from different vCPUs.
+//
+// Lifetime: the loop must outlive this object. In the Vm ownership chain
+// that's guaranteed because Vm::~Vm explicitly stops io_loop_ (which joins
+// its thread and closes any armed timer) before machine_ -- and therefore
+// this batcher -- is destroyed.
+class ConsoleTxBatcher {
+public:
+    using RawWriter = std::function<void(const uint8_t*, size_t)>;
+
+    explicit ConsoleTxBatcher(RawWriter writer);
+    ~ConsoleTxBatcher();
+
+    ConsoleTxBatcher(const ConsoleTxBatcher&) = delete;
+    ConsoleTxBatcher& operator=(const ConsoleTxBatcher&) = delete;
+
+    // Attach the io loop used for delayed flushes. Safe to call before
+    // the loop is started; buffering begins when the loop becomes running.
+    // Passing nullptr detaches and forces Append() to go synchronous.
+    void AttachIoLoop(VmIoLoop* loop);
+
+    // Append bytes to the tx buffer. If the loop is unavailable or not
+    // running, bytes are written through synchronously to preserve output
+    // across Vm startup/shutdown edges.
+    void Append(const uint8_t* data, size_t size);
+
+    // Synchronous flush of whatever is currently buffered.
+    void Flush();
+
+private:
+    // Hands buf_ contents to writer_. Releases `lock` around the writer
+    // call and re-acquires it before returning. Leaves buf_ empty.
+    void FlushLocked(std::unique_lock<std::mutex>& lock);
+
+    // Runs on io_thread_ when the coalesce timer fires. Returns 0 so the
+    // timer self-destructs; the next Append() will re-arm it.
+    uint64_t OnTimerFire();
+
+    RawWriter writer_;
+    VmIoLoop* io_loop_ = nullptr;
+
+    std::mutex mu_;
+    std::string buf_;
+    bool timer_armed_ = false;
+    uint64_t timer_id_ = 0;
+
+    static constexpr size_t kFlushThreshold = 1024;
+    static constexpr uint64_t kFlushDelayMs = 16;
+};
diff --git a/src/core/vmm/hypervisor_vm.h b/src/core/vmm/hypervisor_vm.h
index edf9e4d..1912c6e 100644
--- a/src/core/vmm/hypervisor_vm.h
+++ b/src/core/vmm/hypervisor_vm.h
@@ -32,6 +32,27 @@ class HypervisorVm {
     // skipped. Default returns false so HVF / WHVP keep their current path.
     virtual bool AssertIrq(uint32_t /*gsi*/, bool /*level*/) { return false; }
 
+    // Register an eventfd (or platform equivalent) as an IRQFD for a
+    // level-triggered GSI. When trigger_fd is signalled, the hypervisor
+    // asserts the line directly in kernel space, bypassing the userspace
+    // RequestInterrupt / AssertIrq ioctl path.
+    //
+    // gsi is the hypervisor-absolute interrupt number:
+    //   - arm64 KVM: SPI absolute INTID (>= 32)
+    //   - x86_64 KVM: IOAPIC pin (0..23 with the default routing)
+    // The caller is responsible for computing the arch-specific offset.
+    //
+    // resample_fd (may be -1) is signalled by the hypervisor after the
+    // guest EOIs the interrupt so the caller can re-assert if the device
+    // still has a pending condition. Required for level-triggered lines.
+    //
+    // Default returns false; macOS HVF and any backend without irqfd
+    // support falls back to the RequestInterrupt / SetIrqLevelCallback
+    // path automatically.
+    virtual bool RegisterLevelIrqFd(uint32_t /*gsi*/, int /*trigger_fd*/,
+                                    int /*resample_fd*/) { return false; }
+    virtual bool UnregisterIrqFd(uint32_t /*gsi*/, int /*trigger_fd*/) { return false; }
+
     virtual void SetGuestMemMap(const GuestMemMap*) {}
 
     virtual void QueueInterrupt(uint32_t vector, uint32_t dest_vcpu) {
diff --git a/src/core/vmm/machine_model.h b/src/core/vmm/machine_model.h
index 744d430..96c31f3 100644
--- a/src/core/vmm/machine_model.h
+++ b/src/core/vmm/machine_model.h
@@ -13,6 +13,7 @@
 
 struct VmSharedFolder;
 struct VmConfig;
+class VmIoLoop;
 
 // Describes a VirtIO MMIO device slot with its base address and IRQ number.
 struct VirtioDeviceSlot {
@@ -36,6 +37,7 @@ class MachineModel {
         GuestMemMap& mem,
         HypervisorVm* hv_vm,
         std::shared_ptr<ConsolePort> console_port,
+        VmIoLoop* io_loop,
         std::function<void()> shutdown_cb,
         std::function<void()> reboot_cb) = 0;
 
diff --git a/src/core/vmm/vm.cpp b/src/core/vmm/vm.cpp
index 8783987..0b4f9df 100644
--- a/src/core/vmm/vm.cpp
+++ b/src/core/vmm/vm.cpp
@@ -2,6 +2,13 @@
 #include "core/vmm/vm_platform.h"
 #include <algorithm>
 
+#if defined(__linux__)
+#include <cerrno>
+#include <cstring>
+#include <sys/eventfd.h>
+#include <unistd.h>
+#endif
+
 #if defined(__APPLE__) && defined(__x86_64__)
 #include "core/arch/x86_64/x86_machine.h"
 #include "platform/macos/hypervisor/x86_64/hvf_vcpu.h"
@@ -13,12 +20,16 @@
 #include "core/arch/x86_64/x86_machine.h"
 #elif defined(__linux__) && defined(__x86_64__)
 #include "core/arch/x86_64/x86_machine.h"
+#elif defined(__linux__) && defined(__aarch64__)
+#include "core/arch/aarch64/aarch64_machine.h"
+#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h"
+#include "platform/linux/hypervisor/aarch64/kvm_vm.h"
 #endif
 
 static std::unique_ptr<MachineModel> CreateMachineModel() {
 #if defined(_WIN32) || (defined(__APPLE__) && defined(__x86_64__)) || (defined(__linux__) && defined(__x86_64__))
     return std::make_unique<X86Machine>();
-#elif defined(__APPLE__) && defined(__aarch64__)
+#elif (defined(__APPLE__) && defined(__aarch64__)) || (defined(__linux__) && defined(__aarch64__))
     return std::make_unique<Aarch64Machine>();
 #else
     LOG_ERROR("No machine model available for this platform/architecture");
@@ -51,6 +62,11 @@ Vm::~Vm() {
         if (t.joinable()) t.join();
     }
 
+    // Tear down irqfds (detach uv_poll, unregister with kvm, close fds, stop
+    // io_loop_) before destroying the hypervisor VM.
+    ShutdownIrqFds();
+    io_loop_.Stop();
+
     if (vdagent_handler_) {
         vdagent_handler_->SetClipboardCallback(nullptr);
     }
@@ -108,6 +124,7 @@ std::unique_ptr<Vm> Vm::Create(const VmConfig& config) {
     if (!vm->machine_->SetupPlatformDevices(
             vm->addr_space_, vm->mem_, vm->hv_vm_.get(),
             vm->console_port_,
+            &vm->io_loop_,
             [&vm_ref = *vm]() { vm_ref.RequestStop(); },
             [&vm_ref = *vm]() { vm_ref.RequestReboot(); })) {
         LOG_ERROR("Failed to set up platform devices");
@@ -231,6 +248,14 @@ void Vm::SetupVCpuCallbacks(uint32_t vcpu_index) {
         hvf_vcpu->SetPsciShutdownCallback([this]() { RequestStop(); });
         hvf_vcpu->SetPsciRebootCallback([this]() { RequestReboot(); });
     }
+#elif defined(__linux__) && defined(__aarch64__)
+    // In-kernel PSCI handles CPU_ON; only SYSTEM_OFF / SYSTEM_RESET bubble
+    // up to userspace as KVM_EXIT_SYSTEM_EVENT.
+    auto* kvm_vcpu = dynamic_cast<kvm::KvmVCpu*>(vcpus_[vcpu_index].get());
+    if (kvm_vcpu) {
+        kvm_vcpu->SetShutdownCallback([this]() { RequestStop(); });
+        kvm_vcpu->SetRebootCallback([this]() { RequestReboot(); });
+    }
 #else
     (void)vcpu_index;
 #endif
@@ -326,6 +351,92 @@ void Vm::SetIrqLevel(uint8_t irq, bool asserted) {
     machine_->SetIrqLevel(hv_vm_.get(), irq, asserted);
 }
 
+bool Vm::TryEnableIrqFd(VirtioMmioDevice* dev, uint8_t slot_irq) {
+#if defined(__linux__)
+    IrqFdSlot slot;
+    #if defined(__aarch64__)
+        slot.gsi = static_cast<uint32_t>(slot_irq) + 32;  // absolute SPI INTID
+    #elif defined(__x86_64__)
+        slot.gsi = static_cast<uint32_t>(slot_irq);       // IOAPIC pin
+    #else
+        (void)slot_irq;
+        return false;
+    #endif
+    slot.dev = dev;
+    irqfd_slots_.push_back(slot);
+    return true;
+#else
+    (void)dev;
+    (void)slot_irq;
+    return false;
+#endif
+}
+
+void Vm::InstallIrqFds() {
+#if defined(__linux__)
+    if (irqfd_slots_.empty() || !hv_vm_) return;
+
+    // Allocate trigger + resample eventfds per slot, then ask the hypervisor
+    // to register each one. On any failure, drop the slot from the list
+    // (its virtio device keeps using the SetIrqLevelCallback fallback).
+    size_t write_idx = 0;
+    for (size_t read_idx = 0; read_idx < irqfd_slots_.size(); ++read_idx) {
+        IrqFdSlot& s = irqfd_slots_[read_idx];
+
+        int trig = ::eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+        int resamp = ::eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+        if (trig < 0 || resamp < 0) {
+            LOG_WARN("irqfd: eventfd() failed: %s", strerror(errno));
+            if (trig >= 0) ::close(trig);
+            if (resamp >= 0) ::close(resamp);
+            continue;
+        }
+        if (!hv_vm_->RegisterLevelIrqFd(s.gsi, trig, resamp)) {
+            ::close(trig);
+            ::close(resamp);
+            continue;
+        }
+
+        s.trigger_fd = trig;
+        s.resample_fd = resamp;
+        s.dev->SetIrqEventFd(trig);
+        io_loop_.AttachIrqFd(s.dev, trig, resamp);
+        irqfd_slots_[write_idx++] = s;
+    }
+    irqfd_slots_.resize(write_idx);
+
+    if (!irqfd_slots_.empty()) {
+        LOG_INFO("irqfd: %zu slots attached to io_loop", irqfd_slots_.size());
+    }
+#endif
+}
+
+void Vm::ShutdownIrqFds() {
+#if defined(__linux__)
+    // Detach uv_poll on the io_loop first (synchronously-ish: Post returns
+    // immediately but the detach closure runs in the io_thread before Stop
+    // completes). Then unregister with the kernel and close fds.
+    for (auto& s : irqfd_slots_) {
+        if (s.dev) io_loop_.DetachIrqFd(s.dev);
+    }
+    for (auto& s : irqfd_slots_) {
+        if (s.trigger_fd >= 0) {
+            if (hv_vm_) hv_vm_->UnregisterIrqFd(s.gsi, s.trigger_fd);
+            ::close(s.trigger_fd);
+            s.trigger_fd = -1;
+        }
+        if (s.resample_fd >= 0) {
+            ::close(s.resample_fd);
+            s.resample_fd = -1;
+        }
+        if (s.dev) {
+            s.dev->SetIrqEventFd(-1);  // revert to callback path on teardown
+        }
+    }
+    irqfd_slots_.clear();
+#endif
+}
+
 bool Vm::SetupVirtioBlk(const std::string& disk_path, const VirtioDeviceSlot& slot) {
     virtio_blk_ = std::make_unique<VirtioBlkDevice>();
     if (!virtio_blk_->Open(disk_path)) return false;
@@ -334,6 +445,7 @@ bool Vm::SetupVirtioBlk(const std::string& disk_path, const VirtioDeviceSlot& sl
     virtio_mmio_->Init(virtio_blk_.get(), mem_);
     virtio_mmio_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_.get(), slot.irq);
     virtio_blk_->SetMmioDevice(virtio_mmio_.get());
 
     addr_space_.AddMmioDevice(
@@ -353,6 +465,7 @@ bool Vm::SetupVirtioNet(bool link_up, const std::vector<PortForward>& forwards,
     virtio_mmio_net_->Init(virtio_net_.get(), mem_);
     virtio_mmio_net_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_net_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_net_.get(), slot.irq);
     virtio_net_->SetMmioDevice(virtio_mmio_net_.get());
 
     virtio_net_->SetTxCallback([this](const uint8_t* frame, uint32_t len) {
@@ -379,6 +492,7 @@ bool Vm::SetupVirtioInput(const VirtioDeviceSlot& kbd_slot,
     virtio_mmio_kbd_->Init(virtio_kbd_.get(), mem_);
     virtio_mmio_kbd_->SetIrqCallback([this, irq = kbd_slot.irq]() { InjectIrq(irq); });
     virtio_mmio_kbd_->SetIrqLevelCallback([this, irq = kbd_slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_kbd_.get(), kbd_slot.irq);
     virtio_kbd_->SetMmioDevice(virtio_mmio_kbd_.get());
     addr_space_.AddMmioDevice(
         kbd_slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_kbd_.get());
@@ -389,6 +503,7 @@ bool Vm::SetupVirtioInput(const VirtioDeviceSlot& kbd_slot,
     virtio_mmio_tablet_->Init(virtio_tablet_.get(), mem_);
     virtio_mmio_tablet_->SetIrqCallback([this, irq = tablet_slot.irq]() { InjectIrq(irq); });
     virtio_mmio_tablet_->SetIrqLevelCallback([this, irq = tablet_slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_tablet_.get(), tablet_slot.irq);
     virtio_tablet_->SetMmioDevice(virtio_mmio_tablet_.get());
     addr_space_.AddMmioDevice(
         tablet_slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_tablet_.get());
@@ -417,6 +532,7 @@ bool Vm::SetupVirtioGpu(uint32_t width, uint32_t height, const VirtioDeviceSlot&
     virtio_mmio_gpu_->Init(virtio_gpu_.get(), mem_);
     virtio_mmio_gpu_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_gpu_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_gpu_.get(), slot.irq);
     virtio_gpu_->SetMmioDevice(virtio_mmio_gpu_.get());
     addr_space_.AddMmioDevice(
         slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_gpu_.get());
@@ -461,6 +577,7 @@ bool Vm::SetupVirtioSerial(const VirtioDeviceSlot& slot) {
     virtio_mmio_serial_->Init(virtio_serial_.get(), mem_);
     virtio_mmio_serial_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_serial_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_serial_.get(), slot.irq);
     virtio_serial_->SetMmioDevice(virtio_mmio_serial_.get());
     addr_space_.AddMmioDevice(
         slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_serial_.get());
@@ -478,6 +595,7 @@ bool Vm::SetupVirtioFs(const std::vector<VmSharedFolder>& initial_folders,
     virtio_mmio_fs_->Init(virtio_fs_.get(), mem_);
     virtio_mmio_fs_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_fs_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_fs_.get(), slot.irq);
     virtio_fs_->SetMmioDevice(virtio_mmio_fs_.get());
 
     addr_space_.AddMmioDevice(slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_fs_.get());
@@ -496,6 +614,7 @@ bool Vm::SetupVirtioFs(const std::vector<VmSharedFolder>& initial_folders,
 bool Vm::SetupVirtioSnd(const VirtioDeviceSlot& slot) {
     virtio_snd_ = std::make_unique<VirtioSndDevice>();
     virtio_snd_->SetMemMap(mem_);
+    virtio_snd_->SetIoLoop(&io_loop_);
 
     if (audio_port_) {
         virtio_snd_->SetAudioPort(audio_port_);
@@ -505,6 +624,7 @@ bool Vm::SetupVirtioSnd(const VirtioDeviceSlot& slot) {
     virtio_mmio_snd_->Init(virtio_snd_.get(), mem_);
     virtio_mmio_snd_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); });
     virtio_mmio_snd_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); });
+    TryEnableIrqFd(virtio_mmio_snd_.get(), slot.irq);
     virtio_snd_->SetMmioDevice(virtio_mmio_snd_.get());
     addr_space_.AddMmioDevice(
         slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_snd_.get());
@@ -629,6 +749,24 @@ int Vm::Run() {
         FinalizeBoot(boot_config_);
     }
 
+    if (running_) {
+#if defined(__linux__) && defined(__aarch64__)
+        // KVM_IRQFD on arm64 requires the in-kernel VGIC to have had its
+        // KVM_DEV_ARM_VGIC_CTRL_INIT issued. SetupAarch64Boot normally drives
+        // that, but it runs on the BSP thread after boot_complete_ — i.e.
+        // after we would try to register irqfds here. Force-finalize from
+        // this (main) thread; FinalizeVgicInit is idempotent.
+        if (auto* kvm_vm = dynamic_cast<kvm::KvmVm*>(hv_vm_.get())) {
+            kvm_vm->FinalizeVgicInit();
+        }
+#endif
+        // Bring up the central device I/O loop, then register each virtio
+        // slot's irqfd with it. Slots that fail to register stay on the
+        // classic KVM_IRQ_LINE fallback.
+        io_loop_.Start();
+        InstallIrqFds();
+    }
+
     // Phase 2: release all threads into their run loops.
     {
         std::lock_guard<std::mutex> lock(boot_mutex_);
diff --git a/src/core/vmm/vm.h b/src/core/vmm/vm.h
index a48cd72..2964b03 100644
--- a/src/core/vmm/vm.h
+++ b/src/core/vmm/vm.h
@@ -5,6 +5,7 @@
 #include "core/vmm/hypervisor_vm.h"
 #include "core/vmm/machine_model.h"
 #include "core/vmm/vcpu_startup_state.h"
+#include "core/vmm/vm_io_loop.h"
 #include "core/device/virtio/virtio_mmio.h"
 #include "core/device/virtio/virtio_blk.h"
 #include "core/device/virtio/virtio_net.h"
@@ -109,6 +110,17 @@ class Vm {
     void InjectIrq(uint8_t irq);
     void SetIrqLevel(uint8_t irq, bool asserted);
 
+    // Record a virtio-mmio slot as an IRQFD candidate. The actual KVM_IRQFD
+    // registration happens inside Run() once vCPUs (and, on arm64, the VGIC)
+    // are up. The classic SetIrqLevelCallback path stays wired as a fallback;
+    // when the real fd is installed the device transparently switches over.
+    bool TryEnableIrqFd(VirtioMmioDevice* dev, uint8_t slot_irq);
+
+    // Register all recorded candidate slots with the hypervisor + io_loop_.
+    // Slots that fail stay in the callback-driven path. Linux-only.
+    void InstallIrqFds();
+    void ShutdownIrqFds();
+
     uint32_t cpu_count_ = 1;
     std::unique_ptr<MachineModel> machine_;
     std::unique_ptr<HypervisorVm> hv_vm_;
@@ -148,6 +160,15 @@ class Vm {
     // Active virtio slot list (populated during setup, used for kernel loading)
     std::vector<VirtioDeviceSlot> active_virtio_slots_;
 
+    struct IrqFdSlot {
+        uint32_t gsi = 0;             // absolute hypervisor GSI
+        int trigger_fd = -1;          // write-to-assert eventfd
+        int resample_fd = -1;         // signalled on EOI (may be -1)
+        VirtioMmioDevice* dev = nullptr;  // for pending-status re-check
+    };
+    std::vector<IrqFdSlot> irqfd_slots_;
+    VmIoLoop io_loop_;
+
     std::atomic<bool> running_{false};
     std::atomic<bool> reboot_requested_{false};
 
diff --git a/src/core/vmm/vm_io_loop.cpp b/src/core/vmm/vm_io_loop.cpp
new file mode 100644
index 0000000..4102dd4
--- /dev/null
+++ b/src/core/vmm/vm_io_loop.cpp
@@ -0,0 +1,252 @@
+#include "core/vmm/vm_io_loop.h"
+
+#include "core/device/virtio/virtio_mmio.h"
+#include "core/vmm/types.h"
+
+#if defined(__linux__)
+#include <unistd.h>
+#endif
+
+VmIoLoop::VmIoLoop() = default;
+
+VmIoLoop::~VmIoLoop() {
+    Stop();
+}
+
+bool VmIoLoop::Start() {
+    {
+        std::lock_guard<std::mutex> lock(post_mutex_);
+        if (accepting_) return true;
+    }
+
+    int rc = uv_loop_init(&loop_);
+    if (rc != 0) {
+        LOG_ERROR("VmIoLoop: uv_loop_init failed: %s", uv_strerror(rc));
+        return false;
+    }
+
+    async_post_.data = this;
+    rc = uv_async_init(&loop_, &async_post_, OnAsyncPost);
+    if (rc != 0) {
+        LOG_ERROR("VmIoLoop: uv_async_init(post) failed: %s", uv_strerror(rc));
+        (void)uv_loop_close(&loop_);
+        return false;
+    }
+
+    async_stop_.data = this;
+    rc = uv_async_init(&loop_, &async_stop_, OnAsyncStop);
+    if (rc != 0) {
+        LOG_ERROR("VmIoLoop: uv_async_init(stop) failed: %s", uv_strerror(rc));
+        uv_close(reinterpret_cast<uv_handle_t*>(&async_post_), nullptr);
+        while (uv_run(&loop_, UV_RUN_NOWAIT) != 0) {}
+        (void)uv_loop_close(&loop_);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(post_mutex_);
+        accepting_ = true;
+    }
+    running_.store(true, std::memory_order_release);
+    io_thread_ = std::thread(&VmIoLoop::ThreadMain, this);
+    return true;
+}
+
+void VmIoLoop::Stop() {
+    {
+        std::lock_guard<std::mutex> lock(post_mutex_);
+        if (!accepting_) return;
+        accepting_ = false;
+    }
+
+    uv_async_send(&async_stop_);
+
+    if (io_thread_.joinable()) io_thread_.join();
+
+    running_.store(false, std::memory_order_release);
+
+    // Drop pending Post tasks without running them; captures destruct.
+    std::deque<Task> drained;
+    {
+        std::lock_guard<std::mutex> lock(post_mutex_);
+        drained.swap(post_queue_);
+    }
+}
+
+void VmIoLoop::ThreadMain() {
+    uv_run(&loop_, UV_RUN_DEFAULT);
+    while (uv_run(&loop_, UV_RUN_NOWAIT) != 0) {}
+    (void)uv_loop_close(&loop_);
+}
+
+void VmIoLoop::Post(Task fn) {
+    std::lock_guard<std::mutex> lock(post_mutex_);
+    if (!accepting_) return;
+    post_queue_.push_back(std::move(fn));
+    uv_async_send(&async_post_);
+}
+
+void VmIoLoop::OnAsyncPost(uv_async_t* h) {
+    auto* self = static_cast<VmIoLoop*>(h->data);
+
+    // uv_async coalesces wakeups, so drain everything in one shot.
+    std::deque<Task> drained;
+    {
+        std::lock_guard<std::mutex> lock(self->post_mutex_);
+        drained.swap(self->post_queue_);
+    }
+
+    // If the stop callback ran first this iteration, drop everything: newly
+    // created uv handles here would never be closed before uv_loop_close.
+    if (self->io_stopped_) return;
+
+    for (auto& fn : drained) {
+        if (fn) fn();
+    }
+}
+
+void VmIoLoop::OnAsyncStop(uv_async_t* h) {
+    auto* self = static_cast<VmIoLoop*>(h->data);
+    self->io_stopped_ = true;
+
+    // Destroy outstanding timers. Close callbacks delete the ctx on the
+    // next iteration (uv_close is async).
+    for (auto& kv : self->timers_) {
+        auto* ctx = kv.second;
+        uv_timer_stop(&ctx->handle);
+        uv_close(reinterpret_cast<uv_handle_t*>(&ctx->handle),
+                 [](uv_handle_t* h2) {
+                     delete static_cast<VmIoLoop::TimerCtx*>(h2->data);
+                 });
+    }
+    self->timers_.clear();
+
+    for (auto& kv : self->irqfds_) {
+        auto* ctx = kv.second;
+        uv_poll_stop(&ctx->handle);
+        uv_close(reinterpret_cast<uv_handle_t*>(&ctx->handle),
+                 [](uv_handle_t* h2) {
+                     delete static_cast<VmIoLoop::IrqFdCtx*>(h2->data);
+                 });
+    }
+    self->irqfds_.clear();
+
+    uv_close(reinterpret_cast<uv_handle_t*>(&self->async_post_), nullptr);
+    uv_close(reinterpret_cast<uv_handle_t*>(h), nullptr);
+}
+
+uint64_t VmIoLoop::AddTimer(uint64_t initial_ms, TimerCallback cb) {
+    uint64_t id = next_timer_id_.fetch_add(1, std::memory_order_relaxed);
+    Post([this, id, initial_ms, cb = std::move(cb)]() mutable {
+        if (io_stopped_) return;
+        auto* ctx = new TimerCtx{};
+        ctx->owner = this;
+        ctx->id = id;
+        ctx->cb = std::move(cb);
+        ctx->handle.data = ctx;
+        if (uv_timer_init(&loop_, &ctx->handle) != 0) {
+            delete ctx;
+            return;
+        }
+        timers_[id] = ctx;
+        uv_timer_start(&ctx->handle, OnTimerFire, initial_ms, 0);
+    });
+    return id;
+}
+
+void VmIoLoop::RemoveTimer(uint64_t id) {
+    Post([this, id]() {
+        auto it = timers_.find(id);
+        if (it == timers_.end()) return;
+        auto* ctx = it->second;
+        timers_.erase(it);
+        uv_timer_stop(&ctx->handle);
+        uv_close(reinterpret_cast<uv_handle_t*>(&ctx->handle),
+                 [](uv_handle_t* h) {
+                     delete static_cast<TimerCtx*>(h->data);
+                 });
+    });
+}
+
+void VmIoLoop::OnTimerFire(uv_timer_t* t) {
+    auto* ctx = static_cast<TimerCtx*>(t->data);
+    uint64_t next_ms = ctx->cb ? ctx->cb() : 0;
+    if (next_ms == 0) {
+        auto* owner = ctx->owner;
+        owner->timers_.erase(ctx->id);
+        uv_timer_stop(&ctx->handle);
+        uv_close(reinterpret_cast<uv_handle_t*>(&ctx->handle),
+                 [](uv_handle_t* h) {
+                     delete static_cast<TimerCtx*>(h->data);
+                 });
+    } else {
+        uv_timer_start(&ctx->handle, OnTimerFire, next_ms, 0);
+    }
+}
+
+void VmIoLoop::AttachIrqFd(VirtioMmioDevice* dev, int trigger_fd, int resample_fd) {
+#if defined(__linux__)
+    if (!dev || trigger_fd < 0 || resample_fd < 0) return;
+    Post([this, dev, trigger_fd, resample_fd]() {
+        if (io_stopped_) return;
+        if (irqfds_.count(dev)) return;  // idempotent
+        auto* ctx = new IrqFdCtx{};
+        ctx->owner = this;
+        ctx->dev = dev;
+        ctx->trigger_fd = trigger_fd;
+        ctx->resample_fd = resample_fd;
+        ctx->handle.data = ctx;
+        if (uv_poll_init(&loop_, &ctx->handle, resample_fd) != 0) {
+            LOG_WARN("VmIoLoop: uv_poll_init(irqfd) failed");
+            delete ctx;
+            return;
+        }
+        irqfds_[dev] = ctx;
+        uv_poll_start(&ctx->handle, UV_READABLE, OnIrqFdReadable);
+    });
+#else
+    (void)dev;
+    (void)trigger_fd;
+    (void)resample_fd;
+#endif
+}
+
+void VmIoLoop::DetachIrqFd(VirtioMmioDevice* dev) {
+#if defined(__linux__)
+    if (!dev) return;
+    Post([this, dev]() {
+        auto it = irqfds_.find(dev);
+        if (it == irqfds_.end()) return;
+        auto* ctx = it->second;
+        irqfds_.erase(it);
+        uv_poll_stop(&ctx->handle);
+        uv_close(reinterpret_cast<uv_handle_t*>(&ctx->handle),
+                 [](uv_handle_t* h) {
+                     delete static_cast<IrqFdCtx*>(h->data);
+                 });
+    });
+#else
+    (void)dev;
+#endif
+}
+
+void VmIoLoop::OnIrqFdReadable(uv_poll_t* p, int status, int events) {
+#if defined(__linux__)
+    (void)events;
+    auto* ctx = static_cast<IrqFdCtx*>(p->data);
+    if (status < 0) {
+        LOG_WARN("VmIoLoop: irqfd poll error: %s", uv_strerror(status));
+        return;
+    }
+    uint64_t v = 0;
+    (void)::read(ctx->resample_fd, &v, sizeof(v));
+    if (ctx->dev && ctx->dev->GetInterruptStatus() != 0 && ctx->trigger_fd >= 0) {
+        uint64_t one = 1;
+        (void)::write(ctx->trigger_fd, &one, sizeof(one));
+    }
+#else
+    (void)p;
+    (void)status;
+    (void)events;
+#endif
+}
diff --git a/src/core/vmm/vm_io_loop.h b/src/core/vmm/vm_io_loop.h
new file mode 100644
index 0000000..82e02a5
--- /dev/null
+++ b/src/core/vmm/vm_io_loop.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <uv.h>
+
+class VirtioMmioDevice;
+
+// Per-Vm device I/O event loop. Owns a single libuv loop running on its own
+// io_thread_, and is the central place to:
+//   - Drive the irqfd resample path (Linux): uv_poll on a resample eventfd,
+//     re-assert the trigger eventfd if the device still has pending bits.
+//   - Host timers for virtio devices (e.g. virtio_snd period tick) so that
+//     epoll_wait's timeout naturally folds in the next timer deadline.
+//   - Serve as a single point for other components to Post work to a
+//     known-safe thread (libuv handles are single-threaded).
+//
+// Concurrency contract (see plan):
+//   * All public methods are thread-safe. Call them from any thread.
+//   * Internally, every uv_* call other than uv_async_send happens on
+//     io_thread_. Cross-thread entry points Post a closure onto a queue and
+//     wake the loop with uv_async_send.
+//   * Post is FIFO within a single caller thread; cross-thread ordering is
+//     by the moment the task acquires the queue mutex.
+//   * Stop() does NOT execute the remaining posted tasks: captures are
+//     destroyed (releasing shared_ptr etc.), but the functions are not
+//     called. Devices must not rely on "all work drains" semantics.
+class VmIoLoop {
+public:
+    // Returned by callbacks in AddTimer: number of ms until the next fire,
+    // or 0 to stop (and destroy) the timer.
+    using TimerCallback = std::function<uint64_t()>;
+    using Task = std::function<void()>;
+
+    VmIoLoop();
+    ~VmIoLoop();
+
+    VmIoLoop(const VmIoLoop&) = delete;
+    VmIoLoop& operator=(const VmIoLoop&) = delete;
+
+    // Spawn io_thread_ and bring up the uv_loop. Must be called once before
+    // any other method. Subsequent calls are a no-op and return true.
+    bool Start();
+
+    // Close all handles, join io_thread_, drop pending Post tasks without
+    // running them. Idempotent.
+    void Stop();
+
+    bool running() const { return running_.load(std::memory_order_acquire); }
+
+    // Submit fn to run on io_thread_ (FIFO per caller).
+    void Post(Task fn);
+
+    // Schedule a timer. `initial_ms` is the delay to the first fire;
+    // subsequent fires use whatever ms the callback returns (0 = stop and
+    // destroy). For a classic fixed-interval timer, have the callback always
+    // return the interval.
+    // Returns an opaque id usable with RemoveTimer (allocated eagerly on the
+    // caller's thread; the underlying uv_timer_t is created asynchronously
+    // on io_thread_).
+    uint64_t AddTimer(uint64_t initial_ms, TimerCallback cb);
+
+    // Cancel a timer scheduled by AddTimer. Safe to call even if the timer
+    // has already self-destructed (returned 0 from its callback) or was
+    // never actually installed (e.g. cancelled before AddTimer's post ran).
+    void RemoveTimer(uint64_t id);
+
+    // Attach a Linux eventfd pair to this loop for irqfd resample handling.
+    // When `resample_fd` becomes readable (kernel signalled it on guest EOI),
+    // we drain the counter and, if the device still has pending interrupt
+    // bits, write(trigger_fd) to re-assert the GIC/IOAPIC line. The fds'
+    // lifetime is the caller's responsibility; call DetachIrqFd before
+    // closing them. No-op on non-Linux.
+    void AttachIrqFd(VirtioMmioDevice* dev, int trigger_fd, int resample_fd);
+    void DetachIrqFd(VirtioMmioDevice* dev);
+
+public:
+    // Public for static-callback access; treat as implementation detail.
+    struct TimerCtx {
+        uv_timer_t handle{};
+        VmIoLoop* owner = nullptr;
+        uint64_t id = 0;
+        TimerCallback cb;
+    };
+    struct IrqFdCtx {
+        uv_poll_t handle{};
+        VmIoLoop* owner = nullptr;
+        VirtioMmioDevice* dev = nullptr;
+        int trigger_fd = -1;
+        int resample_fd = -1;
+    };
+
+private:
+    void ThreadMain();
+    static void OnAsyncPost(uv_async_t* h);
+    static void OnAsyncStop(uv_async_t* h);
+    static void OnTimerFire(uv_timer_t* t);
+    static void OnIrqFdReadable(uv_poll_t* p, int status, int events);
+
+    uv_loop_t loop_{};
+    uv_async_t async_post_{};
+    uv_async_t async_stop_{};
+    std::thread io_thread_;
+    std::atomic<bool> running_{false};
+
+    std::mutex post_mutex_;
+    std::deque<Task> post_queue_;
+    bool accepting_ = false;
+
+    std::atomic<uint64_t> next_timer_id_{1};
+
+    // Accessed only from io_thread_.
+    std::unordered_map<uint64_t, TimerCtx*> timers_;
+    std::unordered_map<VirtioMmioDevice*, IrqFdCtx*> irqfds_;
+    bool io_stopped_ = false;
+};
diff --git a/src/platform/CMakeLists.txt b/src/platform/CMakeLists.txt
index d611fe3..dc616ae 100644
--- a/src/platform/CMakeLists.txt
+++ b/src/platform/CMakeLists.txt
@@ -54,15 +54,25 @@ elseif(APPLE)
             "-framework Hypervisor"
     )
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    set(KVM_SOURCES
+    set(KVM_COMMON_SOURCES
         ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/kvm_platform.cpp
-        ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp
-        ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vcpu.cpp
         ${CMAKE_SOURCE_DIR}/src/platform/linux/vm_platform_linux.cpp
         ${CMAKE_SOURCE_DIR}/src/platform/posix/console/posix_console_port.cpp
     )
 
-    add_library(tenbox_platform STATIC ${KVM_SOURCES})
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+        set(KVM_ARCH_SOURCES
+            ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp
+            ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp
+        )
+    else()
+        set(KVM_ARCH_SOURCES
+            ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp
+            ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vcpu.cpp
+        )
+    endif()
+
+    add_library(tenbox_platform STATIC ${KVM_COMMON_SOURCES} ${KVM_ARCH_SOURCES})
 
     target_include_directories(tenbox_platform
         PUBLIC
diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp
new file mode 100644
index 0000000..7430a0e
--- /dev/null
+++ b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp
@@ -0,0 +1,297 @@
+#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h"
+#include "platform/linux/hypervisor/aarch64/kvm_vm.h"
+#include "platform/linux/hypervisor/kvm_platform.h"
+#include "core/vmm/types.h"
+
+#include <cerrno>
+#include <cstddef>
+#include <csignal>
+#include <cstring>
+#include <linux/kvm.h>
+#include <asm/kvm.h>
+#include <pthread.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+namespace kvm {
+
+// Signal used to kick a vCPU out of KVM_RUN. Handler intentionally empty —
+// arriving in userspace with a pending signal is enough for KVM_RUN to
+// return with -EINTR / KVM_EXIT_INTR.
+static constexpr int kCancelSignal = SIGUSR1;
+
+static void CancelSignalHandler(int /*sig*/) {}
+
+static void InstallCancelSignalHandler() {
+    static bool installed = false;
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    if (installed) return;
+
+    struct sigaction sa{};
+    sa.sa_handler = CancelSignalHandler;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;  // no SA_RESTART: we want KVM_RUN to return EINTR
+    ::sigaction(kCancelSignal, &sa, nullptr);
+    installed = true;
+}
+
+// Build a KVM_REG_ARM64 core register id from a field name inside struct
+// kvm_regs (which starts with user_pt_regs "regs"). Offsets are expressed
+// in 32-bit words per the KVM API convention.
+static constexpr uint64_t CoreRegId(uint64_t byte_offset) {
+    return KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE |
+           (byte_offset / sizeof(uint32_t));
+}
+
+// Offsets within struct kvm_regs { user_pt_regs regs; ... } on aarch64.
+// user_pt_regs = { u64 regs[31]; u64 sp; u64 pc; u64 pstate; }.
+static constexpr uint64_t kOffX(uint32_t i) { return i * 8u; }
+static constexpr uint64_t kOffSp     = 31u * 8u;          // 0xF8
+static constexpr uint64_t kOffPc     = 32u * 8u;          // 0x100
+static constexpr uint64_t kOffPstate = 33u * 8u;          // 0x108
+
+// MPIDR_EL1 encoding for KVM_REG_ARM64 system register access.
+//   op0=3 op1=0 CRn=0 CRm=0 op2=5
+static constexpr uint64_t kSysRegMpidrEl1 = ARM64_SYS_REG(3, 0, 0, 0, 5);
+
+static bool SetOneReg(int fd, uint64_t id, uint64_t value) {
+    struct kvm_one_reg r{};
+    r.id = id;
+    r.addr = reinterpret_cast<uint64_t>(&value);
+    return ::ioctl(fd, KVM_SET_ONE_REG, &r) == 0;
+}
+
+KvmVCpu::~KvmVCpu() {
+    if (run_) {
+        ::munmap(run_, run_size_);
+        run_ = nullptr;
+    }
+    if (vcpu_fd_ >= 0) {
+        ::close(vcpu_fd_);
+        vcpu_fd_ = -1;
+    }
+}
+
+std::unique_ptr<KvmVCpu> KvmVCpu::Create(KvmVm& vm, uint32_t index,
+                                         AddressSpace* addr_space) {
+    auto vcpu = std::unique_ptr<KvmVCpu>(new KvmVCpu());
+    vcpu->index_ = index;
+    vcpu->vm_ = &vm;
+    vcpu->addr_space_ = addr_space;
+
+    // arm64 KVM requires KVM_CREATE_VCPU / KVM_ARM_VCPU_INIT to be fully
+    // serialised across vCPUs: once any vcpu has been INIT'd the kernel
+    // returns -EBUSY on further KVM_CREATE_VCPU. Our vcpu worker threads
+    // run in parallel, so guard the whole create + init sequence with a
+    // process-wide mutex. (Single-process use of /dev/kvm is the norm for
+    // this runtime, so a static lock is fine.)
+    static std::mutex create_mutex;
+    std::lock_guard<std::mutex> create_guard(create_mutex);
+
+    vcpu->vcpu_fd_ = ::ioctl(vm.VmFd(), KVM_CREATE_VCPU, (unsigned long)index);
+    if (vcpu->vcpu_fd_ < 0) {
+        LOG_ERROR("kvm: KVM_CREATE_VCPU(%u) failed: %s", index, strerror(errno));
+        return nullptr;
+    }
+
+    vcpu->run_size_ = vm.VcpuMmapSize();
+    void* run = ::mmap(nullptr, vcpu->run_size_, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, vcpu->vcpu_fd_, 0);
+    if (run == MAP_FAILED) {
+        LOG_ERROR("kvm: mmap kvm_run for vCPU %u failed: %s",
+                  index, strerror(errno));
+        return nullptr;
+    }
+    vcpu->run_ = static_cast<struct kvm_run*>(run);
+
+    // Secondary vCPUs start in POWER_OFF so the in-kernel PSCI layer blocks
+    // them inside KVM_RUN until the BSP issues PSCI_CPU_ON.
+    const bool power_off = (index != 0);
+    if (!vcpu->InitVcpu(power_off)) {
+        return nullptr;
+    }
+
+    // Program MPIDR_EL1 with a unique affinity value (Aff0 = index). KVM
+    // defaults to derived affinity but being explicit matches QEMU/HVF.
+    if (!SetOneReg(vcpu->vcpu_fd_, kSysRegMpidrEl1,
+                   static_cast<uint64_t>(index) & 0xFFu)) {
+        LOG_WARN("kvm: set MPIDR_EL1 for vCPU %u failed: %s",
+                 index, strerror(errno));
+    }
+
+    LOG_INFO("kvm: aarch64 vCPU %u created (%s)",
+             index, power_off ? "POWER_OFF" : "running");
+    return vcpu;
+}
+
+bool KvmVCpu::InitVcpu(bool power_off) {
+    // Query the preferred target from the host.
+    struct kvm_vcpu_init init{};
+    if (::ioctl(vm_->VmFd(), KVM_ARM_PREFERRED_TARGET, &init) < 0) {
+        LOG_ERROR("kvm: KVM_ARM_PREFERRED_TARGET failed: %s", strerror(errno));
+        return false;
+    }
+
+    // Enable in-kernel PSCI v0.2 handling. KVM will parse HVC PSCI calls
+    // (CPU_ON / SYSTEM_OFF / SYSTEM_RESET) entirely in the kernel and expose
+    // lifecycle events via KVM_EXIT_SYSTEM_EVENT.
+    auto SetFeature = [&init](unsigned bit) {
+        init.features[bit / 32] |= (1u << (bit % 32));
+    };
+    SetFeature(KVM_ARM_VCPU_PSCI_0_2);
+    if (power_off) {
+        SetFeature(KVM_ARM_VCPU_POWER_OFF);
+    }
+
+    if (::ioctl(vcpu_fd_, KVM_ARM_VCPU_INIT, &init) < 0) {
+        LOG_ERROR("kvm: KVM_ARM_VCPU_INIT(%u) failed: %s",
+                  index_, strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+void KvmVCpu::OnThreadInit() {
+    InstallCancelSignalHandler();
+
+    // Unblock the cancel signal on this (vCPU worker) thread, in case it was
+    // inherited-blocked.
+    sigset_t set;
+    sigemptyset(&set);
+    sigaddset(&set, kCancelSignal);
+    pthread_sigmask(SIG_UNBLOCK, &set, nullptr);
+
+    thread_id_.store(static_cast<unsigned long>(pthread_self()),
+                     std::memory_order_release);
+}
+
+bool KvmVCpu::SetupBootRegisters(uint8_t* /*ram*/) {
+    // aarch64 BSP boot state is configured through SetupAarch64Boot, which
+    // receives the (entry_pc, fdt_addr) pair from Aarch64Machine.
+    return true;
+}
+
+bool KvmVCpu::SetupAarch64Boot(uint64_t entry_pc, uint64_t fdt_addr) {
+    // Finalize the in-kernel VGIC now — by the time the BSP reaches this
+    // point all vCPUs have been created (Vm::Run waits for all vCPUs ready
+    // before invoking FinalizeBoot, which calls us).
+    if (!vm_->FinalizeVgicInit()) {
+        return false;
+    }
+
+    // PSTATE: EL1h with D/A/I/F masked = 0x3C5, same as the HVF path.
+    constexpr uint64_t kPstateEl1h = 0x3C5ULL;
+
+    bool ok = true;
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffPc),     entry_pc);
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(0)),   fdt_addr);
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(1)),   0);
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(2)),   0);
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(3)),   0);
+    ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffPstate), kPstateEl1h);
+    if (!ok) {
+        LOG_ERROR("kvm: vCPU %u SetupAarch64Boot: KVM_SET_ONE_REG failed: %s",
+                  index_, strerror(errno));
+        return false;
+    }
+
+    LOG_INFO("kvm: vCPU %u ARM64 boot: PC=0x%" PRIx64 ", X0(FDT)=0x%" PRIx64,
+             index_, entry_pc, fdt_addr);
+    return true;
+}
+
+VCpuExitAction KvmVCpu::RunOnce() {
+    int rc = ::ioctl(vcpu_fd_, KVM_RUN, 0);
+    if (rc < 0) {
+        if (errno == EINTR || errno == EAGAIN) {
+            run_->immediate_exit = 0;
+            return VCpuExitAction::kContinue;
+        }
+        LOG_ERROR("kvm: KVM_RUN(%u) failed: %s", index_, strerror(errno));
+        return VCpuExitAction::kError;
+    }
+
+    switch (run_->exit_reason) {
+    case KVM_EXIT_MMIO: {
+        auto& mmio = run_->mmio;
+        if (mmio.is_write) {
+            uint64_t val = 0;
+            ::memcpy(&val, mmio.data, mmio.len);
+            addr_space_->HandleMmioWrite(mmio.phys_addr, mmio.len, val);
+        } else {
+            uint64_t val = 0;
+            addr_space_->HandleMmioRead(mmio.phys_addr, mmio.len, &val);
+            ::memcpy(mmio.data, &val, mmio.len);
+        }
+        return VCpuExitAction::kContinue;
+    }
+
+    case KVM_EXIT_HLT:
+        return VCpuExitAction::kHalt;
+
+    case KVM_EXIT_INTR:
+        return VCpuExitAction::kContinue;
+
+    case KVM_EXIT_SHUTDOWN:
+        LOG_INFO("kvm: vCPU %u KVM_EXIT_SHUTDOWN", index_);
+        if (shutdown_cb_) shutdown_cb_();
+        return VCpuExitAction::kShutdown;
+
+    case KVM_EXIT_SYSTEM_EVENT: {
+        uint32_t type = run_->system_event.type;
+        LOG_INFO("kvm: vCPU %u KVM_EXIT_SYSTEM_EVENT type=%u", index_, type);
+        // PSCI SYSTEM_OFF / SYSTEM_RESET / etc. are delivered here by the
+        // in-kernel PSCI emulator. Translate to the generic Vm lifecycle
+        // callbacks so RequestReboot() can actually recycle the VM.
+        if (type == KVM_SYSTEM_EVENT_RESET) {
+            if (reboot_cb_) reboot_cb_();
+        } else {
+            if (shutdown_cb_) shutdown_cb_();
+        }
+        return VCpuExitAction::kShutdown;
+    }
+
+    case KVM_EXIT_FAIL_ENTRY:
+        LOG_ERROR("kvm: KVM_EXIT_FAIL_ENTRY reason=0x%" PRIx64 " cpu=%u",
+                  (uint64_t)run_->fail_entry.hardware_entry_failure_reason,
+                  run_->fail_entry.cpu);
+        return VCpuExitAction::kError;
+
+    case KVM_EXIT_INTERNAL_ERROR:
+        LOG_ERROR("kvm: KVM_EXIT_INTERNAL_ERROR suberror=%u",
+                  run_->internal.suberror);
+        return VCpuExitAction::kError;
+
+    case KVM_EXIT_UNKNOWN:
+    case KVM_EXIT_IRQ_WINDOW_OPEN:
+        return VCpuExitAction::kContinue;
+
+    default:
+        LOG_WARN("kvm: vCPU %u unhandled exit reason %u",
+                 index_, run_->exit_reason);
+        return VCpuExitAction::kContinue;
+    }
+}
+
+void KvmVCpu::CancelRun() {
+    if (run_) {
+        run_->immediate_exit = 1;
+    }
+    unsigned long tid = thread_id_.load(std::memory_order_acquire);
+    if (tid) {
+        ::pthread_kill(static_cast<pthread_t>(tid), kCancelSignal);
+    }
+}
+
+bool KvmVCpu::WaitForInterrupt(uint32_t timeout_ms) {
+    // With an in-kernel VGIC, WFI is normally handled inside KVM and we do
+    // not surface KVM_EXIT_HLT. If we do get here, just sleep briefly so the
+    // run loop keeps responsive to CancelRun.
+    if (timeout_ms == 0) timeout_ms = 1;
+    ::usleep(static_cast<useconds_t>(timeout_ms) * 1000);
+    return false;
+}
+
+} // namespace kvm
diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h
new file mode 100644
index 0000000..6dee403
--- /dev/null
+++ b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "core/vmm/address_space.h"
+#include "core/vmm/hypervisor_vcpu.h"
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+struct kvm_run;
+
+namespace kvm {
+
+class KvmVm;
+
+class KvmVCpu final : public HypervisorVCpu {
+public:
+    ~KvmVCpu() override;
+
+    static std::unique_ptr<KvmVCpu> Create(KvmVm& vm, uint32_t index,
+                                           AddressSpace* addr_space);
+
+    VCpuExitAction RunOnce() override;
+    void CancelRun() override;
+    uint32_t Index() const override { return index_; }
+
+    // Core HypervisorVCpu interface requires SetupBootRegisters; on aarch64
+    // the real work happens through SetupAarch64Boot, invoked by
+    // Aarch64Machine::SetupBootVCpu. Keep SetupBootRegisters as a no-op for
+    // symmetry with the HVF backend.
+    bool SetupBootRegisters(uint8_t* ram) override;
+
+    // BSP startup: set PC, X0=FDT, PSTATE=0x3C5 (EL1h, DAIF masked). Called
+    // once from the Aarch64Machine on the BSP thread *before* it enters
+    // RunOnce. Must run on the vCPU's own worker thread.
+    // Matches the HVF signature so aarch64_machine.cpp can share code paths.
+    bool SetupAarch64Boot(uint64_t entry_pc, uint64_t fdt_addr);
+
+    void OnThreadInit() override;
+
+    bool WaitForInterrupt(uint32_t timeout_ms) override;
+
+    // KVM's in-kernel PSCI handles AP bring-up entirely in the kernel: APs
+    // are created in POWER_OFF state and KVM_RUN blocks until a
+    // PSCI_CPU_ON HVC is dispatched. Userspace SIPI/PSCI callbacks never
+    // fire, so the generic startup-wait would deadlock.
+    bool NeedsStartupWait() const override { return false; }
+
+    // Shutdown/reset callbacks invoked when KVM surfaces a PSCI
+    // SYSTEM_OFF / SYSTEM_RESET via KVM_EXIT_SYSTEM_EVENT. Wired by
+    // Vm::SetupVCpuCallbacks so the Vm can RequestStop/RequestReboot.
+    using ShutdownCallback = std::function<void()>;
+    using RebootCallback = std::function<void()>;
+    void SetShutdownCallback(ShutdownCallback cb) { shutdown_cb_ = std::move(cb); }
+    void SetRebootCallback(RebootCallback cb) { reboot_cb_ = std::move(cb); }
+
+private:
+    KvmVCpu() = default;
+
+    bool InitVcpu(bool power_off);
+
+    uint32_t index_ = 0;
+    int vcpu_fd_ = -1;
+    struct kvm_run* run_ = nullptr;
+    size_t run_size_ = 0;
+
+    KvmVm* vm_ = nullptr;
+    AddressSpace* addr_space_ = nullptr;
+
+    // CancelRun writes immediate_exit = 1 and raises SIGUSR1 on the vCPU
+    // thread. OnThreadInit stashes the pthread id so CancelRun can deliver
+    // the signal to the right thread.
+    std::atomic<unsigned long> thread_id_{0};
+
+    ShutdownCallback shutdown_cb_;
+    RebootCallback reboot_cb_;
+};
+
+} // namespace kvm
diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp b/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp
new file mode 100644
index 0000000..24d827b
--- /dev/null
+++ b/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp
@@ -0,0 +1,326 @@
+#include "platform/linux/hypervisor/aarch64/kvm_vm.h"
+#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h"
+#include "platform/linux/hypervisor/kvm_platform.h"
+
+#include <cerrno>
+#include <cstring>
+#include <vector>
+#include <linux/kvm.h>
+#include <asm/kvm.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+namespace kvm {
+
+KvmVm::~KvmVm() {
+    if (vgic_fd_ >= 0) {
+        ::close(vgic_fd_);
+        vgic_fd_ = -1;
+    }
+    if (vm_fd_ >= 0) {
+        ::close(vm_fd_);
+        vm_fd_ = -1;
+    }
+    // kvm_fd_ is owned by kvm_platform.cpp; do not close.
+}
+
+std::unique_ptr<KvmVm> KvmVm::Create(uint32_t cpu_count) {
+    auto vm = std::unique_ptr<KvmVm>(new KvmVm());
+    vm->cpu_count_ = cpu_count;
+
+    vm->kvm_fd_ = GetKvmFd();
+    if (vm->kvm_fd_ < 0) {
+        LOG_ERROR("kvm: /dev/kvm not available");
+        return nullptr;
+    }
+
+    int vcpu_mmap_size = ::ioctl(vm->kvm_fd_, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (vcpu_mmap_size < (int)sizeof(struct kvm_run)) {
+        LOG_ERROR("kvm: KVM_GET_VCPU_MMAP_SIZE failed (%d): %s",
+                  vcpu_mmap_size, strerror(errno));
+        return nullptr;
+    }
+    vm->vcpu_mmap_size_ = static_cast<size_t>(vcpu_mmap_size);
+
+    // KVM_CREATE_VM takes an IPA size (in bits) on arm64. 0 means "use the
+    // KVM default" (40 bits on most hosts).
+    vm->vm_fd_ = ::ioctl(vm->kvm_fd_, KVM_CREATE_VM, 0UL);
+    if (vm->vm_fd_ < 0) {
+        LOG_ERROR("kvm: KVM_CREATE_VM failed: %s", strerror(errno));
+        return nullptr;
+    }
+
+    if (!vm->CreateInKernelVgic()) {
+        return nullptr;
+    }
+
+    LOG_INFO("kvm: aarch64 VM created (%u vCPUs, mmap_size=%zu, vgic=%s)",
+             cpu_count, vm->vcpu_mmap_size_,
+             vm->uses_gic_v2_ ? "v2" : "v3");
+    return vm;
+}
+
+bool KvmVm::CreateInKernelVgic() {
+    // Prefer GICv3 (matches our default FDT + arm64 ABI). On hosts whose
+    // kernel cannot emulate VGICv3 over a physical GICv2 (e.g. Raspberry Pi
+    // 5's GIC-400 under certain kernel configs), KVM_CREATE_DEVICE returns
+    // ENODEV: fall back to VGICv2.
+    if (TryCreateVgicV3()) {
+        uses_gic_v2_ = false;
+        return true;
+    }
+    LOG_WARN("kvm: VGICv3 unavailable, falling back to VGICv2");
+    if (TryCreateVgicV2()) {
+        uses_gic_v2_ = true;
+        return true;
+    }
+    LOG_ERROR("kvm: neither VGICv3 nor VGICv2 could be created");
+    return false;
+}
+
+bool KvmVm::TryCreateVgicV3() {
+    struct kvm_create_device cd{};
+    cd.type = KVM_DEV_TYPE_ARM_VGIC_V3;
+    cd.fd = 0;
+    cd.flags = 0;
+    if (::ioctl(vm_fd_, KVM_CREATE_DEVICE, &cd) < 0) {
+        LOG_INFO("kvm: KVM_CREATE_DEVICE(VGIC_V3) unavailable: %s",
+                 strerror(errno));
+        return false;
+    }
+    vgic_fd_ = static_cast<int>(cd.fd);
+
+    auto SetAddr = [this](uint64_t attr, uint64_t addr) -> bool {
+        struct kvm_device_attr da{};
+        da.group = KVM_DEV_ARM_VGIC_GRP_ADDR;
+        da.attr = attr;
+        da.addr = reinterpret_cast<uint64_t>(&addr);
+        if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) {
+            LOG_ERROR("kvm: VGIC_V3 SET_ADDR(attr=%" PRIu64 ") failed: %s",
+                      attr, strerror(errno));
+            return false;
+        }
+        return true;
+    };
+
+    if (!SetAddr(KVM_VGIC_V3_ADDR_TYPE_DIST, kGicDistBase)) return false;
+    if (!SetAddr(KVM_VGIC_V3_ADDR_TYPE_REDIST, kGicRedistBase)) return false;
+    return true;
+}
+
+bool KvmVm::TryCreateVgicV2() {
+    struct kvm_create_device cd{};
+    cd.type = KVM_DEV_TYPE_ARM_VGIC_V2;
+    cd.fd = 0;
+    cd.flags = 0;
+    if (::ioctl(vm_fd_, KVM_CREATE_DEVICE, &cd) < 0) {
+        LOG_ERROR("kvm: KVM_CREATE_DEVICE(VGIC_V2) failed: %s", strerror(errno));
+        return false;
+    }
+    vgic_fd_ = static_cast<int>(cd.fd);
+
+    auto SetAddr = [this](uint64_t attr, uint64_t addr) -> bool {
+        struct kvm_device_attr da{};
+        da.group = KVM_DEV_ARM_VGIC_GRP_ADDR;
+        da.attr = attr;
+        da.addr = reinterpret_cast<uint64_t>(&addr);
+        if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) {
+            LOG_ERROR("kvm: VGIC_V2 SET_ADDR(attr=%" PRIu64 ") failed: %s",
+                      attr, strerror(errno));
+            return false;
+        }
+        return true;
+    };
+
+    // GICv2 needs DIST and CPU interface addresses. We reuse the same 64 KiB
+    // distributor slot as v3 and place the virtual CPU interface at
+    // 0x08010000 (inside the space that v3 would use for redistributors).
+    if (!SetAddr(KVM_VGIC_V2_ADDR_TYPE_DIST, kGicDistBase)) return false;
+    if (!SetAddr(KVM_VGIC_V2_ADDR_TYPE_CPU, kGicV2CpuBase)) return false;
+    return true;
+}
+
+bool KvmVm::FinalizeVgicInit() {
+    std::lock_guard<std::mutex> lock(vgic_init_mutex_);
+    if (vgic_initialized_) return true;
+    if (vgic_fd_ < 0) {
+        LOG_ERROR("kvm: FinalizeVgicInit called without a VGIC device");
+        return false;
+    }
+
+    struct kvm_device_attr da{};
+    da.group = KVM_DEV_ARM_VGIC_GRP_CTRL;
+    da.attr = KVM_DEV_ARM_VGIC_CTRL_INIT;
+    if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) {
+        LOG_ERROR("kvm: VGIC CTRL_INIT failed: %s", strerror(errno));
+        return false;
+    }
+    vgic_initialized_ = true;
+    LOG_INFO("kvm: in-kernel VGIC%s initialized", uses_gic_v2_ ? "v2" : "v3");
+    return true;
+}
+
+bool KvmVm::MapMemory(GPA gpa, void* hva, uint64_t size, bool writable) {
+    uint32_t slot;
+    {
+        std::lock_guard<std::mutex> lock(slot_mutex_);
+        slot = next_slot_++;
+    }
+
+    struct kvm_userspace_memory_region region{};
+    region.slot = slot;
+    region.flags = writable ? 0 : KVM_MEM_READONLY;
+    region.guest_phys_addr = gpa;
+    region.memory_size = size;
+    region.userspace_addr = reinterpret_cast<uint64_t>(hva);
+
+    if (::ioctl(vm_fd_, KVM_SET_USER_MEMORY_REGION, &region) < 0) {
+        LOG_ERROR("kvm: KVM_SET_USER_MEMORY_REGION(slot=%u gpa=0x%" PRIx64
+                  " size=0x%" PRIx64 ") failed: %s",
+                  slot, gpa, size, strerror(errno));
+        return false;
+    }
+
+    LOG_INFO("kvm: mapped slot=%u GPA=0x%" PRIx64 " size=0x%" PRIx64 " HVA=%p%s",
+             slot, gpa, size, hva, writable ? "" : " [RO]");
+    return true;
+}
+
+bool KvmVm::UnmapMemory(GPA /*gpa*/, uint64_t /*size*/) {
+    // Not exercised by the current VM lifecycle (RAM is torn down with the
+    // process). Implementing this cleanly requires tracking slot IDs per GPA.
+    LOG_WARN("kvm: UnmapMemory not implemented");
+    return false;
+}
+
+std::unique_ptr<HypervisorVCpu> KvmVm::CreateVCpu(
+    uint32_t index, AddressSpace* addr_space) {
+    return KvmVCpu::Create(*this, index, addr_space);
+}
+
+void KvmVm::RequestInterrupt(const InterruptRequest& req) {
+    // Aarch64Machine::SetIrqLevel encodes SPIs as (hw_irq + 32), i.e. the
+    // architectural GIC INTID. KVM's in-kernel VGIC expects the absolute
+    // INTID (32..1019) as irq_id in the (type|vcpu|num) encoding, so we pass
+    // it through unchanged. SGIs/PPIs do not flow through RequestInterrupt
+    // in our codebase.
+    if (req.vector < 32 || req.vector > 1019) {
+        LOG_WARN("kvm: RequestInterrupt for out-of-range SPI vector %u ignored",
+                 req.vector);
+        return;
+    }
+    uint32_t encoded = (static_cast<uint32_t>(KVM_ARM_IRQ_TYPE_SPI) << 24) |
+                       (0u << 16) |
+                       (req.vector & 0xffffu);
+
+    struct kvm_irq_level il{};
+    il.irq = encoded;
+    il.level = req.level_triggered ? 1 : 0;
+    if (::ioctl(vm_fd_, KVM_IRQ_LINE, &il) < 0) {
+        LOG_WARN("kvm: KVM_IRQ_LINE(intid=%u level=%d) failed: %s",
+                 req.vector, (int)req.level_triggered, strerror(errno));
+    }
+}
+
+bool KvmVm::AssertIrq(uint32_t gsi, bool level) {
+    // gsi is the absolute architectural INTID (>= 32 for SPIs).
+    if (gsi < 32 || gsi > 1019) return false;
+    uint32_t encoded = (static_cast<uint32_t>(KVM_ARM_IRQ_TYPE_SPI) << 24) |
+                       (gsi & 0xffffu);
+
+    struct kvm_irq_level il{};
+    il.irq = encoded;
+    il.level = level ? 1 : 0;
+    if (::ioctl(vm_fd_, KVM_IRQ_LINE, &il) < 0) {
+        LOG_WARN("kvm: AssertIrq KVM_IRQ_LINE(intid=%u level=%d) failed: %s",
+                 gsi, (int)level, strerror(errno));
+    }
+    return true;
+}
+
+bool KvmVm::UpdateIrqRoutingLocked() {
+    // arm64 KVM has NO default GSI routing — we must install one entry per
+    // SPI we want to drive through KVM_IRQFD (or KVM_IRQ_LINE with routing).
+    // Build a routing table from routed_gsis_ and send it wholesale.
+    size_t n = routed_gsis_.size();
+    std::vector<uint8_t> buf(
+        sizeof(struct kvm_irq_routing) +
+        n * sizeof(struct kvm_irq_routing_entry), 0);
+    auto* routing = reinterpret_cast<struct kvm_irq_routing*>(buf.data());
+    routing->nr = static_cast<uint32_t>(n);
+
+    auto* entries = reinterpret_cast<struct kvm_irq_routing_entry*>(
+        buf.data() + sizeof(struct kvm_irq_routing));
+    size_t i = 0;
+    for (uint32_t gsi : routed_gsis_) {
+        entries[i].gsi = gsi;
+        entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+        entries[i].u.irqchip.irqchip = 0;         // only VGIC
+        entries[i].u.irqchip.pin = gsi - 32;      // SPI pin is INTID - 32
+        ++i;
+    }
+
+    if (::ioctl(vm_fd_, KVM_SET_GSI_ROUTING, routing) < 0) {
+        LOG_WARN("kvm: KVM_SET_GSI_ROUTING(n=%zu) failed: %s",
+                 n, strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+bool KvmVm::RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) {
+    // arm64 GSI for KVM_IRQFD is the absolute SPI INTID (>= 32). We must
+    // explicitly install a KVM_IRQ_ROUTING_IRQCHIP entry mapping gsi -> SPI
+    // pin before KVM_IRQFD; otherwise the kernel happily accepts the irqfd
+    // but never delivers the interrupt (no route found).
+    if (gsi < 32 || gsi > 1019 || trigger_fd < 0) return false;
+
+    {
+        std::lock_guard<std::mutex> lock(irqfd_route_mutex_);
+        routed_gsis_.insert(gsi);
+        if (!UpdateIrqRoutingLocked()) {
+            routed_gsis_.erase(gsi);
+            return false;
+        }
+    }
+
+    struct kvm_irqfd ifd{};
+    ifd.fd = static_cast<uint32_t>(trigger_fd);
+    ifd.gsi = gsi;
+    if (resample_fd >= 0) {
+        ifd.flags = KVM_IRQFD_FLAG_RESAMPLE;
+        ifd.resamplefd = static_cast<uint32_t>(resample_fd);
+    }
+    if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) {
+        LOG_WARN("kvm: KVM_IRQFD(gsi=%u trigger=%d resample=%d) failed: %s",
+                 gsi, trigger_fd, resample_fd, strerror(errno));
+        std::lock_guard<std::mutex> lock(irqfd_route_mutex_);
+        routed_gsis_.erase(gsi);
+        UpdateIrqRoutingLocked();
+        return false;
+    }
+    LOG_INFO("kvm: irqfd registered gsi=%u trigger=%d resample=%d",
+             gsi, trigger_fd, resample_fd);
+    return true;
+}
+
+bool KvmVm::UnregisterIrqFd(uint32_t gsi, int trigger_fd) {
+    if (trigger_fd < 0) return false;
+
+    struct kvm_irqfd ifd{};
+    ifd.fd = static_cast<uint32_t>(trigger_fd);
+    ifd.gsi = gsi;
+    ifd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+    bool ok = (::ioctl(vm_fd_, KVM_IRQFD, &ifd) == 0);
+    if (!ok) {
+        LOG_WARN("kvm: KVM_IRQFD DEASSIGN(gsi=%u trigger=%d) failed: %s",
+                 gsi, trigger_fd, strerror(errno));
+    }
+
+    std::lock_guard<std::mutex> lock(irqfd_route_mutex_);
+    routed_gsis_.erase(gsi);
+    UpdateIrqRoutingLocked();  // best-effort; ignore result on teardown
+    return ok;
+}
+
+} // namespace kvm
diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vm.h b/src/platform/linux/hypervisor/aarch64/kvm_vm.h
new file mode 100644
index 0000000..5c5e744
--- /dev/null
+++ b/src/platform/linux/hypervisor/aarch64/kvm_vm.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include "core/vmm/hypervisor_vm.h"
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <set>
+
+namespace kvm {
+
+class KvmVCpu;
+
+// ARM64 KVM VM backend.
+// - Uses in-kernel VGICv3 (created via KVM_CREATE_DEVICE) with the
+//   dist/redist layout expected by the generic Aarch64Machine:
+//       GICD at 0x08000000 (64 KiB)
+//       GICR at 0x080A0000 (2 * 64 KiB per vCPU)
+// - Relies on in-kernel PSCI v0.2 for SYSTEM_OFF/RESET and secondary CPU
+//   startup; no userspace PSCI dispatch is needed.
+class KvmVm final : public HypervisorVm {
+public:
+    ~KvmVm() override;
+
+    static std::unique_ptr<KvmVm> Create(uint32_t cpu_count);
+
+    bool MapMemory(GPA gpa, void* hva, uint64_t size, bool writable) override;
+    bool UnmapMemory(GPA gpa, uint64_t size) override;
+
+    std::unique_ptr<HypervisorVCpu> CreateVCpu(
+        uint32_t index, AddressSpace* addr_space) override;
+
+    void RequestInterrupt(const InterruptRequest& req) override;
+
+    // KVM has an in-kernel VGIC: SPI IRQ lines go through KVM_IRQ_LINE.
+    bool AssertIrq(uint32_t gsi, bool level) override;
+
+    // Register / unregister a KVM_IRQFD for a level-triggered SPI.
+    // gsi is the absolute INTID (>= 32). resample_fd may be -1 to fall
+    // back to edge semantics, but virtio-mmio requires a resample fd.
+    bool RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) override;
+    bool UnregisterIrqFd(uint32_t gsi, int trigger_fd) override;
+
+    void SetGuestMemMap(const GuestMemMap* mem) override { guest_mem_ = mem; }
+
+    // Issue KVM_DEV_ARM_VGIC_CTRL_INIT on the in-kernel VGIC. Must be called
+    // exactly once, after *all* vCPUs have been created via KVM_CREATE_VCPU
+    // (KVM rejects INIT otherwise). Safe to call multiple times: no-op after
+    // the first success.
+    bool FinalizeVgicInit();
+
+    int VmFd() const { return vm_fd_; }
+    int KvmFd() const { return kvm_fd_; }
+    size_t VcpuMmapSize() const { return vcpu_mmap_size_; }
+    uint32_t CpuCount() const { return cpu_count_; }
+
+    // True when the fallback VGICv2 path was used (host GIC is v2 and the
+    // kernel doesn't emulate v3 on top of it — common on Raspberry Pi 5 with
+    // GIC-400). The machine model needs this to pick the right FDT compat.
+    bool UsesGicV2() const { return uses_gic_v2_; }
+
+    // GIC layout (shared between v2 and v3 wherever possible).
+    static constexpr uint64_t kGicDistBase    = 0x08000000ULL;
+    static constexpr uint64_t kGicDistSize    = 0x00010000ULL;  // 64 KiB
+    // GICv3: redistributor region (2 * 64 KiB per vCPU).
+    static constexpr uint64_t kGicRedistBase  = 0x080A0000ULL;
+    static constexpr uint64_t kGicRedistStride = 0x00020000ULL;
+    // GICv2 CPU interface (placed inside the unused redist slot).
+    static constexpr uint64_t kGicV2CpuBase   = 0x08010000ULL;
+    static constexpr uint64_t kGicV2CpuSize   = 0x00010000ULL;
+
+private:
+    KvmVm() = default;
+
+    bool CreateInKernelVgic();
+    bool TryCreateVgicV3();
+    bool TryCreateVgicV2();
+
+    bool UpdateIrqRoutingLocked();
+
+    int kvm_fd_ = -1;
+    int vm_fd_ = -1;
+    int vgic_fd_ = -1;
+    bool vgic_initialized_ = false;
+    bool uses_gic_v2_ = false;
+    uint32_t cpu_count_ = 0;
+    size_t vcpu_mmap_size_ = 0;
+    std::mutex vgic_init_mutex_;
+
+    // GSIs (absolute SPI INTIDs) with an active irqfd. KVM requires us to
+    // program explicit GSI routing on arm64 — there is no default routing
+    // installed by VGIC creation. We rewrite the full route table every time
+    // a slot is added/removed under irqfd_route_mutex_.
+    std::mutex irqfd_route_mutex_;
+    std::set<uint32_t> routed_gsis_;
+
+    const GuestMemMap* guest_mem_ = nullptr;
+
+    std::mutex slot_mutex_;
+    uint32_t next_slot_ = 0;
+};
+
+} // namespace kvm
diff --git a/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp b/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp
index 6f9996a..fa7ff44 100644
--- a/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp
+++ b/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp
@@ -130,4 +130,42 @@ bool KvmVm::AssertIrq(uint32_t gsi, bool level) {
     return true;
 }
 
+bool KvmVm::RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) {
+    // On x86 KVM, the default GSI routing created alongside KVM_CREATE_IRQCHIP
+    // maps GSI 0..23 onto IOAPIC pins, so no explicit KVM_SET_GSI_ROUTING is
+    // required for level-triggered virtio-mmio lines.
+    if (trigger_fd < 0) return false;
+
+    struct kvm_irqfd ifd{};
+    ifd.fd = static_cast<uint32_t>(trigger_fd);
+    ifd.gsi = gsi;
+    if (resample_fd >= 0) {
+        ifd.flags = KVM_IRQFD_FLAG_RESAMPLE;
+        ifd.resamplefd = static_cast<uint32_t>(resample_fd);
+    }
+    if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) {
+        LOG_WARN("kvm: KVM_IRQFD(gsi=%u trigger=%d resample=%d) failed: %s",
+                 gsi, trigger_fd, resample_fd, strerror(errno));
+        return false;
+    }
+    LOG_INFO("kvm: irqfd registered gsi=%u trigger=%d resample=%d",
+             gsi, trigger_fd, resample_fd);
+    return true;
+}
+
+bool KvmVm::UnregisterIrqFd(uint32_t gsi, int trigger_fd) {
+    if (trigger_fd < 0) return false;
+
+    struct kvm_irqfd ifd{};
+    ifd.fd = static_cast<uint32_t>(trigger_fd);
+    ifd.gsi = gsi;
+    ifd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+    if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) {
+        LOG_WARN("kvm: KVM_IRQFD DEASSIGN(gsi=%u trigger=%d) failed: %s",
+                 gsi, trigger_fd, strerror(errno));
+        return false;
+    }
+    return true;
+}
+
 } // namespace kvm
diff --git a/src/platform/linux/hypervisor/x86_64/kvm_vm.h b/src/platform/linux/hypervisor/x86_64/kvm_vm.h
index 042f17b..d276215 100644
--- a/src/platform/linux/hypervisor/x86_64/kvm_vm.h
+++ b/src/platform/linux/hypervisor/x86_64/kvm_vm.h
@@ -27,6 +27,10 @@ class KvmVm final : public HypervisorVm {
     // KVM has an in-kernel irqchip: IRQ lines go through KVM_IRQ_LINE.
     bool AssertIrq(uint32_t gsi, bool level) override;
 
+    // Register / unregister a KVM_IRQFD for a level-triggered GSI (IOAPIC pin).
+    bool RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) override;
+    bool UnregisterIrqFd(uint32_t gsi, int trigger_fd) override;
+
     void SetGuestMemMap(const GuestMemMap* mem) override { guest_mem_ = mem; }
 
     int VmFd() const { return vm_fd_; }
diff --git a/src/platform/linux/vm_platform_linux.cpp b/src/platform/linux/vm_platform_linux.cpp
index 38f72b3..6f7f1d1 100644
--- a/src/platform/linux/vm_platform_linux.cpp
+++ b/src/platform/linux/vm_platform_linux.cpp
@@ -1,6 +1,12 @@
 #include "core/vmm/vm_platform.h"
 #include "platform/linux/hypervisor/kvm_platform.h"
+#if defined(__x86_64__)
 #include "platform/linux/hypervisor/x86_64/kvm_vm.h"
+#elif defined(__aarch64__)
+#include "platform/linux/hypervisor/aarch64/kvm_vm.h"
+#else
+#error "Unsupported Linux architecture for KVM backend"
+#endif
 #include "platform/posix/console/posix_console_port.h"
 
 #include <sched.h>
@@ -20,6 +26,14 @@ uint8_t* VmPlatform::AllocateRam(uint64_t size) {
                        PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     if (ptr == MAP_FAILED) return nullptr;
+
+    // Hint the kernel to back guest RAM with 2 MiB transparent huge pages.
+    // Only a hint — kernel silently falls back to 4 KiB pages if the mapping
+    // edges aren't 2 MiB aligned or if contiguous memory is unavailable, so
+    // there's no failure path to handle. Reduces stage-2 TLB pressure on
+    // arm64 and x86 alike.
+    ::madvise(ptr, size, MADV_HUGEPAGE);
+
     return static_cast<uint8_t*>(ptr);
 }