diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml index 3a39cd7..ede193b 100644 --- a/.github/workflows/pr-check.yml +++ b/.github/workflows/pr-check.yml @@ -22,7 +22,12 @@ jobs: - os: ubuntu-latest configure_args: -DCMAKE_BUILD_TYPE=Release build_args: --parallel - artifact_name: tenbox-build-linux + artifact_name: tenbox-build-linux-x64 + artifact_path: build/tenbox-vm-runtime + - os: ubuntu-24.04-arm + configure_args: -DCMAKE_BUILD_TYPE=Release + build_args: --parallel + artifact_name: tenbox-build-linux-arm64 artifact_path: build/tenbox-vm-runtime steps: - uses: actions/checkout@v4 diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 75fd789..8f141a9 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -1,6 +1,8 @@ # Common sources shared by all platforms set(TENBOX_CORE_SOURCES ${CMAKE_SOURCE_DIR}/src/core/vmm/vm.cpp + ${CMAKE_SOURCE_DIR}/src/core/vmm/vm_io_loop.cpp + ${CMAKE_SOURCE_DIR}/src/core/vmm/console_tx_batcher.cpp ${CMAKE_SOURCE_DIR}/src/core/vmm/address_space.cpp ${CMAKE_SOURCE_DIR}/src/core/device/virtio/virtqueue.cpp ${CMAKE_SOURCE_DIR}/src/core/device/virtio/virtio_mmio.cpp diff --git a/src/core/arch/aarch64/aarch64_machine.cpp b/src/core/arch/aarch64/aarch64_machine.cpp index 265297f..a6f18d7 100644 --- a/src/core/arch/aarch64/aarch64_machine.cpp +++ b/src/core/arch/aarch64/aarch64_machine.cpp @@ -10,6 +10,9 @@ #ifdef __APPLE__ #include "platform/macos/hypervisor/aarch64/hvf_vcpu.h" #include "platform/macos/hypervisor/aarch64/hvf_vm.h" +#elif defined(__linux__) && defined(__aarch64__) +#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h" +#include "platform/linux/hypervisor/aarch64/kvm_vm.h" #endif bool Aarch64Machine::SetupPlatformDevices( @@ -17,6 +20,7 @@ bool Aarch64Machine::SetupPlatformDevices( GuestMemMap& /*mem*/, HypervisorVm* hv_vm, std::shared_ptr console_port, + VmIoLoop* io_loop, std::function shutdown_cb, std::function reboot_cb) { @@ -30,9 +34,15 @@ bool Aarch64Machine::SetupPlatformDevices( uart_.SetIrqLevelCallback([this](bool asserted) { SetIrqLevel(hv_vm_, kUartIrq, asserted); }); - uart_.SetTxCallback([console_port](uint8_t byte) { - if (!console_port) return; - console_port->Write(&byte, 1); + // Thread the per-byte UART stream through a batcher so the downstream + // ConsolePort sees larger chunks instead of N * 1-byte writes. + tx_batcher_ = std::make_unique( + [console_port](const uint8_t* data, size_t size) { + if (console_port) console_port->Write(data, size); + }); + tx_batcher_->AttachIoLoop(io_loop); + uart_.SetTxCallback([this](uint8_t byte) { + tx_batcher_->Append(&byte, 1); }); addr_space.AddMmioDevice(kUartBase, Pl011::kMmioSize, &uart_); @@ -206,20 +216,21 @@ bool Aarch64Machine::LoadKernel( fdt.AddPropertyString("device_type", "cpu"); fdt.AddPropertyString("compatible", "arm,arm-v8"); fdt.AddPropertyU32("reg", i); - if (config.cpu_count > 1) { - fdt.AddPropertyString("enable-method", "psci"); - } + // PSCI is always available (in-kernel PSCI on KVM, userspace + // emulation on HVF) so every CPU — including a single-core config — + // uses "psci" as its enable-method. This also lets the guest use + // PSCI SYSTEM_OFF / SYSTEM_RESET for shutdown/reboot. + fdt.AddPropertyString("enable-method", "psci"); fdt.EndNode(); } fdt.EndNode(); - // PSCI node (for multi-core) - if (config.cpu_count > 1) { - fdt.BeginNode("psci"); - fdt.AddPropertyString("compatible", "arm,psci-1.0"); - fdt.AddPropertyString("method", "hvc"); - fdt.EndNode(); - } + // PSCI node (always present so the guest can issue SYSTEM_OFF / + // SYSTEM_RESET, even in single-CPU configurations). + fdt.BeginNode("psci"); + fdt.AddPropertyString("compatible", "arm,psci-1.0"); + fdt.AddPropertyString("method", "hvc"); + fdt.EndNode(); // /timer (ARM generic timer) fdt.BeginNode("timer"); @@ -235,10 +246,13 @@ bool Aarch64Machine::LoadKernel( fdt.AddPropertyEmpty("always-on"); fdt.EndNode(); - // /intc (GICv3) - // Use actual redistributor addresses from the hypervisor + // /intc — GICv3 by default, with a GICv2 fallback for hosts where the + // in-kernel VGICv3 is unavailable (e.g. Raspberry Pi 5 with GIC-400). GPA actual_redist_base = kGicRedistBase; uint32_t redist_total_size = static_cast(config.cpu_count * 0x20000); + bool use_gic_v2 = false; + GPA gic_v2_cpu_base = 0x08010000ULL; + uint32_t gic_v2_cpu_size = 0x10000; #ifdef __APPLE__ if (hv_vm_) { auto* hvf = dynamic_cast(hv_vm_); @@ -247,24 +261,48 @@ bool Aarch64Machine::LoadKernel( redist_total_size = static_cast(hvf->GetRedistSizePerCpu()) * config.cpu_count; } } +#elif defined(__linux__) && defined(__aarch64__) + if (hv_vm_) { + auto* kvm_vm = dynamic_cast(hv_vm_); + if (kvm_vm && kvm_vm->UsesGicV2()) { + use_gic_v2 = true; + gic_v2_cpu_base = kvm::KvmVm::kGicV2CpuBase; + gic_v2_cpu_size = static_cast(kvm::KvmVm::kGicV2CpuSize); + } + } #endif char gic_name[64]; snprintf(gic_name, sizeof(gic_name), "intc@%" PRIx64, (uint64_t)kGicDistBase); fdt.BeginNode(gic_name); - fdt.AddPropertyString("compatible", "arm,gic-v3"); + if (use_gic_v2) { + fdt.AddPropertyString("compatible", "arm,cortex-a15-gic"); + } else { + fdt.AddPropertyString("compatible", "arm,gic-v3"); + } fdt.AddPropertyU32("#interrupt-cells", 3); fdt.AddPropertyEmpty("interrupt-controller"); fdt.AddPropertyU32("phandle", gic_phandle); - fdt.AddPropertyCells("reg", { - static_cast(kGicDistBase >> 32), - static_cast(kGicDistBase & 0xFFFFFFFF), - 0, 0x10000, // Distributor: 64 KiB - static_cast(actual_redist_base >> 32), - static_cast(actual_redist_base & 0xFFFFFFFF), - 0, redist_total_size, - }); + if (use_gic_v2) { + fdt.AddPropertyCells("reg", { + static_cast(kGicDistBase >> 32), + static_cast(kGicDistBase & 0xFFFFFFFF), + 0, 0x10000, // Distributor: 64 KiB (v2 only uses first 4 KiB) + static_cast(gic_v2_cpu_base >> 32), + static_cast(gic_v2_cpu_base & 0xFFFFFFFF), + 0, gic_v2_cpu_size, // CPU interface (GICC) + }); + } else { + fdt.AddPropertyCells("reg", { + static_cast(kGicDistBase >> 32), + static_cast(kGicDistBase & 0xFFFFFFFF), + 0, 0x10000, // Distributor: 64 KiB + static_cast(actual_redist_base >> 32), + static_cast(actual_redist_base & 0xFFFFFFFF), + 0, redist_total_size, + }); + } fdt.EndNode(); // Fixed clock for AMBA peripherals (PL011 requires clocks property) @@ -380,9 +418,16 @@ bool Aarch64Machine::SetupBootVCpu(HypervisorVCpu* vcpu, uint8_t* /*ram*/) { return false; } return hvf_vcpu->SetupAarch64Boot(kernel_entry_, fdt_gpa_); +#elif defined(__linux__) && defined(__aarch64__) + auto* kvm_vcpu = dynamic_cast(vcpu); + if (!kvm_vcpu) { + LOG_ERROR("aarch64: SetupBootVCpu requires KvmVCpu on Linux"); + return false; + } + return kvm_vcpu->SetupAarch64Boot(kernel_entry_, fdt_gpa_); #else (void)vcpu; - LOG_ERROR("aarch64: SetupBootVCpu called on non-Apple platform"); + LOG_ERROR("aarch64: SetupBootVCpu called on unsupported platform"); return false; #endif } diff --git a/src/core/arch/aarch64/aarch64_machine.h b/src/core/arch/aarch64/aarch64_machine.h index a410752..ef85e73 100644 --- a/src/core/arch/aarch64/aarch64_machine.h +++ b/src/core/arch/aarch64/aarch64_machine.h @@ -1,10 +1,13 @@ #pragma once #include "core/vmm/machine_model.h" +#include "core/vmm/console_tx_batcher.h" #include "core/arch/aarch64/pl011.h" #include "core/arch/aarch64/boot.h" #include "core/device/rtc/pl031_rtc.h" +#include + // ARM64 virt machine model (Apple Hypervisor.framework). // Uses GICv3, PL011 UART, FDT boot, and VirtIO MMIO. class Aarch64Machine final : public MachineModel { @@ -16,6 +19,7 @@ class Aarch64Machine final : public MachineModel { GuestMemMap& mem, HypervisorVm* hv_vm, std::shared_ptr console_port, + VmIoLoop* io_loop, std::function shutdown_cb, std::function reboot_cb) override; @@ -42,6 +46,10 @@ class Aarch64Machine final : public MachineModel { private: Pl011 uart_; Pl031Rtc rtc_; + // Coalesces per-byte UART tx writes into larger chunks before they + // reach the ConsolePort. unique_ptr so the object is created only + // once SetupPlatformDevices captures the downstream writer. + std::unique_ptr tx_batcher_; GPA kernel_entry_ = 0; GPA fdt_gpa_ = 0; diff --git a/src/core/arch/x86_64/x86_machine.cpp b/src/core/arch/x86_64/x86_machine.cpp index 1849d3a..0e3dc04 100644 --- a/src/core/arch/x86_64/x86_machine.cpp +++ b/src/core/arch/x86_64/x86_machine.cpp @@ -21,6 +21,7 @@ bool X86Machine::SetupPlatformDevices( GuestMemMap& /*mem*/, HypervisorVm* hv_vm, std::shared_ptr console_port, + VmIoLoop* io_loop, std::function shutdown_cb, std::function reboot_cb) { @@ -29,9 +30,15 @@ bool X86Machine::SetupPlatformDevices( }; uart_.SetIrqCallback([this]() { irq_injector_(4); }); - uart_.SetTxCallback([console_port](uint8_t byte) { - if (!console_port) return; - console_port->Write(&byte, 1); + // Thread the per-byte UART stream through a batcher so the downstream + // ConsolePort sees larger chunks instead of N * 1-byte writes. + tx_batcher_ = std::make_unique( + [console_port](const uint8_t* data, size_t size) { + if (console_port) console_port->Write(data, size); + }); + tx_batcher_->AttachIoLoop(io_loop); + uart_.SetTxCallback([this](uint8_t byte) { + tx_batcher_->Append(&byte, 1); }); addr_space.AddPioDevice( Uart16550::kCom1Base, Uart16550::kRegCount, &uart_); diff --git a/src/core/arch/x86_64/x86_machine.h b/src/core/arch/x86_64/x86_machine.h index c11d818..dd15ed6 100644 --- a/src/core/arch/x86_64/x86_machine.h +++ b/src/core/arch/x86_64/x86_machine.h @@ -1,7 +1,10 @@ #pragma once #include "core/vmm/machine_model.h" +#include "core/vmm/console_tx_batcher.h" #include "core/device/serial/uart_16550.h" + +#include #include "core/device/timer/i8254_pit.h" #include "core/device/rtc/cmos_rtc.h" #include "core/device/irq/ioapic.h" @@ -22,6 +25,7 @@ class X86Machine final : public MachineModel { GuestMemMap& mem, HypervisorVm* hv_vm, std::shared_ptr console_port, + VmIoLoop* io_loop, std::function shutdown_cb, std::function reboot_cb) override; @@ -57,6 +61,8 @@ class X86Machine final : public MachineModel { private: Uart16550 uart_; + // Coalesces per-byte UART tx writes before they reach the ConsolePort. + std::unique_ptr tx_batcher_; I8254Pit pit_; SystemControlB sys_ctrl_b_; CmosRtc rtc_; diff --git a/src/core/device/virtio/virtio_mmio.cpp b/src/core/device/virtio/virtio_mmio.cpp index 07a027d..8c522e0 100644 --- a/src/core/device/virtio/virtio_mmio.cpp +++ b/src/core/device/virtio/virtio_mmio.cpp @@ -1,8 +1,28 @@ #include "core/device/virtio/virtio_mmio.h" +#if defined(__linux__) || defined(__APPLE__) +#include // write() for eventfd in IRQFD mode +#endif + static constexpr uint64_t VIRTIO_RING_F_INDIRECT_DESC = (1ULL << 28); static constexpr uint64_t VIRTIO_F_EVENT_IDX = (1ULL << 29); +namespace { + +inline void SignalIrqEventFd(int fd) { +#if defined(__linux__) || defined(__APPLE__) + uint64_t one = 1; + // EFD_NONBLOCK fds may return EAGAIN if the counter saturates (2^64-2 + // accumulated unhandled writes) — impossible in practice and harmless. + // Ignore the return value: the only interesting failure would be EBADF. + (void)::write(fd, &one, sizeof(one)); +#else + (void)fd; +#endif +} + +} // namespace + void VirtioMmioDevice::Init(VirtioDeviceOps* ops, const GuestMemMap& mem) { ops_ = ops; mem_ = mem; @@ -149,7 +169,10 @@ void VirtioMmioDevice::MmioWrite(uint64_t offset, uint8_t size, break; case kInterruptACK: { uint32_t prev = interrupt_status_.fetch_and(~val, std::memory_order_acq_rel); - if ((prev & ~val) == 0 && irq_level_callback_) { + // In IRQFD mode, deassert is handled by the in-kernel irqchip via + // the EOI + resample path — do not fire the level callback here + // (that would race with the kernel and double-toggle the line). + if (irq_eventfd_ < 0 && (prev & ~val) == 0 && irq_level_callback_) { irq_level_callback_(false); } break; @@ -221,6 +244,10 @@ void VirtioMmioDevice::NotifyUsedBuffer(int queue_idx) { } interrupt_status_.fetch_or(1, std::memory_order_release); // VIRTIO_MMIO_INT_VRING + if (irq_eventfd_ >= 0) { + SignalIrqEventFd(irq_eventfd_); + return; + } if (irq_level_callback_) irq_level_callback_(true); else if (irq_callback_) @@ -230,6 +257,10 @@ void VirtioMmioDevice::NotifyUsedBuffer(int queue_idx) { void VirtioMmioDevice::NotifyConfigChange() { config_generation_++; interrupt_status_.fetch_or(2, std::memory_order_release); // VIRTIO_MMIO_INT_CONFIG + if (irq_eventfd_ >= 0) { + SignalIrqEventFd(irq_eventfd_); + return; + } if (irq_level_callback_) irq_level_callback_(true); else if (irq_callback_) diff --git a/src/core/device/virtio/virtio_mmio.h b/src/core/device/virtio/virtio_mmio.h index 77ce5ab..4ef39a1 100644 --- a/src/core/device/virtio/virtio_mmio.h +++ b/src/core/device/virtio/virtio_mmio.h @@ -36,6 +36,22 @@ class VirtioMmioDevice : public Device { void SetIrqCallback(IrqCallback cb) { irq_callback_ = std::move(cb); } void SetIrqLevelCallback(IrqLevelCallback cb) { irq_level_callback_ = std::move(cb); } + // Switch the device to IRQFD mode: instead of invoking the callbacks on + // every notify, write a single 64-bit value to irq_eventfd, letting the + // hypervisor's in-kernel irqchip assert the line directly. In this mode + // the explicit deassert on InterruptACK is skipped as well — deassertion + // is handled by the irqchip EOI + resample path. + // + // Ownership of the fd stays with the caller; it must outlive this device. + void SetIrqEventFd(int fd) { irq_eventfd_ = fd; } + + // Snapshot of the internal interrupt_status register. Used by the irqfd + // resample poller to decide whether the device still has a pending + // condition and needs to be re-asserted. + uint32_t GetInterruptStatus() const { + return interrupt_status_.load(std::memory_order_acquire); + } + void MmioRead(uint64_t offset, uint8_t size, uint64_t* value) override; void MmioWrite(uint64_t offset, uint8_t size, uint64_t value) override; @@ -91,6 +107,7 @@ class VirtioMmioDevice : public Device { GuestMemMap mem_; IrqCallback irq_callback_; IrqLevelCallback irq_level_callback_; + int irq_eventfd_ = -1; // IRQFD mode: write to assert; -1 disables. // Transport state uint32_t status_ = 0; diff --git a/src/core/device/virtio/virtio_snd.cpp b/src/core/device/virtio/virtio_snd.cpp index 2fe3bbe..a2a0ba7 100644 --- a/src/core/device/virtio/virtio_snd.cpp +++ b/src/core/device/virtio/virtio_snd.cpp @@ -1,5 +1,6 @@ #include "core/device/virtio/virtio_snd.h" #include "core/vmm/types.h" +#include "core/vmm/vm_io_loop.h" #include #include #include @@ -366,8 +367,16 @@ void VirtioSndDevice::HandleChmapInfo(const VirtioSndQueryInfo* query, void VirtioSndDevice::StartPeriodTimer() { StopPeriodTimer(); - period_running_ = true; - period_thread_ = std::thread(&VirtioSndDevice::PeriodTimerThread, this); + if (!io_loop_) return; // no loop => no pacing (dev effectively silent) + period_start_time_ = std::chrono::steady_clock::now(); + period_bytes_processed_ = 0; + period_running_.store(true); + period_timer_id_ = io_loop_->AddTimer(0, [this]() -> uint64_t { + if (!period_running_.load()) return 0; // self-destruct + uint64_t next_ms = PeriodTick(); + if (!period_running_.load()) return 0; + return next_ms ? next_ms : 1; // never return 0 while running + }); } void VirtioSndDevice::FlushPendingTxBuffers() { @@ -392,106 +401,85 @@ void VirtioSndDevice::FlushPendingTxBuffers() { } void VirtioSndDevice::StopPeriodTimer() { - if (period_running_) { - period_running_ = false; - period_cv_.notify_all(); - if (period_thread_.joinable()) { - period_thread_.join(); - } + if (!period_running_.exchange(false)) return; + if (io_loop_ && period_timer_id_) { + io_loop_->RemoveTimer(period_timer_id_); + period_timer_id_ = 0; } } -void VirtioSndDevice::PeriodTimerThread() { - auto start_time = std::chrono::steady_clock::now(); - uint64_t bytes_processed = 0; // Track audio position in bytes +uint64_t VirtioSndDevice::PeriodTick() { + // Get current stream parameters + uint32_t sample_rate, period_bytes; + uint8_t channels; + { + std::lock_guard lock(period_mutex_); + sample_rate = pcm_sample_rate_; + period_bytes = pcm_period_bytes_; + channels = pcm_channels_; + } - while (period_running_) { - // Get current stream parameters - uint32_t sample_rate, period_bytes; - uint8_t channels; - { - std::lock_guard lock(period_mutex_); - sample_rate = pcm_sample_rate_; - period_bytes = pcm_period_bytes_; - channels = pcm_channels_; - } + if (sample_rate == 0 || period_bytes == 0 || channels == 0) { + return 10; // stream not yet set up; retry later + } - if (sample_rate == 0 || period_bytes == 0 || channels == 0) { - std::unique_lock lock(period_mutex_); - period_cv_.wait_for(lock, std::chrono::milliseconds(10), - [this]() { return !period_running_.load(); }); - continue; - } + uint32_t bytes_per_second = sample_rate * channels * 2; // S16 - uint32_t bytes_per_second = sample_rate * channels * 2; // S16 - - // Calculate timing: how far ahead/behind are we? - auto now = std::chrono::steady_clock::now(); - auto elapsed_ms = std::chrono::duration_cast(now - start_time).count(); - int64_t audio_ms = static_cast(bytes_processed) * 1000 / bytes_per_second; - int64_t drift_ms = audio_ms - elapsed_ms; // positive = ahead, negative = behind - - // If we're behind, process buffers immediately - // If we're ahead, sleep until we need to process - if (drift_ms > 0) { - // We're ahead of real-time, sleep a bit - int64_t sleep_ms = (std::min)(drift_ms, (int64_t)10); - std::unique_lock lock(period_mutex_); - period_cv_.wait_for(lock, std::chrono::milliseconds(sleep_ms), - [this]() { return !period_running_.load(); }); - continue; - } + auto now = std::chrono::steady_clock::now(); + auto elapsed_ms = std::chrono::duration_cast( + now - period_start_time_).count(); + int64_t audio_ms = static_cast(period_bytes_processed_) * 1000 / + bytes_per_second; + int64_t drift_ms = audio_ms - elapsed_ms; // +ahead / -behind - // If we're way behind (> 200ms), reset timing - if (drift_ms < -200) { - start_time = std::chrono::steady_clock::now(); - bytes_processed = 0; - continue; - } + if (drift_ms > 0) { + // Ahead of real time — wait until we need more samples. + return static_cast((std::min)(drift_ms, (int64_t)10)); + } - // Process one buffer - PendingTxBuffer buf{}; - bool have_buf = false; - { - std::lock_guard lock(tx_mutex_); - if (!pending_tx_buffers_.empty()) { - buf = std::move(pending_tx_buffers_.front()); - pending_tx_buffers_.pop_front(); - have_buf = true; - } - } + if (drift_ms < -200) { + // Way behind (suspend/resume?); resync the clock instead of + // burning through every queued buffer. + period_start_time_ = std::chrono::steady_clock::now(); + period_bytes_processed_ = 0; + return 1; + } - if (!have_buf) { - // No buffers available, wait briefly - std::unique_lock lock(period_mutex_); - period_cv_.wait_for(lock, std::chrono::milliseconds(1), - [this]() { return !period_running_.load(); }); - continue; + PendingTxBuffer buf{}; + bool have_buf = false; + { + std::lock_guard lock(tx_mutex_); + if (!pending_tx_buffers_.empty()) { + buf = std::move(pending_tx_buffers_.front()); + pending_tx_buffers_.pop_front(); + have_buf = true; } + } - // Send PCM data to manager - size_t pcm_bytes = 0; - if (!buf.pcm_data.empty() && audio_port_) { - AudioChunk chunk; - chunk.sample_rate = sample_rate; - chunk.channels = channels; - pcm_bytes = buf.pcm_data.size() * sizeof(int16_t); - chunk.pcm = std::move(buf.pcm_data); - audio_port_->SubmitPcm(std::move(chunk)); - } + if (!have_buf) { + return 1; // spin gently until the guest queues more data + } - // Track audio position - bytes_processed += (pcm_bytes > 0) ? pcm_bytes : period_bytes; + size_t pcm_bytes = 0; + if (!buf.pcm_data.empty() && audio_port_) { + AudioChunk chunk; + chunk.sample_rate = sample_rate; + chunk.channels = channels; + pcm_bytes = buf.pcm_data.size() * sizeof(int16_t); + chunk.pcm = std::move(buf.pcm_data); + audio_port_->SubmitPcm(std::move(chunk)); + } - // Return buffer to guest - if (mmio_) { - auto* txq = mmio_->GetQueue(VIRTIO_SND_VQ_TX); - if (txq) { - txq->PushUsed(buf.head, buf.status_len); - mmio_->NotifyUsedBuffer(VIRTIO_SND_VQ_TX); - } + period_bytes_processed_ += (pcm_bytes > 0) ? pcm_bytes : period_bytes; + + if (mmio_) { + auto* txq = mmio_->GetQueue(VIRTIO_SND_VQ_TX); + if (txq) { + txq->PushUsed(buf.head, buf.status_len); + mmio_->NotifyUsedBuffer(VIRTIO_SND_VQ_TX); } } + return 1; // immediately try the next buffer; drift calc paces us } uint32_t VirtioSndDevice::RateEnumToHz(uint8_t rate_enum) { diff --git a/src/core/device/virtio/virtio_snd.h b/src/core/device/virtio/virtio_snd.h index dd63be6..936d57c 100644 --- a/src/core/device/virtio/virtio_snd.h +++ b/src/core/device/virtio/virtio_snd.h @@ -3,14 +3,15 @@ #include "common/ports.h" #include "core/device/virtio/virtio_mmio.h" #include -#include +#include #include #include #include #include -#include #include +class VmIoLoop; + // virtio-snd device ID (spec 5.14) constexpr uint32_t VIRTIO_SND_DEVICE_ID = 25; @@ -149,6 +150,10 @@ class VirtioSndDevice : public VirtioDeviceOps { void SetMmioDevice(VirtioMmioDevice* mmio) { mmio_ = mmio; } void SetMemMap(const GuestMemMap& mem) { mem_ = mem; } void SetAudioPort(std::shared_ptr port) { audio_port_ = std::move(port); } + // The io_loop hosts our period timer. Must be set before the guest + // starts a stream; a nullptr falls back to "no audio pacing" (playback + // effectively stalls, matching a stream-less config). + void SetIoLoop(VmIoLoop* loop) { io_loop_ = loop; } uint32_t GetDeviceId() const override { return VIRTIO_SND_DEVICE_ID; } uint64_t GetDeviceFeatures() const override; @@ -173,7 +178,9 @@ class VirtioSndDevice : public VirtioDeviceOps { void HandleChmapInfo(const VirtioSndQueryInfo* query, uint8_t* resp, uint32_t* resp_len); - void PeriodTimerThread(); + // One tick of the period-driven playback loop. Runs on io_loop_'s + // thread; returns the delay (ms) until the next tick. + uint64_t PeriodTick(); void StartPeriodTimer(); void StopPeriodTimer(); void FlushPendingTxBuffers(); @@ -197,11 +204,15 @@ class VirtioSndDevice : public VirtioDeviceOps { uint32_t pcm_buffer_bytes_ = 0; uint32_t pcm_period_bytes_ = 0; - // Period timer: releases TX buffers at real audio rate to throttle guest - std::thread period_thread_; + // Period timer: releases TX buffers at real audio rate to throttle guest. + // The timer itself is owned by io_loop_; we only keep the id and state + // used by PeriodTick. + VmIoLoop* io_loop_ = nullptr; std::mutex period_mutex_; - std::condition_variable period_cv_; std::atomic period_running_{false}; + uint64_t period_timer_id_ = 0; + std::chrono::steady_clock::time_point period_start_time_{}; + uint64_t period_bytes_processed_ = 0; // Pending TX buffers waiting to be returned to guest struct PendingTxBuffer { diff --git a/src/core/vmm/console_tx_batcher.cpp b/src/core/vmm/console_tx_batcher.cpp new file mode 100644 index 0000000..6b4e403 --- /dev/null +++ b/src/core/vmm/console_tx_batcher.cpp @@ -0,0 +1,75 @@ +#include "core/vmm/console_tx_batcher.h" + +#include "core/vmm/vm_io_loop.h" + +#include + +ConsoleTxBatcher::ConsoleTxBatcher(RawWriter writer) : writer_(std::move(writer)) {} + +ConsoleTxBatcher::~ConsoleTxBatcher() { + // By contract the io_loop has already been stopped by whoever owns us + // (Vm::~Vm), so any armed timer has been closed and its capture of + // `this` released. We just need to drain whatever the guest wrote + // after the last timer flush so tail-end console output isn't lost. + Flush(); +} + +void ConsoleTxBatcher::AttachIoLoop(VmIoLoop* loop) { + std::lock_guard lock(mu_); + io_loop_ = loop; +} + +void ConsoleTxBatcher::Append(const uint8_t* data, size_t size) { + if (!data || size == 0) return; + + std::unique_lock lock(mu_); + + // If no loop is attached yet, or it's been torn down, we can't + // schedule a delayed flush. Drain any previously-buffered bytes + // first (preserve order) and write this chunk through synchronously. + if (!io_loop_ || !io_loop_->running()) { + if (!buf_.empty()) FlushLocked(lock); + RawWriter w = writer_; + lock.unlock(); + if (w) w(data, size); + return; + } + + buf_.append(reinterpret_cast(data), size); + + if (buf_.size() >= kFlushThreshold) { + FlushLocked(lock); + return; + } + + if (!timer_armed_) { + timer_armed_ = true; + timer_id_ = io_loop_->AddTimer(kFlushDelayMs, [this]() -> uint64_t { + return OnTimerFire(); + }); + } +} + +void ConsoleTxBatcher::Flush() { + std::unique_lock lock(mu_); + if (!buf_.empty()) FlushLocked(lock); +} + +void ConsoleTxBatcher::FlushLocked(std::unique_lock& lock) { + std::string pending; + pending.swap(buf_); + RawWriter w = writer_; + lock.unlock(); + if (w && !pending.empty()) { + w(reinterpret_cast(pending.data()), pending.size()); + } + lock.lock(); +} + +uint64_t ConsoleTxBatcher::OnTimerFire() { + std::unique_lock lock(mu_); + timer_armed_ = false; + timer_id_ = 0; + if (!buf_.empty()) FlushLocked(lock); + return 0; // self-destruct; next Append() re-arms +} diff --git a/src/core/vmm/console_tx_batcher.h b/src/core/vmm/console_tx_batcher.h new file mode 100644 index 0000000..c523ef1 --- /dev/null +++ b/src/core/vmm/console_tx_batcher.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include + +class VmIoLoop; + +// Coalesces the 1-byte-at-a-time UART tx stream into larger chunks before +// handing them to the downstream sink (ConsolePort -> stdout / IPC pipe). +// +// Guest UARTs (pl011, 16550) call into us via a TxCallback that fires once +// per MMIO/PIO write. Boot log alone can push thousands of bytes this way, +// each currently turning into one ::write()/WriteFile() syscall on the host. +// This class buffers writes and flushes them in two scenarios: +// - The pending buffer reaches kFlushThreshold bytes: synchronously flush +// on the caller (vCPU) thread. Bounds worst-case latency for bursts. +// - An io_loop timer fires kFlushDelayMs after the first byte landed. +// Handles the slow interactive-echo case where bytes dribble in below +// the threshold. +// +// Thread-safety: all public methods are safe to call from any thread. +// The RawWriter is invoked without holding the internal mutex so that slow +// syscalls don't serialize Append() calls from different vCPUs. +// +// Lifetime: the loop must outlive this object. In the Vm ownership chain +// that's guaranteed because Vm::~Vm explicitly stops io_loop_ (which joins +// its thread and closes any armed timer) before machine_ -- and therefore +// this batcher -- is destroyed. +class ConsoleTxBatcher { +public: + using RawWriter = std::function; + + explicit ConsoleTxBatcher(RawWriter writer); + ~ConsoleTxBatcher(); + + ConsoleTxBatcher(const ConsoleTxBatcher&) = delete; + ConsoleTxBatcher& operator=(const ConsoleTxBatcher&) = delete; + + // Attach the io loop used for delayed flushes. Safe to call before + // the loop is started; buffering begins when the loop becomes running. + // Passing nullptr detaches and forces Append() to go synchronous. + void AttachIoLoop(VmIoLoop* loop); + + // Append bytes to the tx buffer. If the loop is unavailable or not + // running, bytes are written through synchronously to preserve output + // across Vm startup/shutdown edges. + void Append(const uint8_t* data, size_t size); + + // Synchronous flush of whatever is currently buffered. + void Flush(); + +private: + // Hands buf_ contents to writer_. Releases `lock` around the writer + // call and re-acquires it before returning. Leaves buf_ empty. + void FlushLocked(std::unique_lock& lock); + + // Runs on io_thread_ when the coalesce timer fires. Returns 0 so the + // timer self-destructs; the next Append() will re-arm it. + uint64_t OnTimerFire(); + + RawWriter writer_; + VmIoLoop* io_loop_ = nullptr; + + std::mutex mu_; + std::string buf_; + bool timer_armed_ = false; + uint64_t timer_id_ = 0; + + static constexpr size_t kFlushThreshold = 1024; + static constexpr uint64_t kFlushDelayMs = 16; +}; diff --git a/src/core/vmm/hypervisor_vm.h b/src/core/vmm/hypervisor_vm.h index edf9e4d..1912c6e 100644 --- a/src/core/vmm/hypervisor_vm.h +++ b/src/core/vmm/hypervisor_vm.h @@ -32,6 +32,27 @@ class HypervisorVm { // skipped. Default returns false so HVF / WHVP keep their current path. virtual bool AssertIrq(uint32_t /*gsi*/, bool /*level*/) { return false; } + // Register an eventfd (or platform equivalent) as an IRQFD for a + // level-triggered GSI. When trigger_fd is signalled, the hypervisor + // asserts the line directly in kernel space, bypassing the userspace + // RequestInterrupt / AssertIrq ioctl path. + // + // gsi is the hypervisor-absolute interrupt number: + // - arm64 KVM: SPI absolute INTID (>= 32) + // - x86_64 KVM: IOAPIC pin (0..23 with the default routing) + // The caller is responsible for computing the arch-specific offset. + // + // resample_fd (may be -1) is signalled by the hypervisor after the + // guest EOIs the interrupt so the caller can re-assert if the device + // still has a pending condition. Required for level-triggered lines. + // + // Default returns false; macOS HVF and any backend without irqfd + // support falls back to the RequestInterrupt / SetIrqLevelCallback + // path automatically. + virtual bool RegisterLevelIrqFd(uint32_t /*gsi*/, int /*trigger_fd*/, + int /*resample_fd*/) { return false; } + virtual bool UnregisterIrqFd(uint32_t /*gsi*/, int /*trigger_fd*/) { return false; } + virtual void SetGuestMemMap(const GuestMemMap*) {} virtual void QueueInterrupt(uint32_t vector, uint32_t dest_vcpu) { diff --git a/src/core/vmm/machine_model.h b/src/core/vmm/machine_model.h index 744d430..96c31f3 100644 --- a/src/core/vmm/machine_model.h +++ b/src/core/vmm/machine_model.h @@ -13,6 +13,7 @@ struct VmSharedFolder; struct VmConfig; +class VmIoLoop; // Describes a VirtIO MMIO device slot with its base address and IRQ number. struct VirtioDeviceSlot { @@ -36,6 +37,7 @@ class MachineModel { GuestMemMap& mem, HypervisorVm* hv_vm, std::shared_ptr console_port, + VmIoLoop* io_loop, std::function shutdown_cb, std::function reboot_cb) = 0; diff --git a/src/core/vmm/vm.cpp b/src/core/vmm/vm.cpp index 8783987..0b4f9df 100644 --- a/src/core/vmm/vm.cpp +++ b/src/core/vmm/vm.cpp @@ -2,6 +2,13 @@ #include "core/vmm/vm_platform.h" #include +#if defined(__linux__) +#include +#include +#include +#include +#endif + #if defined(__APPLE__) && defined(__x86_64__) #include "core/arch/x86_64/x86_machine.h" #include "platform/macos/hypervisor/x86_64/hvf_vcpu.h" @@ -13,12 +20,16 @@ #include "core/arch/x86_64/x86_machine.h" #elif defined(__linux__) && defined(__x86_64__) #include "core/arch/x86_64/x86_machine.h" +#elif defined(__linux__) && defined(__aarch64__) +#include "core/arch/aarch64/aarch64_machine.h" +#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h" +#include "platform/linux/hypervisor/aarch64/kvm_vm.h" #endif static std::unique_ptr CreateMachineModel() { #if defined(_WIN32) || (defined(__APPLE__) && defined(__x86_64__)) || (defined(__linux__) && defined(__x86_64__)) return std::make_unique(); -#elif defined(__APPLE__) && defined(__aarch64__) +#elif (defined(__APPLE__) && defined(__aarch64__)) || (defined(__linux__) && defined(__aarch64__)) return std::make_unique(); #else LOG_ERROR("No machine model available for this platform/architecture"); @@ -51,6 +62,11 @@ Vm::~Vm() { if (t.joinable()) t.join(); } + // Tear down irqfds (detach uv_poll, unregister with kvm, close fds, stop + // io_loop_) before destroying the hypervisor VM. + ShutdownIrqFds(); + io_loop_.Stop(); + if (vdagent_handler_) { vdagent_handler_->SetClipboardCallback(nullptr); } @@ -108,6 +124,7 @@ std::unique_ptr Vm::Create(const VmConfig& config) { if (!vm->machine_->SetupPlatformDevices( vm->addr_space_, vm->mem_, vm->hv_vm_.get(), vm->console_port_, + &vm->io_loop_, [&vm_ref = *vm]() { vm_ref.RequestStop(); }, [&vm_ref = *vm]() { vm_ref.RequestReboot(); })) { LOG_ERROR("Failed to set up platform devices"); @@ -231,6 +248,14 @@ void Vm::SetupVCpuCallbacks(uint32_t vcpu_index) { hvf_vcpu->SetPsciShutdownCallback([this]() { RequestStop(); }); hvf_vcpu->SetPsciRebootCallback([this]() { RequestReboot(); }); } +#elif defined(__linux__) && defined(__aarch64__) + // In-kernel PSCI handles CPU_ON; only SYSTEM_OFF / SYSTEM_RESET bubble + // up to userspace as KVM_EXIT_SYSTEM_EVENT. + auto* kvm_vcpu = dynamic_cast(vcpus_[vcpu_index].get()); + if (kvm_vcpu) { + kvm_vcpu->SetShutdownCallback([this]() { RequestStop(); }); + kvm_vcpu->SetRebootCallback([this]() { RequestReboot(); }); + } #else (void)vcpu_index; #endif @@ -326,6 +351,92 @@ void Vm::SetIrqLevel(uint8_t irq, bool asserted) { machine_->SetIrqLevel(hv_vm_.get(), irq, asserted); } +bool Vm::TryEnableIrqFd(VirtioMmioDevice* dev, uint8_t slot_irq) { +#if defined(__linux__) + IrqFdSlot slot; + #if defined(__aarch64__) + slot.gsi = static_cast(slot_irq) + 32; // absolute SPI INTID + #elif defined(__x86_64__) + slot.gsi = static_cast(slot_irq); // IOAPIC pin + #else + (void)slot_irq; + return false; + #endif + slot.dev = dev; + irqfd_slots_.push_back(slot); + return true; +#else + (void)dev; + (void)slot_irq; + return false; +#endif +} + +void Vm::InstallIrqFds() { +#if defined(__linux__) + if (irqfd_slots_.empty() || !hv_vm_) return; + + // Allocate trigger + resample eventfds per slot, then ask the hypervisor + // to register each one. On any failure, drop the slot from the list + // (its virtio device keeps using the SetIrqLevelCallback fallback). + size_t write_idx = 0; + for (size_t read_idx = 0; read_idx < irqfd_slots_.size(); ++read_idx) { + IrqFdSlot& s = irqfd_slots_[read_idx]; + + int trig = ::eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + int resamp = ::eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (trig < 0 || resamp < 0) { + LOG_WARN("irqfd: eventfd() failed: %s", strerror(errno)); + if (trig >= 0) ::close(trig); + if (resamp >= 0) ::close(resamp); + continue; + } + if (!hv_vm_->RegisterLevelIrqFd(s.gsi, trig, resamp)) { + ::close(trig); + ::close(resamp); + continue; + } + + s.trigger_fd = trig; + s.resample_fd = resamp; + s.dev->SetIrqEventFd(trig); + io_loop_.AttachIrqFd(s.dev, trig, resamp); + irqfd_slots_[write_idx++] = s; + } + irqfd_slots_.resize(write_idx); + + if (!irqfd_slots_.empty()) { + LOG_INFO("irqfd: %zu slots attached to io_loop", irqfd_slots_.size()); + } +#endif +} + +void Vm::ShutdownIrqFds() { +#if defined(__linux__) + // Detach uv_poll on the io_loop first (synchronously-ish: Post returns + // immediately but the detach closure runs in the io_thread before Stop + // completes). Then unregister with the kernel and close fds. + for (auto& s : irqfd_slots_) { + if (s.dev) io_loop_.DetachIrqFd(s.dev); + } + for (auto& s : irqfd_slots_) { + if (s.trigger_fd >= 0) { + if (hv_vm_) hv_vm_->UnregisterIrqFd(s.gsi, s.trigger_fd); + ::close(s.trigger_fd); + s.trigger_fd = -1; + } + if (s.resample_fd >= 0) { + ::close(s.resample_fd); + s.resample_fd = -1; + } + if (s.dev) { + s.dev->SetIrqEventFd(-1); // revert to callback path on teardown + } + } + irqfd_slots_.clear(); +#endif +} + bool Vm::SetupVirtioBlk(const std::string& disk_path, const VirtioDeviceSlot& slot) { virtio_blk_ = std::make_unique(); if (!virtio_blk_->Open(disk_path)) return false; @@ -334,6 +445,7 @@ bool Vm::SetupVirtioBlk(const std::string& disk_path, const VirtioDeviceSlot& sl virtio_mmio_->Init(virtio_blk_.get(), mem_); virtio_mmio_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_.get(), slot.irq); virtio_blk_->SetMmioDevice(virtio_mmio_.get()); addr_space_.AddMmioDevice( @@ -353,6 +465,7 @@ bool Vm::SetupVirtioNet(bool link_up, const std::vector& forwards, virtio_mmio_net_->Init(virtio_net_.get(), mem_); virtio_mmio_net_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_net_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_net_.get(), slot.irq); virtio_net_->SetMmioDevice(virtio_mmio_net_.get()); virtio_net_->SetTxCallback([this](const uint8_t* frame, uint32_t len) { @@ -379,6 +492,7 @@ bool Vm::SetupVirtioInput(const VirtioDeviceSlot& kbd_slot, virtio_mmio_kbd_->Init(virtio_kbd_.get(), mem_); virtio_mmio_kbd_->SetIrqCallback([this, irq = kbd_slot.irq]() { InjectIrq(irq); }); virtio_mmio_kbd_->SetIrqLevelCallback([this, irq = kbd_slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_kbd_.get(), kbd_slot.irq); virtio_kbd_->SetMmioDevice(virtio_mmio_kbd_.get()); addr_space_.AddMmioDevice( kbd_slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_kbd_.get()); @@ -389,6 +503,7 @@ bool Vm::SetupVirtioInput(const VirtioDeviceSlot& kbd_slot, virtio_mmio_tablet_->Init(virtio_tablet_.get(), mem_); virtio_mmio_tablet_->SetIrqCallback([this, irq = tablet_slot.irq]() { InjectIrq(irq); }); virtio_mmio_tablet_->SetIrqLevelCallback([this, irq = tablet_slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_tablet_.get(), tablet_slot.irq); virtio_tablet_->SetMmioDevice(virtio_mmio_tablet_.get()); addr_space_.AddMmioDevice( tablet_slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_tablet_.get()); @@ -417,6 +532,7 @@ bool Vm::SetupVirtioGpu(uint32_t width, uint32_t height, const VirtioDeviceSlot& virtio_mmio_gpu_->Init(virtio_gpu_.get(), mem_); virtio_mmio_gpu_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_gpu_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_gpu_.get(), slot.irq); virtio_gpu_->SetMmioDevice(virtio_mmio_gpu_.get()); addr_space_.AddMmioDevice( slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_gpu_.get()); @@ -461,6 +577,7 @@ bool Vm::SetupVirtioSerial(const VirtioDeviceSlot& slot) { virtio_mmio_serial_->Init(virtio_serial_.get(), mem_); virtio_mmio_serial_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_serial_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_serial_.get(), slot.irq); virtio_serial_->SetMmioDevice(virtio_mmio_serial_.get()); addr_space_.AddMmioDevice( slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_serial_.get()); @@ -478,6 +595,7 @@ bool Vm::SetupVirtioFs(const std::vector& initial_folders, virtio_mmio_fs_->Init(virtio_fs_.get(), mem_); virtio_mmio_fs_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_fs_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_fs_.get(), slot.irq); virtio_fs_->SetMmioDevice(virtio_mmio_fs_.get()); addr_space_.AddMmioDevice(slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_fs_.get()); @@ -496,6 +614,7 @@ bool Vm::SetupVirtioFs(const std::vector& initial_folders, bool Vm::SetupVirtioSnd(const VirtioDeviceSlot& slot) { virtio_snd_ = std::make_unique(); virtio_snd_->SetMemMap(mem_); + virtio_snd_->SetIoLoop(&io_loop_); if (audio_port_) { virtio_snd_->SetAudioPort(audio_port_); @@ -505,6 +624,7 @@ bool Vm::SetupVirtioSnd(const VirtioDeviceSlot& slot) { virtio_mmio_snd_->Init(virtio_snd_.get(), mem_); virtio_mmio_snd_->SetIrqCallback([this, irq = slot.irq]() { InjectIrq(irq); }); virtio_mmio_snd_->SetIrqLevelCallback([this, irq = slot.irq](bool a) { SetIrqLevel(irq, a); }); + TryEnableIrqFd(virtio_mmio_snd_.get(), slot.irq); virtio_snd_->SetMmioDevice(virtio_mmio_snd_.get()); addr_space_.AddMmioDevice( slot.mmio_base, VirtioMmioDevice::kMmioSize, virtio_mmio_snd_.get()); @@ -629,6 +749,24 @@ int Vm::Run() { FinalizeBoot(boot_config_); } + if (running_) { +#if defined(__linux__) && defined(__aarch64__) + // KVM_IRQFD on arm64 requires the in-kernel VGIC to have had its + // KVM_DEV_ARM_VGIC_CTRL_INIT issued. SetupAarch64Boot normally drives + // that, but it runs on the BSP thread after boot_complete_ — i.e. + // after we would try to register irqfds here. Force-finalize from + // this (main) thread; FinalizeVgicInit is idempotent. + if (auto* kvm_vm = dynamic_cast(hv_vm_.get())) { + kvm_vm->FinalizeVgicInit(); + } +#endif + // Bring up the central device I/O loop, then register each virtio + // slot's irqfd with it. Slots that fail to register stay on the + // classic KVM_IRQ_LINE fallback. + io_loop_.Start(); + InstallIrqFds(); + } + // Phase 2: release all threads into their run loops. { std::lock_guard lock(boot_mutex_); diff --git a/src/core/vmm/vm.h b/src/core/vmm/vm.h index a48cd72..2964b03 100644 --- a/src/core/vmm/vm.h +++ b/src/core/vmm/vm.h @@ -5,6 +5,7 @@ #include "core/vmm/hypervisor_vm.h" #include "core/vmm/machine_model.h" #include "core/vmm/vcpu_startup_state.h" +#include "core/vmm/vm_io_loop.h" #include "core/device/virtio/virtio_mmio.h" #include "core/device/virtio/virtio_blk.h" #include "core/device/virtio/virtio_net.h" @@ -109,6 +110,17 @@ class Vm { void InjectIrq(uint8_t irq); void SetIrqLevel(uint8_t irq, bool asserted); + // Record a virtio-mmio slot as an IRQFD candidate. The actual KVM_IRQFD + // registration happens inside Run() once vCPUs (and, on arm64, the VGIC) + // are up. The classic SetIrqLevelCallback path stays wired as a fallback; + // when the real fd is installed the device transparently switches over. + bool TryEnableIrqFd(VirtioMmioDevice* dev, uint8_t slot_irq); + + // Register all recorded candidate slots with the hypervisor + io_loop_. + // Slots that fail stay in the callback-driven path. Linux-only. + void InstallIrqFds(); + void ShutdownIrqFds(); + uint32_t cpu_count_ = 1; std::unique_ptr machine_; std::unique_ptr hv_vm_; @@ -148,6 +160,15 @@ class Vm { // Active virtio slot list (populated during setup, used for kernel loading) std::vector active_virtio_slots_; + struct IrqFdSlot { + uint32_t gsi = 0; // absolute hypervisor GSI + int trigger_fd = -1; // write-to-assert eventfd + int resample_fd = -1; // signalled on EOI (may be -1) + VirtioMmioDevice* dev = nullptr; // for pending-status re-check + }; + std::vector irqfd_slots_; + VmIoLoop io_loop_; + std::atomic running_{false}; std::atomic reboot_requested_{false}; diff --git a/src/core/vmm/vm_io_loop.cpp b/src/core/vmm/vm_io_loop.cpp new file mode 100644 index 0000000..4102dd4 --- /dev/null +++ b/src/core/vmm/vm_io_loop.cpp @@ -0,0 +1,252 @@ +#include "core/vmm/vm_io_loop.h" + +#include "core/device/virtio/virtio_mmio.h" +#include "core/vmm/types.h" + +#if defined(__linux__) +#include +#endif + +VmIoLoop::VmIoLoop() = default; + +VmIoLoop::~VmIoLoop() { + Stop(); +} + +bool VmIoLoop::Start() { + { + std::lock_guard lock(post_mutex_); + if (accepting_) return true; + } + + int rc = uv_loop_init(&loop_); + if (rc != 0) { + LOG_ERROR("VmIoLoop: uv_loop_init failed: %s", uv_strerror(rc)); + return false; + } + + async_post_.data = this; + rc = uv_async_init(&loop_, &async_post_, OnAsyncPost); + if (rc != 0) { + LOG_ERROR("VmIoLoop: uv_async_init(post) failed: %s", uv_strerror(rc)); + (void)uv_loop_close(&loop_); + return false; + } + + async_stop_.data = this; + rc = uv_async_init(&loop_, &async_stop_, OnAsyncStop); + if (rc != 0) { + LOG_ERROR("VmIoLoop: uv_async_init(stop) failed: %s", uv_strerror(rc)); + uv_close(reinterpret_cast(&async_post_), nullptr); + while (uv_run(&loop_, UV_RUN_NOWAIT) != 0) {} + (void)uv_loop_close(&loop_); + return false; + } + + { + std::lock_guard lock(post_mutex_); + accepting_ = true; + } + running_.store(true, std::memory_order_release); + io_thread_ = std::thread(&VmIoLoop::ThreadMain, this); + return true; +} + +void VmIoLoop::Stop() { + { + std::lock_guard lock(post_mutex_); + if (!accepting_) return; + accepting_ = false; + } + + uv_async_send(&async_stop_); + + if (io_thread_.joinable()) io_thread_.join(); + + running_.store(false, std::memory_order_release); + + // Drop pending Post tasks without running them; captures destruct. + std::deque drained; + { + std::lock_guard lock(post_mutex_); + drained.swap(post_queue_); + } +} + +void VmIoLoop::ThreadMain() { + uv_run(&loop_, UV_RUN_DEFAULT); + while (uv_run(&loop_, UV_RUN_NOWAIT) != 0) {} + (void)uv_loop_close(&loop_); +} + +void VmIoLoop::Post(Task fn) { + std::lock_guard lock(post_mutex_); + if (!accepting_) return; + post_queue_.push_back(std::move(fn)); + uv_async_send(&async_post_); +} + +void VmIoLoop::OnAsyncPost(uv_async_t* h) { + auto* self = static_cast(h->data); + + // uv_async coalesces wakeups, so drain everything in one shot. + std::deque drained; + { + std::lock_guard lock(self->post_mutex_); + drained.swap(self->post_queue_); + } + + // If the stop callback ran first this iteration, drop everything: newly + // created uv handles here would never be closed before uv_loop_close. + if (self->io_stopped_) return; + + for (auto& fn : drained) { + if (fn) fn(); + } +} + +void VmIoLoop::OnAsyncStop(uv_async_t* h) { + auto* self = static_cast(h->data); + self->io_stopped_ = true; + + // Destroy outstanding timers. Close callbacks delete the ctx on the + // next iteration (uv_close is async). + for (auto& kv : self->timers_) { + auto* ctx = kv.second; + uv_timer_stop(&ctx->handle); + uv_close(reinterpret_cast(&ctx->handle), + [](uv_handle_t* h2) { + delete static_cast(h2->data); + }); + } + self->timers_.clear(); + + for (auto& kv : self->irqfds_) { + auto* ctx = kv.second; + uv_poll_stop(&ctx->handle); + uv_close(reinterpret_cast(&ctx->handle), + [](uv_handle_t* h2) { + delete static_cast(h2->data); + }); + } + self->irqfds_.clear(); + + uv_close(reinterpret_cast(&self->async_post_), nullptr); + uv_close(reinterpret_cast(h), nullptr); +} + +uint64_t VmIoLoop::AddTimer(uint64_t initial_ms, TimerCallback cb) { + uint64_t id = next_timer_id_.fetch_add(1, std::memory_order_relaxed); + Post([this, id, initial_ms, cb = std::move(cb)]() mutable { + if (io_stopped_) return; + auto* ctx = new TimerCtx{}; + ctx->owner = this; + ctx->id = id; + ctx->cb = std::move(cb); + ctx->handle.data = ctx; + if (uv_timer_init(&loop_, &ctx->handle) != 0) { + delete ctx; + return; + } + timers_[id] = ctx; + uv_timer_start(&ctx->handle, OnTimerFire, initial_ms, 0); + }); + return id; +} + +void VmIoLoop::RemoveTimer(uint64_t id) { + Post([this, id]() { + auto it = timers_.find(id); + if (it == timers_.end()) return; + auto* ctx = it->second; + timers_.erase(it); + uv_timer_stop(&ctx->handle); + uv_close(reinterpret_cast(&ctx->handle), + [](uv_handle_t* h) { + delete static_cast(h->data); + }); + }); +} + +void VmIoLoop::OnTimerFire(uv_timer_t* t) { + auto* ctx = static_cast(t->data); + uint64_t next_ms = ctx->cb ? ctx->cb() : 0; + if (next_ms == 0) { + auto* owner = ctx->owner; + owner->timers_.erase(ctx->id); + uv_timer_stop(&ctx->handle); + uv_close(reinterpret_cast(&ctx->handle), + [](uv_handle_t* h) { + delete static_cast(h->data); + }); + } else { + uv_timer_start(&ctx->handle, OnTimerFire, next_ms, 0); + } +} + +void VmIoLoop::AttachIrqFd(VirtioMmioDevice* dev, int trigger_fd, int resample_fd) { +#if defined(__linux__) + if (!dev || trigger_fd < 0 || resample_fd < 0) return; + Post([this, dev, trigger_fd, resample_fd]() { + if (io_stopped_) return; + if (irqfds_.count(dev)) return; // idempotent + auto* ctx = new IrqFdCtx{}; + ctx->owner = this; + ctx->dev = dev; + ctx->trigger_fd = trigger_fd; + ctx->resample_fd = resample_fd; + ctx->handle.data = ctx; + if (uv_poll_init(&loop_, &ctx->handle, resample_fd) != 0) { + LOG_WARN("VmIoLoop: uv_poll_init(irqfd) failed"); + delete ctx; + return; + } + irqfds_[dev] = ctx; + uv_poll_start(&ctx->handle, UV_READABLE, OnIrqFdReadable); + }); +#else + (void)dev; + (void)trigger_fd; + (void)resample_fd; +#endif +} + +void VmIoLoop::DetachIrqFd(VirtioMmioDevice* dev) { +#if defined(__linux__) + if (!dev) return; + Post([this, dev]() { + auto it = irqfds_.find(dev); + if (it == irqfds_.end()) return; + auto* ctx = it->second; + irqfds_.erase(it); + uv_poll_stop(&ctx->handle); + uv_close(reinterpret_cast(&ctx->handle), + [](uv_handle_t* h) { + delete static_cast(h->data); + }); + }); +#else + (void)dev; +#endif +} + +void VmIoLoop::OnIrqFdReadable(uv_poll_t* p, int status, int events) { +#if defined(__linux__) + (void)events; + auto* ctx = static_cast(p->data); + if (status < 0) { + LOG_WARN("VmIoLoop: irqfd poll error: %s", uv_strerror(status)); + return; + } + uint64_t v = 0; + (void)::read(ctx->resample_fd, &v, sizeof(v)); + if (ctx->dev && ctx->dev->GetInterruptStatus() != 0 && ctx->trigger_fd >= 0) { + uint64_t one = 1; + (void)::write(ctx->trigger_fd, &one, sizeof(one)); + } +#else + (void)p; + (void)status; + (void)events; +#endif +} diff --git a/src/core/vmm/vm_io_loop.h b/src/core/vmm/vm_io_loop.h new file mode 100644 index 0000000..82e02a5 --- /dev/null +++ b/src/core/vmm/vm_io_loop.h @@ -0,0 +1,122 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +class VirtioMmioDevice; + +// Per-Vm device I/O event loop. Owns a single libuv loop running on its own +// io_thread_, and is the central place to: +// - Drive the irqfd resample path (Linux): uv_poll on a resample eventfd, +// re-assert the trigger eventfd if the device still has pending bits. +// - Host timers for virtio devices (e.g. virtio_snd period tick) so that +// epoll_wait's timeout naturally folds in the next timer deadline. +// - Serve as a single point for other components to Post work to a +// known-safe thread (libuv handles are single-threaded). +// +// Concurrency contract (see plan): +// * All public methods are thread-safe. Call them from any thread. +// * Internally, every uv_* call other than uv_async_send happens on +// io_thread_. Cross-thread entry points Post a closure onto a queue and +// wake the loop with uv_async_send. +// * Post is FIFO within a single caller thread; cross-thread ordering is +// by the moment the task acquires the queue mutex. +// * Stop() does NOT execute the remaining posted tasks: captures are +// destroyed (releasing shared_ptr etc.), but the functions are not +// called. Devices must not rely on "all work drains" semantics. +class VmIoLoop { +public: + // Returned by callbacks in AddTimer: number of ms until the next fire, + // or 0 to stop (and destroy) the timer. + using TimerCallback = std::function; + using Task = std::function; + + VmIoLoop(); + ~VmIoLoop(); + + VmIoLoop(const VmIoLoop&) = delete; + VmIoLoop& operator=(const VmIoLoop&) = delete; + + // Spawn io_thread_ and bring up the uv_loop. Must be called once before + // any other method. Subsequent calls are a no-op and return true. + bool Start(); + + // Close all handles, join io_thread_, drop pending Post tasks without + // running them. Idempotent. + void Stop(); + + bool running() const { return running_.load(std::memory_order_acquire); } + + // Submit fn to run on io_thread_ (FIFO per caller). + void Post(Task fn); + + // Schedule a timer. `initial_ms` is the delay to the first fire; + // subsequent fires use whatever ms the callback returns (0 = stop and + // destroy). For a classic fixed-interval timer, have the callback always + // return the interval. + // Returns an opaque id usable with RemoveTimer (allocated eagerly on the + // caller's thread; the underlying uv_timer_t is created asynchronously + // on io_thread_). + uint64_t AddTimer(uint64_t initial_ms, TimerCallback cb); + + // Cancel a timer scheduled by AddTimer. Safe to call even if the timer + // has already self-destructed (returned 0 from its callback) or was + // never actually installed (e.g. cancelled before AddTimer's post ran). + void RemoveTimer(uint64_t id); + + // Attach a Linux eventfd pair to this loop for irqfd resample handling. + // When `resample_fd` becomes readable (kernel signalled it on guest EOI), + // we drain the counter and, if the device still has pending interrupt + // bits, write(trigger_fd) to re-assert the GIC/IOAPIC line. The fds' + // lifetime is the caller's responsibility; call DetachIrqFd before + // closing them. No-op on non-Linux. + void AttachIrqFd(VirtioMmioDevice* dev, int trigger_fd, int resample_fd); + void DetachIrqFd(VirtioMmioDevice* dev); + +public: + // Public for static-callback access; treat as implementation detail. + struct TimerCtx { + uv_timer_t handle{}; + VmIoLoop* owner = nullptr; + uint64_t id = 0; + TimerCallback cb; + }; + struct IrqFdCtx { + uv_poll_t handle{}; + VmIoLoop* owner = nullptr; + VirtioMmioDevice* dev = nullptr; + int trigger_fd = -1; + int resample_fd = -1; + }; + +private: + void ThreadMain(); + static void OnAsyncPost(uv_async_t* h); + static void OnAsyncStop(uv_async_t* h); + static void OnTimerFire(uv_timer_t* t); + static void OnIrqFdReadable(uv_poll_t* p, int status, int events); + + uv_loop_t loop_{}; + uv_async_t async_post_{}; + uv_async_t async_stop_{}; + std::thread io_thread_; + std::atomic running_{false}; + + std::mutex post_mutex_; + std::deque post_queue_; + bool accepting_ = false; + + std::atomic next_timer_id_{1}; + + // Accessed only from io_thread_. + std::unordered_map timers_; + std::unordered_map irqfds_; + bool io_stopped_ = false; +}; diff --git a/src/platform/CMakeLists.txt b/src/platform/CMakeLists.txt index d611fe3..dc616ae 100644 --- a/src/platform/CMakeLists.txt +++ b/src/platform/CMakeLists.txt @@ -54,15 +54,25 @@ elseif(APPLE) "-framework Hypervisor" ) elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") - set(KVM_SOURCES + set(KVM_COMMON_SOURCES ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/kvm_platform.cpp - ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp - ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vcpu.cpp ${CMAKE_SOURCE_DIR}/src/platform/linux/vm_platform_linux.cpp ${CMAKE_SOURCE_DIR}/src/platform/posix/console/posix_console_port.cpp ) - add_library(tenbox_platform STATIC ${KVM_SOURCES}) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + set(KVM_ARCH_SOURCES + ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp + ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp + ) + else() + set(KVM_ARCH_SOURCES + ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp + ${CMAKE_SOURCE_DIR}/src/platform/linux/hypervisor/x86_64/kvm_vcpu.cpp + ) + endif() + + add_library(tenbox_platform STATIC ${KVM_COMMON_SOURCES} ${KVM_ARCH_SOURCES}) target_include_directories(tenbox_platform PUBLIC diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp new file mode 100644 index 0000000..7430a0e --- /dev/null +++ b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.cpp @@ -0,0 +1,297 @@ +#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h" +#include "platform/linux/hypervisor/aarch64/kvm_vm.h" +#include "platform/linux/hypervisor/kvm_platform.h" +#include "core/vmm/types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace kvm { + +// Signal used to kick a vCPU out of KVM_RUN. Handler intentionally empty — +// arriving in userspace with a pending signal is enough for KVM_RUN to +// return with -EINTR / KVM_EXIT_INTR. +static constexpr int kCancelSignal = SIGUSR1; + +static void CancelSignalHandler(int /*sig*/) {} + +static void InstallCancelSignalHandler() { + static bool installed = false; + static std::mutex m; + std::lock_guard lock(m); + if (installed) return; + + struct sigaction sa{}; + sa.sa_handler = CancelSignalHandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; // no SA_RESTART: we want KVM_RUN to return EINTR + ::sigaction(kCancelSignal, &sa, nullptr); + installed = true; +} + +// Build a KVM_REG_ARM64 core register id from a field name inside struct +// kvm_regs (which starts with user_pt_regs "regs"). Offsets are expressed +// in 32-bit words per the KVM API convention. +static constexpr uint64_t CoreRegId(uint64_t byte_offset) { + return KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | + (byte_offset / sizeof(uint32_t)); +} + +// Offsets within struct kvm_regs { user_pt_regs regs; ... } on aarch64. +// user_pt_regs = { u64 regs[31]; u64 sp; u64 pc; u64 pstate; }. +static constexpr uint64_t kOffX(uint32_t i) { return i * 8u; } +static constexpr uint64_t kOffSp = 31u * 8u; // 0xF8 +static constexpr uint64_t kOffPc = 32u * 8u; // 0x100 +static constexpr uint64_t kOffPstate = 33u * 8u; // 0x108 + +// MPIDR_EL1 encoding for KVM_REG_ARM64 system register access. +// op0=3 op1=0 CRn=0 CRm=0 op2=5 +static constexpr uint64_t kSysRegMpidrEl1 = ARM64_SYS_REG(3, 0, 0, 0, 5); + +static bool SetOneReg(int fd, uint64_t id, uint64_t value) { + struct kvm_one_reg r{}; + r.id = id; + r.addr = reinterpret_cast(&value); + return ::ioctl(fd, KVM_SET_ONE_REG, &r) == 0; +} + +KvmVCpu::~KvmVCpu() { + if (run_) { + ::munmap(run_, run_size_); + run_ = nullptr; + } + if (vcpu_fd_ >= 0) { + ::close(vcpu_fd_); + vcpu_fd_ = -1; + } +} + +std::unique_ptr KvmVCpu::Create(KvmVm& vm, uint32_t index, + AddressSpace* addr_space) { + auto vcpu = std::unique_ptr(new KvmVCpu()); + vcpu->index_ = index; + vcpu->vm_ = &vm; + vcpu->addr_space_ = addr_space; + + // arm64 KVM requires KVM_CREATE_VCPU / KVM_ARM_VCPU_INIT to be fully + // serialised across vCPUs: once any vcpu has been INIT'd the kernel + // returns -EBUSY on further KVM_CREATE_VCPU. Our vcpu worker threads + // run in parallel, so guard the whole create + init sequence with a + // process-wide mutex. (Single-process use of /dev/kvm is the norm for + // this runtime, so a static lock is fine.) + static std::mutex create_mutex; + std::lock_guard create_guard(create_mutex); + + vcpu->vcpu_fd_ = ::ioctl(vm.VmFd(), KVM_CREATE_VCPU, (unsigned long)index); + if (vcpu->vcpu_fd_ < 0) { + LOG_ERROR("kvm: KVM_CREATE_VCPU(%u) failed: %s", index, strerror(errno)); + return nullptr; + } + + vcpu->run_size_ = vm.VcpuMmapSize(); + void* run = ::mmap(nullptr, vcpu->run_size_, PROT_READ | PROT_WRITE, + MAP_SHARED, vcpu->vcpu_fd_, 0); + if (run == MAP_FAILED) { + LOG_ERROR("kvm: mmap kvm_run for vCPU %u failed: %s", + index, strerror(errno)); + return nullptr; + } + vcpu->run_ = static_cast(run); + + // Secondary vCPUs start in POWER_OFF so the in-kernel PSCI layer blocks + // them inside KVM_RUN until the BSP issues PSCI_CPU_ON. + const bool power_off = (index != 0); + if (!vcpu->InitVcpu(power_off)) { + return nullptr; + } + + // Program MPIDR_EL1 with a unique affinity value (Aff0 = index). KVM + // defaults to derived affinity but being explicit matches QEMU/HVF. + if (!SetOneReg(vcpu->vcpu_fd_, kSysRegMpidrEl1, + static_cast(index) & 0xFFu)) { + LOG_WARN("kvm: set MPIDR_EL1 for vCPU %u failed: %s", + index, strerror(errno)); + } + + LOG_INFO("kvm: aarch64 vCPU %u created (%s)", + index, power_off ? "POWER_OFF" : "running"); + return vcpu; +} + +bool KvmVCpu::InitVcpu(bool power_off) { + // Query the preferred target from the host. + struct kvm_vcpu_init init{}; + if (::ioctl(vm_->VmFd(), KVM_ARM_PREFERRED_TARGET, &init) < 0) { + LOG_ERROR("kvm: KVM_ARM_PREFERRED_TARGET failed: %s", strerror(errno)); + return false; + } + + // Enable in-kernel PSCI v0.2 handling. KVM will parse HVC PSCI calls + // (CPU_ON / SYSTEM_OFF / SYSTEM_RESET) entirely in the kernel and expose + // lifecycle events via KVM_EXIT_SYSTEM_EVENT. + auto SetFeature = [&init](unsigned bit) { + init.features[bit / 32] |= (1u << (bit % 32)); + }; + SetFeature(KVM_ARM_VCPU_PSCI_0_2); + if (power_off) { + SetFeature(KVM_ARM_VCPU_POWER_OFF); + } + + if (::ioctl(vcpu_fd_, KVM_ARM_VCPU_INIT, &init) < 0) { + LOG_ERROR("kvm: KVM_ARM_VCPU_INIT(%u) failed: %s", + index_, strerror(errno)); + return false; + } + return true; +} + +void KvmVCpu::OnThreadInit() { + InstallCancelSignalHandler(); + + // Unblock the cancel signal on this (vCPU worker) thread, in case it was + // inherited-blocked. + sigset_t set; + sigemptyset(&set); + sigaddset(&set, kCancelSignal); + pthread_sigmask(SIG_UNBLOCK, &set, nullptr); + + thread_id_.store(static_cast(pthread_self()), + std::memory_order_release); +} + +bool KvmVCpu::SetupBootRegisters(uint8_t* /*ram*/) { + // aarch64 BSP boot state is configured through SetupAarch64Boot, which + // receives the (entry_pc, fdt_addr) pair from Aarch64Machine. + return true; +} + +bool KvmVCpu::SetupAarch64Boot(uint64_t entry_pc, uint64_t fdt_addr) { + // Finalize the in-kernel VGIC now — by the time the BSP reaches this + // point all vCPUs have been created (Vm::Run waits for all vCPUs ready + // before invoking FinalizeBoot, which calls us). + if (!vm_->FinalizeVgicInit()) { + return false; + } + + // PSTATE: EL1h with D/A/I/F masked = 0x3C5, same as the HVF path. + constexpr uint64_t kPstateEl1h = 0x3C5ULL; + + bool ok = true; + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffPc), entry_pc); + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(0)), fdt_addr); + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(1)), 0); + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(2)), 0); + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffX(3)), 0); + ok &= SetOneReg(vcpu_fd_, CoreRegId(kOffPstate), kPstateEl1h); + if (!ok) { + LOG_ERROR("kvm: vCPU %u SetupAarch64Boot: KVM_SET_ONE_REG failed: %s", + index_, strerror(errno)); + return false; + } + + LOG_INFO("kvm: vCPU %u ARM64 boot: PC=0x%" PRIx64 ", X0(FDT)=0x%" PRIx64, + index_, entry_pc, fdt_addr); + return true; +} + +VCpuExitAction KvmVCpu::RunOnce() { + int rc = ::ioctl(vcpu_fd_, KVM_RUN, 0); + if (rc < 0) { + if (errno == EINTR || errno == EAGAIN) { + run_->immediate_exit = 0; + return VCpuExitAction::kContinue; + } + LOG_ERROR("kvm: KVM_RUN(%u) failed: %s", index_, strerror(errno)); + return VCpuExitAction::kError; + } + + switch (run_->exit_reason) { + case KVM_EXIT_MMIO: { + auto& mmio = run_->mmio; + if (mmio.is_write) { + uint64_t val = 0; + ::memcpy(&val, mmio.data, mmio.len); + addr_space_->HandleMmioWrite(mmio.phys_addr, mmio.len, val); + } else { + uint64_t val = 0; + addr_space_->HandleMmioRead(mmio.phys_addr, mmio.len, &val); + ::memcpy(mmio.data, &val, mmio.len); + } + return VCpuExitAction::kContinue; + } + + case KVM_EXIT_HLT: + return VCpuExitAction::kHalt; + + case KVM_EXIT_INTR: + return VCpuExitAction::kContinue; + + case KVM_EXIT_SHUTDOWN: + LOG_INFO("kvm: vCPU %u KVM_EXIT_SHUTDOWN", index_); + if (shutdown_cb_) shutdown_cb_(); + return VCpuExitAction::kShutdown; + + case KVM_EXIT_SYSTEM_EVENT: { + uint32_t type = run_->system_event.type; + LOG_INFO("kvm: vCPU %u KVM_EXIT_SYSTEM_EVENT type=%u", index_, type); + // PSCI SYSTEM_OFF / SYSTEM_RESET / etc. are delivered here by the + // in-kernel PSCI emulator. Translate to the generic Vm lifecycle + // callbacks so RequestReboot() can actually recycle the VM. + if (type == KVM_SYSTEM_EVENT_RESET) { + if (reboot_cb_) reboot_cb_(); + } else { + if (shutdown_cb_) shutdown_cb_(); + } + return VCpuExitAction::kShutdown; + } + + case KVM_EXIT_FAIL_ENTRY: + LOG_ERROR("kvm: KVM_EXIT_FAIL_ENTRY reason=0x%" PRIx64 " cpu=%u", + (uint64_t)run_->fail_entry.hardware_entry_failure_reason, + run_->fail_entry.cpu); + return VCpuExitAction::kError; + + case KVM_EXIT_INTERNAL_ERROR: + LOG_ERROR("kvm: KVM_EXIT_INTERNAL_ERROR suberror=%u", + run_->internal.suberror); + return VCpuExitAction::kError; + + case KVM_EXIT_UNKNOWN: + case KVM_EXIT_IRQ_WINDOW_OPEN: + return VCpuExitAction::kContinue; + + default: + LOG_WARN("kvm: vCPU %u unhandled exit reason %u", + index_, run_->exit_reason); + return VCpuExitAction::kContinue; + } +} + +void KvmVCpu::CancelRun() { + if (run_) { + run_->immediate_exit = 1; + } + unsigned long tid = thread_id_.load(std::memory_order_acquire); + if (tid) { + ::pthread_kill(static_cast(tid), kCancelSignal); + } +} + +bool KvmVCpu::WaitForInterrupt(uint32_t timeout_ms) { + // With an in-kernel VGIC, WFI is normally handled inside KVM and we do + // not surface KVM_EXIT_HLT. If we do get here, just sleep briefly so the + // run loop keeps responsive to CancelRun. + if (timeout_ms == 0) timeout_ms = 1; + ::usleep(static_cast(timeout_ms) * 1000); + return false; +} + +} // namespace kvm diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h new file mode 100644 index 0000000..6dee403 --- /dev/null +++ b/src/platform/linux/hypervisor/aarch64/kvm_vcpu.h @@ -0,0 +1,80 @@ +#pragma once + +#include "core/vmm/address_space.h" +#include "core/vmm/hypervisor_vcpu.h" + +#include +#include +#include +#include + +struct kvm_run; + +namespace kvm { + +class KvmVm; + +class KvmVCpu final : public HypervisorVCpu { +public: + ~KvmVCpu() override; + + static std::unique_ptr Create(KvmVm& vm, uint32_t index, + AddressSpace* addr_space); + + VCpuExitAction RunOnce() override; + void CancelRun() override; + uint32_t Index() const override { return index_; } + + // Core HypervisorVCpu interface requires SetupBootRegisters; on aarch64 + // the real work happens through SetupAarch64Boot, invoked by + // Aarch64Machine::SetupBootVCpu. Keep SetupBootRegisters as a no-op for + // symmetry with the HVF backend. + bool SetupBootRegisters(uint8_t* ram) override; + + // BSP startup: set PC, X0=FDT, PSTATE=0x3C5 (EL1h, DAIF masked). Called + // once from the Aarch64Machine on the BSP thread *before* it enters + // RunOnce. Must run on the vCPU's own worker thread. + // Matches the HVF signature so aarch64_machine.cpp can share code paths. + bool SetupAarch64Boot(uint64_t entry_pc, uint64_t fdt_addr); + + void OnThreadInit() override; + + bool WaitForInterrupt(uint32_t timeout_ms) override; + + // KVM's in-kernel PSCI handles AP bring-up entirely in the kernel: APs + // are created in POWER_OFF state and KVM_RUN blocks until a + // PSCI_CPU_ON HVC is dispatched. Userspace SIPI/PSCI callbacks never + // fire, so the generic startup-wait would deadlock. + bool NeedsStartupWait() const override { return false; } + + // Shutdown/reset callbacks invoked when KVM surfaces a PSCI + // SYSTEM_OFF / SYSTEM_RESET via KVM_EXIT_SYSTEM_EVENT. Wired by + // Vm::SetupVCpuCallbacks so the Vm can RequestStop/RequestReboot. + using ShutdownCallback = std::function; + using RebootCallback = std::function; + void SetShutdownCallback(ShutdownCallback cb) { shutdown_cb_ = std::move(cb); } + void SetRebootCallback(RebootCallback cb) { reboot_cb_ = std::move(cb); } + +private: + KvmVCpu() = default; + + bool InitVcpu(bool power_off); + + uint32_t index_ = 0; + int vcpu_fd_ = -1; + struct kvm_run* run_ = nullptr; + size_t run_size_ = 0; + + KvmVm* vm_ = nullptr; + AddressSpace* addr_space_ = nullptr; + + // CancelRun writes immediate_exit = 1 and raises SIGUSR1 on the vCPU + // thread. OnThreadInit stashes the pthread id so CancelRun can deliver + // the signal to the right thread. + std::atomic thread_id_{0}; + + ShutdownCallback shutdown_cb_; + RebootCallback reboot_cb_; +}; + +} // namespace kvm diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp b/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp new file mode 100644 index 0000000..24d827b --- /dev/null +++ b/src/platform/linux/hypervisor/aarch64/kvm_vm.cpp @@ -0,0 +1,326 @@ +#include "platform/linux/hypervisor/aarch64/kvm_vm.h" +#include "platform/linux/hypervisor/aarch64/kvm_vcpu.h" +#include "platform/linux/hypervisor/kvm_platform.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace kvm { + +KvmVm::~KvmVm() { + if (vgic_fd_ >= 0) { + ::close(vgic_fd_); + vgic_fd_ = -1; + } + if (vm_fd_ >= 0) { + ::close(vm_fd_); + vm_fd_ = -1; + } + // kvm_fd_ is owned by kvm_platform.cpp; do not close. +} + +std::unique_ptr KvmVm::Create(uint32_t cpu_count) { + auto vm = std::unique_ptr(new KvmVm()); + vm->cpu_count_ = cpu_count; + + vm->kvm_fd_ = GetKvmFd(); + if (vm->kvm_fd_ < 0) { + LOG_ERROR("kvm: /dev/kvm not available"); + return nullptr; + } + + int vcpu_mmap_size = ::ioctl(vm->kvm_fd_, KVM_GET_VCPU_MMAP_SIZE, 0); + if (vcpu_mmap_size < (int)sizeof(struct kvm_run)) { + LOG_ERROR("kvm: KVM_GET_VCPU_MMAP_SIZE failed (%d): %s", + vcpu_mmap_size, strerror(errno)); + return nullptr; + } + vm->vcpu_mmap_size_ = static_cast(vcpu_mmap_size); + + // KVM_CREATE_VM takes an IPA size (in bits) on arm64. 0 means "use the + // KVM default" (40 bits on most hosts). + vm->vm_fd_ = ::ioctl(vm->kvm_fd_, KVM_CREATE_VM, 0UL); + if (vm->vm_fd_ < 0) { + LOG_ERROR("kvm: KVM_CREATE_VM failed: %s", strerror(errno)); + return nullptr; + } + + if (!vm->CreateInKernelVgic()) { + return nullptr; + } + + LOG_INFO("kvm: aarch64 VM created (%u vCPUs, mmap_size=%zu, vgic=%s)", + cpu_count, vm->vcpu_mmap_size_, + vm->uses_gic_v2_ ? "v2" : "v3"); + return vm; +} + +bool KvmVm::CreateInKernelVgic() { + // Prefer GICv3 (matches our default FDT + arm64 ABI). On hosts whose + // kernel cannot emulate VGICv3 over a physical GICv2 (e.g. Raspberry Pi + // 5's GIC-400 under certain kernel configs), KVM_CREATE_DEVICE returns + // ENODEV: fall back to VGICv2. + if (TryCreateVgicV3()) { + uses_gic_v2_ = false; + return true; + } + LOG_WARN("kvm: VGICv3 unavailable, falling back to VGICv2"); + if (TryCreateVgicV2()) { + uses_gic_v2_ = true; + return true; + } + LOG_ERROR("kvm: neither VGICv3 nor VGICv2 could be created"); + return false; +} + +bool KvmVm::TryCreateVgicV3() { + struct kvm_create_device cd{}; + cd.type = KVM_DEV_TYPE_ARM_VGIC_V3; + cd.fd = 0; + cd.flags = 0; + if (::ioctl(vm_fd_, KVM_CREATE_DEVICE, &cd) < 0) { + LOG_INFO("kvm: KVM_CREATE_DEVICE(VGIC_V3) unavailable: %s", + strerror(errno)); + return false; + } + vgic_fd_ = static_cast(cd.fd); + + auto SetAddr = [this](uint64_t attr, uint64_t addr) -> bool { + struct kvm_device_attr da{}; + da.group = KVM_DEV_ARM_VGIC_GRP_ADDR; + da.attr = attr; + da.addr = reinterpret_cast(&addr); + if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) { + LOG_ERROR("kvm: VGIC_V3 SET_ADDR(attr=%" PRIu64 ") failed: %s", + attr, strerror(errno)); + return false; + } + return true; + }; + + if (!SetAddr(KVM_VGIC_V3_ADDR_TYPE_DIST, kGicDistBase)) return false; + if (!SetAddr(KVM_VGIC_V3_ADDR_TYPE_REDIST, kGicRedistBase)) return false; + return true; +} + +bool KvmVm::TryCreateVgicV2() { + struct kvm_create_device cd{}; + cd.type = KVM_DEV_TYPE_ARM_VGIC_V2; + cd.fd = 0; + cd.flags = 0; + if (::ioctl(vm_fd_, KVM_CREATE_DEVICE, &cd) < 0) { + LOG_ERROR("kvm: KVM_CREATE_DEVICE(VGIC_V2) failed: %s", strerror(errno)); + return false; + } + vgic_fd_ = static_cast(cd.fd); + + auto SetAddr = [this](uint64_t attr, uint64_t addr) -> bool { + struct kvm_device_attr da{}; + da.group = KVM_DEV_ARM_VGIC_GRP_ADDR; + da.attr = attr; + da.addr = reinterpret_cast(&addr); + if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) { + LOG_ERROR("kvm: VGIC_V2 SET_ADDR(attr=%" PRIu64 ") failed: %s", + attr, strerror(errno)); + return false; + } + return true; + }; + + // GICv2 needs DIST and CPU interface addresses. We reuse the same 64 KiB + // distributor slot as v3 and place the virtual CPU interface at + // 0x08010000 (inside the space that v3 would use for redistributors). + if (!SetAddr(KVM_VGIC_V2_ADDR_TYPE_DIST, kGicDistBase)) return false; + if (!SetAddr(KVM_VGIC_V2_ADDR_TYPE_CPU, kGicV2CpuBase)) return false; + return true; +} + +bool KvmVm::FinalizeVgicInit() { + std::lock_guard lock(vgic_init_mutex_); + if (vgic_initialized_) return true; + if (vgic_fd_ < 0) { + LOG_ERROR("kvm: FinalizeVgicInit called without a VGIC device"); + return false; + } + + struct kvm_device_attr da{}; + da.group = KVM_DEV_ARM_VGIC_GRP_CTRL; + da.attr = KVM_DEV_ARM_VGIC_CTRL_INIT; + if (::ioctl(vgic_fd_, KVM_SET_DEVICE_ATTR, &da) < 0) { + LOG_ERROR("kvm: VGIC CTRL_INIT failed: %s", strerror(errno)); + return false; + } + vgic_initialized_ = true; + LOG_INFO("kvm: in-kernel VGIC%s initialized", uses_gic_v2_ ? "v2" : "v3"); + return true; +} + +bool KvmVm::MapMemory(GPA gpa, void* hva, uint64_t size, bool writable) { + uint32_t slot; + { + std::lock_guard lock(slot_mutex_); + slot = next_slot_++; + } + + struct kvm_userspace_memory_region region{}; + region.slot = slot; + region.flags = writable ? 0 : KVM_MEM_READONLY; + region.guest_phys_addr = gpa; + region.memory_size = size; + region.userspace_addr = reinterpret_cast(hva); + + if (::ioctl(vm_fd_, KVM_SET_USER_MEMORY_REGION, ®ion) < 0) { + LOG_ERROR("kvm: KVM_SET_USER_MEMORY_REGION(slot=%u gpa=0x%" PRIx64 + " size=0x%" PRIx64 ") failed: %s", + slot, gpa, size, strerror(errno)); + return false; + } + + LOG_INFO("kvm: mapped slot=%u GPA=0x%" PRIx64 " size=0x%" PRIx64 " HVA=%p%s", + slot, gpa, size, hva, writable ? "" : " [RO]"); + return true; +} + +bool KvmVm::UnmapMemory(GPA /*gpa*/, uint64_t /*size*/) { + // Not exercised by the current VM lifecycle (RAM is torn down with the + // process). Implementing this cleanly requires tracking slot IDs per GPA. + LOG_WARN("kvm: UnmapMemory not implemented"); + return false; +} + +std::unique_ptr KvmVm::CreateVCpu( + uint32_t index, AddressSpace* addr_space) { + return KvmVCpu::Create(*this, index, addr_space); +} + +void KvmVm::RequestInterrupt(const InterruptRequest& req) { + // Aarch64Machine::SetIrqLevel encodes SPIs as (hw_irq + 32), i.e. the + // architectural GIC INTID. KVM's in-kernel VGIC expects the absolute + // INTID (32..1019) as irq_id in the (type|vcpu|num) encoding, so we pass + // it through unchanged. SGIs/PPIs do not flow through RequestInterrupt + // in our codebase. + if (req.vector < 32 || req.vector > 1019) { + LOG_WARN("kvm: RequestInterrupt for out-of-range SPI vector %u ignored", + req.vector); + return; + } + uint32_t encoded = (static_cast(KVM_ARM_IRQ_TYPE_SPI) << 24) | + (0u << 16) | + (req.vector & 0xffffu); + + struct kvm_irq_level il{}; + il.irq = encoded; + il.level = req.level_triggered ? 1 : 0; + if (::ioctl(vm_fd_, KVM_IRQ_LINE, &il) < 0) { + LOG_WARN("kvm: KVM_IRQ_LINE(intid=%u level=%d) failed: %s", + req.vector, (int)req.level_triggered, strerror(errno)); + } +} + +bool KvmVm::AssertIrq(uint32_t gsi, bool level) { + // gsi is the absolute architectural INTID (>= 32 for SPIs). + if (gsi < 32 || gsi > 1019) return false; + uint32_t encoded = (static_cast(KVM_ARM_IRQ_TYPE_SPI) << 24) | + (gsi & 0xffffu); + + struct kvm_irq_level il{}; + il.irq = encoded; + il.level = level ? 1 : 0; + if (::ioctl(vm_fd_, KVM_IRQ_LINE, &il) < 0) { + LOG_WARN("kvm: AssertIrq KVM_IRQ_LINE(intid=%u level=%d) failed: %s", + gsi, (int)level, strerror(errno)); + } + return true; +} + +bool KvmVm::UpdateIrqRoutingLocked() { + // arm64 KVM has NO default GSI routing — we must install one entry per + // SPI we want to drive through KVM_IRQFD (or KVM_IRQ_LINE with routing). + // Build a routing table from routed_gsis_ and send it wholesale. + size_t n = routed_gsis_.size(); + std::vector buf( + sizeof(struct kvm_irq_routing) + + n * sizeof(struct kvm_irq_routing_entry), 0); + auto* routing = reinterpret_cast(buf.data()); + routing->nr = static_cast(n); + + auto* entries = reinterpret_cast( + buf.data() + sizeof(struct kvm_irq_routing)); + size_t i = 0; + for (uint32_t gsi : routed_gsis_) { + entries[i].gsi = gsi; + entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + entries[i].u.irqchip.irqchip = 0; // only VGIC + entries[i].u.irqchip.pin = gsi - 32; // SPI pin is INTID - 32 + ++i; + } + + if (::ioctl(vm_fd_, KVM_SET_GSI_ROUTING, routing) < 0) { + LOG_WARN("kvm: KVM_SET_GSI_ROUTING(n=%zu) failed: %s", + n, strerror(errno)); + return false; + } + return true; +} + +bool KvmVm::RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) { + // arm64 GSI for KVM_IRQFD is the absolute SPI INTID (>= 32). We must + // explicitly install a KVM_IRQ_ROUTING_IRQCHIP entry mapping gsi -> SPI + // pin before KVM_IRQFD; otherwise the kernel happily accepts the irqfd + // but never delivers the interrupt (no route found). + if (gsi < 32 || gsi > 1019 || trigger_fd < 0) return false; + + { + std::lock_guard lock(irqfd_route_mutex_); + routed_gsis_.insert(gsi); + if (!UpdateIrqRoutingLocked()) { + routed_gsis_.erase(gsi); + return false; + } + } + + struct kvm_irqfd ifd{}; + ifd.fd = static_cast(trigger_fd); + ifd.gsi = gsi; + if (resample_fd >= 0) { + ifd.flags = KVM_IRQFD_FLAG_RESAMPLE; + ifd.resamplefd = static_cast(resample_fd); + } + if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) { + LOG_WARN("kvm: KVM_IRQFD(gsi=%u trigger=%d resample=%d) failed: %s", + gsi, trigger_fd, resample_fd, strerror(errno)); + std::lock_guard lock(irqfd_route_mutex_); + routed_gsis_.erase(gsi); + UpdateIrqRoutingLocked(); + return false; + } + LOG_INFO("kvm: irqfd registered gsi=%u trigger=%d resample=%d", + gsi, trigger_fd, resample_fd); + return true; +} + +bool KvmVm::UnregisterIrqFd(uint32_t gsi, int trigger_fd) { + if (trigger_fd < 0) return false; + + struct kvm_irqfd ifd{}; + ifd.fd = static_cast(trigger_fd); + ifd.gsi = gsi; + ifd.flags = KVM_IRQFD_FLAG_DEASSIGN; + bool ok = (::ioctl(vm_fd_, KVM_IRQFD, &ifd) == 0); + if (!ok) { + LOG_WARN("kvm: KVM_IRQFD DEASSIGN(gsi=%u trigger=%d) failed: %s", + gsi, trigger_fd, strerror(errno)); + } + + std::lock_guard lock(irqfd_route_mutex_); + routed_gsis_.erase(gsi); + UpdateIrqRoutingLocked(); // best-effort; ignore result on teardown + return ok; +} + +} // namespace kvm diff --git a/src/platform/linux/hypervisor/aarch64/kvm_vm.h b/src/platform/linux/hypervisor/aarch64/kvm_vm.h new file mode 100644 index 0000000..5c5e744 --- /dev/null +++ b/src/platform/linux/hypervisor/aarch64/kvm_vm.h @@ -0,0 +1,102 @@ +#pragma once + +#include "core/vmm/hypervisor_vm.h" +#include +#include +#include +#include + +namespace kvm { + +class KvmVCpu; + +// ARM64 KVM VM backend. +// - Uses in-kernel VGICv3 (created via KVM_CREATE_DEVICE) with the +// dist/redist layout expected by the generic Aarch64Machine: +// GICD at 0x08000000 (64 KiB) +// GICR at 0x080A0000 (2 * 64 KiB per vCPU) +// - Relies on in-kernel PSCI v0.2 for SYSTEM_OFF/RESET and secondary CPU +// startup; no userspace PSCI dispatch is needed. +class KvmVm final : public HypervisorVm { +public: + ~KvmVm() override; + + static std::unique_ptr Create(uint32_t cpu_count); + + bool MapMemory(GPA gpa, void* hva, uint64_t size, bool writable) override; + bool UnmapMemory(GPA gpa, uint64_t size) override; + + std::unique_ptr CreateVCpu( + uint32_t index, AddressSpace* addr_space) override; + + void RequestInterrupt(const InterruptRequest& req) override; + + // KVM has an in-kernel VGIC: SPI IRQ lines go through KVM_IRQ_LINE. + bool AssertIrq(uint32_t gsi, bool level) override; + + // Register / unregister a KVM_IRQFD for a level-triggered SPI. + // gsi is the absolute INTID (>= 32). resample_fd may be -1 to fall + // back to edge semantics, but virtio-mmio requires a resample fd. + bool RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) override; + bool UnregisterIrqFd(uint32_t gsi, int trigger_fd) override; + + void SetGuestMemMap(const GuestMemMap* mem) override { guest_mem_ = mem; } + + // Issue KVM_DEV_ARM_VGIC_CTRL_INIT on the in-kernel VGIC. Must be called + // exactly once, after *all* vCPUs have been created via KVM_CREATE_VCPU + // (KVM rejects INIT otherwise). Safe to call multiple times: no-op after + // the first success. + bool FinalizeVgicInit(); + + int VmFd() const { return vm_fd_; } + int KvmFd() const { return kvm_fd_; } + size_t VcpuMmapSize() const { return vcpu_mmap_size_; } + uint32_t CpuCount() const { return cpu_count_; } + + // True when the fallback VGICv2 path was used (host GIC is v2 and the + // kernel doesn't emulate v3 on top of it — common on Raspberry Pi 5 with + // GIC-400). The machine model needs this to pick the right FDT compat. + bool UsesGicV2() const { return uses_gic_v2_; } + + // GIC layout (shared between v2 and v3 wherever possible). + static constexpr uint64_t kGicDistBase = 0x08000000ULL; + static constexpr uint64_t kGicDistSize = 0x00010000ULL; // 64 KiB + // GICv3: redistributor region (2 * 64 KiB per vCPU). + static constexpr uint64_t kGicRedistBase = 0x080A0000ULL; + static constexpr uint64_t kGicRedistStride = 0x00020000ULL; + // GICv2 CPU interface (placed inside the unused redist slot). + static constexpr uint64_t kGicV2CpuBase = 0x08010000ULL; + static constexpr uint64_t kGicV2CpuSize = 0x00010000ULL; + +private: + KvmVm() = default; + + bool CreateInKernelVgic(); + bool TryCreateVgicV3(); + bool TryCreateVgicV2(); + + bool UpdateIrqRoutingLocked(); + + int kvm_fd_ = -1; + int vm_fd_ = -1; + int vgic_fd_ = -1; + bool vgic_initialized_ = false; + bool uses_gic_v2_ = false; + uint32_t cpu_count_ = 0; + size_t vcpu_mmap_size_ = 0; + std::mutex vgic_init_mutex_; + + // GSIs (absolute SPI INTIDs) with an active irqfd. KVM requires us to + // program explicit GSI routing on arm64 — there is no default routing + // installed by VGIC creation. We rewrite the full route table every time + // a slot is added/removed under irqfd_route_mutex_. + std::mutex irqfd_route_mutex_; + std::set routed_gsis_; + + const GuestMemMap* guest_mem_ = nullptr; + + std::mutex slot_mutex_; + uint32_t next_slot_ = 0; +}; + +} // namespace kvm diff --git a/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp b/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp index 6f9996a..fa7ff44 100644 --- a/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp +++ b/src/platform/linux/hypervisor/x86_64/kvm_vm.cpp @@ -130,4 +130,42 @@ bool KvmVm::AssertIrq(uint32_t gsi, bool level) { return true; } +bool KvmVm::RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) { + // On x86 KVM, the default GSI routing created alongside KVM_CREATE_IRQCHIP + // maps GSI 0..23 onto IOAPIC pins, so no explicit KVM_SET_GSI_ROUTING is + // required for level-triggered virtio-mmio lines. + if (trigger_fd < 0) return false; + + struct kvm_irqfd ifd{}; + ifd.fd = static_cast(trigger_fd); + ifd.gsi = gsi; + if (resample_fd >= 0) { + ifd.flags = KVM_IRQFD_FLAG_RESAMPLE; + ifd.resamplefd = static_cast(resample_fd); + } + if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) { + LOG_WARN("kvm: KVM_IRQFD(gsi=%u trigger=%d resample=%d) failed: %s", + gsi, trigger_fd, resample_fd, strerror(errno)); + return false; + } + LOG_INFO("kvm: irqfd registered gsi=%u trigger=%d resample=%d", + gsi, trigger_fd, resample_fd); + return true; +} + +bool KvmVm::UnregisterIrqFd(uint32_t gsi, int trigger_fd) { + if (trigger_fd < 0) return false; + + struct kvm_irqfd ifd{}; + ifd.fd = static_cast(trigger_fd); + ifd.gsi = gsi; + ifd.flags = KVM_IRQFD_FLAG_DEASSIGN; + if (::ioctl(vm_fd_, KVM_IRQFD, &ifd) < 0) { + LOG_WARN("kvm: KVM_IRQFD DEASSIGN(gsi=%u trigger=%d) failed: %s", + gsi, trigger_fd, strerror(errno)); + return false; + } + return true; +} + } // namespace kvm diff --git a/src/platform/linux/hypervisor/x86_64/kvm_vm.h b/src/platform/linux/hypervisor/x86_64/kvm_vm.h index 042f17b..d276215 100644 --- a/src/platform/linux/hypervisor/x86_64/kvm_vm.h +++ b/src/platform/linux/hypervisor/x86_64/kvm_vm.h @@ -27,6 +27,10 @@ class KvmVm final : public HypervisorVm { // KVM has an in-kernel irqchip: IRQ lines go through KVM_IRQ_LINE. bool AssertIrq(uint32_t gsi, bool level) override; + // Register / unregister a KVM_IRQFD for a level-triggered GSI (IOAPIC pin). + bool RegisterLevelIrqFd(uint32_t gsi, int trigger_fd, int resample_fd) override; + bool UnregisterIrqFd(uint32_t gsi, int trigger_fd) override; + void SetGuestMemMap(const GuestMemMap* mem) override { guest_mem_ = mem; } int VmFd() const { return vm_fd_; } diff --git a/src/platform/linux/vm_platform_linux.cpp b/src/platform/linux/vm_platform_linux.cpp index 38f72b3..6f7f1d1 100644 --- a/src/platform/linux/vm_platform_linux.cpp +++ b/src/platform/linux/vm_platform_linux.cpp @@ -1,6 +1,12 @@ #include "core/vmm/vm_platform.h" #include "platform/linux/hypervisor/kvm_platform.h" +#if defined(__x86_64__) #include "platform/linux/hypervisor/x86_64/kvm_vm.h" +#elif defined(__aarch64__) +#include "platform/linux/hypervisor/aarch64/kvm_vm.h" +#else +#error "Unsupported Linux architecture for KVM backend" +#endif #include "platform/posix/console/posix_console_port.h" #include @@ -20,6 +26,14 @@ uint8_t* VmPlatform::AllocateRam(uint64_t size) { PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return nullptr; + + // Hint the kernel to back guest RAM with 2 MiB transparent huge pages. + // Only a hint — kernel silently falls back to 4 KiB pages if the mapping + // edges aren't 2 MiB aligned or if contiguous memory is unavailable, so + // there's no failure path to handle. Reduces stage-2 TLB pressure on + // arm64 and x86 alike. + ::madvise(ptr, size, MADV_HUGEPAGE); + return static_cast(ptr); }