From 0beb7339a0c93ba995a9b4171d9efbe6f3a83f30 Mon Sep 17 00:00:00 2001 From: zackees Date: Sat, 18 Apr 2026 22:37:18 -0700 Subject: [PATCH] =?UTF-8?q?fix(build):=20P0=20regression=20=E2=80=94=20Ope?= =?UTF-8?q?ration=20not=20permitted=20(os=20error=201)=20on=20warm=20build?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Unix, `fbuild_core::containment::spawn_contained` delegated to `ContainedProcessGroup::spawn_with_containment` from the `running-process-core` crate. That implementation stores the first spawned child's PID as the group's PGID and then has every subsequent child call `setpgid(0, first_child_pid)` from its `pre_exec` hook. Once the first child exits and is reaped (e.g. the short `avr-gcc -dumpversion` call emitted by `log_toolchain_version`), the kernel tears down that process group. The second spawn's `setpgid(0, stale_pgid)` then fails with EPERM, which surfaces as `build error: build failed: io error: Operation not permitted (os error 1)` immediately after the `Toolchain: avr-gcc 7.3.0` line — exactly the failure reported in #129. This is reproducible on Linux CI from 2.1.17 onwards (Build Leonardo et al.) and blocks every AVR / ESP32 / etc. build made via the daemon. Fix: bypass the shared-pgid behaviour on Unix. Install a per-child `pre_exec` hook that creates a fresh process group with `setpgid(0, 0)` and, on Linux, requests `PR_SET_PDEATHSIG(SIGKILL)` so the kernel still kills the child when the spawning daemon thread exits. Windows is unchanged — Job Object assignment is stateless and has no analogous failure mode. macOS loses the drop-time `killpg` backstop, which was already a no-op in practice because the global `ContainedProcessGroup` lives in a `OnceLock` that never drops; this is the same coverage profile as before the fix. Regression test: `sequential_contained_spawns_do_not_fail_with_eperm` in `crates/fbuild-core/src/containment.rs` initialises the global group and performs two consecutive `spawn_contained` calls with a wait+sleep between them, mirroring the AVR build's "dumpversion then compile" shape. Refs #129. Reproducing commits: #108 (containment feature) + the interaction surfaced by #120 / #119 that made the second-spawn path universally reachable. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/fbuild-core/src/containment.rs | 193 +++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 19 deletions(-) diff --git a/crates/fbuild-core/src/containment.rs b/crates/fbuild-core/src/containment.rs index 1f0e4e62..bebabf58 100644 --- a/crates/fbuild-core/src/containment.rs +++ b/crates/fbuild-core/src/containment.rs @@ -15,12 +15,18 @@ //! When the daemon process exits for any reason, Windows closes the //! job handle and kills every assigned process atomically. Children //! are assigned to the job via `AssignProcessToJobObject` after spawn. -//! * **Linux** — children are placed in a new process group with -//! `setpgid(0, 0)` and request `PR_SET_PDEATHSIG(SIGKILL)` so the -//! kernel sends SIGKILL when the daemon thread exits. On `Drop` the -//! group issues `killpg(SIGKILL)` as a backstop. -//! * **macOS** — process groups only; `prctl` is not available. The -//! drop-time `killpg` is the primary mechanism. +//! * **Linux** — each child is placed in its own new process group with +//! `setpgid(0, 0)` and requests `PR_SET_PDEATHSIG(SIGKILL)` so the +//! kernel sends SIGKILL when the daemon thread exits. Per-child groups +//! avoid the EPERM that +//! [`ContainedProcessGroup::spawn_with_containment`] hits when a +//! second child tries to join a stale, already-exited first child's +//! pgid (see issue #129). +//! * **macOS** — `prctl` is not available. Each child gets a fresh +//! process group; there is no drop-time `killpg` backstop because the +//! global group is a `OnceLock<...>` that never drops. This is the +//! same coverage gap that existed before this fix — macOS containment +//! is best-effort, and improving it is tracked separately. //! //! # Wiring //! @@ -41,11 +47,18 @@ //! [`spawn_detached`]. //! //! Based on the `ContainedProcessGroup` / `Containment` primitives in -//! . See FastLED/fbuild#32. +//! . On Unix, we +//! deliberately bypass `ContainedProcessGroup::spawn_with_containment`'s +//! shared-pgid behaviour and apply the `setpgid(0, 0)` + `prctl` pattern +//! per-child ourselves — see the module-level "How" section for why. +//! See FastLED/fbuild#32, #129. use std::process::{Child, Command}; use std::sync::OnceLock; +#[cfg(unix)] +use running_process_core::ContainedProcessGroup; +#[cfg(windows)] use running_process_core::{ContainedChild, ContainedProcessGroup, Containment}; /// Global process-wide containment group. Initialised once by the @@ -87,14 +100,38 @@ pub fn is_initialised() -> bool { /// /// Falls back to uncontained `Command::spawn` when no global group has /// been initialised (non-daemon binaries). +/// +/// **Windows**: delegates to `ContainedProcessGroup::spawn` which +/// assigns the child to the Job Object — safe and stateless. +/// +/// **Unix**: installs a per-child `pre_exec` hook that creates a new +/// process group (`setpgid(0, 0)`) and, on Linux, requests +/// `PR_SET_PDEATHSIG(SIGKILL)`. This sidesteps +/// [`ContainedProcessGroup::spawn_with_containment`]'s shared-pgid +/// behaviour, which fails with EPERM on the second spawn after the +/// first child (the pgid leader) has exited — the root cause of +/// FastLED/fbuild#129. The Linux kernel's `PR_SET_PDEATHSIG` still +/// enforces the "child dies with daemon" contract; macOS relies on +/// process-group-leader death alone. pub fn spawn_contained(command: &mut Command) -> std::io::Result { - match GLOBAL_GROUP.get() { - Some(group) => { - let ContainedChild { child, .. } = - group.spawn_with_containment(command, Containment::Contained)?; - Ok(child) + #[cfg(windows)] + { + match GLOBAL_GROUP.get() { + Some(group) => { + let ContainedChild { child, .. } = + group.spawn_with_containment(command, Containment::Contained)?; + Ok(child) + } + None => command.spawn(), } - None => command.spawn(), + } + #[cfg(unix)] + { + if GLOBAL_GROUP.get().is_none() { + return command.spawn(); + } + unix_install_pre_exec(command); + command.spawn() } } @@ -102,13 +139,74 @@ pub fn spawn_contained(command: &mut Command) -> std::io::Result { /// processes that must outlive the daemon (the daemon itself, spawned /// by the CLI / PyO3 bindings). pub fn spawn_detached(command: &mut Command) -> std::io::Result { - match GLOBAL_GROUP.get() { - Some(group) => { - let ContainedChild { child, .. } = - group.spawn_with_containment(command, Containment::Detached)?; - Ok(child) + #[cfg(windows)] + { + match GLOBAL_GROUP.get() { + Some(group) => { + let ContainedChild { child, .. } = + group.spawn_with_containment(command, Containment::Detached)?; + Ok(child) + } + None => command.spawn(), } - None => command.spawn(), + } + #[cfg(unix)] + { + // Detached: create a new session so the child survives the + // daemon thread that spawned it. Matches the upstream behaviour + // but without joining any shared pgid. + if GLOBAL_GROUP.get().is_none() { + return command.spawn(); + } + unix_install_detached_pre_exec(command); + command.spawn() + } +} + +/// Install a `pre_exec` hook that puts the child in a fresh process +/// group (Unix) and, on Linux, asks the kernel to SIGKILL the child +/// when the spawning thread exits. +/// +/// Per-child pgid avoids the EPERM race from +/// [`ContainedProcessGroup::spawn_with_containment`] where a second +/// spawn tries to join a stale first-child pgid. +#[cfg(unix)] +fn unix_install_pre_exec(command: &mut Command) { + use std::os::unix::process::CommandExt; + // SAFETY: the closure only calls async-signal-safe libc entries + // (`setpgid`, `prctl`, `getppid`, `_exit`) and does not allocate. + unsafe { + command.pre_exec(|| { + if libc::setpgid(0, 0) == -1 { + return Err(std::io::Error::last_os_error()); + } + #[cfg(target_os = "linux")] + { + if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) == -1 { + return Err(std::io::Error::last_os_error()); + } + if libc::getppid() == 1 { + // Parent already exited between fork() and prctl(); + // don't orphan this child to init. + libc::_exit(1); + } + } + Ok(()) + }); + } +} + +#[cfg(unix)] +fn unix_install_detached_pre_exec(command: &mut Command) { + use std::os::unix::process::CommandExt; + // SAFETY: `setsid` is async-signal-safe. + unsafe { + command.pre_exec(|| { + if libc::setsid() == -1 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) + }); } } @@ -360,4 +458,61 @@ mod tests { // across all tests in this binary. let _: bool = is_initialised(); } + + /// Regression: FastLED/fbuild#129 — two consecutive + /// `spawn_contained` calls must both succeed after the first child + /// has exited. The previous implementation (shared-pgid via + /// `ContainedProcessGroup::spawn_with_containment`) recorded the + /// first child's PID as the pgid and then tried to `setpgid(0, + /// first_pid)` on the second child, which fails with EPERM once + /// the first child has exited and been reaped. + /// + /// Per-child pgids (`setpgid(0, 0)`) + `PR_SET_PDEATHSIG` on Linux + /// avoids the stale-pgid dependency entirely. Windows uses Job + /// Object assignment, which is stateless and has no analogous + /// failure mode. + #[test] + fn sequential_contained_spawns_do_not_fail_with_eperm() { + // Install the global group so we exercise the contained path + // rather than the uncontained fallback. Idempotent. + init_global_containment("FBUILD-UNIT-TEST").expect("init_global_containment"); + assert!(is_initialised(), "global group must be initialised"); + + // First spawn: short-lived command, wait for it to exit before + // issuing the second spawn. This is exactly the shape of the + // AVR build's "gcc -dumpversion then compile" sequence that + // reproduces the original bug. + let build_cmd = || { + let mut cmd = if cfg!(windows) { + let mut c = Command::new("cmd"); + c.args(["/C", "echo", "ok"]); + c + } else { + let mut c = Command::new("echo"); + c.arg("ok"); + c + }; + cmd.stdin(std::process::Stdio::null()); + cmd.stdout(std::process::Stdio::null()); + cmd.stderr(std::process::Stdio::null()); + cmd + }; + + let mut first = build_cmd(); + let mut first_child = spawn_contained(&mut first).expect("first spawn"); + let first_status = first_child.wait().expect("first wait"); + assert!(first_status.success(), "first child should succeed"); + + // Small pause so the reaped pid and its (now defunct) pgroup + // have time to be torn down by the kernel. On the pre-fix + // code path this made the second spawn's `setpgid(0, stale)` + // fail with EPERM with very high probability. + std::thread::sleep(std::time::Duration::from_millis(50)); + + let mut second = build_cmd(); + let mut second_child = + spawn_contained(&mut second).expect("second spawn must not fail with EPERM"); + let second_status = second_child.wait().expect("second wait"); + assert!(second_status.success(), "second child should succeed"); + } }