diff --git a/Cargo.lock b/Cargo.lock index e7d1fa86..e18a6c80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -336,6 +336,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" + [[package]] name = "cfg_aliases" version = "0.2.1" @@ -443,6 +449,25 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -635,6 +660,18 @@ dependencies = [ "syn", ] +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "equivalent" version = "1.0.2" @@ -775,11 +812,14 @@ dependencies = [ name = "fbuild-core" version = "2.1.16" dependencies = [ + "libc", + "running-process-core", "serde", "serde_json", "sha2", "tempfile", "thiserror 2.0.18", + "tokio", "tracing", ] @@ -924,6 +964,17 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "filedescriptor" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e40758ed24c9b2eeb76c35fb0aebc66c626084edd827e07e1552279814c6682d" +dependencies = [ + "libc", + "thiserror 1.0.69", + "winapi", +] + [[package]] name = "filetime" version = "0.2.27" @@ -1719,6 +1770,18 @@ dependencies = [ "libc", ] +[[package]] +name = "nix" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "cfg_aliases 0.1.1", + "libc", +] + [[package]] name = "nix" version = "0.30.1" @@ -1727,7 +1790,7 @@ checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.11.0", "cfg-if", - "cfg_aliases", + "cfg_aliases 0.2.1", "libc", ] @@ -1739,7 +1802,7 @@ checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3" dependencies = [ "bitflags 2.11.0", "cfg-if", - "cfg_aliases", + "cfg_aliases 0.2.1", "libc", ] @@ -1752,6 +1815,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1871,6 +1943,27 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-pty" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4a596a2b3d2752d94f51fac2d4a96737b8705dddd311a32b9af47211f08671e" +dependencies = [ + "anyhow", + "bitflags 1.3.2", + "downcast-rs", + "filedescriptor", + "lazy_static", + "libc", + "log", + "nix 0.28.0", + "serial2", + "shared_library", + "shell-words", + "winapi", + "winreg", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -2000,7 +2093,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ "bytes", - "cfg_aliases", + "cfg_aliases 0.2.1", "pin-project-lite", "quinn-proto", "quinn-udp", @@ -2040,7 +2133,7 @@ version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" dependencies = [ - "cfg_aliases", + "cfg_aliases 0.2.1", "libc", "once_cell", "socket2", @@ -2134,6 +2227,26 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2243,6 +2356,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "running-process-core" +version = "3.1.0" +source = "git+https://github.com/zackees/running-process?rev=ff9d7972504f7a0dcee0f410274daf3b02e4fcc2#ff9d7972504f7a0dcee0f410274daf3b02e4fcc2" +dependencies = [ + "libc", + "portable-pty", + "sysinfo", + "thiserror 2.0.18", + "winapi", +] + [[package]] name = "rusqlite" version = "0.31.0" @@ -2446,6 +2571,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serial2" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcdbc46aa3882ec3d48ec2b5abcb4f0d863a13d7599265f3faa6d851f23c12f3" +dependencies = [ + "cfg-if", + "libc", + "winapi", +] + [[package]] name = "serialport" version = "4.9.0" @@ -2495,6 +2631,22 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shared_library" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9e7e0f2bfae24d8a5b5a66c5b257a83c7412304311512a0c054cd5e619da11" +dependencies = [ + "lazy_static", + "libc", +] + +[[package]] +name = "shell-words" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc6fe69c597f9c37bfeeeeeb33da3530379845f10be461a66d16d03eca2ded77" + [[package]] name = "shlex" version = "1.3.0" @@ -2642,6 +2794,21 @@ dependencies = [ "syn", ] +[[package]] +name = "sysinfo" +version = "0.30.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", +] + [[package]] name = "tap" version = "1.0.1" @@ -3308,6 +3475,22 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -3317,6 +3500,31 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -3488,6 +3696,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" +dependencies = [ + "winapi", +] + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index 8dd2364b..b95b1b57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,3 +58,9 @@ blake3 = "1" mimalloc = "0.1" object = "0.36" rusqlite = { version = "0.31", features = ["bundled"] } + +# Process containment: all subprocess spawns the daemon performs (compilers, +# esptool, qemu, simavr, node, npm, …) and any grandchildren they fork must +# die with the daemon. Published to crates.io is pending; pin a git rev. +# See FastLED/fbuild#32. +running-process-core = { git = "https://github.com/zackees/running-process", rev = "ff9d7972504f7a0dcee0f410274daf3b02e4fcc2", package = "running-process-core" } diff --git a/crates/fbuild-build/src/script_runtime.rs b/crates/fbuild-build/src/script_runtime.rs index f609384a..7b7935c4 100644 --- a/crates/fbuild-build/src/script_runtime.rs +++ b/crates/fbuild-build/src/script_runtime.rs @@ -100,13 +100,22 @@ pub fn resolve_extra_script_overlay( .stdin(std::process::Stdio::null()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()); - let output = command.output().map_err(|e| { + // Route the spawn through the daemon's containment group so a + // daemon crash mid-evaluation doesn't leave a python child running + // in the background. See FastLED/fbuild#32. + let child = fbuild_core::containment::spawn_contained(&mut command).map_err(|e| { fbuild_core::FbuildError::BuildFailed(format!( "failed to run extra_scripts runtime via '{}': {}", python.join(" "), e )) })?; + let output = child.wait_with_output().map_err(|e| { + fbuild_core::FbuildError::BuildFailed(format!( + "failed to collect extra_scripts runtime output: {}", + e + )) + })?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); diff --git a/crates/fbuild-build/src/zccache.rs b/crates/fbuild-build/src/zccache.rs index 460d0692..72851083 100644 --- a/crates/fbuild-build/src/zccache.rs +++ b/crates/fbuild-build/src/zccache.rs @@ -127,6 +127,11 @@ fn find_zccache_in_venv(start: &Path, exe_name: &str) -> Option { /// /// This is idempotent — `zccache start` is a no-op when the daemon is up. pub fn ensure_running(zccache: &Path) { + // INTENTIONALLY DETACHED (FastLED/fbuild#32): zccache is itself a + // long-running daemon with independent lifecycle management. `start` + // is a no-op when it's already running, and either way the zccache + // daemon must survive the fbuild daemon — so this spawn stays out + // of the containment group. let mut cmd = std::process::Command::new(zccache); cmd.arg("start") .stdout(std::process::Stdio::null()) diff --git a/crates/fbuild-cli/src/daemon_client.rs b/crates/fbuild-cli/src/daemon_client.rs index 4ecfc5c2..57245495 100644 --- a/crates/fbuild-cli/src/daemon_client.rs +++ b/crates/fbuild-cli/src/daemon_client.rs @@ -907,6 +907,15 @@ async fn spawn_daemon_process() -> fbuild_core::Result<()> { #[cfg(windows)] strip_std_handle_inheritance(); + // INTENTIONALLY DETACHED (FastLED/fbuild#32): the CLI spawns the + // daemon and then exits — the daemon must survive the CLI. The + // daemon in turn installs its own global `ContainedProcessGroup` + // (see fbuild-daemon/src/main.rs) so every descendant it spawns + // dies with *it*. The CLI binary itself has no global containment + // group installed, so this `spawn()` is already uncontained; the + // comment is here so a future refactor doesn't accidentally reroute + // it through `containment::spawn_contained`, which would make the + // daemon die the instant the CLI exits. cmd.spawn().map_err(|e| { fbuild_core::FbuildError::DaemonError(format!( "failed to spawn daemon (is fbuild-daemon in PATH?): {}", diff --git a/crates/fbuild-core/Cargo.toml b/crates/fbuild-core/Cargo.toml index c0c5705d..224710f0 100644 --- a/crates/fbuild-core/Cargo.toml +++ b/crates/fbuild-core/Cargo.toml @@ -12,6 +12,20 @@ tracing = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sha2 = { workspace = true } +# Process containment primitive (Job Objects on Windows; process groups + +# PR_SET_PDEATHSIG on Linux; process groups on macOS). The single global +# `ContainedProcessGroup` owned by the daemon ensures every child process +# spawned through `fbuild_core::subprocess::run_command` (and the emulator +# helpers) dies with the daemon. See FastLED/fbuild#32. +running-process-core = { workspace = true } +# Async helpers for the tokio integration in `containment::tokio_spawn` — +# we need the `process` feature so `tokio::process::Command` is in scope. +tokio = { workspace = true } + +# `libc::setpgid` / `libc::prctl` for the Unix `pre_exec` hook used by +# `containment::tokio_spawn::configure`. See FastLED/fbuild#32. +[target.'cfg(unix)'.dependencies] +libc = "0.2" [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/fbuild-core/src/containment.rs b/crates/fbuild-core/src/containment.rs new file mode 100644 index 00000000..1f0e4e62 --- /dev/null +++ b/crates/fbuild-core/src/containment.rs @@ -0,0 +1,363 @@ +//! Process containment: every child process spawned by the daemon dies +//! with the daemon, including grandchildren forked by the child. +//! +//! # Why +//! +//! Without containment, a daemon that is SIGKILLed (or whose console +//! window is closed on Windows) leaves behind orphaned compiler / +//! linker / esptool / qemu / simavr / node / npm processes. On Windows +//! those orphans also leak their `bash.exe` / `conhost.exe` / +//! `OpenConsole.exe` wrappers; on Linux/macOS they re-parent to init. +//! +//! # How +//! +//! * **Windows** — a single Job Object with `JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE`. +//! When the daemon process exits for any reason, Windows closes the +//! job handle and kills every assigned process atomically. Children +//! are assigned to the job via `AssignProcessToJobObject` after spawn. +//! * **Linux** — children are placed in a new process group with +//! `setpgid(0, 0)` and request `PR_SET_PDEATHSIG(SIGKILL)` so the +//! kernel sends SIGKILL when the daemon thread exits. On `Drop` the +//! group issues `killpg(SIGKILL)` as a backstop. +//! * **macOS** — process groups only; `prctl` is not available. The +//! drop-time `killpg` is the primary mechanism. +//! +//! # Wiring +//! +//! The daemon binary initialises the process-wide group at startup via +//! [`init_global_containment`]. Every subprocess the daemon spawns goes +//! through: +//! +//! * [`fbuild_core::subprocess::run_command`](super::subprocess::run_command) +//! — the central blocking helper used by compilers, linkers, esptool, +//! avrdude, addr2line, and most emulator setup code. +//! * [`spawn_contained`] — direct `std::process::Command` spawns that +//! don't go through `run_command` (e.g. zccache daemon startup). +//! * [`tokio_spawn::spawn_contained`] — long-running async spawns in +//! the emulator handlers (QEMU, simavr, node/avr8js). +//! +//! Processes that must intentionally outlive the daemon — currently +//! only the daemon itself, spawned by the CLI and PyO3 bindings — use +//! [`spawn_detached`]. +//! +//! Based on the `ContainedProcessGroup` / `Containment` primitives in +//! . See FastLED/fbuild#32. + +use std::process::{Child, Command}; +use std::sync::OnceLock; + +use running_process_core::{ContainedChild, ContainedProcessGroup, Containment}; + +/// Global process-wide containment group. Initialised once by the +/// daemon; remains `None` in non-daemon contexts (CLI binary, tests). +/// +/// When `None`, spawn helpers fall back to uncontained spawning — the +/// same behaviour as before this feature landed. That keeps the CLI +/// and tests working without forcing every binary to manage a job +/// object. +static GLOBAL_GROUP: OnceLock = OnceLock::new(); + +/// Initialise the process-wide containment group. +/// +/// Idempotent: a second call from the same process is a no-op. Call +/// this as early as possible from the daemon's `main` so the group +/// outlives every subprocess the daemon could possibly spawn. +/// +/// The `originator` tag is propagated to child processes via the +/// `RUNNING_PROCESS_ORIGINATOR` env var so orphaned processes can be +/// correlated back to a specific daemon instance after a crash. +pub fn init_global_containment(originator: &str) -> std::io::Result<()> { + if GLOBAL_GROUP.get().is_some() { + return Ok(()); + } + let group = ContainedProcessGroup::with_originator(originator)?; + // OnceLock::set returns Err if another thread raced and set it + // first; in that case the other value is equivalent — silently OK. + let _ = GLOBAL_GROUP.set(group); + Ok(()) +} + +/// True if the global containment group has been initialised in this +/// process. Primarily useful for tests. +pub fn is_initialised() -> bool { + GLOBAL_GROUP.get().is_some() +} + +/// Spawn a `std::process::Command` inside the global containment group. +/// +/// Falls back to uncontained `Command::spawn` when no global group has +/// been initialised (non-daemon binaries). +pub fn spawn_contained(command: &mut Command) -> std::io::Result { + match GLOBAL_GROUP.get() { + Some(group) => { + let ContainedChild { child, .. } = + group.spawn_with_containment(command, Containment::Contained)?; + Ok(child) + } + None => command.spawn(), + } +} + +/// Spawn a `std::process::Command` without containment. Intended for +/// processes that must outlive the daemon (the daemon itself, spawned +/// by the CLI / PyO3 bindings). +pub fn spawn_detached(command: &mut Command) -> std::io::Result { + match GLOBAL_GROUP.get() { + Some(group) => { + let ContainedChild { child, .. } = + group.spawn_with_containment(command, Containment::Detached)?; + Ok(child) + } + None => command.spawn(), + } +} + +// --------------------------------------------------------------------------- +// tokio integration +// --------------------------------------------------------------------------- + +/// Tokio-compatible containment helpers. +/// +/// `tokio::process::Command` doesn't expose its inner +/// `std::process::Command`, so `ContainedProcessGroup::spawn` can't be +/// used directly. The helpers in this module reproduce the same +/// behaviour: +/// +/// * On Unix the `pre_exec` hook (`CommandExt`, which tokio re-exposes) +/// performs the same `setpgid` + `PR_SET_PDEATHSIG` dance the core +/// crate does. +/// * On Windows the child is assigned to a parallel Job Object owned by +/// this module, configured with the same +/// `JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE` flag, so daemon death still +/// kills every tokio-spawned child. +/// +/// Both job handles (the std-path one inside `running-process-core`, +/// and the one here) live for the lifetime of the daemon process and +/// are closed automatically when the daemon exits, triggering +/// kill-on-close for their respective children. +pub mod tokio_spawn { + /// Spawn a `tokio::process::Command` inside the global containment + /// group. Falls back to an uncontained spawn when no group is + /// initialised. + pub fn spawn_contained( + command: &mut tokio::process::Command, + ) -> std::io::Result { + if !super::is_initialised() { + return command.spawn(); + } + configure(command); + let child = command.spawn()?; + post_spawn(&child)?; + Ok(child) + } + + /// Configure a tokio command with the containment pre-spawn hooks. + /// + /// On Unix this installs a `pre_exec` closure. On Windows this is a + /// no-op — the Job Object assignment happens post-spawn in + /// [`post_spawn`]. + pub fn configure(command: &mut tokio::process::Command) { + #[cfg(unix)] + { + if !super::is_initialised() { + return; + } + // SAFETY: the closure only calls async-signal-safe libc + // entries (`setpgid`, `prctl`, `getppid`, `_exit`) and does + // not allocate. + unsafe { + command.pre_exec(|| { + if libc::setpgid(0, 0) == -1 { + return Err(std::io::Error::last_os_error()); + } + #[cfg(target_os = "linux")] + { + if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) == -1 { + return Err(std::io::Error::last_os_error()); + } + if libc::getppid() == 1 { + // Parent already exited between fork() and + // prctl(); don't orphan this child to init. + libc::_exit(1); + } + } + Ok(()) + }); + } + } + #[cfg(not(unix))] + { + let _ = command; + } + } + + /// Assign the already-spawned child to the containment Job Object + /// (Windows) or no-op (Unix, where `pre_exec` did the work). + pub fn post_spawn(child: &tokio::process::Child) -> std::io::Result<()> { + #[cfg(windows)] + { + if !super::is_initialised() { + return Ok(()); + } + let Some(raw) = child.raw_handle() else { + // Already reaped — nothing to contain. + return Ok(()); + }; + super::windows_job::assign(raw) + } + #[cfg(not(windows))] + { + let _ = child; + Ok(()) + } + } +} + +// --------------------------------------------------------------------------- +// Windows-only: parallel Job Object used for tokio spawns. +// --------------------------------------------------------------------------- +#[cfg(windows)] +mod windows_job { + use std::sync::OnceLock; + + type Handle = *mut std::ffi::c_void; + + /// Wrapper around a raw `HANDLE` that is `Send + Sync`. The handle + /// is owned for the lifetime of the process — we deliberately do + /// not implement `Drop`, since closing the job handle would kill + /// every assigned child. The daemon process exit is what triggers + /// the kill-on-close semantics we want. + #[derive(Debug, Clone, Copy)] + struct JobHandle(Handle); + + unsafe impl Send for JobHandle {} + unsafe impl Sync for JobHandle {} + + static JOB: OnceLock = OnceLock::new(); + + const JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE: u32 = 0x2000; + const JOB_OBJECT_LIMIT_BREAKAWAY_OK: u32 = 0x0800; + const JOB_OBJECT_EXTENDED_LIMIT_INFORMATION: i32 = 9; + + #[repr(C)] + #[derive(Default, Clone, Copy)] + struct IoCounters { + read_operation_count: u64, + write_operation_count: u64, + other_operation_count: u64, + read_transfer_count: u64, + write_transfer_count: u64, + other_transfer_count: u64, + } + + #[repr(C)] + #[derive(Default, Clone, Copy)] + struct JobObjectBasicLimitInformation { + per_process_user_time_limit: i64, + per_job_user_time_limit: i64, + limit_flags: u32, + minimum_working_set_size: usize, + maximum_working_set_size: usize, + active_process_limit: u32, + affinity: usize, + priority_class: u32, + scheduling_class: u32, + } + + #[repr(C)] + #[derive(Default, Clone, Copy)] + struct JobObjectExtendedLimitInformation { + basic_limit_information: JobObjectBasicLimitInformation, + io_info: IoCounters, + process_memory_limit: usize, + job_memory_limit: usize, + peak_process_memory_used: usize, + peak_job_memory_used: usize, + } + + #[link(name = "kernel32")] + extern "system" { + fn CreateJobObjectW(security_attrs: *mut std::ffi::c_void, name: *const u16) -> Handle; + fn SetInformationJobObject( + job: Handle, + info_class: i32, + info: *mut std::ffi::c_void, + info_len: u32, + ) -> i32; + fn AssignProcessToJobObject(job: Handle, process: Handle) -> i32; + } + + fn ensure_job() -> std::io::Result { + if let Some(h) = JOB.get() { + return Ok(h.0); + } + let job = unsafe { CreateJobObjectW(std::ptr::null_mut(), std::ptr::null()) }; + if job.is_null() { + return Err(std::io::Error::last_os_error()); + } + let mut info = JobObjectExtendedLimitInformation::default(); + info.basic_limit_information.limit_flags = + JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE | JOB_OBJECT_LIMIT_BREAKAWAY_OK; + let ok = unsafe { + SetInformationJobObject( + job, + JOB_OBJECT_EXTENDED_LIMIT_INFORMATION, + &mut info as *mut _ as *mut std::ffi::c_void, + std::mem::size_of::() as u32, + ) + }; + if ok == 0 { + return Err(std::io::Error::last_os_error()); + } + let _ = JOB.set(JobHandle(job)); + Ok(JOB.get().expect("just set").0) + } + + pub(super) fn assign(process_handle: Handle) -> std::io::Result<()> { + let job = ensure_job()?; + let ok = unsafe { AssignProcessToJobObject(job, process_handle) }; + if ok == 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn spawn_contained_without_init_falls_back_to_uncontained() { + // When no global group is installed, the helper should still be + // able to spawn processes — this preserves behaviour for the + // CLI binary and for unit tests. + let mut cmd = if cfg!(windows) { + let mut c = Command::new("cmd"); + c.args(["/C", "echo", "hello"]); + c + } else { + let mut c = Command::new("echo"); + c.arg("hello"); + c + }; + cmd.stdout(std::process::Stdio::null()); + cmd.stderr(std::process::Stdio::null()); + let mut child = spawn_contained(&mut cmd).expect("spawn"); + let _ = child.wait(); + } + + #[test] + fn is_initialised_is_bool() { + // Compile-time sanity check that the accessor exists and returns + // a `bool`. We intentionally do not call + // `init_global_containment` here because the global is shared + // across all tests in this binary. + let _: bool = is_initialised(); + } +} diff --git a/crates/fbuild-core/src/lib.rs b/crates/fbuild-core/src/lib.rs index 37c09e7a..c9472af3 100644 --- a/crates/fbuild-core/src/lib.rs +++ b/crates/fbuild-core/src/lib.rs @@ -8,6 +8,7 @@ pub mod build_log; pub mod compiler_flags; +pub mod containment; pub mod elapsed; pub mod emulator; pub mod response_file; diff --git a/crates/fbuild-core/src/subprocess.rs b/crates/fbuild-core/src/subprocess.rs index 95a20cfc..5db68524 100644 --- a/crates/fbuild-core/src/subprocess.rs +++ b/crates/fbuild-core/src/subprocess.rs @@ -165,12 +165,41 @@ fn strip_msys_env(cmd: &mut Command) { } fn run_no_timeout(mut cmd: Command) -> Result { - let output: Output = cmd.output()?; + // Route the spawn through the process-wide containment group so the + // resulting child (and any grandchildren it forks) dies with the + // daemon. Falls back to an uncontained spawn when the global group + // is not initialised (CLI binary, unit tests). See FastLED/fbuild#32. + let mut child = crate::containment::spawn_contained(&mut cmd)?; + let stdout = child + .stdout + .take() + .map(|mut s| { + let mut buf = Vec::new(); + std::io::Read::read_to_end(&mut s, &mut buf).ok(); + buf + }) + .unwrap_or_default(); + let stderr = child + .stderr + .take() + .map(|mut s| { + let mut buf = Vec::new(); + std::io::Read::read_to_end(&mut s, &mut buf).ok(); + buf + }) + .unwrap_or_default(); + let status = child.wait()?; + let output = Output { + status, + stdout, + stderr, + }; Ok(output_to_tool_output(output)) } fn run_with_timeout(mut cmd: Command, timeout: Duration) -> Result { - let mut child = cmd.spawn()?; + // See `run_no_timeout`: route through containment. + let mut child = crate::containment::spawn_contained(&mut cmd)?; let timeout_ms = timeout.as_millis() as u64; let start = std::time::Instant::now(); diff --git a/crates/fbuild-daemon/Cargo.toml b/crates/fbuild-daemon/Cargo.toml index 2c6050a8..86a409bf 100644 --- a/crates/fbuild-daemon/Cargo.toml +++ b/crates/fbuild-daemon/Cargo.toml @@ -58,3 +58,8 @@ workspace = true [dev-dependencies] fbuild-test-support = { path = "../fbuild-test-support" } + +# libc::kill(pid, SIGKILL) is used by the process-containment integration +# test (tests/process_containment.rs). Unix-only; Windows uses taskkill /F. +[target.'cfg(unix)'.dev-dependencies] +libc = "0.2" diff --git a/crates/fbuild-daemon/src/bin/README.md b/crates/fbuild-daemon/src/bin/README.md new file mode 100644 index 00000000..ce677019 --- /dev/null +++ b/crates/fbuild-daemon/src/bin/README.md @@ -0,0 +1,14 @@ +# Test helper binaries + +Auxiliary binaries used only by the fbuild-daemon integration test suite. +These are never shipped — they exist only so the test driver in +`tests/` can spawn real OS processes and exercise platform-level +behaviour (process containment, Job Objects, process groups) rather +than mocking it. + +## Binaries + +- `containment_harness.rs` — drives the process-containment integration + test (FastLED/fbuild#32). Acts as a parent, child, or grandchild + depending on the role argument passed on the command line; see the + module-level doc-comment in the source. diff --git a/crates/fbuild-daemon/src/bin/containment_harness.rs b/crates/fbuild-daemon/src/bin/containment_harness.rs new file mode 100644 index 00000000..c5596105 --- /dev/null +++ b/crates/fbuild-daemon/src/bin/containment_harness.rs @@ -0,0 +1,120 @@ +//! Integration-test harness for process containment (FastLED/fbuild#32). +//! +//! This binary is only compiled for the `fbuild-daemon` test suite — +//! it is not shipped. It has three roles, selected by `argv[1]`: +//! +//! | role | behaviour | +//! |------------|-------------------------------------------------------------------| +//! | `parent` | Install the global containment group, spawn a contained `child` | +//! | | process, print ` ` to | +//! | | stdout, then sleep forever so the test can hard-kill the parent. | +//! | `child` | Spawn a contained `grandchild`, print its own PID and the | +//! | | grandchild's PID to stdout, then sleep forever. | +//! | `grandchild` | Sleep forever. Exists to prove grandchild-level containment. | +//! +//! The test driver (`tests/process_containment.rs`) parses the PIDs +//! printed by each role and uses OS-specific probes to verify every PID +//! is gone after the parent is killed. + +use std::io::Write; +use std::time::Duration; + +fn main() { + // Slurp the role argument; panic loudly if missing because the test + // harness should always pass one. + let args: Vec = std::env::args().collect(); + let role = args + .get(1) + .map(|s| s.as_str()) + .expect("containment_harness requires a role argument (parent|child|grandchild)"); + + match role { + "parent" => run_parent(), + "child" => run_child(), + "grandchild" => run_grandchild(), + other => panic!("unknown containment_harness role: {other}"), + } +} + +fn run_parent() { + // Install the process-wide containment group. In production this is + // done by the fbuild-daemon binary; here we reproduce the same call + // site so the test exercises real behaviour, not a mock. + fbuild_core::containment::init_global_containment("FBUILD-TEST") + .expect("init_global_containment"); + + // Spawn the child via the contained-spawn helper. + let self_exe = std::env::current_exe().expect("current_exe"); + let mut cmd = std::process::Command::new(&self_exe); + cmd.arg("child") + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::null()); + let mut child = fbuild_core::containment::spawn_contained(&mut cmd).expect("spawn child"); + + // The child prints ` \n` to stdout on + // startup. Wait for that single line before announcing our own PIDs. + let mut stdout = child.stdout.take().expect("child stdout"); + let mut buf = Vec::::new(); + let mut byte = [0u8; 1]; + loop { + match std::io::Read::read(&mut stdout, &mut byte) { + Ok(0) => break, + Ok(_) if byte[0] == b'\n' => break, + Ok(_) => buf.push(byte[0]), + Err(_) => break, + } + } + let child_line = String::from_utf8_lossy(&buf).trim().to_string(); + + // Emit ` \n` on stdout — this is the + // protocol the test driver parses. + let line = format!("{} {}\n", std::process::id(), child_line); + std::io::stdout() + .write_all(line.as_bytes()) + .expect("write line"); + std::io::stdout().flush().ok(); + + // Drop the child handle without waiting — containment keeps it + // alive, and we want the handle dropped so the Windows job object + // is the only thing keeping track. + std::mem::forget(child); + + // Sleep forever (capped at 2 minutes so a leaked process dies on + // its own in pathological failure modes). + std::thread::sleep(Duration::from_secs(120)); +} + +fn run_child() { + // Spawn the grandchild inside the same containment group that the + // parent installed — but we haven't installed anything here. The + // grandchild inherits containment transparently on Unix (process + // group membership is inherited) and on Windows (Job Objects + // auto-include descendants because we set BREAKAWAY_OK on the job + // but do not use `CREATE_BREAKAWAY_FROM_JOB` when spawning). + // + // So here we use a plain `spawn()` and the grandchild still ends + // up in the parent's containment group. + let self_exe = std::env::current_exe().expect("current_exe"); + let mut cmd = std::process::Command::new(&self_exe); + cmd.arg("grandchild") + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()); + let child = cmd.spawn().expect("spawn grandchild"); + let grandchild_pid = child.id(); + + let line = format!("{} {}\n", std::process::id(), grandchild_pid); + std::io::stdout() + .write_all(line.as_bytes()) + .expect("write line"); + std::io::stdout().flush().ok(); + + // Drop the handle and sleep forever. + std::mem::forget(child); + std::thread::sleep(Duration::from_secs(120)); +} + +fn run_grandchild() { + std::thread::sleep(Duration::from_secs(120)); +} diff --git a/crates/fbuild-daemon/src/handlers/emulator.rs b/crates/fbuild-daemon/src/handlers/emulator.rs index 4bbf06e9..ad7bde81 100644 --- a/crates/fbuild-daemon/src/handlers/emulator.rs +++ b/crates/fbuild-daemon/src/handlers/emulator.rs @@ -538,13 +538,12 @@ pub async fn deploy_avr8js( fn find_node() -> fbuild_core::Result { let node = if cfg!(windows) { "node.exe" } else { "node" }; - match std::process::Command::new(node) - .arg("--version") - .stdout(Stdio::piped()) - .stderr(Stdio::null()) - .output() - { - Ok(output) if output.status.success() => Ok(PathBuf::from(node)), + // Route through fbuild-core's `run_command` so the probe spawn is + // captured by the daemon's containment group (issue #32). The probe + // is short-lived (`node --version`) but a missing binary should + // still bubble up the same way. + match fbuild_core::subprocess::run_command(&[node, "--version"], None, None, None) { + Ok(output) if output.success() => Ok(PathBuf::from(node)), _ => Err(fbuild_core::FbuildError::DeployFailed( "Node.js is required for headless avr8js emulation but 'node' was not found on PATH. \ Install Node.js 18+ from https://nodejs.org/" @@ -662,38 +661,42 @@ fn ensure_avr8js_npm_in(cache_dir: &Path, force_refresh: bool) -> fbuild_core::R })?; let npm = if cfg!(windows) { "npm.cmd" } else { "npm" }; - let output = std::process::Command::new(npm) - .args(["install", "--save", "avr8js@0.21.0"]) - .arg("--prefix") - .arg(cache_dir) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .output() - .map_err(|e| { - fbuild_core::FbuildError::DeployFailed(format!( - "failed to launch 'npm' (for `npm install avr8js@0.21.0 --prefix {}`): {}. \ - Ensure `npm` is installed alongside Node.js and on PATH \ - (https://nodejs.org/). If npm is installed, set \ - {}=1 to force a clean reinstall.", - cache_dir.display(), - e, - REFRESH_EMU_CACHE_ENV - )) - })?; - if !output.status.success() { - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); + // Route through `run_command` (which spawns via the daemon's + // containment group) so an `npm install` killed mid-flight doesn't + // leak node processes after the daemon dies. See FastLED/fbuild#32. + let cache_dir_str = cache_dir.to_string_lossy().to_string(); + let output = fbuild_core::subprocess::run_command( + &[ + npm, + "install", + "--save", + "avr8js@0.21.0", + "--prefix", + &cache_dir_str, + ], + None, + None, + None, + ) + .map_err(|e| { + fbuild_core::FbuildError::DeployFailed(format!( + "failed to launch 'npm' (for `npm install avr8js@0.21.0 --prefix {}`): {}. \ + Ensure `npm` is installed alongside Node.js and on PATH \ + (https://nodejs.org/). If npm is installed, set \ + {}=1 to force a clean reinstall.", + cache_dir.display(), + e, + REFRESH_EMU_CACHE_ENV + )) + })?; + if !output.success() { return Err(fbuild_core::FbuildError::DeployFailed(format!( "`npm install avr8js@0.21.0 --prefix {}` exited with status {}.\n\ --- stdout ---\n{}\n--- stderr ---\n{}", cache_dir.display(), - output - .status - .code() - .map(|c| c.to_string()) - .unwrap_or_else(|| "?".to_string()), - stdout.trim_end(), - stderr.trim_end() + output.exit_code, + output.stdout.trim_end(), + output.stderr.trim_end() ))); } @@ -784,12 +787,15 @@ async fn run_avr8js_headless( ); } - let mut child = cmd.spawn().map_err(|e| { - fbuild_core::FbuildError::DeployFailed(format!( - "failed to launch Node.js for avr8js: {}", - e - )) - })?; + // Route through containment (#32) so a daemon crash mid-emulation + // takes node.exe and any helper processes it spawned with it. + let mut child = + fbuild_core::containment::tokio_spawn::spawn_contained(&mut cmd).map_err(|e| { + fbuild_core::FbuildError::DeployFailed(format!( + "failed to launch Node.js for avr8js: {}", + e + )) + })?; let stdout = child.stdout.take().ok_or_else(|| { fbuild_core::FbuildError::DeployFailed("failed to capture avr8js stdout".to_string()) @@ -1014,14 +1020,18 @@ async fn run_qemu_process( tracing::info!("{}: {} {}", label, qemu_path.display(), args.join(" ")); } - let mut child = cmd.spawn().map_err(|e| { - fbuild_core::FbuildError::DeployFailed(build_linux_macos_qemu_hint(&format!( - "failed to launch {} at {}: {}", - label, - qemu_path.display(), - e - ))) - })?; + // Route through containment (#32). QEMU is a long-running process + // — a daemon hard-kill mid-emulation must not leave `qemu-system-*` + // or its `conhost.exe` wrapper behind. + let mut child = + fbuild_core::containment::tokio_spawn::spawn_contained(&mut cmd).map_err(|e| { + fbuild_core::FbuildError::DeployFailed(build_linux_macos_qemu_hint(&format!( + "failed to launch {} at {}: {}", + label, + qemu_path.display(), + e + ))) + })?; let stdout = child.stdout.take().ok_or_else(|| { fbuild_core::FbuildError::DeployFailed(format!("failed to capture {} stdout", label)) @@ -1746,13 +1756,10 @@ fn find_simavr() -> fbuild_core::Result { } else { "simavr" }; - // Try running simavr to verify it exists - match std::process::Command::new(simavr) - .arg("--help") - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .output() - { + // Try running simavr to verify it exists; route through containment + // (issue #32). This is a short-lived probe so the containment + // difference is purely consistency. + match fbuild_core::subprocess::run_command(&[simavr, "--help"], None, None, None) { Ok(_) => Ok(PathBuf::from(simavr)), Err(_) => { let install_hint = if cfg!(target_os = "linux") { diff --git a/crates/fbuild-daemon/src/main.rs b/crates/fbuild-daemon/src/main.rs index 24513f54..828c1c01 100644 --- a/crates/fbuild-daemon/src/main.rs +++ b/crates/fbuild-daemon/src/main.rs @@ -34,6 +34,17 @@ async fn main() { unsafe { std::env::set_var("FBUILD_DEV_MODE", "1") }; } + // Install the process-wide containment group as early as possible so + // every subprocess the daemon spawns (compilers, linkers, esptool, + // avrdude, qemu, simavr, node, npm, …) is born inside a Windows Job + // Object (JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE) or a Unix process + // group with PR_SET_PDEATHSIG. When the daemon dies for any reason + // — SIGKILL, power loss, crash, console window close — the OS + // reaps every descendant in the group. See FastLED/fbuild#32. + if let Err(e) = fbuild_core::containment::init_global_containment("FBUILD-DAEMON") { + eprintln!("warning: failed to install process containment: {}", e); + } + let port = args.port.unwrap_or_else(fbuild_paths::get_daemon_port); tracing_subscriber::fmt() diff --git a/crates/fbuild-daemon/tests/process_containment.rs b/crates/fbuild-daemon/tests/process_containment.rs new file mode 100644 index 00000000..4dd24a85 --- /dev/null +++ b/crates/fbuild-daemon/tests/process_containment.rs @@ -0,0 +1,182 @@ +//! Integration test for process containment (FastLED/fbuild#32). +//! +//! Spawns the `containment_harness` binary in `parent` mode. The parent +//! installs the global `ContainedProcessGroup`, spawns a contained +//! child, and the child spawns a grandchild. The parent writes +//! ` \n` to stdout, then sleeps. +//! +//! The test driver then hard-kills **only** the parent. Thanks to +//! containment: +//! +//! * On **Windows** the Job Object's `JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE` +//! semantics fire the moment the parent's process handle is reaped and +//! the job handle goes away, killing every assigned descendant. +//! * On **Linux** the kernel's `PR_SET_PDEATHSIG(SIGKILL)` on each child +//! and the drop-time `killpg(SIGKILL)` backstop kill the group. +//! * On **macOS** the drop-time `killpg(SIGKILL)` kills the group. +//! +//! The test polls for the child and grandchild PIDs and asserts both +//! are gone within a few seconds. +//! +//! This test is marked `#[ignore]` because it: +//! 1. Hard-kills processes, which CI runners can flag as noisy. +//! 2. Requires the `containment_harness` binary, which is built on +//! demand by `CARGO_BIN_EXE_*`. +//! +//! Run explicitly with: +//! ```bash +//! uv run cargo test -p fbuild-daemon --test process_containment -- --ignored +//! ``` + +use std::io::{BufRead, BufReader}; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +#[test] +#[ignore = "spawns real subprocesses and issues hard-kills; run with --ignored"] +fn daemon_children_die_when_daemon_dies() { + let harness = env!("CARGO_BIN_EXE_containment_harness"); + + // Start the parent role. + let mut parent = Command::new(harness) + .arg("parent") + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + .expect("spawn parent"); + + // Read one line of ` \n` from the + // parent's stdout. The line is emitted only after the grandchild + // has been spawned, so when we have the three PIDs we know the + // whole tree is live. + let stdout = parent.stdout.take().expect("parent stdout"); + let mut reader = BufReader::new(stdout); + let mut line = String::new(); + let read = reader.read_line(&mut line).expect("read parent line"); + assert!(read > 0, "parent did not emit PID line"); + + let pids: Vec = line + .split_whitespace() + .map(|s| s.parse::().expect("pid parse")) + .collect(); + assert_eq!( + pids.len(), + 3, + "expected three PIDs (parent child grandchild), got {:?}", + pids + ); + let parent_pid = pids[0]; + let child_pid = pids[1]; + let grandchild_pid = pids[2]; + + // Sanity: every pid must be alive *right now*. + assert!( + pid_alive(child_pid), + "child {} is not alive before kill", + child_pid + ); + assert!( + pid_alive(grandchild_pid), + "grandchild {} is not alive before kill", + grandchild_pid + ); + + // Hard-kill the parent. + kill_hard(parent_pid).expect("hard-kill parent"); + + // Wait for the parent to be reaped. This is necessary on Windows + // because the Job Object's kill-on-close only fires after the job + // handle goes away, which requires the parent process to have fully + // exited and its HANDLE to be closed by the test driver's `Child`. + let _ = parent.wait(); + + // Poll for up to 10 s: after containment fires, both child and + // grandchild must be gone. + let deadline = Instant::now() + Duration::from_secs(10); + loop { + let child_gone = !pid_alive(child_pid); + let grand_gone = !pid_alive(grandchild_pid); + if child_gone && grand_gone { + return; // success + } + if Instant::now() >= deadline { + panic!( + "containment failed: child {} alive={}, grandchild {} alive={}", + child_pid, + pid_alive(child_pid), + grandchild_pid, + pid_alive(grandchild_pid), + ); + } + std::thread::sleep(Duration::from_millis(100)); + } +} + +// --------------------------------------------------------------------------- +// OS-specific PID probes and hard-kill +// --------------------------------------------------------------------------- + +#[cfg(unix)] +fn pid_alive(pid: u32) -> bool { + // `kill(pid, 0)` is a probe — returns 0 if the pid exists and we + // have permission, -1/ESRCH otherwise. + unsafe { libc::kill(pid as i32, 0) == 0 } +} + +#[cfg(windows)] +fn pid_alive(pid: u32) -> bool { + // OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION) succeeds for any + // running process; fails for a dead / non-existent PID. Also check + // the exit code — a handle to a process that has exited but not + // yet been reaped will still open successfully but report exited. + type Handle = *mut std::ffi::c_void; + const PROCESS_QUERY_LIMITED_INFORMATION: u32 = 0x1000; + const STILL_ACTIVE: u32 = 259; + #[link(name = "kernel32")] + extern "system" { + fn OpenProcess(desired_access: u32, inherit_handle: i32, process_id: u32) -> Handle; + fn CloseHandle(handle: Handle) -> i32; + fn GetExitCodeProcess(handle: Handle, exit_code: *mut u32) -> i32; + } + unsafe { + let h = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, 0, pid); + if h.is_null() { + return false; + } + let mut code: u32 = 0; + let ok = GetExitCodeProcess(h, &mut code as *mut u32); + CloseHandle(h); + ok != 0 && code == STILL_ACTIVE + } +} + +#[cfg(unix)] +fn kill_hard(pid: u32) -> std::io::Result<()> { + let rc = unsafe { libc::kill(pid as i32, libc::SIGKILL) }; + if rc == 0 { + Ok(()) + } else { + Err(std::io::Error::last_os_error()) + } +} + +#[cfg(windows)] +fn kill_hard(pid: u32) -> std::io::Result<()> { + // `taskkill /F` is the standard Windows hard-kill and works for + // arbitrary PIDs without DLL shenanigans. + let status = Command::new("taskkill") + .args(["/F", "/PID", &pid.to_string()]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status()?; + if status.success() { + Ok(()) + } else { + Err(std::io::Error::other(format!( + "taskkill /F /PID {} exited with {:?}", + pid, + status.code() + ))) + } +} diff --git a/crates/fbuild-packages/src/disk_cache/budget.rs b/crates/fbuild-packages/src/disk_cache/budget.rs index 2d9d9651..0cda75c0 100644 --- a/crates/fbuild-packages/src/disk_cache/budget.rs +++ b/crates/fbuild-packages/src/disk_cache/budget.rs @@ -119,48 +119,45 @@ fn get_total_disk_space(path: &Path) -> u64 { #[cfg(windows)] fn get_total_disk_space_windows(path: &Path) -> u64 { - use std::process::Command; - // Use PowerShell to get disk info + // Use PowerShell to get disk info. Route through fbuild-core's + // `run_command` so the probe spawn is captured by the daemon's + // containment group (issue #32) — a daemon crash mid-GC must not + // leak a `powershell.exe` or its `conhost.exe` wrapper. let path_str = path.to_string_lossy(); let drive = if path_str.len() >= 2 && path_str.as_bytes()[1] == b':' { &path_str[..2] } else { "C:" }; - - Command::new("powershell") - .args([ - "-NoProfile", - "-Command", - &format!( - "(Get-PSDrive -Name '{}').Used + (Get-PSDrive -Name '{}').Free", - &drive[..1], - &drive[..1] - ), - ]) - .output() - .ok() - .and_then(|o| { - String::from_utf8_lossy(&o.stdout) - .trim() - .parse::() - .ok() - }) - .unwrap_or(500 * 1024 * 1024 * 1024) // fallback 500 GB + let ps_cmd = format!( + "(Get-PSDrive -Name '{}').Used + (Get-PSDrive -Name '{}').Free", + &drive[..1], + &drive[..1] + ); + fbuild_core::subprocess::run_command( + &["powershell", "-NoProfile", "-Command", &ps_cmd], + None, + None, + None, + ) + .ok() + .and_then(|o| o.stdout.trim().parse::().ok()) + .unwrap_or(500 * 1024 * 1024 * 1024) // fallback 500 GB } #[cfg(unix)] fn get_total_disk_space_unix(path: &Path) -> u64 { - use std::process::Command; // Use `df -Pk` for portable 1024-byte block output. - // `-P` gives POSIX format, `-k` forces 1024-byte blocks on all platforms. - let output = Command::new("df").args(["-P", "-k"]).arg(path).output(); + // `-P` gives POSIX format, `-k` forces 1024-byte blocks on all + // platforms. Route through the containment group (issue #32). + let path_str = path.to_string_lossy(); + let output = + fbuild_core::subprocess::run_command(&["df", "-P", "-k", &path_str], None, None, None); output .ok() .and_then(|o| { - let stdout = String::from_utf8_lossy(&o.stdout); - stdout + o.stdout .lines() .nth(1) // skip header .and_then(|line| { diff --git a/crates/fbuild-python/src/lib.rs b/crates/fbuild-python/src/lib.rs index f843250e..de732f24 100644 --- a/crates/fbuild-python/src/lib.rs +++ b/crates/fbuild-python/src/lib.rs @@ -616,6 +616,12 @@ impl Daemon { } } + // INTENTIONALLY DETACHED (FastLED/fbuild#32): the Python host + // spawns the daemon and then the Python interpreter may exit — + // the daemon must survive. This PyO3 binding runs inside the + // Python interpreter process, which has no global containment + // group, so `spawn()` is already uncontained; see the matching + // comment in fbuild-cli/src/daemon_client.rs. let mut cmd = std::process::Command::new("fbuild-daemon"); if fbuild_paths::is_dev_mode() { cmd.arg("--dev"); @@ -1557,6 +1563,8 @@ impl AsyncDaemon { } } + // INTENTIONALLY DETACHED (FastLED/fbuild#32): see the + // matching comment in `Daemon::ensure_running` above. let mut cmd = std::process::Command::new("fbuild-daemon"); if dev_mode { cmd.arg("--dev");