From 5bf790b0e649d0a0bd012be7581a7cb58697150a Mon Sep 17 00:00:00 2001 From: zackees Date: Fri, 17 Apr 2026 20:38:24 -0700 Subject: [PATCH] fix(daemon): set SO_LINGER 0 on accepted sockets to avoid CLOSE_WAIT leaks (partial #32) Set SO_LINGER=0 on the daemon's listening socket before `listen(2)` so that accepted client sockets inherit a zero linger and force an immediate RST on close instead of going through FIN / CLOSE_WAIT / TIME_WAIT. After a hard-kill of the daemon, this prevents the kernel from leaking dangling CLOSE_WAIT state on client sockets that outlives the daemon itself and would otherwise block a fresh instance from re-binding the port. SO_LINGER is inherited from the listener by accept(2) on Linux, macOS, and Windows (AFD.sys), so setting it once on the listener covers every accepted connection without needing to hook axum 0.7's internal accept loop (axum 0.7 hands accept off to hyper-util inside `serve` with no per-connection extension point). Verified with the pre-existing regression test at crates/fbuild-daemon/tests/port_recovery.rs (run with `--ignored`), which hard-kills a daemon holding an open TCP connection and asserts a second daemon can re-bind the port cleanly. Remaining on #32: process containment via the `running-process` crate (Windows Job Object + POSIX process group parent-death). The SetConsoleCtrlHandler piece landed earlier in PR #61. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/fbuild-daemon/src/main.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/fbuild-daemon/src/main.rs b/crates/fbuild-daemon/src/main.rs index c29346ed..24513f54 100644 --- a/crates/fbuild-daemon/src/main.rs +++ b/crates/fbuild-daemon/src/main.rs @@ -407,6 +407,17 @@ fn is_pid_alive(pid: u32) -> bool { /// so it's safe there but not on Windows). See ISSUES.md "Issue B5a". /// * **Unix** — sets `SO_REUSEADDR`, which only permits `TIME_WAIT` /// recovery on this OS family. +/// * **All platforms** — sets `SO_LINGER = 0` on the listening socket so +/// that accepted client sockets inherit a zero linger and force an +/// immediate `RST` on close instead of going through the +/// `FIN / CLOSE_WAIT / TIME_WAIT` dance. After a hard-kill of the +/// daemon, this prevents the kernel from leaking dangling `CLOSE_WAIT` +/// state on client sockets that outlives the daemon itself and would +/// otherwise block a fresh instance from re-binding the port. SO_LINGER +/// is inherited from the listener by `accept(2)` on Linux, macOS, and +/// Windows (AFD.sys), so setting it once on the listener covers every +/// subsequently accepted connection without needing to hook axum 0.7's +/// internal accept loop. See FastLED/fbuild#32. /// /// Bind is retried up to 3 times with 500 ms backoff to handle the brief /// window where a hard-killed previous instance still has kernel TCP @@ -443,6 +454,13 @@ fn bind_listener_with_retry(addr: &str) -> tokio::net::TcpListener { } } + // Force RST on close for accepted client sockets — inherited via + // `accept(2)` on Linux/macOS/Windows. See doc comment above and + // FastLED/fbuild#32. + if let Err(e) = sock.set_linger(Some(std::time::Duration::ZERO)) { + tracing::warn!("failed to set SO_LINGER=0 on listener: {}", e); + } + if let Err(e) = sock.set_nonblocking(true) { eprintln!("failed to set non-blocking: {}", e); std::process::exit(1);