From b374438cef23d440bff3cf92718054094793b14c Mon Sep 17 00:00:00 2001 From: zackees Date: Sat, 18 Apr 2026 20:01:40 -0700 Subject: [PATCH] perf(daemon): watch-set cache hit/miss telemetry (closes #123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `AtomicU64` counters on `DaemonWatchSetCache` (hits / misses / stale_evictions / puts) that drive two observability surfaces: 1. **`tracing::debug!(target: "fbuild_daemon::watch_set_cache", …)`** on every `get` — one line per lookup tagged `outcome=hit|miss|stale` with key, age, and configured max_age. Filter-ready for anyone trying to diagnose a suspected stale-hit or cache-cold scenario without having to rebuild with a debug macro. 2. **`DaemonInfoResponse::watch_set_cache: WatchSetCacheStats`** on `/api/daemon/info` — aggregate snapshot of the counters plus the currently-configured window. A misconfigured `FBUILD_WATCH_SET_CACHE_SECS` (#122) is otherwise invisible at the CLI because a cache-miss looks identical to a pre-cache cold run. `stale_evictions` is reported separately from `misses` so operators can distinguish "window is too short" from "cache is cold / key is new" — the two failure modes warrant different responses and collapsing them throws away the signal. Counters are lock-free reads/writes on the fast path (single atomic fetch-add per get/put) so the observability cost is negligible. ## Tests - `stats_track_each_outcome` — table-driven scenario hits miss, put, hit, and stale-eviction, asserting each counter advances exactly once without bleeding into a sibling. - `stats_reports_configured_window` — round-trips `max_age_secs` through `with_max_age` so operators can confirm which window is actually in effect. All 128 `fbuild-daemon` lib tests pass; `cargo clippy --workspace --all-targets -- -D warnings` clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/fbuild-daemon/src/handlers/health.rs | 1 + crates/fbuild-daemon/src/models.rs | 9 ++ crates/fbuild-daemon/src/watch_set_cache.rs | 125 +++++++++++++++++++- 3 files changed, 132 insertions(+), 3 deletions(-) diff --git a/crates/fbuild-daemon/src/handlers/health.rs b/crates/fbuild-daemon/src/handlers/health.rs index 80fac86b..3065fbc8 100644 --- a/crates/fbuild-daemon/src/handlers/health.rs +++ b/crates/fbuild-daemon/src/handlers/health.rs @@ -58,6 +58,7 @@ pub async fn daemon_info(State(ctx): State>) -> Json, } /// GET / (root endpoint) @@ -687,6 +693,7 @@ mod tests { source_mtime: 1700000000.0, spawner_cwd: "/home/user/project".into(), mcp_url: "http://127.0.0.1:8765/mcp".into(), + watch_set_cache: None, }; let json = serde_json::to_string(&resp).unwrap(); assert!(json.contains("\"started_at\"")); @@ -725,6 +732,7 @@ mod tests { source_mtime: 0.0, spawner_cwd: "unknown".into(), mcp_url: "http://127.0.0.1:8765/mcp".into(), + watch_set_cache: None, }; let json = serde_json::to_string(&resp).unwrap(); assert!(!json.contains("current_operation")); @@ -750,6 +758,7 @@ mod tests { source_mtime: 0.0, spawner_cwd: "unknown".into(), mcp_url: "http://127.0.0.1:8765/mcp".into(), + watch_set_cache: None, }; let json = serde_json::to_string(&resp).unwrap(); assert!(json.contains("\"current_operation\"")); diff --git a/crates/fbuild-daemon/src/watch_set_cache.rs b/crates/fbuild-daemon/src/watch_set_cache.rs index cb32ac5f..23f5bf62 100644 --- a/crates/fbuild-daemon/src/watch_set_cache.rs +++ b/crates/fbuild-daemon/src/watch_set_cache.rs @@ -33,12 +33,32 @@ use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use dashmap::DashMap; use fbuild_build::build_fingerprint::WatchSetStampCache; use fbuild_build::zccache::FingerprintWatch; +/// Aggregate counters the daemon exposes on `/api/daemon/info` so +/// operators can verify the watch-set cache is actually serving +/// hits in production — the sub-1 s warm-deploy budget leans on the +/// hit path, and a silently-misconfigured cache looks identical to +/// the slow path at the CLI. +#[derive(Debug, Clone, Copy, serde::Serialize)] +pub struct WatchSetCacheStats { + /// Freshness window currently in effect (seconds). + pub max_age_secs: u64, + /// `get` calls that returned a fresh entry. + pub hits: u64, + /// `get` calls that found no entry for the key. + pub misses: u64, + /// `get` calls that found an entry but it was past `max_age`. + pub stale_evictions: u64, + /// `put` calls made (a successful flash-path walk + store). + pub puts: u64, +} + /// Default freshness window for cache entries. Short enough that a /// user editing a file and immediately re-building still triggers /// the real walk (modulo edit speed), long enough to cover the @@ -47,10 +67,15 @@ use fbuild_build::zccache::FingerprintWatch; pub const DEFAULT_FRESHNESS: Duration = Duration::from_secs(2); /// In-memory cache. Cheap to clone via `Arc` because the only -/// state is a `DashMap`. +/// state is a `DashMap`. Counter fields are `AtomicU64` so the +/// `get` / `put` calls stay lock-free on the fast path. pub struct DaemonWatchSetCache { inner: DashMap, max_age: Duration, + hits: AtomicU64, + misses: AtomicU64, + stale_evictions: AtomicU64, + puts: AtomicU64, } impl Default for DaemonWatchSetCache { @@ -68,6 +93,23 @@ impl DaemonWatchSetCache { Self { inner: DashMap::new(), max_age, + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + stale_evictions: AtomicU64::new(0), + puts: AtomicU64::new(0), + } + } + + /// Snapshot the live counters — used by `/api/daemon/info` to + /// expose cache observability without holding any lock on the + /// hot path (each load is a single atomic read). + pub fn stats(&self) -> WatchSetCacheStats { + WatchSetCacheStats { + max_age_secs: self.max_age.as_secs(), + hits: self.hits.load(Ordering::Relaxed), + misses: self.misses.load(Ordering::Relaxed), + stale_evictions: self.stale_evictions.load(Ordering::Relaxed), + puts: self.puts.load(Ordering::Relaxed), } } @@ -90,23 +132,51 @@ impl DaemonWatchSetCache { impl WatchSetStampCache for DaemonWatchSetCache { fn get(&self, watches: &[FingerprintWatch]) -> Option { let key = key_for(watches); - let entry = self.inner.get(&key)?; + let Some(entry) = self.inner.get(&key) else { + self.misses.fetch_add(1, Ordering::Relaxed); + tracing::debug!( + target: "fbuild_daemon::watch_set_cache", + outcome = "miss", + key, + "watch-set cache lookup" + ); + return None; + }; let (hash, set_at) = (entry.0.clone(), entry.1); drop(entry); - if set_at.elapsed() >= self.max_age { + let age = set_at.elapsed(); + if age >= self.max_age { // Lazy eviction so a stale entry doesn't keep memory // pinned indefinitely; the next put would have replaced // it anyway, but explicit removal helps a long-idle // daemon. self.inner.remove(&key); + self.stale_evictions.fetch_add(1, Ordering::Relaxed); + tracing::debug!( + target: "fbuild_daemon::watch_set_cache", + outcome = "stale", + key, + age_ms = age.as_millis() as u64, + max_age_ms = self.max_age.as_millis() as u64, + "watch-set cache lookup" + ); return None; } + self.hits.fetch_add(1, Ordering::Relaxed); + tracing::debug!( + target: "fbuild_daemon::watch_set_cache", + outcome = "hit", + key, + age_ms = age.as_millis() as u64, + "watch-set cache lookup" + ); Some(hash) } fn put(&self, watches: &[FingerprintWatch], hash: String) { let key = key_for(watches); self.inner.insert(key, (hash, Instant::now())); + self.puts.fetch_add(1, Ordering::Relaxed); } } @@ -181,4 +251,53 @@ mod tests { let ws = vec![watch("/never-stored")]; assert!(cache.get(&ws).is_none()); } + + /// Counters advance in lock-step with the `get` / `put` outcomes + /// the daemon exposes on `/api/daemon/info` (#123). A single + /// scenario hits every code path: unknown-key miss, fresh put, + /// fresh hit, stale eviction. Any regression in the counter + /// wiring surfaces here instead of silently serving the wrong + /// observability numbers in production. + #[test] + fn stats_track_each_outcome() { + let cache = DaemonWatchSetCache::with_max_age(Duration::from_millis(5)); + let ws = vec![watch("/tracked")]; + + // Unknown key → miss. + assert!(cache.get(&ws).is_none()); + let s = cache.stats(); + assert_eq!(s.misses, 1); + assert_eq!(s.hits, 0); + assert_eq!(s.puts, 0); + assert_eq!(s.stale_evictions, 0); + + // Fresh put → counter bumps. + cache.put(&ws, "h".to_string()); + assert_eq!(cache.stats().puts, 1); + + // Fresh get (inside the 5 ms window) → hit. + assert_eq!(cache.get(&ws).as_deref(), Some("h")); + assert_eq!(cache.stats().hits, 1); + + // Wait past the window → next get is counted as a stale + // eviction, not a hit and not a plain miss. + std::thread::sleep(Duration::from_millis(10)); + assert!(cache.get(&ws).is_none()); + let s = cache.stats(); + assert_eq!(s.stale_evictions, 1); + // The stale get must NOT roll up as a plain miss — operators + // read the two counters separately to distinguish "cache is + // too short" (high stale) from "cache is cold" (high miss). + assert_eq!(s.misses, 1); + } + + /// `max_age_secs` on the stats snapshot reflects the cache's + /// configured window so operators can tell which value is + /// actually in effect (e.g. after `FBUILD_WATCH_SET_CACHE_SECS` + /// override from #122). + #[test] + fn stats_reports_configured_window() { + let cache = DaemonWatchSetCache::with_max_age(Duration::from_secs(9)); + assert_eq!(cache.stats().max_age_secs, 9); + } }