CryptoLabInc · couragehong · Jun 2, 2026 · Jun 2, 2026
diff --git a/gen/runed/v1/runed.pb.go b/gen/runed/v1/runed.pb.go
diff --git a/internal/backend/llama.go b/internal/backend/llama.go
@@ -27,14 +27,43 @@ type Config struct {
 	CtxSize    int    // default 2048; --ctx-size in tokens = max input length (llama-server rejects longer input with HTTP 400)
 }
 
+// ServingState is the backend's current ability to serve embeddings, as
+// reported through the daemon's Health RPC.
+type ServingState int
+
+const (
+	// ServingOK: the llama-server child is up and answering /health.
+	ServingOK ServingState = iota
+	// ServingIdle: the child was intentionally stopped after an idle period
+	// to free model memory. The next Embed RPC resurrects it (a one-off
+	// cold-start latency). This is expected, not a fault.
+	ServingIdle
+	// ServingDegraded: the child is up but failing /health, or it exited
+	// unexpectedly. A genuine problem.
+	ServingDegraded
+)
+
+// childState records why the llama-server child is not currently running,
+// so Serving can tell an intentional idle-suspend apart from a crash.
+type childState int
+
+const (
+	childRunning   childState = iota // child up (or starting)
+	childSuspended                   // intentionally stopped (idle suspend / restart)
+	childFailed                      // exited unexpectedly
+)
+
 type LlamaBackend struct {
 	cfg Config
 
 	cmd      *exec.Cmd
 	port     int
 	cmdDone  chan struct{} // closed by watchChild when current cmd.Wait returns
 	stopping bool          // true while stopLocked is intentionally terminating the child
-	mu       sync.Mutex    // protects cmd, port, cmdDone, stopping — short critical sections only
+	// state records why the child is down (childSuspended vs childFailed) so
+	// Serving maps it to ServingIdle vs ServingDegraded. Protected by mu.
+	state childState
+	mu    sync.Mutex // protects cmd, port, cmdDone, stopping, state — short critical sections only
 
 	// lifecycleMu serializes Start / Stop / EnsureStarted. RPC handlers
 	// call EnsureStarted on every request so the contention here must
@@ -166,6 +195,7 @@ func (b *LlamaBackend) startLocked(ctx context.Context) error {
 	b.cmd = cmd
 	b.port = 0
 	b.cmdDone = done
+	b.state = childRunning
 	b.mu.Unlock()
 	go b.watchChild(cmd, done)
 
@@ -263,6 +293,11 @@ func (b *LlamaBackend) watchChild(cmd *exec.Cmd, done chan struct{}) {
 	if b.cmd == cmd {
 		b.cmd = nil
 		b.port = 0
+		// Any exit leaves the child down; default to failed so a crash or a
+		// failed (re)start reports DEGRADED. Stop() (the idle-suspend path)
+		// overrides to childSuspended afterward — it waits on done, so its
+		// write lands strictly after this one.
+		b.state = childFailed
 	}
 	b.mu.Unlock()
 	// Log before close(done) so readers of done can rely on the log
@@ -302,6 +337,27 @@ func (b *LlamaBackend) IsHealthy(ctx context.Context) bool {
 	return resp.StatusCode == 200
 }
 
+// Serving reports whether the backend can currently serve embeddings, and if
+// not, why. Health maps the result onto the proto Status enum:
+// ServingOK→OK, ServingIdle→IDLE (intentional idle-suspend, resumes on the
+// next RPC), ServingDegraded→DEGRADED (process up-but-unhealthy, or crashed).
+func (b *LlamaBackend) Serving(ctx context.Context) ServingState {
+	b.mu.Lock()
+	haveCmd := b.cmd != nil
+	st := b.state
+	b.mu.Unlock()
+	if haveCmd {
+		if b.IsHealthy(ctx) {
+			return ServingOK
+		}
+		return ServingDegraded
+	}
+	if st == childSuspended {
+		return ServingIdle
+	}
+	return ServingDegraded
+}
+
 // Stop terminates the llama-server child if running. After Stop returns,
 // cmd is nil and port is 0 — a subsequent EnsureStarted re-launches the
 // child cleanly.
@@ -316,7 +372,22 @@ func (b *LlamaBackend) Stop(ctx context.Context) error {
 	defer b.inflightMu.Unlock()
 	b.lifecycleMu.Lock()
 	defer b.lifecycleMu.Unlock()
-	return b.stopLocked(ctx)
+	b.mu.Lock()
+	wasRunning := b.cmd != nil
+	b.mu.Unlock()
+	err := b.stopLocked(ctx)
+	// Stop is the intentional idle-suspend path (the idle ticker). Mark the
+	// child suspended so Serving reports IDLE, not DEGRADED — but only if we
+	// actually stopped a running child, so a Stop on an already-crashed
+	// backend doesn't mask the failure. stopLocked waits on done, so
+	// watchChild's childFailed write has already landed; this override runs
+	// after it.
+	if wasRunning {
+		b.mu.Lock()
+		b.state = childSuspended
+		b.mu.Unlock()
+	}
+	return err
 }
 
 func (b *LlamaBackend) stopLocked(ctx context.Context) error {

diff --git a/internal/backend/serving_test.go b/internal/backend/serving_test.go
@@ -0,0 +1,121 @@
+//go:build !windows
+
+package backend
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os/exec"
+	"testing"
+	"time"
+)
+
+// TestServing_DistinguishesIdleFromDegraded checks the Serving mapping given a
+// known child state: an intentional idle-suspend is ServingIdle (not a fault)
+// while a crash or an up-but-unhealthy child is ServingDegraded. This is what
+// lets Health surface STATUS_IDLE instead of misleading users with
+// STATUS_DEGRADED after the idle ticker stops llama-server to free memory.
+func TestServing_DistinguishesIdleFromDegraded(t *testing.T) {
+	t.Run("child down after intentional suspend → idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		b.mu.Lock()
+		b.cmd = nil
+		b.state = childSuspended
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingIdle {
+			t.Fatalf("Serving() = %v, want ServingIdle", got)
+		}
+	})
+
+	t.Run("child down after unexpected exit → degraded", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		b.mu.Lock()
+		b.cmd = nil
+		b.state = childFailed
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("Serving() = %v, want ServingDegraded", got)
+		}
+	})
+
+	t.Run("child up and healthy → ok", func(t *testing.T) {
+		fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusOK)
+		}))
+		defer fake.Close()
+		host, port := splitHostPort(t, fake.URL)
+		b := NewLlamaBackend(Config{Host: host})
+		b.mu.Lock()
+		b.cmd = &exec.Cmd{} // non-nil → child considered up
+		b.port = port
+		b.state = childRunning
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingOK {
+			t.Fatalf("Serving() = %v, want ServingOK", got)
+		}
+	})
+
+	t.Run("child up but unhealthy → degraded", func(t *testing.T) {
+		fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusServiceUnavailable)
+		}))
+		defer fake.Close()
+		host, port := splitHostPort(t, fake.URL)
+		b := NewLlamaBackend(Config{Host: host})
+		b.mu.Lock()
+		b.cmd = &exec.Cmd{} // non-nil → child considered up
+		b.port = port
+		b.state = childRunning
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("Serving() = %v, want ServingDegraded", got)
+		}
+	})
+}
+
+// TestServing_LifecycleTransitions drives the real child lifecycle (via a
+// placeholder `sleep` process) to verify which teardown paths produce IDLE vs
+// DEGRADED. The key regression: only Stop() (the idle ticker) yields IDLE — an
+// internal stopLocked from a failed (re)start, or a crash, must stay DEGRADED.
+func TestServing_LifecycleTransitions(t *testing.T) {
+	t.Run("Stop (idle-suspend) → idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		if err := b.Stop(context.Background()); err != nil {
+			t.Fatalf("Stop: %v", err)
+		}
+		if got := b.Serving(context.Background()); got != ServingIdle {
+			t.Fatalf("after Stop: got %v, want ServingIdle", got)
+		}
+	})
+
+	t.Run("internal stopLocked (failed start/restart) → degraded, not idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		// startLocked / EnsureStarted stop the child via stopLocked (not Stop)
+		// when a (re)start fails. That must NOT be reported as an idle-suspend.
+		b.lifecycleMu.Lock()
+		_ = b.stopLocked(context.Background())
+		b.lifecycleMu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("internal stop should be DEGRADED, got %v", got)
+		}
+	})
+
+	t.Run("unexpected exit (crash) → degraded", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		_ = cmd.Process.Kill() // crash: no Stop/stopLocked involved
+		deadline := time.Now().Add(2 * time.Second)
+		for b.getCmd() != nil && time.Now().Before(deadline) {
+			time.Sleep(10 * time.Millisecond)
+		}
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("after crash: got %v, want ServingDegraded", got)
+		}
+	})
+}
diff --git a/internal/server/server.go b/internal/server/server.go
@@ -250,7 +250,7 @@ func (s *Server) Info(ctx context.Context, _ *runedv1.InfoRequest) (*runedv1.Inf
 }
 
 // Health maps backend state to the proto Status enum. SHUTTING_DOWN
-// outranks LOADING/DEGRADED/OK so a drain-in-progress daemon doesn't
+// outranks LOADING/IDLE/DEGRADED/OK so a drain-in-progress daemon doesn't
 // advertise itself as ready (the GracefulStop race would otherwise
 // surface Unavailable right after the OK response).
 //
@@ -278,11 +278,15 @@ func (s *Server) Health(ctx context.Context, _ *runedv1.HealthRequest) (*runedv1
 		}
 		return resp, nil
 	}
-	if !b.IsHealthy(ctx) {
+	switch b.Serving(ctx) {
+	case backend.ServingIdle:
+		resp.Status = runedv1.HealthResponse_STATUS_IDLE
+		resp.Message = "embedder suspended after idle to free memory; the next request resumes it automatically"
+	case backend.ServingDegraded:
 		resp.Status = runedv1.HealthResponse_STATUS_DEGRADED
-		return resp, nil
+	default:
+		resp.Status = runedv1.HealthResponse_STATUS_OK
 	}
-	resp.Status = runedv1.HealthResponse_STATUS_OK
 	return resp, nil
 }
 

diff --git a/proto/runed/v1/runed.proto b/proto/runed/v1/runed.proto
@@ -81,6 +81,7 @@ message HealthResponse {
     STATUS_LOADING = 2; // bootstrap: fetching llama-server, downloading embedding model
     STATUS_DEGRADED = 3;
     STATUS_SHUTTING_DOWN = 4;
+    STATUS_IDLE = 5; // backend intentionally suspended after idle to free memory; resumes on next request
   }
 
   // STATUS_LOADING sub-phase