diff --git a/gen/runed/v1/runed.pb.go b/gen/runed/v1/runed.pb.go
index 856c461..7743c39 100644
--- a/gen/runed/v1/runed.pb.go
+++ b/gen/runed/v1/runed.pb.go
@@ -29,6 +29,7 @@ const (
 	HealthResponse_STATUS_LOADING       HealthResponse_Status = 2 // bootstrap: fetching llama-server, downloading embedding model
 	HealthResponse_STATUS_DEGRADED      HealthResponse_Status = 3
 	HealthResponse_STATUS_SHUTTING_DOWN HealthResponse_Status = 4
+	HealthResponse_STATUS_IDLE          HealthResponse_Status = 5 // backend intentionally suspended after idle to free memory; resumes on next request
 )
 
 // Enum value maps for HealthResponse_Status.
@@ -39,6 +40,7 @@ var (
 		2: "STATUS_LOADING",
 		3: "STATUS_DEGRADED",
 		4: "STATUS_SHUTTING_DOWN",
+		5: "STATUS_IDLE",
 	}
 	HealthResponse_Status_value = map[string]int32{
 		"STATUS_UNSPECIFIED":   0,
@@ -46,6 +48,7 @@ var (
 		"STATUS_LOADING":       2,
 		"STATUS_DEGRADED":      3,
 		"STATUS_SHUTTING_DOWN": 4,
+		"STATUS_IDLE":          5,
 	}
 )
 
@@ -678,7 +681,7 @@ const file_runed_v1_runed_proto_rawDesc = "" +
 	"vector_dim\x18\x03 \x01(\x05R\tvectorDim\x12&\n" +
 	"\x0fmax_text_length\x18\x04 \x01(\x05R\rmaxTextLength\x12$\n" +
 	"\x0emax_batch_size\x18\x05 \x01(\x05R\fmaxBatchSize\"\x0f\n" +
-	"\rHealthRequest\"\x97\x04\n" +
+	"\rHealthRequest\"\xa9\x04\n" +
 	"\x0eHealthResponse\x127\n" +
 	"\x06status\x18\x01 \x01(\x0e2\x1f.runed.v1.HealthResponse.StatusR\x06status\x12%\n" +
 	"\x0euptime_seconds\x18\x02 \x01(\x03R\ruptimeSeconds\x12%\n" +
@@ -688,13 +691,14 @@ const file_runed_v1_runed_proto_rawDesc = "" +
 	"bytes_done\x18\x05 \x01(\x03R\tbytesDone\x12\x1f\n" +
 	"\vbytes_total\x18\x06 \x01(\x03R\n" +
 	"bytesTotal\x12\x18\n" +
-	"\amessage\x18\a \x01(\tR\amessage\"r\n" +
+	"\amessage\x18\a \x01(\tR\amessage\"\x83\x01\n" +
 	"\x06Status\x12\x16\n" +
 	"\x12STATUS_UNSPECIFIED\x10\x00\x12\r\n" +
 	"\tSTATUS_OK\x10\x01\x12\x12\n" +
 	"\x0eSTATUS_LOADING\x10\x02\x12\x13\n" +
 	"\x0fSTATUS_DEGRADED\x10\x03\x12\x18\n" +
-	"\x14STATUS_SHUTTING_DOWN\x10\x04\"z\n" +
+	"\x14STATUS_SHUTTING_DOWN\x10\x04\x12\x0f\n" +
+	"\vSTATUS_IDLE\x10\x05\"z\n" +
 	"\x05Phase\x12\x15\n" +
 	"\x11PHASE_UNSPECIFIED\x10\x00\x12\x1f\n" +
 	"\x1bPHASE_FETCHING_LLAMA_SERVER\x10\x01\x12\x18\n" +
diff --git a/internal/backend/llama.go b/internal/backend/llama.go
index 996ee5b..b96e988 100644
--- a/internal/backend/llama.go
+++ b/internal/backend/llama.go
@@ -27,6 +27,32 @@ type Config struct {
 	CtxSize    int    // default 2048; --ctx-size in tokens = max input length (llama-server rejects longer input with HTTP 400)
 }
 
+// ServingState is the backend's current ability to serve embeddings, as
+// reported through the daemon's Health RPC.
+type ServingState int
+
+const (
+	// ServingOK: the llama-server child is up and answering /health.
+	ServingOK ServingState = iota
+	// ServingIdle: the child was intentionally stopped after an idle period
+	// to free model memory. The next Embed RPC resurrects it (a one-off
+	// cold-start latency). This is expected, not a fault.
+	ServingIdle
+	// ServingDegraded: the child is up but failing /health, or it exited
+	// unexpectedly. A genuine problem.
+	ServingDegraded
+)
+
+// childState records why the llama-server child is not currently running,
+// so Serving can tell an intentional idle-suspend apart from a crash.
+type childState int
+
+const (
+	childRunning   childState = iota // child up (or starting)
+	childSuspended                   // intentionally stopped (idle suspend / restart)
+	childFailed                      // exited unexpectedly
+)
+
 type LlamaBackend struct {
 	cfg Config
 
@@ -34,7 +60,10 @@ type LlamaBackend struct {
 	port     int
 	cmdDone  chan struct{} // closed by watchChild when current cmd.Wait returns
 	stopping bool          // true while stopLocked is intentionally terminating the child
-	mu       sync.Mutex    // protects cmd, port, cmdDone, stopping — short critical sections only
+	// state records why the child is down (childSuspended vs childFailed) so
+	// Serving maps it to ServingIdle vs ServingDegraded. Protected by mu.
+	state childState
+	mu    sync.Mutex // protects cmd, port, cmdDone, stopping, state — short critical sections only
 
 	// lifecycleMu serializes Start / Stop / EnsureStarted. RPC handlers
 	// call EnsureStarted on every request so the contention here must
@@ -166,6 +195,7 @@ func (b *LlamaBackend) startLocked(ctx context.Context) error {
 	b.cmd = cmd
 	b.port = 0
 	b.cmdDone = done
+	b.state = childRunning
 	b.mu.Unlock()
 	go b.watchChild(cmd, done)
 
@@ -263,6 +293,11 @@ func (b *LlamaBackend) watchChild(cmd *exec.Cmd, done chan struct{}) {
 	if b.cmd == cmd {
 		b.cmd = nil
 		b.port = 0
+		// Any exit leaves the child down; default to failed so a crash or a
+		// failed (re)start reports DEGRADED. Stop() (the idle-suspend path)
+		// overrides to childSuspended afterward — it waits on done, so its
+		// write lands strictly after this one.
+		b.state = childFailed
 	}
 	b.mu.Unlock()
 	// Log before close(done) so readers of done can rely on the log
@@ -302,6 +337,27 @@ func (b *LlamaBackend) IsHealthy(ctx context.Context) bool {
 	return resp.StatusCode == 200
 }
 
+// Serving reports whether the backend can currently serve embeddings, and if
+// not, why. Health maps the result onto the proto Status enum:
+// ServingOK→OK, ServingIdle→IDLE (intentional idle-suspend, resumes on the
+// next RPC), ServingDegraded→DEGRADED (process up-but-unhealthy, or crashed).
+func (b *LlamaBackend) Serving(ctx context.Context) ServingState {
+	b.mu.Lock()
+	haveCmd := b.cmd != nil
+	st := b.state
+	b.mu.Unlock()
+	if haveCmd {
+		if b.IsHealthy(ctx) {
+			return ServingOK
+		}
+		return ServingDegraded
+	}
+	if st == childSuspended {
+		return ServingIdle
+	}
+	return ServingDegraded
+}
+
 // Stop terminates the llama-server child if running. After Stop returns,
 // cmd is nil and port is 0 — a subsequent EnsureStarted re-launches the
 // child cleanly.
@@ -316,7 +372,22 @@ func (b *LlamaBackend) Stop(ctx context.Context) error {
 	defer b.inflightMu.Unlock()
 	b.lifecycleMu.Lock()
 	defer b.lifecycleMu.Unlock()
-	return b.stopLocked(ctx)
+	b.mu.Lock()
+	wasRunning := b.cmd != nil
+	b.mu.Unlock()
+	err := b.stopLocked(ctx)
+	// Stop is the intentional idle-suspend path (the idle ticker). Mark the
+	// child suspended so Serving reports IDLE, not DEGRADED — but only if we
+	// actually stopped a running child, so a Stop on an already-crashed
+	// backend doesn't mask the failure. stopLocked waits on done, so
+	// watchChild's childFailed write has already landed; this override runs
+	// after it.
+	if wasRunning {
+		b.mu.Lock()
+		b.state = childSuspended
+		b.mu.Unlock()
+	}
+	return err
 }
 
 func (b *LlamaBackend) stopLocked(ctx context.Context) error {
diff --git a/internal/backend/serving_test.go b/internal/backend/serving_test.go
new file mode 100644
index 0000000..0d7d0fc
--- /dev/null
+++ b/internal/backend/serving_test.go
@@ -0,0 +1,121 @@
+//go:build !windows
+
+package backend
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os/exec"
+	"testing"
+	"time"
+)
+
+// TestServing_DistinguishesIdleFromDegraded checks the Serving mapping given a
+// known child state: an intentional idle-suspend is ServingIdle (not a fault)
+// while a crash or an up-but-unhealthy child is ServingDegraded. This is what
+// lets Health surface STATUS_IDLE instead of misleading users with
+// STATUS_DEGRADED after the idle ticker stops llama-server to free memory.
+func TestServing_DistinguishesIdleFromDegraded(t *testing.T) {
+	t.Run("child down after intentional suspend → idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		b.mu.Lock()
+		b.cmd = nil
+		b.state = childSuspended
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingIdle {
+			t.Fatalf("Serving() = %v, want ServingIdle", got)
+		}
+	})
+
+	t.Run("child down after unexpected exit → degraded", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		b.mu.Lock()
+		b.cmd = nil
+		b.state = childFailed
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("Serving() = %v, want ServingDegraded", got)
+		}
+	})
+
+	t.Run("child up and healthy → ok", func(t *testing.T) {
+		fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusOK)
+		}))
+		defer fake.Close()
+		host, port := splitHostPort(t, fake.URL)
+		b := NewLlamaBackend(Config{Host: host})
+		b.mu.Lock()
+		b.cmd = &exec.Cmd{} // non-nil → child considered up
+		b.port = port
+		b.state = childRunning
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingOK {
+			t.Fatalf("Serving() = %v, want ServingOK", got)
+		}
+	})
+
+	t.Run("child up but unhealthy → degraded", func(t *testing.T) {
+		fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusServiceUnavailable)
+		}))
+		defer fake.Close()
+		host, port := splitHostPort(t, fake.URL)
+		b := NewLlamaBackend(Config{Host: host})
+		b.mu.Lock()
+		b.cmd = &exec.Cmd{} // non-nil → child considered up
+		b.port = port
+		b.state = childRunning
+		b.mu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("Serving() = %v, want ServingDegraded", got)
+		}
+	})
+}
+
+// TestServing_LifecycleTransitions drives the real child lifecycle (via a
+// placeholder `sleep` process) to verify which teardown paths produce IDLE vs
+// DEGRADED. The key regression: only Stop() (the idle ticker) yields IDLE — an
+// internal stopLocked from a failed (re)start, or a crash, must stay DEGRADED.
+func TestServing_LifecycleTransitions(t *testing.T) {
+	t.Run("Stop (idle-suspend) → idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		if err := b.Stop(context.Background()); err != nil {
+			t.Fatalf("Stop: %v", err)
+		}
+		if got := b.Serving(context.Background()); got != ServingIdle {
+			t.Fatalf("after Stop: got %v, want ServingIdle", got)
+		}
+	})
+
+	t.Run("internal stopLocked (failed start/restart) → degraded, not idle", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		// startLocked / EnsureStarted stop the child via stopLocked (not Stop)
+		// when a (re)start fails. That must NOT be reported as an idle-suspend.
+		b.lifecycleMu.Lock()
+		_ = b.stopLocked(context.Background())
+		b.lifecycleMu.Unlock()
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("internal stop should be DEGRADED, got %v", got)
+		}
+	})
+
+	t.Run("unexpected exit (crash) → degraded", func(t *testing.T) {
+		b := NewLlamaBackend(Config{})
+		cmd := attachPlaceholderChild(t, b, 1)
+		defer cleanupPlaceholderChild(cmd)
+		_ = cmd.Process.Kill() // crash: no Stop/stopLocked involved
+		deadline := time.Now().Add(2 * time.Second)
+		for b.getCmd() != nil && time.Now().Before(deadline) {
+			time.Sleep(10 * time.Millisecond)
+		}
+		if got := b.Serving(context.Background()); got != ServingDegraded {
+			t.Fatalf("after crash: got %v, want ServingDegraded", got)
+		}
+	})
+}
diff --git a/internal/server/server.go b/internal/server/server.go
index d55cc0d..c056bbb 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -250,7 +250,7 @@ func (s *Server) Info(ctx context.Context, _ *runedv1.InfoRequest) (*runedv1.Inf
 }
 
 // Health maps backend state to the proto Status enum. SHUTTING_DOWN
-// outranks LOADING/DEGRADED/OK so a drain-in-progress daemon doesn't
+// outranks LOADING/IDLE/DEGRADED/OK so a drain-in-progress daemon doesn't
 // advertise itself as ready (the GracefulStop race would otherwise
 // surface Unavailable right after the OK response).
 //
@@ -278,11 +278,15 @@ func (s *Server) Health(ctx context.Context, _ *runedv1.HealthRequest) (*runedv1
 		}
 		return resp, nil
 	}
-	if !b.IsHealthy(ctx) {
+	switch b.Serving(ctx) {
+	case backend.ServingIdle:
+		resp.Status = runedv1.HealthResponse_STATUS_IDLE
+		resp.Message = "embedder suspended after idle to free memory; the next request resumes it automatically"
+	case backend.ServingDegraded:
 		resp.Status = runedv1.HealthResponse_STATUS_DEGRADED
-		return resp, nil
+	default:
+		resp.Status = runedv1.HealthResponse_STATUS_OK
 	}
-	resp.Status = runedv1.HealthResponse_STATUS_OK
 	return resp, nil
 }
 
diff --git a/proto/runed/v1/runed.proto b/proto/runed/v1/runed.proto
index 23660f6..a5e2b38 100644
--- a/proto/runed/v1/runed.proto
+++ b/proto/runed/v1/runed.proto
@@ -81,6 +81,7 @@ message HealthResponse {
     STATUS_LOADING = 2; // bootstrap: fetching llama-server, downloading embedding model
     STATUS_DEGRADED = 3;
     STATUS_SHUTTING_DOWN = 4;
+    STATUS_IDLE = 5; // backend intentionally suspended after idle to free memory; resumes on next request
   }
 
   // STATUS_LOADING sub-phase