diff --git a/gen/runed/v1/runed.pb.go b/gen/runed/v1/runed.pb.go index 856c461..7743c39 100644 --- a/gen/runed/v1/runed.pb.go +++ b/gen/runed/v1/runed.pb.go @@ -29,6 +29,7 @@ const ( HealthResponse_STATUS_LOADING HealthResponse_Status = 2 // bootstrap: fetching llama-server, downloading embedding model HealthResponse_STATUS_DEGRADED HealthResponse_Status = 3 HealthResponse_STATUS_SHUTTING_DOWN HealthResponse_Status = 4 + HealthResponse_STATUS_IDLE HealthResponse_Status = 5 // backend intentionally suspended after idle to free memory; resumes on next request ) // Enum value maps for HealthResponse_Status. @@ -39,6 +40,7 @@ var ( 2: "STATUS_LOADING", 3: "STATUS_DEGRADED", 4: "STATUS_SHUTTING_DOWN", + 5: "STATUS_IDLE", } HealthResponse_Status_value = map[string]int32{ "STATUS_UNSPECIFIED": 0, @@ -46,6 +48,7 @@ var ( "STATUS_LOADING": 2, "STATUS_DEGRADED": 3, "STATUS_SHUTTING_DOWN": 4, + "STATUS_IDLE": 5, } ) @@ -678,7 +681,7 @@ const file_runed_v1_runed_proto_rawDesc = "" + "vector_dim\x18\x03 \x01(\x05R\tvectorDim\x12&\n" + "\x0fmax_text_length\x18\x04 \x01(\x05R\rmaxTextLength\x12$\n" + "\x0emax_batch_size\x18\x05 \x01(\x05R\fmaxBatchSize\"\x0f\n" + - "\rHealthRequest\"\x97\x04\n" + + "\rHealthRequest\"\xa9\x04\n" + "\x0eHealthResponse\x127\n" + "\x06status\x18\x01 \x01(\x0e2\x1f.runed.v1.HealthResponse.StatusR\x06status\x12%\n" + "\x0euptime_seconds\x18\x02 \x01(\x03R\ruptimeSeconds\x12%\n" + @@ -688,13 +691,14 @@ const file_runed_v1_runed_proto_rawDesc = "" + "bytes_done\x18\x05 \x01(\x03R\tbytesDone\x12\x1f\n" + "\vbytes_total\x18\x06 \x01(\x03R\n" + "bytesTotal\x12\x18\n" + - "\amessage\x18\a \x01(\tR\amessage\"r\n" + + "\amessage\x18\a \x01(\tR\amessage\"\x83\x01\n" + "\x06Status\x12\x16\n" + "\x12STATUS_UNSPECIFIED\x10\x00\x12\r\n" + "\tSTATUS_OK\x10\x01\x12\x12\n" + "\x0eSTATUS_LOADING\x10\x02\x12\x13\n" + "\x0fSTATUS_DEGRADED\x10\x03\x12\x18\n" + - "\x14STATUS_SHUTTING_DOWN\x10\x04\"z\n" + + "\x14STATUS_SHUTTING_DOWN\x10\x04\x12\x0f\n" + + "\vSTATUS_IDLE\x10\x05\"z\n" + "\x05Phase\x12\x15\n" + "\x11PHASE_UNSPECIFIED\x10\x00\x12\x1f\n" + "\x1bPHASE_FETCHING_LLAMA_SERVER\x10\x01\x12\x18\n" + diff --git a/internal/backend/llama.go b/internal/backend/llama.go index 996ee5b..b96e988 100644 --- a/internal/backend/llama.go +++ b/internal/backend/llama.go @@ -27,6 +27,32 @@ type Config struct { CtxSize int // default 2048; --ctx-size in tokens = max input length (llama-server rejects longer input with HTTP 400) } +// ServingState is the backend's current ability to serve embeddings, as +// reported through the daemon's Health RPC. +type ServingState int + +const ( + // ServingOK: the llama-server child is up and answering /health. + ServingOK ServingState = iota + // ServingIdle: the child was intentionally stopped after an idle period + // to free model memory. The next Embed RPC resurrects it (a one-off + // cold-start latency). This is expected, not a fault. + ServingIdle + // ServingDegraded: the child is up but failing /health, or it exited + // unexpectedly. A genuine problem. + ServingDegraded +) + +// childState records why the llama-server child is not currently running, +// so Serving can tell an intentional idle-suspend apart from a crash. +type childState int + +const ( + childRunning childState = iota // child up (or starting) + childSuspended // intentionally stopped (idle suspend / restart) + childFailed // exited unexpectedly +) + type LlamaBackend struct { cfg Config @@ -34,7 +60,10 @@ type LlamaBackend struct { port int cmdDone chan struct{} // closed by watchChild when current cmd.Wait returns stopping bool // true while stopLocked is intentionally terminating the child - mu sync.Mutex // protects cmd, port, cmdDone, stopping — short critical sections only + // state records why the child is down (childSuspended vs childFailed) so + // Serving maps it to ServingIdle vs ServingDegraded. Protected by mu. + state childState + mu sync.Mutex // protects cmd, port, cmdDone, stopping, state — short critical sections only // lifecycleMu serializes Start / Stop / EnsureStarted. RPC handlers // call EnsureStarted on every request so the contention here must @@ -166,6 +195,7 @@ func (b *LlamaBackend) startLocked(ctx context.Context) error { b.cmd = cmd b.port = 0 b.cmdDone = done + b.state = childRunning b.mu.Unlock() go b.watchChild(cmd, done) @@ -263,6 +293,11 @@ func (b *LlamaBackend) watchChild(cmd *exec.Cmd, done chan struct{}) { if b.cmd == cmd { b.cmd = nil b.port = 0 + // Any exit leaves the child down; default to failed so a crash or a + // failed (re)start reports DEGRADED. Stop() (the idle-suspend path) + // overrides to childSuspended afterward — it waits on done, so its + // write lands strictly after this one. + b.state = childFailed } b.mu.Unlock() // Log before close(done) so readers of done can rely on the log @@ -302,6 +337,27 @@ func (b *LlamaBackend) IsHealthy(ctx context.Context) bool { return resp.StatusCode == 200 } +// Serving reports whether the backend can currently serve embeddings, and if +// not, why. Health maps the result onto the proto Status enum: +// ServingOK→OK, ServingIdle→IDLE (intentional idle-suspend, resumes on the +// next RPC), ServingDegraded→DEGRADED (process up-but-unhealthy, or crashed). +func (b *LlamaBackend) Serving(ctx context.Context) ServingState { + b.mu.Lock() + haveCmd := b.cmd != nil + st := b.state + b.mu.Unlock() + if haveCmd { + if b.IsHealthy(ctx) { + return ServingOK + } + return ServingDegraded + } + if st == childSuspended { + return ServingIdle + } + return ServingDegraded +} + // Stop terminates the llama-server child if running. After Stop returns, // cmd is nil and port is 0 — a subsequent EnsureStarted re-launches the // child cleanly. @@ -316,7 +372,22 @@ func (b *LlamaBackend) Stop(ctx context.Context) error { defer b.inflightMu.Unlock() b.lifecycleMu.Lock() defer b.lifecycleMu.Unlock() - return b.stopLocked(ctx) + b.mu.Lock() + wasRunning := b.cmd != nil + b.mu.Unlock() + err := b.stopLocked(ctx) + // Stop is the intentional idle-suspend path (the idle ticker). Mark the + // child suspended so Serving reports IDLE, not DEGRADED — but only if we + // actually stopped a running child, so a Stop on an already-crashed + // backend doesn't mask the failure. stopLocked waits on done, so + // watchChild's childFailed write has already landed; this override runs + // after it. + if wasRunning { + b.mu.Lock() + b.state = childSuspended + b.mu.Unlock() + } + return err } func (b *LlamaBackend) stopLocked(ctx context.Context) error { diff --git a/internal/backend/serving_test.go b/internal/backend/serving_test.go new file mode 100644 index 0000000..0d7d0fc --- /dev/null +++ b/internal/backend/serving_test.go @@ -0,0 +1,121 @@ +//go:build !windows + +package backend + +import ( + "context" + "net/http" + "net/http/httptest" + "os/exec" + "testing" + "time" +) + +// TestServing_DistinguishesIdleFromDegraded checks the Serving mapping given a +// known child state: an intentional idle-suspend is ServingIdle (not a fault) +// while a crash or an up-but-unhealthy child is ServingDegraded. This is what +// lets Health surface STATUS_IDLE instead of misleading users with +// STATUS_DEGRADED after the idle ticker stops llama-server to free memory. +func TestServing_DistinguishesIdleFromDegraded(t *testing.T) { + t.Run("child down after intentional suspend → idle", func(t *testing.T) { + b := NewLlamaBackend(Config{}) + b.mu.Lock() + b.cmd = nil + b.state = childSuspended + b.mu.Unlock() + if got := b.Serving(context.Background()); got != ServingIdle { + t.Fatalf("Serving() = %v, want ServingIdle", got) + } + }) + + t.Run("child down after unexpected exit → degraded", func(t *testing.T) { + b := NewLlamaBackend(Config{}) + b.mu.Lock() + b.cmd = nil + b.state = childFailed + b.mu.Unlock() + if got := b.Serving(context.Background()); got != ServingDegraded { + t.Fatalf("Serving() = %v, want ServingDegraded", got) + } + }) + + t.Run("child up and healthy → ok", func(t *testing.T) { + fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer fake.Close() + host, port := splitHostPort(t, fake.URL) + b := NewLlamaBackend(Config{Host: host}) + b.mu.Lock() + b.cmd = &exec.Cmd{} // non-nil → child considered up + b.port = port + b.state = childRunning + b.mu.Unlock() + if got := b.Serving(context.Background()); got != ServingOK { + t.Fatalf("Serving() = %v, want ServingOK", got) + } + }) + + t.Run("child up but unhealthy → degraded", func(t *testing.T) { + fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer fake.Close() + host, port := splitHostPort(t, fake.URL) + b := NewLlamaBackend(Config{Host: host}) + b.mu.Lock() + b.cmd = &exec.Cmd{} // non-nil → child considered up + b.port = port + b.state = childRunning + b.mu.Unlock() + if got := b.Serving(context.Background()); got != ServingDegraded { + t.Fatalf("Serving() = %v, want ServingDegraded", got) + } + }) +} + +// TestServing_LifecycleTransitions drives the real child lifecycle (via a +// placeholder `sleep` process) to verify which teardown paths produce IDLE vs +// DEGRADED. The key regression: only Stop() (the idle ticker) yields IDLE — an +// internal stopLocked from a failed (re)start, or a crash, must stay DEGRADED. +func TestServing_LifecycleTransitions(t *testing.T) { + t.Run("Stop (idle-suspend) → idle", func(t *testing.T) { + b := NewLlamaBackend(Config{}) + cmd := attachPlaceholderChild(t, b, 1) + defer cleanupPlaceholderChild(cmd) + if err := b.Stop(context.Background()); err != nil { + t.Fatalf("Stop: %v", err) + } + if got := b.Serving(context.Background()); got != ServingIdle { + t.Fatalf("after Stop: got %v, want ServingIdle", got) + } + }) + + t.Run("internal stopLocked (failed start/restart) → degraded, not idle", func(t *testing.T) { + b := NewLlamaBackend(Config{}) + cmd := attachPlaceholderChild(t, b, 1) + defer cleanupPlaceholderChild(cmd) + // startLocked / EnsureStarted stop the child via stopLocked (not Stop) + // when a (re)start fails. That must NOT be reported as an idle-suspend. + b.lifecycleMu.Lock() + _ = b.stopLocked(context.Background()) + b.lifecycleMu.Unlock() + if got := b.Serving(context.Background()); got != ServingDegraded { + t.Fatalf("internal stop should be DEGRADED, got %v", got) + } + }) + + t.Run("unexpected exit (crash) → degraded", func(t *testing.T) { + b := NewLlamaBackend(Config{}) + cmd := attachPlaceholderChild(t, b, 1) + defer cleanupPlaceholderChild(cmd) + _ = cmd.Process.Kill() // crash: no Stop/stopLocked involved + deadline := time.Now().Add(2 * time.Second) + for b.getCmd() != nil && time.Now().Before(deadline) { + time.Sleep(10 * time.Millisecond) + } + if got := b.Serving(context.Background()); got != ServingDegraded { + t.Fatalf("after crash: got %v, want ServingDegraded", got) + } + }) +} diff --git a/internal/server/server.go b/internal/server/server.go index d55cc0d..c056bbb 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -250,7 +250,7 @@ func (s *Server) Info(ctx context.Context, _ *runedv1.InfoRequest) (*runedv1.Inf } // Health maps backend state to the proto Status enum. SHUTTING_DOWN -// outranks LOADING/DEGRADED/OK so a drain-in-progress daemon doesn't +// outranks LOADING/IDLE/DEGRADED/OK so a drain-in-progress daemon doesn't // advertise itself as ready (the GracefulStop race would otherwise // surface Unavailable right after the OK response). // @@ -278,11 +278,15 @@ func (s *Server) Health(ctx context.Context, _ *runedv1.HealthRequest) (*runedv1 } return resp, nil } - if !b.IsHealthy(ctx) { + switch b.Serving(ctx) { + case backend.ServingIdle: + resp.Status = runedv1.HealthResponse_STATUS_IDLE + resp.Message = "embedder suspended after idle to free memory; the next request resumes it automatically" + case backend.ServingDegraded: resp.Status = runedv1.HealthResponse_STATUS_DEGRADED - return resp, nil + default: + resp.Status = runedv1.HealthResponse_STATUS_OK } - resp.Status = runedv1.HealthResponse_STATUS_OK return resp, nil } diff --git a/proto/runed/v1/runed.proto b/proto/runed/v1/runed.proto index 23660f6..a5e2b38 100644 --- a/proto/runed/v1/runed.proto +++ b/proto/runed/v1/runed.proto @@ -81,6 +81,7 @@ message HealthResponse { STATUS_LOADING = 2; // bootstrap: fetching llama-server, downloading embedding model STATUS_DEGRADED = 3; STATUS_SHUTTING_DOWN = 4; + STATUS_IDLE = 5; // backend intentionally suspended after idle to free memory; resumes on next request } // STATUS_LOADING sub-phase